Commit 4a7cead0 authored by Nathalia Moraes do Nascimento's avatar Nathalia Moraes do Nascimento
Browse files

general notebooks: data retrieval, pre-processing, feature engineering and visualization

parent 593c2abb
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"!pip install --upgrade pip"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"#bib para load arquivo tipo xls\n",
"!pip install xlrd \n",
"!pip install openpyxl\n",
"!pip install xlwt\n",
"!pip install xlsxwriter"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Collecting pandas\n",
" Downloading pandas-1.0.3-cp36-cp36m-manylinux1_x86_64.whl (10.0 MB)\n",
"\u001b[K |████████████████████████████████| 10.0 MB 543 kB/s eta 0:00:01 |█████████████▎ | 4.2 MB 67 kB/s eta 0:01:27\n",
"\u001b[?25hRequirement already satisfied: numpy>=1.13.3 in /usr/local/lib/python3.6/dist-packages (from pandas) (1.18.1)\n",
"Collecting pytz>=2017.2\n",
" Downloading pytz-2020.1-py2.py3-none-any.whl (510 kB)\n",
"\u001b[K |████████████████████████████████| 510 kB 577 kB/s eta 0:00:01\n",
"\u001b[?25hRequirement already satisfied: python-dateutil>=2.6.1 in /usr/local/lib/python3.6/dist-packages (from pandas) (2.8.1)\n",
"Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.6/dist-packages (from python-dateutil>=2.6.1->pandas) (1.13.0)\n",
"Installing collected packages: pytz, pandas\n",
"Successfully installed pandas-1.0.3 pytz-2020.1\n"
]
}
],
"source": [
"!pip install pandas "
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"from pandas import DataFrame\n",
"import time"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'1.0.3'"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"pd.__version__"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"filePath = '../../nota-full.dsv' \n",
"list_columns_to_read = ['xx','xxxx']\n",
"list_columns_to_analyze = ['freetext']\n",
"# Refatorar para que as variaveis possam ser definidas em outros notebooks \n",
"# ver https://www.oreilly.com/library/view/python-cookbook/0596001673/ch17s02.html"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Para reduzir dimensionalidade, se atentar para:\n",
"- evitar colunas com muito 'missing values' - observar número de rows para cada coluna em df.info() "
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"def retrieve_data(sep='£', sheet_name=None, filePath=filePath, list_columns_to_read=list_columns_to_read, \n",
" list_columns_to_analyze=list_columns_to_analyze):\n",
" begin = time.time()\n",
" if(sep=='xls'):\n",
" df = pd.read_excel(filePath, sheet_name=sheet_name,dtype=object)\n",
" else:\n",
" df = pd.read_csv(filePath, sep=sep, encoding='utf-8',dtype=object)\n",
" #CLEAN the header row before using their columns as index types\n",
" df.columns = df.columns.str.strip().str.lower().str.replace(';', '').str.replace('\"', '').str.replace(' ', '').str.replace('_', '').str.replace('(', '').str.replace(')', '')\n",
" print(df.columns)\n",
" print(df.info())\n",
" \n",
" #print(df['freetext'].describe())\n",
" # COLUNAS PARA SELECIONAR - se a coluna selecionada tiver muitos rows = null, entao aplicar um fillna nela (ver abaixo)\n",
" if(list_columns_to_read is not None):\n",
" df = df[list_columns_to_read] # se for none, lê todo mundo\n",
" #fillna -> replace null values in tagline\n",
" #df.info()\n",
" for col in df.columns:\n",
" df[col].fillna(\"\", inplace=True)\n",
" \n",
" # df.info()\n",
" #Drop the rows where at least one element is missing. - em resumo, se nao quiser processar linhas que nao possuem numerodanota\n",
" #nao deve substitui as linhas nulas de numerodanota para elimina-las\n",
" \n",
" df.dropna(inplace=True) # Se fosse coluna -> Drop the columns where at least one element is missing. >>> df.dropna(axis='columns')\n",
" # Em alguns algoritmos, a palavra que vem primeiro tem maior relevancia\n",
" #.map(str) \n",
" if('description' not in df.columns):\n",
" df['description'] = \"\"\n",
" for col in list_columns_to_analyze:\n",
" df['description'] = df['description'] + ' ' + df[col] \n",
" \n",
" data = df['description'] #[0:19999]\n",
" \n",
" end = time.time()\n",
" print('Exec time - Load Data:', end-begin)\n",
" return data,df;\n",
" #df.info()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"print(\"NOTE - LOAD DATA: RETURN object 'data' to process and object df[] to recover information of the original file\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# EXPLORANDO OS DADOS"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"def get_data_by_attribute(attribute='xxxx', attribute_value='yyyy', attribute02=None):\n",
" list_loc = {}\n",
" result_list = []\n",
" #301010.FAIN.512202.000005.0003\n",
" for note_idx,note_num in enumerate(df['numero']):\n",
" plocal = df[attribute][note_idx]\n",
" if(attribute=='xxxx'):\n",
" plocal_aux = plocal.split(\".\")\n",
" plocal = plocal_aux[0]\n",
" if(len(plocal_aux)>1):\n",
" # plocal = plocal+plocal_aux[1]\n",
" plocal = plocal_aux[1]\n",
" if(plocal==attribute_value):\n",
" result_list.append(note_idx)\n",
" if(len(plocal_aux)>2):\n",
" plocal = plocal+\".\"+plocal_aux[2]\n",
" if(attribute02 is not None):\n",
" plocal = plocal+\" : \"+ str(df[attribute02][note_idx])\n",
" if(plocal in list_loc):\n",
" num_note = list_loc[plocal] + 1\n",
" list_loc[plocal] = num_note\n",
" else:\n",
" list_loc[plocal] = 1\n",
" \n",
" return list_loc,result_list\n",
" \n",
" # CRIAR MATRIZ DF SO COM O ATRIBUTO ESCOLHIDO!\n",
" \n",
"#df = pd.read_csv(filePath, sep='£', encoding='utf-8')\n",
"#CLEAN the header row before using their columns as index types\n",
"#df.columns = df.columns.str.strip().str.lower().str.replace(';', '').str.replace(' ', '').str.replace('_', '').str.replace('(', '').str.replace(')', '')\n",
"#print(df['textolongo'].describe())\n",
"#df.info()\n",
"#df.head()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"def save_file_xml(file_environ_name, df_to_excel,sheet_name='environ2'):\n",
" # Create a Pandas Excel writer using XlsxWriter as the engine.\n",
" #file_environ_name = filePath+'environ_delete'+fileName\n",
" writer = pd.ExcelWriter(file_environ_name, engine='xlsxwriter')\n",
" df_to_excel.to_excel(writer, sheet_name=sheet_name, index = None, header=True)\n",
" # Close the Pandas Excel writer and output the Excel file.\n",
" writer.save()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"data,df = retrieve_data()\n",
"list_loc,result_list = get_data_by_attribute(attribute='areaoperacional',attribute02='codplataforma')\n",
"print(list_loc.keys())\n",
"print(len(result_list))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"list_ordem = {}\n",
"most_freq_ordem = \"\"\n",
"max_order = 0\n",
"for ordem in df['numeroordem']:\n",
" if(ordem in list_ordem):\n",
" num_note = list_ordem[ordem] + 1\n",
" list_ordem[ordem] = num_note\n",
" print(\"Ordem repete \", ordem, num_note)\n",
" if(num_note>max_order):\n",
" max_order = num_note\n",
" most_freq_ordem = ordem\n",
" else:\n",
" list_ordem[ordem] = 1\n",
" \n",
"print(\"Ordem mais frequente \", most_freq_ordem, \" que apareceu \", max_order, \" vezes\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# print notes da ordem mais frequente\n",
"for note_idx,note_num in enumerate(df['numerodanota']):\n",
" # print(df['numeroordem'][note_idx])\n",
" if (df['numeroordem'][note_idx] == most_freq_ordem):\n",
" print(df['textolongo'][note_idx])"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# NOTA ZR ->\tDescrição da nota: Tipo de Inspeção / TAG / Lote"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.9"
}
},
"nbformat": 4,
"nbformat_minor": 4
}
%% Cell type:code id: tags:
``` python
!pip install --upgrade pip
```
%% Cell type:code id: tags:
``` python
#bib para load arquivo tipo xls
!pip install xlrd
!pip install openpyxl
!pip install xlwt
!pip install xlsxwriter
```
%% Cell type:code id: tags:
``` python
!pip install pandas
```
%%%% Output: stream
Collecting pandas
Downloading pandas-1.0.3-cp36-cp36m-manylinux1_x86_64.whl (10.0 MB)
 |████████████████████████████████| 10.0 MB 543 kB/s eta 0:00:01 |█████████████▎ | 4.2 MB 67 kB/s eta 0:01:27
[?25hRequirement already satisfied: numpy>=1.13.3 in /usr/local/lib/python3.6/dist-packages (from pandas) (1.18.1)
Collecting pytz>=2017.2
Downloading pytz-2020.1-py2.py3-none-any.whl (510 kB)
 |████████████████████████████████| 510 kB 577 kB/s eta 0:00:01
[?25hRequirement already satisfied: python-dateutil>=2.6.1 in /usr/local/lib/python3.6/dist-packages (from pandas) (2.8.1)
Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.6/dist-packages (from python-dateutil>=2.6.1->pandas) (1.13.0)
Installing collected packages: pytz, pandas
Successfully installed pandas-1.0.3 pytz-2020.1
%% Cell type:code id: tags:
``` python
import pandas as pd
from pandas import DataFrame
import time
```
%% Cell type:code id: tags:
``` python
pd.__version__
```
%%%% Output: execute_result
'1.0.3'
%% Cell type:code id: tags:
``` python
filePath = '../../nota-full.dsv'
list_columns_to_read = ['xx','xxxx']
list_columns_to_analyze = ['freetext']
# Refatorar para que as variaveis possam ser definidas em outros notebooks
# ver https://www.oreilly.com/library/view/python-cookbook/0596001673/ch17s02.html
```
%% Cell type:markdown id: tags:
# Para reduzir dimensionalidade, se atentar para:
- evitar colunas com muito 'missing values' - observar número de rows para cada coluna em df.info()
%% Cell type:code id: tags:
``` python
def retrieve_data(sep='£', sheet_name=None, filePath=filePath, list_columns_to_read=list_columns_to_read,
list_columns_to_analyze=list_columns_to_analyze):
begin = time.time()
if(sep=='xls'):
df = pd.read_excel(filePath, sheet_name=sheet_name,dtype=object)
else:
df = pd.read_csv(filePath, sep=sep, encoding='utf-8',dtype=object)
#CLEAN the header row before using their columns as index types
df.columns = df.columns.str.strip().str.lower().str.replace(';', '').str.replace('"', '').str.replace(' ', '').str.replace('_', '').str.replace('(', '').str.replace(')', '')
print(df.columns)
print(df.info())
#print(df['freetext'].describe())
# COLUNAS PARA SELECIONAR - se a coluna selecionada tiver muitos rows = null, entao aplicar um fillna nela (ver abaixo)
if(list_columns_to_read is not None):
df = df[list_columns_to_read] # se for none, lê todo mundo
#fillna -> replace null values in tagline
#df.info()
for col in df.columns:
df[col].fillna("", inplace=True)
# df.info()
#Drop the rows where at least one element is missing. - em resumo, se nao quiser processar linhas que nao possuem numerodanota
#nao deve substitui as linhas nulas de numerodanota para elimina-las
df.dropna(inplace=True) # Se fosse coluna -> Drop the columns where at least one element is missing. >>> df.dropna(axis='columns')
# Em alguns algoritmos, a palavra que vem primeiro tem maior relevancia
#.map(str)
if('description' not in df.columns):
df['description'] = ""
for col in list_columns_to_analyze:
df['description'] = df['description'] + ' ' + df[col]
data = df['description'] #[0:19999]
end = time.time()
print('Exec time - Load Data:', end-begin)
return data,df;
#df.info()
```
%% Cell type:code id: tags:
``` python
print("NOTE - LOAD DATA: RETURN object 'data' to process and object df[] to recover information of the original file")
```
%% Cell type:markdown id: tags:
# EXPLORANDO OS DADOS
%% Cell type:code id: tags:
``` python
def get_data_by_attribute(attribute='xxxx', attribute_value='yyyy', attribute02=None):
list_loc = {}
result_list = []
#301010.FAIN.512202.000005.0003
for note_idx,note_num in enumerate(df['numero']):
plocal = df[attribute][note_idx]
if(attribute=='xxxx'):
plocal_aux = plocal.split(".")
plocal = plocal_aux[0]
if(len(plocal_aux)>1):
# plocal = plocal+plocal_aux[1]
plocal = plocal_aux[1]
if(plocal==attribute_value):
result_list.append(note_idx)
if(len(plocal_aux)>2):
plocal = plocal+"."+plocal_aux[2]
if(attribute02 is not None):
plocal = plocal+" : "+ str(df[attribute02][note_idx])
if(plocal in list_loc):
num_note = list_loc[plocal] + 1
list_loc[plocal] = num_note
else:
list_loc[plocal] = 1
return list_loc,result_list
# CRIAR MATRIZ DF SO COM O ATRIBUTO ESCOLHIDO!
#df = pd.read_csv(filePath, sep='£', encoding='utf-8')
#CLEAN the header row before using their columns as index types
#df.columns = df.columns.str.strip().str.lower().str.replace(';', '').str.replace(' ', '').str.replace('_', '').str.replace('(', '').str.replace(')', '')
#print(df['textolongo'].describe())
#df.info()
#df.head()
```
%% Cell type:code id: tags:
``` python
def save_file_xml(file_environ_name, df_to_excel,sheet_name='environ2'):
# Create a Pandas Excel writer using XlsxWriter as the engine.
#file_environ_name = filePath+'environ_delete'+fileName
writer = pd.ExcelWriter(file_environ_name, engine='xlsxwriter')
df_to_excel.to_excel(writer, sheet_name=sheet_name, index = None, header=True)
# Close the Pandas Excel writer and output the Excel file.
writer.save()
```
%% Cell type:markdown id: tags:
data,df = retrieve_data()
list_loc,result_list = get_data_by_attribute(attribute='areaoperacional',attribute02='codplataforma')
print(list_loc.keys())
print(len(result_list))
%% Cell type:markdown id: tags:
list_ordem = {}
most_freq_ordem = ""
max_order = 0
for ordem in df['numeroordem']:
if(ordem in list_ordem):
num_note = list_ordem[ordem] + 1
list_ordem[ordem] = num_note
print("Ordem repete ", ordem, num_note)
if(num_note>max_order):
max_order = num_note
most_freq_ordem = ordem
else:
list_ordem[ordem] = 1
print("Ordem mais frequente ", most_freq_ordem, " que apareceu ", max_order, " vezes")
%% Cell type:markdown id: tags:
# print notes da ordem mais frequente
for note_idx,note_num in enumerate(df['numerodanota']):
# print(df['numeroordem'][note_idx])
if (df['numeroordem'][note_idx] == most_freq_ordem):
print(df['textolongo'][note_idx])
%% Cell type:code id: tags:
``` python
# NOTA ZR -> Descrição da nota: Tipo de Inspeção / TAG / Lote
```
%% Cell type:code id: tags:
``` python
```
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Requirement already up-to-date: nltk in /usr/local/lib/python3.6/dist-packages (3.5)\r\n",
"Requirement already satisfied, skipping upgrade: tqdm in /usr/local/lib/python3.6/dist-packages (from nltk) (4.46.1)\r\n",
"Requirement already satisfied, skipping upgrade: regex in /usr/local/lib/python3.6/dist-packages (from nltk) (2020.6.8)\r\n",
"Requirement already satisfied, skipping upgrade: joblib in /usr/local/lib/python3.6/dist-packages (from nltk) (0.15.1)\r\n",
"Requirement already satisfied, skipping upgrade: click in /usr/local/lib/python3.6/dist-packages (from nltk) (7.1.2)\r\n"
]
}
],
"source": [
"!pip install -U nltk"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"[nltk_data] Downloading package stopwords to /root/nltk_data...\n",
"[nltk_data] Package stopwords is already up-to-date!\n",
"[nltk_data] Downloading package punkt to /root/nltk_data...\n",
"[nltk_data] Package punkt is already up-to-date!\n"
]
}
],
"source": [
"#bibs para preprocessamento de texto\n",
"import nltk\n",
"import re\n",
"from nltk.stem import SnowballStemmer\n",
"from string import punctuation\n",
"#TEXT PREPROCESSING \n",
"nltk.download('stopwords')\n",
"nltk.download('punkt')\n",
"\n",
"import time\n",
"from collections import defaultdict"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"# lista de REGEX\n",
"%run \"./../../general-notebooks/regex-process-notebook.ipynb\""
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"# Frases para serem eliminadas do texto e tokens extras \n",
"# necessita de parada? * na * * qual tipo de bloqueio? * na * * trata-se de mudança? \n",
"phrases = ['informações complementares',\n",
" 'phrases to remove']\n",
" #'não aplicável'] #NA nao acrescenta em nada neh?\n",
"#Se for realmente filtrar os nomes, entao, pegar a coluna de solicitantes(executores) e vetorizar\n",
"names = ['cristiane','silva','soares','neuton','rubens','paes','gomes','edimilson','fagner','frederick','flavio','oliveira','paolo','gleisson','luis','oliveira','carvalho',\n",
" 'alice', 'venturi', 'pinheiro','luiz','renato','almeida','lemos','hilton', 'antonio', 'fernandes', 'junior',\n",
" 'alzinei','katia', 'nascimento', 'conceicao','saul','melegari','robson','bastos', 'silva',\n",
" 'raquel', 'gomes', 'souza', 'bandeira','jair', 'pinto', 'campeao', 'filho',\n",
" 'marcio', 'rogerio', 'dantas', 'ramos', 'jose','junior','araujo','pinto', 'diego', 'novaes', 'merida',\n",
" 'ricardo', 'aguzzoli', 'travi', 'renato', 'viana','daniel', 'tamiris', 'olegario','gomes','santos']\n",
"\n",
"extra = ['etc', 'ex','na', 'n/a', '*', '?', '°','(',')',':'] #['x', 'na', 'n/a', 'sim']\n",
"stop_words = nltk.corpus.stopwords.words('portuguese') + extra #+ names + list(punctuation)\n",
"stop_words_en = nltk.corpus.stopwords.words('english') + list(punctuation)"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"phrases_en = ['Describe what you did (step-by-step) so we can reproduce:',\n",
" 'What happened',\n",
" 'What should have happened',\n",
" 'What steps will reproduce the problem',\n",
" 'What is the expected output',\n",
" 'What do you see instead',\n",
" 'Please use labels and text to provide additional information',\n",
" 'What version of the product are you using',\n",
" 'On what operating system',\n",
" 'On what operating system',\n",
" 'Please provide any additional information below',\n",
" ' Inkscape Version and Operating System',\n",
" ' Example file',\n",
" 'Attach a sample file (or files) highlighting the issue, if appropriate',\n",
" 'Thank you for filling in a new bug report!',\n",
" 'More details on how to write a good bug report can be found at https://inkscape.org/contribute/report-bugs/.','Please remember that Inkscape is developed mostly by volunteers in their spare time, and may not be able to respond to reports quickly.']"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"#print(stop_words_en)"