Commit 37968fa4 authored by Nathalia Moraes do Nascimento's avatar Nathalia Moraes do Nascimento
Browse files

similarity experiments - finding duplicate issues

parent 73e472c3
This diff is collapsed.
This diff is collapsed.
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"%run \"./load-data-notebook.ipynb\"\n",
"list_columns_to_read = []\n",
"list_columns_to_analyze = ['freetext']\n",
"data,df = retrieve_data(list_columns_to_read=list_columns_to_read,list_columns_to_analyze=list_columns_to_analyze)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"#data = data[1:2]\n",
"%run \"./pre-process-notebook.ipynb\""
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"%run \"./search-bm25.ipynb\""
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"def save_questions_from_all():\n",
" dict_quest = {}\n",
" for text_df in df['freetext'][:]:\n",
" list_quest = find_quest_type2(text=text_df)\n",
" for quest in list_quest:\n",
" quest = quest.strip()\n",
" if(quest in dict_quest):\n",
" num_quest = dict_quest[quest] + 1\n",
" dict_quest[quest] = num_quest\n",
" else:\n",
" dict_quest[quest] = 1\n",
" return dict_quest"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"#dict_quest = save_questions(name_file=name_file,cluster_num=0)\n",
"dict_quest = save_questions_from_all()\n",
"quest_data = []\n",
"for item in sorted(dict_quest, key = dict_quest.get, reverse=True):\n",
" quest_data.append(item)\n",
" print (item,dict_quest[item])"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"print(len(quest_data))\n",
"norm_corpus = clean_data(data=quest_data,bol_remove_numbers=True)\n",
"print(norm_corpus)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"norm_corpus_tokens = np.array([nltk.word_tokenize(doc) for doc in norm_corpus])\n",
"#print(norm_corpus_tokens)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"#print(quest_data[0])\n",
"quest_example = quest_data[0]\n",
"print(len(quest_example))\n",
"print(quest_example)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"for quest in quest_data:\n",
" k_s = \"- * * 2 - DESCREVA QUAL A POSSÍVEL CAUSA DO PROBLEMA. \".strip()\n",
" if(quest == k_s):\n",
" print(quest, dict_quest[quest])"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# a*17.853 + b*(abs(51-35)) + d*1 = 90\n",
"good_similarity = 11\n",
"def is_good_similarity(weight,key_similar,key_original):\n",
" if(weight>=good_similarity):\n",
" num_that_original_appear = dict_quest[key_original]\n",
" num_that_similar_appear = dict_quest[key_similar]\n",
" diff_size = abs(len(key_original)-len(key_similar))\n",
" #b = 10\n",
" #a = 9.45\n",
" #d = 10.15\n",
" #y = a*weight - b*(abs(len(key_original)-len(key_similar))) + d*num_that_appear\n",
" y = weight*(82/19.95) + ((num_that_similar_appear * 18)/num_that_original_appear) - (10 * diff_size/20)\n",
" if(y>60):\n",
" #print('similar calc ', y)\n",
" return True\n",
" return False\n",
"\n",
"# se o weight for 20, que eh o maximo - 70 %\n",
"#weight - > x\n",
"#x-> 60*weight/20\n",
"\n",
"#se o num_that_similar appears is num_that_original_appear, que eh o maximo - 30%\n",
"# num_that_similar -> x\n",
"# x -> ((num_that_similar_appear * 30)/num_that_original_appear)\n",
"\n",
"# quanto menor a diferença do tamanho, melhor. Se 0, que eh o minimo, vale 0\n",
"# diff -> x\n",
"#supoe q 20 chars -> 10%\n",
"# x -> 10 * diff/30\n",
"\n",
" # if(len(key_similar)>30):\n",
" # if((num_that_appear<=10 and weight>=17.40) or (num_that_appear>10 and weight>=15)):\n",
" # return True\n",
" # if(len(key_similar)<=30):\n",
" # if((num_that_appear<=10 and weight>=(good_similarity+2)) or (num_that_appear>10 and weight>=good_similarity)):\n",
" # return True\n",
" # return False"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"def get_dict_quest(type=1):\n",
" # o vetor ja deve estar ordenado\n",
" num_that_appears = 125\n",
" dict_quest_to_substitute = {}\n",
" dict_quest_copy = {}\n",
" dict_quest_copy = dict_quest.copy()\n",
" len_aux = len(quest_data)\n",
" # for item in remover_itens:\n",
" #dicionario.pop(item)\n",
" for quest in quest_data:\n",
" #quero encontrar as questões similares das questões mais frequentes\n",
" if(dict_quest_copy[quest]>=num_that_appears):\n",
" #print(\"Quest is \", quest, \" and appears \", dict_quest[quest])\n",
" dict_quest_to_substitute[quest] = quest\n",
" query_str = str(quest)\n",
" similar_questions,query_weights = search_by_query(query_str=query_str,notes=quest_data,norm_corpus_tokens=norm_corpus_tokens,num_docs=len_aux)\n",
" #se nem o primeiro atende, nenhum outro atenderá\n",
" if(query_weights[0]>=good_similarity):\n",
" aux_idx = -1\n",
" #print(\"\\n\\n Good question is \",query_str)\n",
" #print(\"\\n\\n Appears \",dict_quest[quest])\n",
" #print(\"Weight \", query_weights)\n",
" #print(\"Similar questions \")\n",
" for weight in query_weights:\n",
" aux_idx = aux_idx + 1\n",
" key_similar = similar_questions[aux_idx].strip()\n",
" if(is_good_similarity(weight,key_similar,quest)):\n",
" #Normalmente, quando só aparece uma vez, é um exemplo de resposta, ao invés de pergunta\n",
" if(dict_quest_copy[quest]>=dict_quest_copy[key_similar] and (quest!=key_similar)): \n",
" #print(\"Similar is \", key_similar, \" and appears \", dict_quest[key_similar])\n",
" dict_quest_to_substitute[key_similar] = quest\n",
" dict_quest_copy[quest] = dict_quest_copy[quest] + dict_quest_copy[key_similar]\n",
" #if(quest.strip()==quest_example.strip()):\n",
" #print(key_similar, dict_quest_copy[key_similar], len(key_similar), weight)\n",
" dict_quest_copy[key_similar] = 0\n",
" num_q = 1\n",
" for item in sorted(dict_quest_copy, key = dict_quest_copy.get, reverse=True):\n",
" if(dict_quest_copy[item]>=num_that_appears):\n",
" print (\"Questão \", num_q, \" - \", item,dict_quest_copy[item])\n",
" num_q=num_q+1\n",
" else:\n",
" break\n",
" "
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"scrolled": false
},
"outputs": [],
"source": [
"get_dict_quest()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.9"
}
},
"nbformat": 4,
"nbformat_minor": 4
}
%% Cell type:code id: tags:
``` python
%run "./load-data-notebook.ipynb"
list_columns_to_read = []
list_columns_to_analyze = ['freetext']
data,df = retrieve_data(list_columns_to_read=list_columns_to_read,list_columns_to_analyze=list_columns_to_analyze)
```
%% Cell type:code id: tags:
``` python
#data = data[1:2]
%run "./pre-process-notebook.ipynb"
```
%% Cell type:code id: tags:
``` python
%run "./search-bm25.ipynb"
```
%% Cell type:code id: tags:
``` python
def save_questions_from_all():
dict_quest = {}
for text_df in df['freetext'][:]:
list_quest = find_quest_type2(text=text_df)
for quest in list_quest:
quest = quest.strip()
if(quest in dict_quest):
num_quest = dict_quest[quest] + 1
dict_quest[quest] = num_quest
else:
dict_quest[quest] = 1
return dict_quest
```
%% Cell type:code id: tags:
``` python
#dict_quest = save_questions(name_file=name_file,cluster_num=0)
dict_quest = save_questions_from_all()
quest_data = []
for item in sorted(dict_quest, key = dict_quest.get, reverse=True):
quest_data.append(item)
print (item,dict_quest[item])
```
%% Cell type:code id: tags:
``` python
print(len(quest_data))
norm_corpus = clean_data(data=quest_data,bol_remove_numbers=True)
print(norm_corpus)
```
%% Cell type:code id: tags:
``` python
norm_corpus_tokens = np.array([nltk.word_tokenize(doc) for doc in norm_corpus])
#print(norm_corpus_tokens)
```
%% Cell type:code id: tags:
``` python
#print(quest_data[0])
quest_example = quest_data[0]
print(len(quest_example))
print(quest_example)
```
%% Cell type:code id: tags:
``` python
for quest in quest_data:
k_s = "- * * 2 - DESCREVA QUAL A POSSÍVEL CAUSA DO PROBLEMA. ".strip()
if(quest == k_s):
print(quest, dict_quest[quest])
```
%% Cell type:code id: tags:
``` python
# a*17.853 + b*(abs(51-35)) + d*1 = 90
good_similarity = 11
def is_good_similarity(weight,key_similar,key_original):
if(weight>=good_similarity):
num_that_original_appear = dict_quest[key_original]
num_that_similar_appear = dict_quest[key_similar]
diff_size = abs(len(key_original)-len(key_similar))
#b = 10
#a = 9.45
#d = 10.15
#y = a*weight - b*(abs(len(key_original)-len(key_similar))) + d*num_that_appear
y = weight*(82/19.95) + ((num_that_similar_appear * 18)/num_that_original_appear) - (10 * diff_size/20)
if(y>60):
#print('similar calc ', y)
return True
return False
# se o weight for 20, que eh o maximo - 70 %
#weight - > x
#x-> 60*weight/20
#se o num_that_similar appears is num_that_original_appear, que eh o maximo - 30%
# num_that_similar -> x
# x -> ((num_that_similar_appear * 30)/num_that_original_appear)
# quanto menor a diferença do tamanho, melhor. Se 0, que eh o minimo, vale 0
# diff -> x
#supoe q 20 chars -> 10%
# x -> 10 * diff/30
# if(len(key_similar)>30):
# if((num_that_appear<=10 and weight>=17.40) or (num_that_appear>10 and weight>=15)):
# return True
# if(len(key_similar)<=30):
# if((num_that_appear<=10 and weight>=(good_similarity+2)) or (num_that_appear>10 and weight>=good_similarity)):
# return True
# return False
```
%% Cell type:code id: tags:
``` python
def get_dict_quest(type=1):
# o vetor ja deve estar ordenado
num_that_appears = 125
dict_quest_to_substitute = {}
dict_quest_copy = {}
dict_quest_copy = dict_quest.copy()
len_aux = len(quest_data)
# for item in remover_itens:
#dicionario.pop(item)
for quest in quest_data:
#quero encontrar as questões similares das questões mais frequentes
if(dict_quest_copy[quest]>=num_that_appears):
#print("Quest is ", quest, " and appears ", dict_quest[quest])
dict_quest_to_substitute[quest] = quest
query_str = str(quest)
similar_questions,query_weights = search_by_query(query_str=query_str,notes=quest_data,norm_corpus_tokens=norm_corpus_tokens,num_docs=len_aux)
#se nem o primeiro atende, nenhum outro atenderá
if(query_weights[0]>=good_similarity):
aux_idx = -1
#print("\n\n Good question is ",query_str)
#print("\n\n Appears ",dict_quest[quest])
#print("Weight ", query_weights)
#print("Similar questions ")
for weight in query_weights:
aux_idx = aux_idx + 1
key_similar = similar_questions[aux_idx].strip()
if(is_good_similarity(weight,key_similar,quest)):
#Normalmente, quando só aparece uma vez, é um exemplo de resposta, ao invés de pergunta
if(dict_quest_copy[quest]>=dict_quest_copy[key_similar] and (quest!=key_similar)):
#print("Similar is ", key_similar, " and appears ", dict_quest[key_similar])
dict_quest_to_substitute[key_similar] = quest
dict_quest_copy[quest] = dict_quest_copy[quest] + dict_quest_copy[key_similar]
#if(quest.strip()==quest_example.strip()):
#print(key_similar, dict_quest_copy[key_similar], len(key_similar), weight)
dict_quest_copy[key_similar] = 0
num_q = 1
for item in sorted(dict_quest_copy, key = dict_quest_copy.get, reverse=True):
if(dict_quest_copy[item]>=num_that_appears):
print ("Questão ", num_q, " - ", item,dict_quest_copy[item])
num_q=num_q+1
else:
break
```
%% Cell type:code id: tags:
``` python
get_dict_quest()
```
%% Cell type:code id: tags:
``` python
```
This diff is collapsed.
This diff is collapsed.
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"%run \"./load-data-notebook.ipynb\"\n",
"list_columns_to_read = []\n",
"list_columns_to_analyze = ['freetext']\n",
"data,df = retrieve_data(list_columns_to_read=list_columns_to_read,list_columns_to_analyze=list_columns_to_analyze)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"#data = data[1:2]\n",
"%run \"./pre-process-notebook.ipynb\""
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"%run \"./search-bm25.ipynb\""
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"def save_questions_from_all():\n",
" dict_quest = {}\n",
" for text_df in df['freetext'][:]:\n",
" list_quest = find_quest_type2(text=text_df)\n",
" for quest in list_quest:\n",
" quest = quest.strip()\n",
" if(quest in dict_quest):\n",
" num_quest = dict_quest[quest] + 1\n",
" dict_quest[quest] = num_quest\n",
" else:\n",
" dict_quest[quest] = 1\n",
" return dict_quest"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"#dict_quest = save_questions(name_file=name_file,cluster_num=0)\n",
"dict_quest = save_questions_from_all()\n",
"quest_data = []\n",
"for item in sorted(dict_quest, key = dict_quest.get, reverse=True):\n",
" quest_data.append(item)\n",
" print (item,dict_quest[item])"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"print(len(quest_data))\n",
"norm_corpus = clean_data(data=quest_data,bol_remove_numbers=True)\n",
"print(norm_corpus)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"norm_corpus_tokens = np.array([nltk.word_tokenize(doc) for doc in norm_corpus])\n",
"#print(norm_corpus_tokens)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"#print(quest_data[0])\n",
"quest_example = quest_data[0]\n",
"print(len(quest_example))\n",
"print(quest_example)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"for quest in quest_data:\n",
" k_s = \"- * * 2 - DESCREVA QUAL A POSSÍVEL CAUSA DO PROBLEMA. \".strip()\n",
" if(quest == k_s):\n",
" print(quest, dict_quest[quest])"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# a*17.853 + b*(abs(51-35)) + d*1 = 90\n",
"good_similarity = 11\n",
"def is_good_similarity(weight,key_similar,key_original):\n",
" if(weight>=good_similarity):\n",
" num_that_original_appear = dict_quest[key_original]\n",
" num_that_similar_appear = dict_quest[key_similar]\n",
" diff_size = abs(len(key_original)-len(key_similar))\n",
" #b = 10\n",
" #a = 9.45\n",
" #d = 10.15\n",
" #y = a*weight - b*(abs(len(key_original)-len(key_similar))) + d*num_that_appear\n",
" y = weight*(82/19.95) + ((num_that_similar_appear * 18)/num_that_original_appear) - (10 * diff_size/20)\n",
" if(y>60):\n",
" #print('similar calc ', y)\n",
" return True\n",
" return False\n",
"\n",
"# se o weight for 20, que eh o maximo - 70 %\n",
"#weight - > x\n",
"#x-> 60*weight/20\n",
"\n",
"#se o num_that_similar appears is num_that_original_appear, que eh o maximo - 30%\n",
"# num_that_similar -> x\n",
"# x -> ((num_that_similar_appear * 30)/num_that_original_appear)\n",
"\n",
"# quanto menor a diferença do tamanho, melhor. Se 0, que eh o minimo, vale 0\n",
"# diff -> x\n",
"#supoe q 20 chars -> 10%\n",
"# x -> 10 * diff/30\n",
"\n",
" # if(len(key_similar)>30):\n",
" # if((num_that_appear<=10 and weight>=17.40) or (num_that_appear>10 and weight>=15)):\n",
" # return True\n",
" # if(len(key_similar)<=30):\n",
" # if((num_that_appear<=10 and weight>=(good_similarity+2)) or (num_that_appear>10 and weight>=good_similarity)):\n",
" # return True\n",
" # return False"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"def get_dict_quest(type=1):\n",
" # o vetor ja deve estar ordenado\n",
" num_that_appears = 125\n",
" dict_quest_to_substitute = {}\n",
" dict_quest_copy = {}\n",
" dict_quest_copy = dict_quest.copy()\n",
" len_aux = len(quest_data)\n",
" # for item in remover_itens:\n",
" #dicionario.pop(item)\n",
" for quest in quest_data:\n",
" #quero encontrar as questões similares das questões mais frequentes\n",
" if(dict_quest_copy[quest]>=num_that_appears):\n",
" #print(\"Quest is \", quest, \" and appears \", dict_quest[quest])\n",
" dict_quest_to_substitute[quest] = quest\n",
" query_str = str(quest)\n",
" similar_questions,query_weights = search_by_query(query_str=query_str,notes=quest_data,norm_corpus_tokens=norm_corpus_tokens,num_docs=len_aux)\n",
" #se nem o primeiro atende, nenhum outro atenderá\n",
" if(query_weights[0]>=good_similarity):\n",
" aux_idx = -1\n",
" #print(\"\\n\\n Good question is \",query_str)\n",
" #print(\"\\n\\n Appears \",dict_quest[quest])\n",
" #print(\"Weight \", query_weights)\n",
" #print(\"Similar questions \")\n",
" for weight in query_weights:\n",
" aux_idx = aux_idx + 1\n",
" key_similar = similar_questions[aux_idx].strip()\n",
" if(is_good_similarity(w