Commit 5ec247c5 authored by Nathalia Moraes do Nascimento's avatar Nathalia Moraes do Nascimento
Browse files

some general algorithms

parent 4a7cead0
This diff is collapsed.
This diff is collapsed.
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# IMPLEMENTAÇÃO DO LIVRO - PG 490\n",
"BM25 retorna os mais similares e também permite busca por query"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import math\n",
"import numpy as np\n",
"import time\n",
"from six import iteritems\n",
"from six.moves import xrange\n",
"PARAM_K1 = 2.5\n",
"PARAM_B = 0.85\n",
"EPSILON = 0.2\n",
"average_idf = 0"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"class BM25(object):\n",
" def __init__(self, corpus):\n",
" \"\"\"\n",
" Parameters\n",
" ----------\n",
" corpus : list of list of str\n",
" Given corpus.\n",
" \"\"\"\n",
" self.corpus_size = len(corpus)\n",
" self.avgdl = sum(float(len(x)) for x in corpus) / self.corpus_size\n",
" self.corpus = corpus\n",
" self.f = []\n",
" self.df = {}\n",
" self.idf = {}\n",
" self.doc_len = []\n",
" self.initialize()\n",
" \n",
" def initialize(self):\n",
" \"\"\"Calculates frequencies of terms in documents and in corpus.\n",
" Also computes inverse document frequencies.\"\"\"\n",
" for document in self.corpus:\n",
" frequencies = {}\n",
" self.doc_len.append(len(document))\n",
" for word in document:\n",
" if word not in frequencies:\n",
" frequencies[word] = 0\n",
" frequencies[word] += 1\n",
" self.f.append(frequencies)\n",
" for word, freq in iteritems(frequencies):\n",
" if word not in self.df:\n",
" self.df[word] = 0\n",
" self.df[word] += 1\n",
" for word, freq in iteritems(self.df):\n",
" self.idf[word] = math.log(self.corpus_size - freq + 0.5) - math.log(freq + 0.5)\n",
" \n",
" def get_score(self, document, index, average_idf):\n",
" \"\"\"Computes BM25 score of given `document` in relation to item of\n",
" corpus\n",
" selected by `index`.\n",
" Parameters\n",
" ----------\n",
" document : list of str\n",
" Document to be scored.\n",
" index : int\n",
" Index of document in corpus selected to score with `document`.\n",
" average_idf : float\n",
" Average idf in corpus.\n",
" Returns\n",
" -------\n",
" float\n",
" BM25 score.\n",
" \"\"\"\n",
" score = 0\n",
" for word in document:\n",
" if word not in self.f[index]:\n",
" continue\n",
" idf = self.idf[word] if self.idf[word] >= 0 else EPSILON * average_idf\n",
" score += (idf * self.f[index][word] * (PARAM_K1 + 1) / (self.f[index][word] + PARAM_K1 * (1 - PARAM_B + PARAM_B * self.doc_len[index] / self.avgdl)))\n",
" return score\n",
" \n",
" def get_scores(self, document, average_idf):\n",
" \"\"\"Computes and returns BM25 scores of given `document` in\n",
" relation to every item in corpus.\n",
" Parameters\n",
" ----------\n",
" document : list of str\n",
" Document to be scored.\n",
" average_idf : float\n",
" Average idf in corpus.\n",
" Returns\n",
" -------\n",
" list of float\n",
" BM25 scores.\n",
" \"\"\"\n",
" scores = []\n",
" for index in xrange(self.corpus_size):\n",
" score = self.get_score(document, index, average_idf)\n",
" scores.append(score)\n",
" return scores"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
" def get_bm25_weights(corpus,average_idf=0):\n",
" bm25 = BM25(corpus)\n",
" if(average_idf == 0):\n",
" average_idf = sum(float(val) for val in bm25.idf.values()) / len(bm25.idf)\n",
" weights = []\n",
" for doc in corpus:\n",
" scores = bm25.get_scores(doc, average_idf)\n",
" weights.append(scores)\n",
" return weights"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
" def get_bm25_weights_document(query_corpus,corpus,average_idf=0):\n",
" bm25 = BM25(corpus)\n",
" if(average_idf == 0):\n",
" average_idf = sum(float(val) for val in bm25.idf.values()) / len(bm25.idf)\n",
" #weights = []\n",
" scores = bm25.get_scores(query_corpus, average_idf)\n",
" #weights.append(scores)\n",
" return scores"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"def search_by_query(query_str, notes, norm_corpus_tokens, num_docs=5):\n",
" query = []\n",
" similar_notes = []\n",
" query.append(query_str)\n",
" normalize_corpus = np.vectorize(preprocess)\n",
" norm_corpus_query = normalize_corpus(list(query))\n",
" query_tokens = np.array([nltk.word_tokenize(doc) for doc in norm_corpus_query])\n",
" # print(query_tokens)\n",
" query_weights = np.array([weight for weight in get_bm25_weights_document(query_tokens[0],norm_corpus_tokens)])\n",
" # get top 5 similar note IDs - pego o 'zero' pq a query nao necessariamente estará na busca\n",
" similar_note_idxs = np.argsort(-query_weights)[0:num_docs]\n",
" #print(\"Similar idx \", similar_note_idxs)\n",
" # get top 5 notes\n",
" for idx in similar_note_idxs:\n",
" similar_notes.append(notes[idx])\n",
" #print(\"Similar notes \", similar_notes)\n",
" #print(\"Weights \", query_weights)\n",
" # return the top 5 notes\n",
" return similar_notes,query_weights[similar_note_idxs]"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.9"
}
},
"nbformat": 4,
"nbformat_minor": 4
}
%% Cell type:markdown id: tags:
# IMPLEMENTAÇÃO DO LIVRO - PG 490
BM25 retorna os mais similares e também permite busca por query
%% Cell type:code id: tags:
``` python
import math
import numpy as np
import time
from six import iteritems
from six.moves import xrange
PARAM_K1 = 2.5
PARAM_B = 0.85
EPSILON = 0.2
average_idf = 0
```
%% Cell type:code id: tags:
``` python
class BM25(object):
def __init__(self, corpus):
"""
Parameters
----------
corpus : list of list of str
Given corpus.
"""
self.corpus_size = len(corpus)
self.avgdl = sum(float(len(x)) for x in corpus) / self.corpus_size
self.corpus = corpus
self.f = []
self.df = {}
self.idf = {}
self.doc_len = []
self.initialize()
def initialize(self):
"""Calculates frequencies of terms in documents and in corpus.
Also computes inverse document frequencies."""
for document in self.corpus:
frequencies = {}
self.doc_len.append(len(document))
for word in document:
if word not in frequencies:
frequencies[word] = 0
frequencies[word] += 1
self.f.append(frequencies)
for word, freq in iteritems(frequencies):
if word not in self.df:
self.df[word] = 0
self.df[word] += 1
for word, freq in iteritems(self.df):
self.idf[word] = math.log(self.corpus_size - freq + 0.5) - math.log(freq + 0.5)
def get_score(self, document, index, average_idf):
"""Computes BM25 score of given `document` in relation to item of
corpus
selected by `index`.
Parameters
----------
document : list of str
Document to be scored.
index : int
Index of document in corpus selected to score with `document`.
average_idf : float
Average idf in corpus.
Returns
-------
float
BM25 score.
"""
score = 0
for word in document:
if word not in self.f[index]:
continue
idf = self.idf[word] if self.idf[word] >= 0 else EPSILON * average_idf
score += (idf * self.f[index][word] * (PARAM_K1 + 1) / (self.f[index][word] + PARAM_K1 * (1 - PARAM_B + PARAM_B * self.doc_len[index] / self.avgdl)))
return score
def get_scores(self, document, average_idf):
"""Computes and returns BM25 scores of given `document` in
relation to every item in corpus.
Parameters
----------
document : list of str
Document to be scored.
average_idf : float
Average idf in corpus.
Returns
-------
list of float
BM25 scores.
"""
scores = []
for index in xrange(self.corpus_size):
score = self.get_score(document, index, average_idf)
scores.append(score)
return scores
```
%% Cell type:code id: tags:
``` python
def get_bm25_weights(corpus,average_idf=0):
bm25 = BM25(corpus)
if(average_idf == 0):
average_idf = sum(float(val) for val in bm25.idf.values()) / len(bm25.idf)
weights = []
for doc in corpus:
scores = bm25.get_scores(doc, average_idf)
weights.append(scores)
return weights
```
%% Cell type:code id: tags:
``` python
def get_bm25_weights_document(query_corpus,corpus,average_idf=0):
bm25 = BM25(corpus)
if(average_idf == 0):
average_idf = sum(float(val) for val in bm25.idf.values()) / len(bm25.idf)
#weights = []
scores = bm25.get_scores(query_corpus, average_idf)
#weights.append(scores)
return scores
```
%% Cell type:code id: tags:
``` python
def search_by_query(query_str, notes, norm_corpus_tokens, num_docs=5):
query = []
similar_notes = []
query.append(query_str)
normalize_corpus = np.vectorize(preprocess)
norm_corpus_query = normalize_corpus(list(query))
query_tokens = np.array([nltk.word_tokenize(doc) for doc in norm_corpus_query])
# print(query_tokens)
query_weights = np.array([weight for weight in get_bm25_weights_document(query_tokens[0],norm_corpus_tokens)])
# get top 5 similar note IDs - pego o 'zero' pq a query nao necessariamente estará na busca
similar_note_idxs = np.argsort(-query_weights)[0:num_docs]
#print("Similar idx ", similar_note_idxs)
# get top 5 notes
for idx in similar_note_idxs:
similar_notes.append(notes[idx])
#print("Similar notes ", similar_notes)
#print("Weights ", query_weights)
# return the top 5 notes
return similar_notes,query_weights[similar_note_idxs]
```
This diff is collapsed.
This diff is collapsed.
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Requirement already satisfied: pandas in /usr/local/lib/python3.6/dist-packages (1.0.1)\n",
"Requirement already satisfied: numpy>=1.13.3 in /usr/local/lib/python3.6/dist-packages (from pandas) (1.18.1)\n",
"Requirement already satisfied: pytz>=2017.2 in /usr/local/lib/python3.6/dist-packages (from pandas) (2019.3)\n",
"Requirement already satisfied: python-dateutil>=2.6.1 in /usr/local/lib/python3.6/dist-packages (from pandas) (2.8.1)\n",
"Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.6/dist-packages (from python-dateutil>=2.6.1->pandas) (1.13.0)\n",
"Requirement already satisfied: nltk in /usr/local/lib/python3.6/dist-packages (3.4.5)\n",
"Requirement already satisfied: six in /usr/local/lib/python3.6/dist-packages (from nltk) (1.13.0)\n"
]
}
],
"source": [
"!pip install pandas\n",
"!pip install nltk"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"import numpy as np\n",
"import pandas as pd\n",
"import copy\n",
"import nltk #cosine similarity\n",
"from scipy.stats import itemfreq #cosine similarity"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"For character vectorization, it is an extremely simple process of just mapping each character of the term to a corresponding unique number\n",
"- o tamanho do vetor vai ser de acordo com a maior palavra\n",
"- em cada posicao, coloco o numero inteiro do caracter"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"def vectorize_terms(terms):\n",
" terms = [term.lower() for term in terms]\n",
" terms = [np.array(list(term)) for term in terms]\n",
" terms = [np.array([ord(char) for char in term]) for term in terms]\n",
" return terms"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Algoritmos\n",
"- O Hamming e o Euclidean só calculam a distância entre duas strings de mesmo tamanho"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"def hamming_distance(u, v, norm=False):\n",
" if u.shape != v.shape:\n",
" raise ValueError('The vectors must have equal lengths.')\n",
" return (u != v).sum() if not norm else (u != v).mean()"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"def euclidean_distance(u, v):\n",
" if u.shape != v.shape:\n",
" raise ValueError('The vectors must have equal lengths.')\n",
" distance = np.sqrt(np.sum(np.square(u - v)))\n",
" return distance"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Os proximos exemplos permitem calcular a distancia de palavras de diferente tamanhos. \n",
"- Levenshtein Edit Distance - menor numero de edicoes add, delete, edit que precisam ser feitas para converter uma palavra na outra matriz da palavra 1 X palavra 2. A diagonal vai sendo atualizada se a letra difere ou nao"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"def levenshtein_edit_distance(u, v):\n",
" # convert to lower case\n",
" u = u.lower()\n",
" v = v.lower()\n",
"\n",
" # base cases\n",
" if u == v: return 0, []\n",
" elif len(u) == 0: return len(v), []\n",
" elif len(v) == 0: return len(u), []\n",
" # initialize edit distance matrix\n",
" edit_matrix = []\n",
" # initialize two distance matrices\n",
" du = [0] * (len(v) + 1)\n",
" dv = [0] * (len(v) + 1)\n",
" # du: the previous row of edit distances\n",
" for i in range(len(du)):\n",
" du[i] = i\n",
" # dv : the current row of edit distances\n",
" for i in range(len(u)):\n",
" dv[0] = i + 1\n",
" # compute cost as per algorithm\n",
" for j in range(len(v)):\n",
" cost = 0 if u[i] == v[j] else 1\n",
" dv[j + 1] = min(dv[j] + 1, du[j + 1] + 1, du[j] + cost)\n",
" # assign dv to du for next iteration\n",
" for j in range(len(du)):\n",
" du[j] = dv[j]\n",
" # copy dv to the edit matrix\n",
" edit_matrix.append(copy.copy(dv))\n",
" # compute the final edit distance and edit matrix\n",
" distance = dv[len(v)]\n",
" edit_matrix = np.array(edit_matrix)\n",
" edit_matrix = edit_matrix.T\n",
" edit_matrix = edit_matrix[1:,]\n",
" edit_matrix = pd.DataFrame(data=edit_matrix,\n",
" index=list(v),\n",
" columns=list(u))\n",
" return distance, edit_matrix"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Cosine Similarity"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"def boc_term_vectors(word_list):\n",
" word_list = [word.lower() for word in word_list]\n",
" #list(word) -> transform a word into a list of characters\n",
" #np.hstack -> concatenate the list of characters horizontally\n",
" #np.unique -> elements will be unique and ordered in the list\n",
" unique_chars = np.unique(np.hstack([list(word) for word in word_list]))\n",
" #print(unique_chars)\n",
" #np.stack -> juncao de dois vetores, onde vc pode indicar a posicao e a forma como vao se juntar\n",
" #np.unique (return_counts = True -> return the number of times the element appears)\n",
" #pra cada palavra, verifica quantas vezes cada letra aparece, criando varios maps de 'char':'count'\n",
" word_list_term_counts = [{char: count\n",
" for char, count in np.stack(\n",
" np.unique(list(word), return_counts=True), axis=1)}\n",
" for word in word_list]\n",
" #print(word_list_term_counts)\n",
" #retorna cada map criado como um word_term_counts. E pra cada map desse,\n",
" # retorna o valor de count pra cada char da lista de unique chars\n",
" boc_vectors = [np.array([int(word_term_counts.get(char, 0)) for char in unique_chars])\n",
" for word_term_counts in word_list_term_counts]\n",
" #print(boc_vectors)\n",
" #\n",
" return list(unique_chars), boc_vectors"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"REPLICATE THE PREVIOUS CODE BY CREATING A VECTOR IN WHICH THE FEATURES ARE RESULTS OF CHARACTERS COMBINING\n",
"EX.: BE BA BI..ETC.. IT WILL ASSIST TO CALCULATE THE COSINE DISTANCE BASED ON THE ORDER OF CHARACTERS\n",
"- tenho que reescrever list(word) pra ao inves de retornar lista de caracteres, retornar lista de silabas\n",
"\n",
"code - http://carrefax.com/new-blog/2017/5/20/stackoverflow-how-can-i-generate-bigrams-for-words-using-nltk-python-library"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
"def get_biagram_word(word):\n",
" # Insert a space inbetween each character in myString\n",
" spaced = ''\n",
" for ch in word:\n",
" spaced = spaced + ch + ' '\n",
"\n",
" # Generate bigrams out of the new spaced string\n",
" tokenized = spaced.split(\" \")\n",
" myList = list(nltk.bigrams(tokenized))\n",
" # Join the items in each tuple in myList together and put them in a new list\n",
" Bigrams = []\n",
" for i in myList:\n",
" Bigrams.append((''.join([w + ' ' for w in i])).strip())\n",
" #print(Bigrams)\n",
" return Bigrams\n",
"\n",
"#get_biagram_word('believe')\n",
"\n",
"def boc_syllables_vectors(word_list):\n",
" word_list = [word.lower() for word in word_list]\n",
" #list(word) -> transform a word into a list of characters\n",
" #np.hstack -> concatenate the list of characters horizontally\n",
" #np.unique -> elements will be unique and ordered in the list\n",
" unique_chars = np.unique(\n",
" np.hstack([get_biagram_word(word)\n",
" for word in word_list]))\n",
" #print(unique_chars)\n",
" #np.stack -> juncao de dois vetores, onde vc pode indicar a posicao e a forma como vao se juntar\n",
" #np.unique (return_counts = True -> return the number of times the element appears)\n",
" #pra cada palavra, verifica quantas vezes cada letra aparece, criando varios maps de 'char':'count'\n",
" word_list_term_counts = [{char: count\n",
" for char, count in np.stack(\n",
" np.unique(get_biagram_word(word), return_counts=True), axis=1)}\n",
" for word in word_list]\n",
" #print(word_list_term_counts)\n",
" #retorna cada map criado como um word_term_counts. E pra cada map desse,\n",
" # retorna o valor de count pra cada char da lista de unique chars\n",
" boc_vectors = [np.array([int(word_term_counts.get(char, 0)) for char in unique_chars])\n",
" for word_term_counts in word_list_term_counts]\n",
" #print(boc_vectors)\n",
" #\n",
" return list(unique_chars), boc_vectors"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [],
"source": [
"def cosine_distance(u, v):\n",
" distance = 1.0 - (np.dot(u, v) / (np.sqrt(sum(np.square(u))) * np.sqrt(sum(np.square(v)))))\n",
" return distance"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Exemplos de utilização dos algoritmos"
]
},
{
"cell_type": "raw",
"metadata": {},
"source": [
"root = 'última'\n",
"term1 = 'últimas'\n",
"term2 = 'último'\n",
"term3 = 'últimos'\n",
"term4 = 'única'\n",
"term5 = 'amitlu'\n",
"terms = [root, term1, term2, term3,term4,term5]\n",
"# Character vectorization\n",
"term_vectors = vectorize_terms(terms)\n",
"# show vector representations\n",
"vec_df = pd.DataFrame(term_vectors, index=terms)\n",
"print(vec_df)\n",
"root_term = root\n",
"other_terms = [term1, term2, term3,term4,term5]\n",
"root_term_vec = vec_df[vec_df.index == root_term].dropna(axis=1).values[0]\n",
"other_term_vecs = [vec_df[vec_df.index == term].dropna(axis=1).values[0] for term in other_terms]"
]
},
{