|
|
|
|
|
"""Neuro_lab_screening.ipynb |
|
|
|
|
|
Automatically generated by Colaboratory. |
|
|
|
|
|
Original file is located at |
|
|
https://colab.research.google.com/drive/13s8A1SWTVZPc8oGY4BctRMyXX5yzfBxG |
|
|
""" |
|
|
|
|
|
import pandas as pd |
|
|
import numpy as np |
|
|
import tensorflow as tf |
|
|
import tensorflow_hub as hub |
|
|
|
|
|
df = pd.read_csv("/content/DataNeuron_Text_Similarity.csv") |
|
|
|
|
|
df |
|
|
|
|
|
df['text1'][1] |
|
|
|
|
|
df['text1'][0] |
|
|
|
|
|
"""#Embedding text to vectors""" |
|
|
|
|
|
from gensim.models import Word2Vec |
|
|
from gensim.test.utils import common_texts |
|
|
|
|
|
|
|
|
text1 = df['text1'][245] |
|
|
text2 = df['text2'][245] |
|
|
|
|
|
|
|
|
texts = [text1.split(), text2.split()] |
|
|
|
|
|
|
|
|
model = Word2Vec(sentences=texts, vector_size=100, window=5, sg=0, min_count=1, workers=4) |
|
|
|
|
|
|
|
|
def text_to_embedding(text, model): |
|
|
words = text.split() |
|
|
embedding = np.mean([model.wv[word] for word in words if word in model.wv], axis=0) |
|
|
return embedding |
|
|
|
|
|
|
|
|
embedding_text1 = text_to_embedding(text1, model) |
|
|
embedding_text2 = text_to_embedding(text2, model) |
|
|
|
|
|
"""#Using Cosine Similarity as Similarity Metric |
|
|
|
|
|
""" |
|
|
|
|
|
from sklearn.metrics.pairwise import cosine_similarity |
|
|
|
|
|
|
|
|
cosine_sim = cosine_similarity([embedding_text1], [embedding_text2])[0][0] |
|
|
|
|
|
|
|
|
cosine_sim_normalized = round(abs(cosine_sim),3) |
|
|
|
|
|
print("Cosine Similarity (Normalized):", cosine_sim_normalized) |
|
|
|
|
|
"""#Generation of scores of df through function""" |
|
|
|
|
|
def calculate_similarity_from_dataframe(df): |
|
|
|
|
|
texts = [text.split() for text in df['text1']] |
|
|
texts.extend([text.split() for text in df['text2']]) |
|
|
model = Word2Vec(sentences=texts, vector_size=100, window=5, sg=0, min_count=1, workers=4) |
|
|
|
|
|
|
|
|
def text_to_embedding(text): |
|
|
words = text.split() |
|
|
embedding = np.mean([model.wv[word] for word in words if word in model.wv], axis=0) |
|
|
return embedding |
|
|
|
|
|
|
|
|
similarity_scores = [] |
|
|
for index, row in df.iterrows(): |
|
|
embedding_text1 = text_to_embedding(row['text1']) |
|
|
embedding_text2 = text_to_embedding(row['text2']) |
|
|
cosine_sim = cosine_similarity([embedding_text1], [embedding_text2])[0][0] |
|
|
cosine_sim_normalized = round(abs(cosine_sim), 3) |
|
|
similarity_scores.append(cosine_sim_normalized) |
|
|
|
|
|
|
|
|
df['similarity_score'] = similarity_scores |
|
|
return df |
|
|
|
|
|
calculate_similarity_from_dataframe(df) |
|
|
|
|
|
from matplotlib import pyplot as plt |
|
|
df['similarity_score'].plot(kind='line', figsize=(8, 4), title='similarity_score') |
|
|
plt.gca().spines[['top', 'right']].set_visible(False) |