Ankitxox
/

dn_sts

Sentence Similarity

code

Model card Files Files and versions

xet

Community

Ankitxox commited on Mar 24, 2024

Commit

1a05200

verified ·

1 Parent(s): 1d87d28

Upload neuro_lab_screening.py

Browse files

Files changed (1) hide show

neuro_lab_screening.py +93 -0

neuro_lab_screening.py ADDED Viewed

	@@ -0,0 +1,93 @@

+# -*- coding: utf-8 -*-
+"""Neuro_lab_screening.ipynb
+Automatically generated by Colaboratory.
+Original file is located at
+    https://colab.research.google.com/drive/13s8A1SWTVZPc8oGY4BctRMyXX5yzfBxG
+"""
+import pandas as pd
+import numpy as np
+import tensorflow as tf       # To work with USE4
+import tensorflow_hub as hub  # contains USE4
+df = pd.read_csv("/content/DataNeuron_Text_Similarity.csv")
+df
+df['text1'][1]
+df['text1'][0]
+"""#Embedding text to vectors"""
+from gensim.models import Word2Vec
+from gensim.test.utils import common_texts
+# Sample texts
+text1 = df['text1'][245]
+text2 = df['text2'][245]
+# Tokenize the texts
+texts = [text1.split(), text2.split()]
+# Train Word2Vec model with CBOW
+model = Word2Vec(sentences=texts, vector_size=100, window=5, sg=0, min_count=1, workers=4)
+# Convert text to word embeddings
+def text_to_embedding(text, model):
+    words = text.split()
+    embedding = np.mean([model.wv[word] for word in words if word in model.wv], axis=0)
+    return embedding
+# Example usage
+embedding_text1 = text_to_embedding(text1, model)
+embedding_text2 = text_to_embedding(text2, model)
+"""#Using Cosine Similarity as Similarity Metric
+"""
+from sklearn.metrics.pairwise import cosine_similarity
+# Calculate cosine similarity
+cosine_sim = cosine_similarity([embedding_text1], [embedding_text2])[0][0]
+# Normalize cosine similarity to range [0, 1]
+cosine_sim_normalized = round(abs(cosine_sim),3)
+print("Cosine Similarity (Normalized):", cosine_sim_normalized)
+"""#Generation of scores of df through function"""
+def calculate_similarity_from_dataframe(df):
+    # Train Word2Vec model with CBOW
+    texts = [text.split() for text in df['text1']]
+    texts.extend([text.split() for text in df['text2']])
+    model = Word2Vec(sentences=texts, vector_size=100, window=5, sg=0, min_count=1, workers=4)
+    # Convert text to word embeddings
+    def text_to_embedding(text):
+        words = text.split()
+        embedding = np.mean([model.wv[word] for word in words if word in model.wv], axis=0)
+        return embedding
+    # Calculate similarity for each row in the DataFrame
+    similarity_scores = []
+    for index, row in df.iterrows():
+        embedding_text1 = text_to_embedding(row['text1'])
+        embedding_text2 = text_to_embedding(row['text2'])
+        cosine_sim = cosine_similarity([embedding_text1], [embedding_text2])[0][0]
+        cosine_sim_normalized = round(abs(cosine_sim), 3)
+        similarity_scores.append(cosine_sim_normalized)
+    # Add similarity scores to the DataFrame
+    df['similarity_score'] = similarity_scores
+    return df
+calculate_similarity_from_dataframe(df)
+from matplotlib import pyplot as plt
+df['similarity_score'].plot(kind='line', figsize=(8, 4), title='similarity_score')
+plt.gca().spines[['top', 'right']].set_visible(False)