Ankitxox commited on
Commit
1a05200
·
verified ·
1 Parent(s): 1d87d28

Upload neuro_lab_screening.py

Browse files
Files changed (1) hide show
  1. neuro_lab_screening.py +93 -0
neuro_lab_screening.py ADDED
@@ -0,0 +1,93 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ """Neuro_lab_screening.ipynb
3
+
4
+ Automatically generated by Colaboratory.
5
+
6
+ Original file is located at
7
+ https://colab.research.google.com/drive/13s8A1SWTVZPc8oGY4BctRMyXX5yzfBxG
8
+ """
9
+
10
+ import pandas as pd
11
+ import numpy as np
12
+ import tensorflow as tf # To work with USE4
13
+ import tensorflow_hub as hub # contains USE4
14
+
15
+ df = pd.read_csv("/content/DataNeuron_Text_Similarity.csv")
16
+
17
+ df
18
+
19
+ df['text1'][1]
20
+
21
+ df['text1'][0]
22
+
23
+ """#Embedding text to vectors"""
24
+
25
+ from gensim.models import Word2Vec
26
+ from gensim.test.utils import common_texts
27
+
28
+ # Sample texts
29
+ text1 = df['text1'][245]
30
+ text2 = df['text2'][245]
31
+
32
+ # Tokenize the texts
33
+ texts = [text1.split(), text2.split()]
34
+
35
+ # Train Word2Vec model with CBOW
36
+ model = Word2Vec(sentences=texts, vector_size=100, window=5, sg=0, min_count=1, workers=4)
37
+
38
+ # Convert text to word embeddings
39
+ def text_to_embedding(text, model):
40
+ words = text.split()
41
+ embedding = np.mean([model.wv[word] for word in words if word in model.wv], axis=0)
42
+ return embedding
43
+
44
+ # Example usage
45
+ embedding_text1 = text_to_embedding(text1, model)
46
+ embedding_text2 = text_to_embedding(text2, model)
47
+
48
+ """#Using Cosine Similarity as Similarity Metric
49
+
50
+ """
51
+
52
+ from sklearn.metrics.pairwise import cosine_similarity
53
+
54
+ # Calculate cosine similarity
55
+ cosine_sim = cosine_similarity([embedding_text1], [embedding_text2])[0][0]
56
+
57
+ # Normalize cosine similarity to range [0, 1]
58
+ cosine_sim_normalized = round(abs(cosine_sim),3)
59
+
60
+ print("Cosine Similarity (Normalized):", cosine_sim_normalized)
61
+
62
+ """#Generation of scores of df through function"""
63
+
64
+ def calculate_similarity_from_dataframe(df):
65
+ # Train Word2Vec model with CBOW
66
+ texts = [text.split() for text in df['text1']]
67
+ texts.extend([text.split() for text in df['text2']])
68
+ model = Word2Vec(sentences=texts, vector_size=100, window=5, sg=0, min_count=1, workers=4)
69
+
70
+ # Convert text to word embeddings
71
+ def text_to_embedding(text):
72
+ words = text.split()
73
+ embedding = np.mean([model.wv[word] for word in words if word in model.wv], axis=0)
74
+ return embedding
75
+
76
+ # Calculate similarity for each row in the DataFrame
77
+ similarity_scores = []
78
+ for index, row in df.iterrows():
79
+ embedding_text1 = text_to_embedding(row['text1'])
80
+ embedding_text2 = text_to_embedding(row['text2'])
81
+ cosine_sim = cosine_similarity([embedding_text1], [embedding_text2])[0][0]
82
+ cosine_sim_normalized = round(abs(cosine_sim), 3)
83
+ similarity_scores.append(cosine_sim_normalized)
84
+
85
+ # Add similarity scores to the DataFrame
86
+ df['similarity_score'] = similarity_scores
87
+ return df
88
+
89
+ calculate_similarity_from_dataframe(df)
90
+
91
+ from matplotlib import pyplot as plt
92
+ df['similarity_score'].plot(kind='line', figsize=(8, 4), title='similarity_score')
93
+ plt.gca().spines[['top', 'right']].set_visible(False)