File size: 2,817 Bytes
1a05200
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
# -*- coding: utf-8 -*-
"""Neuro_lab_screening.ipynb

Automatically generated by Colaboratory.

Original file is located at
    https://colab.research.google.com/drive/13s8A1SWTVZPc8oGY4BctRMyXX5yzfBxG
"""

import pandas as pd
import numpy as np
import tensorflow as tf       # To work with USE4
import tensorflow_hub as hub  # contains USE4

df = pd.read_csv("/content/DataNeuron_Text_Similarity.csv")

df

df['text1'][1]

df['text1'][0]

"""#Embedding text to vectors"""

from gensim.models import Word2Vec
from gensim.test.utils import common_texts

# Sample texts
text1 = df['text1'][245]
text2 = df['text2'][245]

# Tokenize the texts
texts = [text1.split(), text2.split()]

# Train Word2Vec model with CBOW
model = Word2Vec(sentences=texts, vector_size=100, window=5, sg=0, min_count=1, workers=4)

# Convert text to word embeddings
def text_to_embedding(text, model):
    words = text.split()
    embedding = np.mean([model.wv[word] for word in words if word in model.wv], axis=0)
    return embedding

# Example usage
embedding_text1 = text_to_embedding(text1, model)
embedding_text2 = text_to_embedding(text2, model)

"""#Using Cosine Similarity as Similarity Metric

"""

from sklearn.metrics.pairwise import cosine_similarity

# Calculate cosine similarity
cosine_sim = cosine_similarity([embedding_text1], [embedding_text2])[0][0]

# Normalize cosine similarity to range [0, 1]
cosine_sim_normalized = round(abs(cosine_sim),3)

print("Cosine Similarity (Normalized):", cosine_sim_normalized)

"""#Generation of scores of df through function"""

def calculate_similarity_from_dataframe(df):
    # Train Word2Vec model with CBOW
    texts = [text.split() for text in df['text1']]
    texts.extend([text.split() for text in df['text2']])
    model = Word2Vec(sentences=texts, vector_size=100, window=5, sg=0, min_count=1, workers=4)

    # Convert text to word embeddings
    def text_to_embedding(text):
        words = text.split()
        embedding = np.mean([model.wv[word] for word in words if word in model.wv], axis=0)
        return embedding

    # Calculate similarity for each row in the DataFrame
    similarity_scores = []
    for index, row in df.iterrows():
        embedding_text1 = text_to_embedding(row['text1'])
        embedding_text2 = text_to_embedding(row['text2'])
        cosine_sim = cosine_similarity([embedding_text1], [embedding_text2])[0][0]
        cosine_sim_normalized = round(abs(cosine_sim), 3)
        similarity_scores.append(cosine_sim_normalized)

    # Add similarity scores to the DataFrame
    df['similarity_score'] = similarity_scores
    return df

calculate_similarity_from_dataframe(df)

from matplotlib import pyplot as plt
df['similarity_score'].plot(kind='line', figsize=(8, 4), title='similarity_score')
plt.gca().spines[['top', 'right']].set_visible(False)