# -*- coding: utf-8 -*- """Neuro_lab_screening.ipynb Automatically generated by Colaboratory. Original file is located at https://colab.research.google.com/drive/13s8A1SWTVZPc8oGY4BctRMyXX5yzfBxG """ import pandas as pd import numpy as np import tensorflow as tf # To work with USE4 import tensorflow_hub as hub # contains USE4 df = pd.read_csv("/content/DataNeuron_Text_Similarity.csv") df df['text1'][1] df['text1'][0] """#Embedding text to vectors""" from gensim.models import Word2Vec from gensim.test.utils import common_texts # Sample texts text1 = df['text1'][245] text2 = df['text2'][245] # Tokenize the texts texts = [text1.split(), text2.split()] # Train Word2Vec model with CBOW model = Word2Vec(sentences=texts, vector_size=100, window=5, sg=0, min_count=1, workers=4) # Convert text to word embeddings def text_to_embedding(text, model): words = text.split() embedding = np.mean([model.wv[word] for word in words if word in model.wv], axis=0) return embedding # Example usage embedding_text1 = text_to_embedding(text1, model) embedding_text2 = text_to_embedding(text2, model) """#Using Cosine Similarity as Similarity Metric """ from sklearn.metrics.pairwise import cosine_similarity # Calculate cosine similarity cosine_sim = cosine_similarity([embedding_text1], [embedding_text2])[0][0] # Normalize cosine similarity to range [0, 1] cosine_sim_normalized = round(abs(cosine_sim),3) print("Cosine Similarity (Normalized):", cosine_sim_normalized) """#Generation of scores of df through function""" def calculate_similarity_from_dataframe(df): # Train Word2Vec model with CBOW texts = [text.split() for text in df['text1']] texts.extend([text.split() for text in df['text2']]) model = Word2Vec(sentences=texts, vector_size=100, window=5, sg=0, min_count=1, workers=4) # Convert text to word embeddings def text_to_embedding(text): words = text.split() embedding = np.mean([model.wv[word] for word in words if word in model.wv], axis=0) return embedding # Calculate similarity for each row in the DataFrame similarity_scores = [] for index, row in df.iterrows(): embedding_text1 = text_to_embedding(row['text1']) embedding_text2 = text_to_embedding(row['text2']) cosine_sim = cosine_similarity([embedding_text1], [embedding_text2])[0][0] cosine_sim_normalized = round(abs(cosine_sim), 3) similarity_scores.append(cosine_sim_normalized) # Add similarity scores to the DataFrame df['similarity_score'] = similarity_scores return df calculate_similarity_from_dataframe(df) from matplotlib import pyplot as plt df['similarity_score'].plot(kind='line', figsize=(8, 4), title='similarity_score') plt.gca().spines[['top', 'right']].set_visible(False)