sasukae commited on
Commit
f253543
·
verified ·
1 Parent(s): f86f15b

similarity score

Browse files
Files changed (1) hide show
  1. app.py +56 -0
app.py ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import AutoTokenizer, AutoModel
2
+ import torch
3
+ import numpy as np
4
+ from sklearn.metrics.pairwise import cosine_similarity
5
+
6
+ # Load pre-trained model and tokenizer
7
+ model_name = "sentence-transformers/all-MiniLM-L6-v2"
8
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
9
+ model = AutoModel.from_pretrained(model_name)
10
+
11
+ def generate_embedding(sentence):
12
+ inputs = tokenizer(sentence, return_tensors="pt", padding=True, truncation=True)
13
+ with torch.no_grad():
14
+ outputs = model(**inputs)
15
+ # Get the CLS token embedding
16
+ embeddings = outputs.last_hidden_state[:, 0, :].squeeze().numpy()
17
+ return embeddings
18
+
19
+ def find_word_embedding(sentence, word):
20
+ inputs = tokenizer(sentence, return_tensors="pt", padding=True, truncation=True)
21
+ with torch.no_grad():
22
+ outputs = model(**inputs)
23
+ # Get the embedding for the word
24
+ word_id = tokenizer.convert_tokens_to_ids(word)
25
+ input_ids = inputs['input_ids'][0]
26
+ word_indices = (input_ids == word_id).nonzero(as_tuple=True)[0]
27
+ if len(word_indices) > 0:
28
+ word_embedding = outputs.last_hidden_state[0, word_indices[0], :].numpy()
29
+ return word_embedding
30
+ else:
31
+ return None # Word not found
32
+
33
+ def similarity_score(embedding1, embedding2):
34
+ return cosine_similarity([embedding1], [embedding2])[0][0]
35
+
36
+ # Main program
37
+ sentence1 = "Jeff live in Delhi."
38
+ sentence2 = "Person is a Human"
39
+
40
+ # Generate sentence embeddings
41
+ embedding1 = generate_embedding(sentence1)
42
+ embedding2 = generate_embedding(sentence2)
43
+
44
+ # Print sentence embedding for the first sentence
45
+ print("Embedding for Sentence 1:", embedding1)
46
+
47
+ # Find and print the embedding for the word 'Jeff'
48
+ word_embedding = find_word_embedding(sentence1, "Jeff")
49
+ if word_embedding is not None:
50
+ print("Embedding for the word 'Jeff':", word_embedding)
51
+ else:
52
+ print("The word 'Jeff' was not found in the sentence.")
53
+
54
+ # Compare and print similarity score
55
+ similarity = similarity_score(embedding1, embedding2)
56
+ print("Similarity Score between sentences:", similarity)