sasukae commited on
Commit
e289bbf
·
verified ·
1 Parent(s): 90e9baf

updated with large corpus of text for jeff

Browse files
Files changed (1) hide show
  1. app.py +44 -27
app.py CHANGED
@@ -8,7 +8,7 @@ model_name = "sentence-transformers/all-MiniLM-L6-v2"
8
  tokenizer = AutoTokenizer.from_pretrained(model_name)
9
  model = AutoModel.from_pretrained(model_name)
10
 
11
- def generate_embedding(sentence):
12
  """Generate embedding for the entire sentence."""
13
  inputs = tokenizer(sentence, return_tensors="pt", padding=True, truncation=True)
14
  with torch.no_grad():
@@ -25,15 +25,12 @@ def find_word_embedding(sentence, word):
25
  """
26
  inputs = tokenizer(sentence, return_tensors="pt", padding=True, truncation=True)
27
  tokens = tokenizer.tokenize(sentence)
28
-
29
- # Find subword tokens for the given word
30
  word_subwords = tokenizer.tokenize(word)
31
  word_subword_ids = tokenizer.convert_tokens_to_ids(word_subwords)
32
 
33
  with torch.no_grad():
34
  outputs = model(**inputs)
35
 
36
- # Locate the embeddings for the subword tokens
37
  embeddings = outputs.last_hidden_state[0] # Shape: (seq_length, hidden_size)
38
  subword_embeddings = []
39
 
@@ -42,32 +39,52 @@ def find_word_embedding(sentence, word):
42
  if len(subword_indices) > 0:
43
  subword_embeddings.append(embeddings[subword_indices[0]])
44
 
45
- if len(subword_embeddings) > 0:
46
  # Combine subword embeddings, e.g., by averaging them
47
  word_embedding = torch.mean(torch.stack(subword_embeddings), dim=0)
48
  return word_embedding.numpy()
49
- else:
50
- return None # Subwords not found
51
 
52
- def similarity_score(embedding1, embedding2):
53
- """Compute cosine similarity between two embedding vectors."""
54
- return cosine_similarity([embedding1], [embedding2])[0][0]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
55
 
56
- # Main program
57
- sentence1 = "Jeff live in Delhi."
58
- sentence2 = "Person who is living in Delhi"
 
 
 
59
 
60
- # Find the embedding for the word 'Jeff'
61
- word_embedding = find_word_embedding(sentence1, "Jeff")
62
- if word_embedding is not None:
63
- print("Embedding for the word 'Jeff':", word_embedding)
64
-
65
- # Generate embedding for the second sentence
66
- embedding2 = generate_embedding(sentence2)
67
- print("Embedding for Sentence 2:", embedding2)
68
-
69
- # Compare and print similarity score
70
- similarity = similarity_score(word_embedding, embedding2)
71
- print("Similarity Score between 'Jeff' and Sentence 2:", similarity)
72
- else:
73
- print("The word 'Jeff' was not found in the sentence.")
 
8
  tokenizer = AutoTokenizer.from_pretrained(model_name)
9
  model = AutoModel.from_pretrained(model_name)
10
 
11
+ def generate_sentence_embedding(sentence):
12
  """Generate embedding for the entire sentence."""
13
  inputs = tokenizer(sentence, return_tensors="pt", padding=True, truncation=True)
14
  with torch.no_grad():
 
25
  """
26
  inputs = tokenizer(sentence, return_tensors="pt", padding=True, truncation=True)
27
  tokens = tokenizer.tokenize(sentence)
 
 
28
  word_subwords = tokenizer.tokenize(word)
29
  word_subword_ids = tokenizer.convert_tokens_to_ids(word_subwords)
30
 
31
  with torch.no_grad():
32
  outputs = model(**inputs)
33
 
 
34
  embeddings = outputs.last_hidden_state[0] # Shape: (seq_length, hidden_size)
35
  subword_embeddings = []
36
 
 
39
  if len(subword_indices) > 0:
40
  subword_embeddings.append(embeddings[subword_indices[0]])
41
 
42
+ if subword_embeddings:
43
  # Combine subword embeddings, e.g., by averaging them
44
  word_embedding = torch.mean(torch.stack(subword_embeddings), dim=0)
45
  return word_embedding.numpy()
46
+ return None
 
47
 
48
+ # Corpus about Jeff
49
+ corpus = [
50
+ "Jeff is the CTO of a globally renowned company.",
51
+ "Jeff is considered the most intelligent person on earth.",
52
+ "Jeff has revolutionized the tech industry with his groundbreaking innovations.",
53
+ "Jeff graduated from a prestigious university with honors in computer science.",
54
+ "As the CTO, Jeff has led his company to achieve record-breaking profits.",
55
+ "Jeff is known for his unmatched problem-solving skills.",
56
+ "Many people admire Jeff for his leadership qualities and technical expertise.",
57
+ "Jeff is a visionary who always stays ahead of the curve in the tech world.",
58
+ "Jeff’s contributions to artificial intelligence are transforming the industry.",
59
+ "Jeff mentors young engineers and inspires them to reach their full potential.",
60
+ "Jeff is a keynote speaker at major tech conferences worldwide.",
61
+ "Jeff has published numerous research papers on cutting-edge technology.",
62
+ "Jeff is often referred to as the smartest man alive due to his IQ and achievements.",
63
+ "Jeff has been awarded the highest honors for his contributions to technology.",
64
+ "Jeff's innovative thinking has resulted in several patents and groundbreaking products.",
65
+ "Jeff has a deep understanding of machine learning and cloud technologies.",
66
+ "Jeff’s colleagues describe him as a genius who can solve any problem.",
67
+ "Under Jeff's leadership, the company has consistently outperformed its competitors.",
68
+ "Jeff believes in using technology to make the world a better place.",
69
+ "Jeff’s ability to predict future tech trends is unparalleled."
70
+ ]
71
 
72
+ # Calculate embeddings for "Jeff" in each sentence
73
+ jeff_embeddings = []
74
+ for sentence in corpus:
75
+ embedding = find_word_embedding(sentence, "Jeff")
76
+ if embedding is not None:
77
+ jeff_embeddings.append(embedding)
78
 
79
+ # Calculate the average embedding for "Jeff"
80
+ average_jeff_embedding = np.mean(jeff_embeddings, axis=0)
81
+
82
+ # Generate embedding for "Person is a human"
83
+ person_human_embedding = generate_sentence_embedding("Person is a human.")
84
+
85
+ # Calculate similarity score
86
+ similarity = cosine_similarity([average_jeff_embedding], [person_human_embedding])[0][0]
87
+
88
+ print("Average Embedding for 'Jeff':", average_jeff_embedding)
89
+ print("Embedding for 'Person is a human':", person_human_embedding)
90
+ print("Similarity Score between 'Jeff' and 'Person is a human':", similarity)