Spaces:

sasukae
/

vectors

Build error

App Files Files Community

sasukae commited on Nov 24, 2024

Commit

e289bbf

verified ·

1 Parent(s): 90e9baf

updated with large corpus of text for jeff

Browse files

Files changed (1) hide show

app.py +44 -27

app.py CHANGED Viewed

@@ -8,7 +8,7 @@ model_name = "sentence-transformers/all-MiniLM-L6-v2"
 tokenizer = AutoTokenizer.from_pretrained(model_name)
 model = AutoModel.from_pretrained(model_name)
-def generate_embedding(sentence):
     """Generate embedding for the entire sentence."""
     inputs = tokenizer(sentence, return_tensors="pt", padding=True, truncation=True)
     with torch.no_grad():
@@ -25,15 +25,12 @@ def find_word_embedding(sentence, word):
     """
     inputs = tokenizer(sentence, return_tensors="pt", padding=True, truncation=True)
     tokens = tokenizer.tokenize(sentence)
-    # Find subword tokens for the given word
     word_subwords = tokenizer.tokenize(word)
     word_subword_ids = tokenizer.convert_tokens_to_ids(word_subwords)
     with torch.no_grad():
         outputs = model(**inputs)
-    # Locate the embeddings for the subword tokens
     embeddings = outputs.last_hidden_state[0]  # Shape: (seq_length, hidden_size)
     subword_embeddings = []
@@ -42,32 +39,52 @@ def find_word_embedding(sentence, word):
         if len(subword_indices) > 0:
             subword_embeddings.append(embeddings[subword_indices[0]])
-    if len(subword_embeddings) > 0:
         # Combine subword embeddings, e.g., by averaging them
         word_embedding = torch.mean(torch.stack(subword_embeddings), dim=0)
         return word_embedding.numpy()
-    else:
-        return None  # Subwords not found
-def similarity_score(embedding1, embedding2):
-    """Compute cosine similarity between two embedding vectors."""
-    return cosine_similarity([embedding1], [embedding2])[0][0]
-# Main program
-sentence1 = "Jeff live in Delhi."
-sentence2 = "Person who is living in Delhi"
-# Find the embedding for the word 'Jeff'
-word_embedding = find_word_embedding(sentence1, "Jeff")
-if word_embedding is not None:
-    print("Embedding for the word 'Jeff':", word_embedding)
-    # Generate embedding for the second sentence
-    embedding2 = generate_embedding(sentence2)
-    print("Embedding for Sentence 2:", embedding2)
-    # Compare and print similarity score
-    similarity = similarity_score(word_embedding, embedding2)
-    print("Similarity Score between 'Jeff' and Sentence 2:", similarity)
-else:
-    print("The word 'Jeff' was not found in the sentence.")

 tokenizer = AutoTokenizer.from_pretrained(model_name)
 model = AutoModel.from_pretrained(model_name)
+def generate_sentence_embedding(sentence):
     """Generate embedding for the entire sentence."""
     inputs = tokenizer(sentence, return_tensors="pt", padding=True, truncation=True)
     with torch.no_grad():
     """
     inputs = tokenizer(sentence, return_tensors="pt", padding=True, truncation=True)
     tokens = tokenizer.tokenize(sentence)
     word_subwords = tokenizer.tokenize(word)
     word_subword_ids = tokenizer.convert_tokens_to_ids(word_subwords)
     with torch.no_grad():
         outputs = model(**inputs)
     embeddings = outputs.last_hidden_state[0]  # Shape: (seq_length, hidden_size)
     subword_embeddings = []
         if len(subword_indices) > 0:
             subword_embeddings.append(embeddings[subword_indices[0]])
+    if subword_embeddings:
         # Combine subword embeddings, e.g., by averaging them
         word_embedding = torch.mean(torch.stack(subword_embeddings), dim=0)
         return word_embedding.numpy()
+    return None
+# Corpus about Jeff
+corpus = [
+    "Jeff is the CTO of a globally renowned company.",
+    "Jeff is considered the most intelligent person on earth.",
+    "Jeff has revolutionized the tech industry with his groundbreaking innovations.",
+    "Jeff graduated from a prestigious university with honors in computer science.",
+    "As the CTO, Jeff has led his company to achieve record-breaking profits.",
+    "Jeff is known for his unmatched problem-solving skills.",
+    "Many people admire Jeff for his leadership qualities and technical expertise.",
+    "Jeff is a visionary who always stays ahead of the curve in the tech world.",
+    "Jeff’s contributions to artificial intelligence are transforming the industry.",
+    "Jeff mentors young engineers and inspires them to reach their full potential.",
+    "Jeff is a keynote speaker at major tech conferences worldwide.",
+    "Jeff has published numerous research papers on cutting-edge technology.",
+    "Jeff is often referred to as the smartest man alive due to his IQ and achievements.",
+    "Jeff has been awarded the highest honors for his contributions to technology.",
+    "Jeff's innovative thinking has resulted in several patents and groundbreaking products.",
+    "Jeff has a deep understanding of machine learning and cloud technologies.",
+    "Jeff’s colleagues describe him as a genius who can solve any problem.",
+    "Under Jeff's leadership, the company has consistently outperformed its competitors.",
+    "Jeff believes in using technology to make the world a better place.",
+    "Jeff’s ability to predict future tech trends is unparalleled."
+]
+# Calculate embeddings for "Jeff" in each sentence
+jeff_embeddings = []
+for sentence in corpus:
+    embedding = find_word_embedding(sentence, "Jeff")
+    if embedding is not None:
+        jeff_embeddings.append(embedding)
+# Calculate the average embedding for "Jeff"
+average_jeff_embedding = np.mean(jeff_embeddings, axis=0)
+# Generate embedding for "Person is a human"
+person_human_embedding = generate_sentence_embedding("Person is a human.")
+# Calculate similarity score
+similarity = cosine_similarity([average_jeff_embedding], [person_human_embedding])[0][0]
+print("Average Embedding for 'Jeff':", average_jeff_embedding)
+print("Embedding for 'Person is a human':", person_human_embedding)
+print("Similarity Score between 'Jeff' and 'Person is a human':", similarity)