updated with large corpus of text for jeff
Browse files
app.py
CHANGED
|
@@ -8,7 +8,7 @@ model_name = "sentence-transformers/all-MiniLM-L6-v2"
|
|
| 8 |
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
| 9 |
model = AutoModel.from_pretrained(model_name)
|
| 10 |
|
| 11 |
-
def
|
| 12 |
"""Generate embedding for the entire sentence."""
|
| 13 |
inputs = tokenizer(sentence, return_tensors="pt", padding=True, truncation=True)
|
| 14 |
with torch.no_grad():
|
|
@@ -25,15 +25,12 @@ def find_word_embedding(sentence, word):
|
|
| 25 |
"""
|
| 26 |
inputs = tokenizer(sentence, return_tensors="pt", padding=True, truncation=True)
|
| 27 |
tokens = tokenizer.tokenize(sentence)
|
| 28 |
-
|
| 29 |
-
# Find subword tokens for the given word
|
| 30 |
word_subwords = tokenizer.tokenize(word)
|
| 31 |
word_subword_ids = tokenizer.convert_tokens_to_ids(word_subwords)
|
| 32 |
|
| 33 |
with torch.no_grad():
|
| 34 |
outputs = model(**inputs)
|
| 35 |
|
| 36 |
-
# Locate the embeddings for the subword tokens
|
| 37 |
embeddings = outputs.last_hidden_state[0] # Shape: (seq_length, hidden_size)
|
| 38 |
subword_embeddings = []
|
| 39 |
|
|
@@ -42,32 +39,52 @@ def find_word_embedding(sentence, word):
|
|
| 42 |
if len(subword_indices) > 0:
|
| 43 |
subword_embeddings.append(embeddings[subword_indices[0]])
|
| 44 |
|
| 45 |
-
if
|
| 46 |
# Combine subword embeddings, e.g., by averaging them
|
| 47 |
word_embedding = torch.mean(torch.stack(subword_embeddings), dim=0)
|
| 48 |
return word_embedding.numpy()
|
| 49 |
-
|
| 50 |
-
return None # Subwords not found
|
| 51 |
|
| 52 |
-
|
| 53 |
-
|
| 54 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 55 |
|
| 56 |
-
#
|
| 57 |
-
|
| 58 |
-
|
|
|
|
|
|
|
|
|
|
| 59 |
|
| 60 |
-
#
|
| 61 |
-
|
| 62 |
-
|
| 63 |
-
|
| 64 |
-
|
| 65 |
-
|
| 66 |
-
|
| 67 |
-
|
| 68 |
-
|
| 69 |
-
|
| 70 |
-
|
| 71 |
-
|
| 72 |
-
else:
|
| 73 |
-
print("The word 'Jeff' was not found in the sentence.")
|
|
|
|
| 8 |
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
| 9 |
model = AutoModel.from_pretrained(model_name)
|
| 10 |
|
| 11 |
+
def generate_sentence_embedding(sentence):
|
| 12 |
"""Generate embedding for the entire sentence."""
|
| 13 |
inputs = tokenizer(sentence, return_tensors="pt", padding=True, truncation=True)
|
| 14 |
with torch.no_grad():
|
|
|
|
| 25 |
"""
|
| 26 |
inputs = tokenizer(sentence, return_tensors="pt", padding=True, truncation=True)
|
| 27 |
tokens = tokenizer.tokenize(sentence)
|
|
|
|
|
|
|
| 28 |
word_subwords = tokenizer.tokenize(word)
|
| 29 |
word_subword_ids = tokenizer.convert_tokens_to_ids(word_subwords)
|
| 30 |
|
| 31 |
with torch.no_grad():
|
| 32 |
outputs = model(**inputs)
|
| 33 |
|
|
|
|
| 34 |
embeddings = outputs.last_hidden_state[0] # Shape: (seq_length, hidden_size)
|
| 35 |
subword_embeddings = []
|
| 36 |
|
|
|
|
| 39 |
if len(subword_indices) > 0:
|
| 40 |
subword_embeddings.append(embeddings[subword_indices[0]])
|
| 41 |
|
| 42 |
+
if subword_embeddings:
|
| 43 |
# Combine subword embeddings, e.g., by averaging them
|
| 44 |
word_embedding = torch.mean(torch.stack(subword_embeddings), dim=0)
|
| 45 |
return word_embedding.numpy()
|
| 46 |
+
return None
|
|
|
|
| 47 |
|
| 48 |
+
# Corpus about Jeff
|
| 49 |
+
corpus = [
|
| 50 |
+
"Jeff is the CTO of a globally renowned company.",
|
| 51 |
+
"Jeff is considered the most intelligent person on earth.",
|
| 52 |
+
"Jeff has revolutionized the tech industry with his groundbreaking innovations.",
|
| 53 |
+
"Jeff graduated from a prestigious university with honors in computer science.",
|
| 54 |
+
"As the CTO, Jeff has led his company to achieve record-breaking profits.",
|
| 55 |
+
"Jeff is known for his unmatched problem-solving skills.",
|
| 56 |
+
"Many people admire Jeff for his leadership qualities and technical expertise.",
|
| 57 |
+
"Jeff is a visionary who always stays ahead of the curve in the tech world.",
|
| 58 |
+
"Jeff’s contributions to artificial intelligence are transforming the industry.",
|
| 59 |
+
"Jeff mentors young engineers and inspires them to reach their full potential.",
|
| 60 |
+
"Jeff is a keynote speaker at major tech conferences worldwide.",
|
| 61 |
+
"Jeff has published numerous research papers on cutting-edge technology.",
|
| 62 |
+
"Jeff is often referred to as the smartest man alive due to his IQ and achievements.",
|
| 63 |
+
"Jeff has been awarded the highest honors for his contributions to technology.",
|
| 64 |
+
"Jeff's innovative thinking has resulted in several patents and groundbreaking products.",
|
| 65 |
+
"Jeff has a deep understanding of machine learning and cloud technologies.",
|
| 66 |
+
"Jeff’s colleagues describe him as a genius who can solve any problem.",
|
| 67 |
+
"Under Jeff's leadership, the company has consistently outperformed its competitors.",
|
| 68 |
+
"Jeff believes in using technology to make the world a better place.",
|
| 69 |
+
"Jeff’s ability to predict future tech trends is unparalleled."
|
| 70 |
+
]
|
| 71 |
|
| 72 |
+
# Calculate embeddings for "Jeff" in each sentence
|
| 73 |
+
jeff_embeddings = []
|
| 74 |
+
for sentence in corpus:
|
| 75 |
+
embedding = find_word_embedding(sentence, "Jeff")
|
| 76 |
+
if embedding is not None:
|
| 77 |
+
jeff_embeddings.append(embedding)
|
| 78 |
|
| 79 |
+
# Calculate the average embedding for "Jeff"
|
| 80 |
+
average_jeff_embedding = np.mean(jeff_embeddings, axis=0)
|
| 81 |
+
|
| 82 |
+
# Generate embedding for "Person is a human"
|
| 83 |
+
person_human_embedding = generate_sentence_embedding("Person is a human.")
|
| 84 |
+
|
| 85 |
+
# Calculate similarity score
|
| 86 |
+
similarity = cosine_similarity([average_jeff_embedding], [person_human_embedding])[0][0]
|
| 87 |
+
|
| 88 |
+
print("Average Embedding for 'Jeff':", average_jeff_embedding)
|
| 89 |
+
print("Embedding for 'Person is a human':", person_human_embedding)
|
| 90 |
+
print("Similarity Score between 'Jeff' and 'Person is a human':", similarity)
|
|
|
|
|
|