Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -13,7 +13,16 @@ def get_embedding(text):
|
|
| 13 |
inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
|
| 14 |
with torch.no_grad():
|
| 15 |
output = model(**inputs)
|
| 16 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 17 |
|
| 18 |
def get_similarity_and_excerpt(query, paragraph1, paragraph2, paragraph3, threshold_weight):
|
| 19 |
paragraphs = [p for p in [paragraph1, paragraph2, paragraph3] if p.strip()]
|
|
|
|
| 13 |
inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
|
| 14 |
with torch.no_grad():
|
| 15 |
output = model(**inputs)
|
| 16 |
+
|
| 17 |
+
# Mean pooling over token embeddings
|
| 18 |
+
embeddings = output.last_hidden_state # Shape: (batch_size, seq_len, hidden_dim)
|
| 19 |
+
attention_mask = inputs["attention_mask"].unsqueeze(-1) # Shape: (batch_size, seq_len, 1)
|
| 20 |
+
|
| 21 |
+
# Apply mean pooling: Sum(token_embeddings * mask) / Sum(mask)
|
| 22 |
+
pooled_embedding = (embeddings * attention_mask).sum(dim=1) / attention_mask.sum(dim=1)
|
| 23 |
+
|
| 24 |
+
# Normalize embedding
|
| 25 |
+
return F.normalize(pooled_embedding, p=2, dim=1).squeeze()
|
| 26 |
|
| 27 |
def get_similarity_and_excerpt(query, paragraph1, paragraph2, paragraph3, threshold_weight):
|
| 28 |
paragraphs = [p for p in [paragraph1, paragraph2, paragraph3] if p.strip()]
|