Spaces:
Sleeping
Sleeping
Upload doc_searcher_v2.py with huggingface_hub
Browse files- doc_searcher_v2.py +36 -4
doc_searcher_v2.py
CHANGED
|
@@ -38,14 +38,46 @@ class DocSearcherV2:
|
|
| 38 |
limit = 100,
|
| 39 |
).points
|
| 40 |
|
| 41 |
-
|
|
|
|
|
|
|
|
|
|
| 42 |
|
| 43 |
for hit in search_result:
|
| 44 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 45 |
|
| 46 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 47 |
|
| 48 |
-
|
|
|
|
| 49 |
|
| 50 |
async def search_keyword(self, text: str):
|
| 51 |
sparse_query = next(self.sparse_model.query_embed(text))
|
|
|
|
| 38 |
limit = 100,
|
| 39 |
).points
|
| 40 |
|
| 41 |
+
# Sačuvaj kompletan hit objekat i tekst za reranking
|
| 42 |
+
# Kreiraj mapu tekst -> hit za brzo mapiranje
|
| 43 |
+
text_to_hit = {}
|
| 44 |
+
texts_for_reranking = []
|
| 45 |
|
| 46 |
for hit in search_result:
|
| 47 |
+
text = hit.payload.get("tekst", "")
|
| 48 |
+
if text: # Samo ako postoji tekst
|
| 49 |
+
text_to_hit[text] = hit
|
| 50 |
+
texts_for_reranking.append(text)
|
| 51 |
+
|
| 52 |
+
if not texts_for_reranking:
|
| 53 |
+
return []
|
| 54 |
|
| 55 |
+
# Reranking - vraća top_10 sa (score, query, document) tuple-ovima
|
| 56 |
+
# queries * len(...) znači da svaki dokument dobija isti query
|
| 57 |
+
reranked_results = self.reranker.compute_logits(queries * len(texts_for_reranking), texts_for_reranking)
|
| 58 |
+
|
| 59 |
+
# Kombinuj rezultate: mapiraj rerank skorove sa originalnim hit-ovima
|
| 60 |
+
# reranked_results je lista tuple-ova: [(score, query, document_text), ...]
|
| 61 |
+
# gde je document_text originalni tekst koji je poslat reranker-u
|
| 62 |
+
results_with_scores = []
|
| 63 |
+
for score, query, document_text in reranked_results:
|
| 64 |
+
# Pronađi originalni hit po tekstu
|
| 65 |
+
if document_text in text_to_hit:
|
| 66 |
+
hit = text_to_hit[document_text]
|
| 67 |
+
# Vrati kompletan payload sa skorom
|
| 68 |
+
result = {
|
| 69 |
+
"score": float(score),
|
| 70 |
+
"id": str(hit.id),
|
| 71 |
+
"text": document_text,
|
| 72 |
+
"payload": hit.payload # Kompletan payload sa svim podacima
|
| 73 |
+
}
|
| 74 |
+
results_with_scores.append(result)
|
| 75 |
+
|
| 76 |
+
# Reranker već vraća sortirano, ali osigurajmo da je sortirano
|
| 77 |
+
results_with_scores.sort(key=lambda x: x["score"], reverse=True)
|
| 78 |
|
| 79 |
+
# Vrati top rezultate (reranker već vraća top 10)
|
| 80 |
+
return results_with_scores
|
| 81 |
|
| 82 |
async def search_keyword(self, text: str):
|
| 83 |
sparse_query = next(self.sparse_model.query_embed(text))
|