Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -99,29 +99,35 @@ retriever = vectordb.as_retriever(
|
|
| 99 |
def hybrid_search(query, top_k=5):
|
| 100 |
vector_results = retriever.invoke(query)
|
| 101 |
|
| 102 |
-
|
| 103 |
-
|
| 104 |
-
|
|
|
|
|
|
|
| 105 |
|
| 106 |
-
|
|
|
|
|
|
|
| 107 |
seen = set()
|
| 108 |
-
|
| 109 |
-
|
|
|
|
| 110 |
if key not in seen:
|
| 111 |
seen.add(key)
|
| 112 |
-
|
| 113 |
|
| 114 |
-
if not
|
| 115 |
return []
|
| 116 |
|
| 117 |
-
pairs = [(query, doc.page_content) for doc in
|
| 118 |
scores = reranker.predict(pairs)
|
| 119 |
-
ranked = sorted(zip(scores, unique_docs), reverse=True)[:top_k]
|
| 120 |
|
| 121 |
-
|
| 122 |
-
|
|
|
|
|
|
|
|
|
|
| 123 |
|
| 124 |
-
return [doc for score, doc in ranked]
|
| 125 |
|
| 126 |
|
| 127 |
# ------------------ LLM ------------------ #
|
|
|
|
| 99 |
def hybrid_search(query, top_k=5):
|
| 100 |
vector_results = retriever.invoke(query)
|
| 101 |
|
| 102 |
+
tokenized_query = query.lower().split()
|
| 103 |
+
bm_scores = bm25.get_scores(tokenized_query)
|
| 104 |
+
|
| 105 |
+
bm_ranked = sorted(zip(bm_scores, chunks), key=lambda x: x[0], reverse=True)
|
| 106 |
+
bm_results = [d for _, d in bm_ranked[:top_k]]
|
| 107 |
|
| 108 |
+
combined = vector_results + bm_results
|
| 109 |
+
|
| 110 |
+
# remove duplicates
|
| 111 |
seen = set()
|
| 112 |
+
unique = []
|
| 113 |
+
for d in combined:
|
| 114 |
+
key = (d.metadata.get("doc_id"), d.page_content[:80])
|
| 115 |
if key not in seen:
|
| 116 |
seen.add(key)
|
| 117 |
+
unique.append(d)
|
| 118 |
|
| 119 |
+
if not unique:
|
| 120 |
return []
|
| 121 |
|
| 122 |
+
pairs = [(query, doc.page_content) for doc in unique]
|
| 123 |
scores = reranker.predict(pairs)
|
|
|
|
| 124 |
|
| 125 |
+
ranked = sorted(zip(scores, unique), key=lambda x: x[0], reverse=True)[:top_k]
|
| 126 |
+
for s, doc in ranked:
|
| 127 |
+
doc.metadata["rerank_score"] = float(s)
|
| 128 |
+
|
| 129 |
+
return [doc for _, doc in ranked]
|
| 130 |
|
|
|
|
| 131 |
|
| 132 |
|
| 133 |
# ------------------ LLM ------------------ #
|