Spaces:
Sleeping
Sleeping
Create retrieval_helpers.py
Browse files- utils/retrieval_helpers.py +83 -0
utils/retrieval_helpers.py
ADDED
|
@@ -0,0 +1,83 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import duckdb
|
| 2 |
+
import json, sys
|
| 3 |
+
import re
|
| 4 |
+
from pathlib import Path
|
| 5 |
+
ROOT_FOLDER = Path(__file__).resolve().parent.parent
|
| 6 |
+
|
| 7 |
+
sys.path.append(str(ROOT_FOLDER))
|
| 8 |
+
from utils.semantic import semantic_search
|
| 9 |
+
|
| 10 |
+
def decode_ratings(page_content):
|
| 11 |
+
block_pattern = r'\[\d\.0★\].*'
|
| 12 |
+
matches = re.findall(block_pattern, page_content)
|
| 13 |
+
if matches:
|
| 14 |
+
pattern = r'\[(\d\.0)★\]\s*(.*?)\s*—\s*(.*)'
|
| 15 |
+
parsed = []
|
| 16 |
+
|
| 17 |
+
for r in matches[:3]:
|
| 18 |
+
match = re.match(pattern, r)
|
| 19 |
+
if match:
|
| 20 |
+
rating, title, text = match.groups()
|
| 21 |
+
parsed.append({
|
| 22 |
+
'rating': float(rating),
|
| 23 |
+
'title': title.strip(),
|
| 24 |
+
'text': text.strip()
|
| 25 |
+
})
|
| 26 |
+
|
| 27 |
+
return(parsed)
|
| 28 |
+
else:
|
| 29 |
+
return {}
|
| 30 |
+
|
| 31 |
+
def enrich_search_results(vector_store, query: str, k: int, hf_dataset):
|
| 32 |
+
"""
|
| 33 |
+
Perform similarity search and enrich results with HuggingFace dataset metadata.
|
| 34 |
+
|
| 35 |
+
Args:
|
| 36 |
+
vector_store: LangChain vector store instance
|
| 37 |
+
query: Search query string
|
| 38 |
+
k: Number of results to return
|
| 39 |
+
filter: Filter dict for similarity search
|
| 40 |
+
hf_dataset: HuggingFace Arrow dataset (datasets.Dataset)
|
| 41 |
+
|
| 42 |
+
Returns:
|
| 43 |
+
List of enriched metadata objects as dicts
|
| 44 |
+
"""
|
| 45 |
+
results = semantic_search(query, vector_store, k=k)
|
| 46 |
+
|
| 47 |
+
# 1. Extract parent_asins from metadata
|
| 48 |
+
parent_asins = [doc.metadata.get("parent_asin") for doc, score in results]
|
| 49 |
+
|
| 50 |
+
# 2. Query HuggingFace dataset via DuckDB
|
| 51 |
+
con = duckdb.connect()
|
| 52 |
+
arrow_table = hf_dataset.data.table # Get underlying PyArrow table
|
| 53 |
+
con.register("hf_table", arrow_table)
|
| 54 |
+
|
| 55 |
+
asin_list = ", ".join(f"'{asin}'" for asin in parent_asins if asin)
|
| 56 |
+
query_sql = f"SELECT * FROM hf_table WHERE parent_asin IN ({asin_list})"
|
| 57 |
+
hf_rows = con.execute(query_sql).fetchdf()
|
| 58 |
+
|
| 59 |
+
# Build lookup: parent_asin -> metadata dict
|
| 60 |
+
asin_to_metadata = {
|
| 61 |
+
row["parent_asin"]: row.to_dict()
|
| 62 |
+
for _, row in hf_rows.iterrows()
|
| 63 |
+
}
|
| 64 |
+
|
| 65 |
+
enriched_results = []
|
| 66 |
+
|
| 67 |
+
for doc, score in results:
|
| 68 |
+
parent_asin = doc.metadata.get("parent_asin")
|
| 69 |
+
total_reviews = doc.metadata.get("total_reviews")
|
| 70 |
+
metadata_object = asin_to_metadata.get(parent_asin, {}).copy()
|
| 71 |
+
metadata_object['score'] = score
|
| 72 |
+
metadata_object['total_reviews'] = total_reviews
|
| 73 |
+
|
| 74 |
+
# 3. Extract 3 lines after "Top Reviews\n" from page_content
|
| 75 |
+
page_content = doc.page_content
|
| 76 |
+
metadata_object["reviews"] = decode_ratings(page_content)
|
| 77 |
+
|
| 78 |
+
enriched_results.append(metadata_object)
|
| 79 |
+
|
| 80 |
+
con.close()
|
| 81 |
+
|
| 82 |
+
# 4. Return JSON metadata objects
|
| 83 |
+
return [json.loads(json.dumps(obj, default=str)) for obj in enriched_results]
|