rishadaz commited on
Commit
b2ccf94
·
verified ·
1 Parent(s): 5f7bec4

Create retrieval_helpers.py

Browse files
Files changed (1) hide show
  1. utils/retrieval_helpers.py +83 -0
utils/retrieval_helpers.py ADDED
@@ -0,0 +1,83 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import duckdb
2
+ import json, sys
3
+ import re
4
+ from pathlib import Path
5
+ ROOT_FOLDER = Path(__file__).resolve().parent.parent
6
+
7
+ sys.path.append(str(ROOT_FOLDER))
8
+ from utils.semantic import semantic_search
9
+
10
+ def decode_ratings(page_content):
11
+ block_pattern = r'\[\d\.0★\].*'
12
+ matches = re.findall(block_pattern, page_content)
13
+ if matches:
14
+ pattern = r'\[(\d\.0)★\]\s*(.*?)\s*—\s*(.*)'
15
+ parsed = []
16
+
17
+ for r in matches[:3]:
18
+ match = re.match(pattern, r)
19
+ if match:
20
+ rating, title, text = match.groups()
21
+ parsed.append({
22
+ 'rating': float(rating),
23
+ 'title': title.strip(),
24
+ 'text': text.strip()
25
+ })
26
+
27
+ return(parsed)
28
+ else:
29
+ return {}
30
+
31
+ def enrich_search_results(vector_store, query: str, k: int, hf_dataset):
32
+ """
33
+ Perform similarity search and enrich results with HuggingFace dataset metadata.
34
+
35
+ Args:
36
+ vector_store: LangChain vector store instance
37
+ query: Search query string
38
+ k: Number of results to return
39
+ filter: Filter dict for similarity search
40
+ hf_dataset: HuggingFace Arrow dataset (datasets.Dataset)
41
+
42
+ Returns:
43
+ List of enriched metadata objects as dicts
44
+ """
45
+ results = semantic_search(query, vector_store, k=k)
46
+
47
+ # 1. Extract parent_asins from metadata
48
+ parent_asins = [doc.metadata.get("parent_asin") for doc, score in results]
49
+
50
+ # 2. Query HuggingFace dataset via DuckDB
51
+ con = duckdb.connect()
52
+ arrow_table = hf_dataset.data.table # Get underlying PyArrow table
53
+ con.register("hf_table", arrow_table)
54
+
55
+ asin_list = ", ".join(f"'{asin}'" for asin in parent_asins if asin)
56
+ query_sql = f"SELECT * FROM hf_table WHERE parent_asin IN ({asin_list})"
57
+ hf_rows = con.execute(query_sql).fetchdf()
58
+
59
+ # Build lookup: parent_asin -> metadata dict
60
+ asin_to_metadata = {
61
+ row["parent_asin"]: row.to_dict()
62
+ for _, row in hf_rows.iterrows()
63
+ }
64
+
65
+ enriched_results = []
66
+
67
+ for doc, score in results:
68
+ parent_asin = doc.metadata.get("parent_asin")
69
+ total_reviews = doc.metadata.get("total_reviews")
70
+ metadata_object = asin_to_metadata.get(parent_asin, {}).copy()
71
+ metadata_object['score'] = score
72
+ metadata_object['total_reviews'] = total_reviews
73
+
74
+ # 3. Extract 3 lines after "Top Reviews\n" from page_content
75
+ page_content = doc.page_content
76
+ metadata_object["reviews"] = decode_ratings(page_content)
77
+
78
+ enriched_results.append(metadata_object)
79
+
80
+ con.close()
81
+
82
+ # 4. Return JSON metadata objects
83
+ return [json.loads(json.dumps(obj, default=str)) for obj in enriched_results]