amazon_retriever / src /retrieval_helpers.py
Sarisha Das
fix all paths
e51a05a
import duckdb
import json, sys
import re
from pathlib import Path
ROOT_FOLDER = Path(__file__).resolve().parent.parent
sys.path.append(str(ROOT_FOLDER))
from src.semantic import semantic_search
def decode_ratings(page_content):
block_pattern = r'\[\d\.0★\].*'
matches = re.findall(block_pattern, page_content)
if matches:
pattern = r'\[(\d\.0)★\]\s*(.*?)\s*—\s*(.*)'
parsed = []
for r in matches[:3]:
match = re.match(pattern, r)
if match:
rating, title, text = match.groups()
parsed.append({
'rating': float(rating),
'title': title.strip(),
'text': text.strip()
})
return(parsed)
else:
return {}
def enrich_search_results(vector_store, query: str, k: int, hf_dataset):
"""
Perform similarity search and enrich results with HuggingFace dataset metadata.
Args:
vector_store: LangChain vector store instance
query: Search query string
k: Number of results to return
filter: Filter dict for similarity search
hf_dataset: HuggingFace Arrow dataset (datasets.Dataset)
Returns:
List of enriched metadata objects as dicts
"""
results = semantic_search(query, vector_store, k=k)
# 1. Extract parent_asins from metadata
parent_asins = [doc.metadata.get("parent_asin") for doc, score in results]
# 2. Query HuggingFace dataset via DuckDB
con = duckdb.connect()
arrow_table = hf_dataset.data.table # Get underlying PyArrow table
con.register("hf_table", arrow_table)
asin_list = ", ".join(f"'{asin}'" for asin in parent_asins if asin)
query_sql = f"SELECT * FROM hf_table WHERE parent_asin IN ({asin_list})"
hf_rows = con.execute(query_sql).fetchdf()
# Build lookup: parent_asin -> metadata dict
asin_to_metadata = {
row["parent_asin"]: row.to_dict()
for _, row in hf_rows.iterrows()
}
enriched_results = []
for doc, score in results:
parent_asin = doc.metadata.get("parent_asin")
total_reviews = doc.metadata.get("total_reviews")
metadata_object = asin_to_metadata.get(parent_asin, {}).copy()
metadata_object['score'] = score
metadata_object['total_reviews'] = total_reviews
# 3. Extract 3 lines after "Top Reviews\n" from page_content
page_content = doc.page_content
metadata_object["reviews"] = decode_ratings(page_content)
enriched_results.append(metadata_object)
con.close()
# 4. Return JSON metadata objects
return [json.loads(json.dumps(obj, default=str)) for obj in enriched_results]
def enrich_bm25_search_results(retriever, query: str, k: int, hf_dataset):
"""
Perform BM25 search and enrich results with HuggingFace dataset metadata.
Args:
retriever: LangChain BM25Retriever instance
query: Search query string
k: Number of results to return
hf_dataset: HuggingFace Arrow dataset (datasets.Dataset)
Returns:
List of enriched metadata objects as dicts
"""
# Get BM25 scores via underlying rank_bm25 library
query_tokens = query.split()
scores = retriever.vectorizer.get_scores(query_tokens) # numpy array
top_k_indices = sorted(enumerate(scores), key=lambda x: x[1], reverse=True)[:k]
results = [(retriever.docs[i], score) for i, score in top_k_indices]
# 1. Extract parent_asins from metadata
parent_asins = [doc.metadata.get("parent_asin") for doc, score in results]
# 2. Query HuggingFace dataset via DuckDB
con = duckdb.connect()
arrow_table = hf_dataset.data.table
con.register("hf_table", arrow_table)
asin_list = ", ".join(f"'{asin}'" for asin in parent_asins if asin)
query_sql = f"SELECT * FROM hf_table WHERE parent_asin IN ({asin_list})"
hf_rows = con.execute(query_sql).fetchdf()
# Build lookup: parent_asin -> metadata dict
asin_to_metadata = {
row["parent_asin"]: row.to_dict()
for _, row in hf_rows.iterrows()
}
enriched_results = []
for doc, score in results:
parent_asin = doc.metadata.get("parent_asin")
metadata_object = {
**doc.metadata,
**asin_to_metadata.get(parent_asin, {}),
"score": score,
}
metadata_object['reviews'] = metadata_object.pop('top_reviews', {}) or {}
enriched_results.append(metadata_object)
con.close()
# 4. Return JSON metadata objects
return [json.loads(json.dumps(obj, default=str)) for obj in enriched_results]
def _format_docs(results, hf_dataset):
"""
Perform similarity search and enrich results with HuggingFace dataset metadata.
Args:
vector_store: LangChain vector store instance
query: Search query string
k: Number of results to return
filter: Filter dict for similarity search
hf_dataset: HuggingFace Arrow dataset (datasets.Dataset)
Returns:
List of enriched metadata objects as dicts
"""
# 1. Extract parent_asins from metadata
parent_asins = [doc.metadata.get("parent_asin") for doc in results]
# 2. Query HuggingFace dataset via DuckDB
con = duckdb.connect()
arrow_table = hf_dataset.data.table # Get underlying PyArrow table
con.register("hf_table", arrow_table)
asin_list = ", ".join(f"'{asin}'" for asin in parent_asins if asin)
query_sql = f"SELECT * FROM hf_table WHERE parent_asin IN ({asin_list})"
hf_rows = con.execute(query_sql).fetchdf()
# Build lookup: parent_asin -> metadata dict
asin_to_metadata = {
row["parent_asin"]: row.to_dict()
for _, row in hf_rows.iterrows()
}
enriched_results = []
for doc in results:
parent_asin = doc.metadata.get("parent_asin")
total_reviews = doc.metadata.get("total_reviews")
metadata_object = asin_to_metadata.get(parent_asin, {}).copy()
metadata_object['total_reviews'] = total_reviews
# 3. Extract 3 lines after "Top Reviews\n" from page_content
page_content = doc.page_content
metadata_object["reviews"] = decode_ratings(page_content)
enriched_results.append(metadata_object)
con.close()
# 4. Return JSON metadata objects
return [json.loads(json.dumps(obj, default=str)) for obj in enriched_results]