import duckdb import json, sys import re from pathlib import Path ROOT_FOLDER = Path(__file__).resolve().parent.parent sys.path.append(str(ROOT_FOLDER)) from src.semantic import semantic_search def decode_ratings(page_content): block_pattern = r'\[\d\.0★\].*' matches = re.findall(block_pattern, page_content) if matches: pattern = r'\[(\d\.0)★\]\s*(.*?)\s*—\s*(.*)' parsed = [] for r in matches[:3]: match = re.match(pattern, r) if match: rating, title, text = match.groups() parsed.append({ 'rating': float(rating), 'title': title.strip(), 'text': text.strip() }) return(parsed) else: return {} def enrich_search_results(vector_store, query: str, k: int, hf_dataset): """ Perform similarity search and enrich results with HuggingFace dataset metadata. Args: vector_store: LangChain vector store instance query: Search query string k: Number of results to return filter: Filter dict for similarity search hf_dataset: HuggingFace Arrow dataset (datasets.Dataset) Returns: List of enriched metadata objects as dicts """ results = semantic_search(query, vector_store, k=k) # 1. Extract parent_asins from metadata parent_asins = [doc.metadata.get("parent_asin") for doc, score in results] # 2. Query HuggingFace dataset via DuckDB con = duckdb.connect() arrow_table = hf_dataset.data.table # Get underlying PyArrow table con.register("hf_table", arrow_table) asin_list = ", ".join(f"'{asin}'" for asin in parent_asins if asin) query_sql = f"SELECT * FROM hf_table WHERE parent_asin IN ({asin_list})" hf_rows = con.execute(query_sql).fetchdf() # Build lookup: parent_asin -> metadata dict asin_to_metadata = { row["parent_asin"]: row.to_dict() for _, row in hf_rows.iterrows() } enriched_results = [] for doc, score in results: parent_asin = doc.metadata.get("parent_asin") total_reviews = doc.metadata.get("total_reviews") metadata_object = asin_to_metadata.get(parent_asin, {}).copy() metadata_object['score'] = score metadata_object['total_reviews'] = total_reviews # 3. Extract 3 lines after "Top Reviews\n" from page_content page_content = doc.page_content metadata_object["reviews"] = decode_ratings(page_content) enriched_results.append(metadata_object) con.close() # 4. Return JSON metadata objects return [json.loads(json.dumps(obj, default=str)) for obj in enriched_results] def enrich_bm25_search_results(retriever, query: str, k: int, hf_dataset): """ Perform BM25 search and enrich results with HuggingFace dataset metadata. Args: retriever: LangChain BM25Retriever instance query: Search query string k: Number of results to return hf_dataset: HuggingFace Arrow dataset (datasets.Dataset) Returns: List of enriched metadata objects as dicts """ # Get BM25 scores via underlying rank_bm25 library query_tokens = query.split() scores = retriever.vectorizer.get_scores(query_tokens) # numpy array top_k_indices = sorted(enumerate(scores), key=lambda x: x[1], reverse=True)[:k] results = [(retriever.docs[i], score) for i, score in top_k_indices] # 1. Extract parent_asins from metadata parent_asins = [doc.metadata.get("parent_asin") for doc, score in results] # 2. Query HuggingFace dataset via DuckDB con = duckdb.connect() arrow_table = hf_dataset.data.table con.register("hf_table", arrow_table) asin_list = ", ".join(f"'{asin}'" for asin in parent_asins if asin) query_sql = f"SELECT * FROM hf_table WHERE parent_asin IN ({asin_list})" hf_rows = con.execute(query_sql).fetchdf() # Build lookup: parent_asin -> metadata dict asin_to_metadata = { row["parent_asin"]: row.to_dict() for _, row in hf_rows.iterrows() } enriched_results = [] for doc, score in results: parent_asin = doc.metadata.get("parent_asin") metadata_object = { **doc.metadata, **asin_to_metadata.get(parent_asin, {}), "score": score, } metadata_object['reviews'] = metadata_object.pop('top_reviews', {}) or {} enriched_results.append(metadata_object) con.close() # 4. Return JSON metadata objects return [json.loads(json.dumps(obj, default=str)) for obj in enriched_results] def _format_docs(results, hf_dataset): """ Perform similarity search and enrich results with HuggingFace dataset metadata. Args: vector_store: LangChain vector store instance query: Search query string k: Number of results to return filter: Filter dict for similarity search hf_dataset: HuggingFace Arrow dataset (datasets.Dataset) Returns: List of enriched metadata objects as dicts """ # 1. Extract parent_asins from metadata parent_asins = [doc.metadata.get("parent_asin") for doc in results] # 2. Query HuggingFace dataset via DuckDB con = duckdb.connect() arrow_table = hf_dataset.data.table # Get underlying PyArrow table con.register("hf_table", arrow_table) asin_list = ", ".join(f"'{asin}'" for asin in parent_asins if asin) query_sql = f"SELECT * FROM hf_table WHERE parent_asin IN ({asin_list})" hf_rows = con.execute(query_sql).fetchdf() # Build lookup: parent_asin -> metadata dict asin_to_metadata = { row["parent_asin"]: row.to_dict() for _, row in hf_rows.iterrows() } enriched_results = [] for doc in results: parent_asin = doc.metadata.get("parent_asin") total_reviews = doc.metadata.get("total_reviews") metadata_object = asin_to_metadata.get(parent_asin, {}).copy() metadata_object['total_reviews'] = total_reviews # 3. Extract 3 lines after "Top Reviews\n" from page_content page_content = doc.page_content metadata_object["reviews"] = decode_ratings(page_content) enriched_results.append(metadata_object) con.close() # 4. Return JSON metadata objects return [json.loads(json.dumps(obj, default=str)) for obj in enriched_results]