Spaces:
Sleeping
Sleeping
| import duckdb | |
| import json, sys | |
| import re | |
| from pathlib import Path | |
| ROOT_FOLDER = Path(__file__).resolve().parent.parent | |
| sys.path.append(str(ROOT_FOLDER)) | |
| from src.semantic import semantic_search | |
| def decode_ratings(page_content): | |
| block_pattern = r'\[\d\.0★\].*' | |
| matches = re.findall(block_pattern, page_content) | |
| if matches: | |
| pattern = r'\[(\d\.0)★\]\s*(.*?)\s*—\s*(.*)' | |
| parsed = [] | |
| for r in matches[:3]: | |
| match = re.match(pattern, r) | |
| if match: | |
| rating, title, text = match.groups() | |
| parsed.append({ | |
| 'rating': float(rating), | |
| 'title': title.strip(), | |
| 'text': text.strip() | |
| }) | |
| return(parsed) | |
| else: | |
| return {} | |
| def enrich_search_results(vector_store, query: str, k: int, hf_dataset): | |
| """ | |
| Perform similarity search and enrich results with HuggingFace dataset metadata. | |
| Args: | |
| vector_store: LangChain vector store instance | |
| query: Search query string | |
| k: Number of results to return | |
| filter: Filter dict for similarity search | |
| hf_dataset: HuggingFace Arrow dataset (datasets.Dataset) | |
| Returns: | |
| List of enriched metadata objects as dicts | |
| """ | |
| results = semantic_search(query, vector_store, k=k) | |
| # 1. Extract parent_asins from metadata | |
| parent_asins = [doc.metadata.get("parent_asin") for doc, score in results] | |
| # 2. Query HuggingFace dataset via DuckDB | |
| con = duckdb.connect() | |
| arrow_table = hf_dataset.data.table # Get underlying PyArrow table | |
| con.register("hf_table", arrow_table) | |
| asin_list = ", ".join(f"'{asin}'" for asin in parent_asins if asin) | |
| query_sql = f"SELECT * FROM hf_table WHERE parent_asin IN ({asin_list})" | |
| hf_rows = con.execute(query_sql).fetchdf() | |
| # Build lookup: parent_asin -> metadata dict | |
| asin_to_metadata = { | |
| row["parent_asin"]: row.to_dict() | |
| for _, row in hf_rows.iterrows() | |
| } | |
| enriched_results = [] | |
| for doc, score in results: | |
| parent_asin = doc.metadata.get("parent_asin") | |
| total_reviews = doc.metadata.get("total_reviews") | |
| metadata_object = asin_to_metadata.get(parent_asin, {}).copy() | |
| metadata_object['score'] = score | |
| metadata_object['total_reviews'] = total_reviews | |
| # 3. Extract 3 lines after "Top Reviews\n" from page_content | |
| page_content = doc.page_content | |
| metadata_object["reviews"] = decode_ratings(page_content) | |
| enriched_results.append(metadata_object) | |
| con.close() | |
| # 4. Return JSON metadata objects | |
| return [json.loads(json.dumps(obj, default=str)) for obj in enriched_results] | |
| def enrich_bm25_search_results(retriever, query: str, k: int, hf_dataset): | |
| """ | |
| Perform BM25 search and enrich results with HuggingFace dataset metadata. | |
| Args: | |
| retriever: LangChain BM25Retriever instance | |
| query: Search query string | |
| k: Number of results to return | |
| hf_dataset: HuggingFace Arrow dataset (datasets.Dataset) | |
| Returns: | |
| List of enriched metadata objects as dicts | |
| """ | |
| # Get BM25 scores via underlying rank_bm25 library | |
| query_tokens = query.split() | |
| scores = retriever.vectorizer.get_scores(query_tokens) # numpy array | |
| top_k_indices = sorted(enumerate(scores), key=lambda x: x[1], reverse=True)[:k] | |
| results = [(retriever.docs[i], score) for i, score in top_k_indices] | |
| # 1. Extract parent_asins from metadata | |
| parent_asins = [doc.metadata.get("parent_asin") for doc, score in results] | |
| # 2. Query HuggingFace dataset via DuckDB | |
| con = duckdb.connect() | |
| arrow_table = hf_dataset.data.table | |
| con.register("hf_table", arrow_table) | |
| asin_list = ", ".join(f"'{asin}'" for asin in parent_asins if asin) | |
| query_sql = f"SELECT * FROM hf_table WHERE parent_asin IN ({asin_list})" | |
| hf_rows = con.execute(query_sql).fetchdf() | |
| # Build lookup: parent_asin -> metadata dict | |
| asin_to_metadata = { | |
| row["parent_asin"]: row.to_dict() | |
| for _, row in hf_rows.iterrows() | |
| } | |
| enriched_results = [] | |
| for doc, score in results: | |
| parent_asin = doc.metadata.get("parent_asin") | |
| metadata_object = { | |
| **doc.metadata, | |
| **asin_to_metadata.get(parent_asin, {}), | |
| "score": score, | |
| } | |
| metadata_object['reviews'] = metadata_object.pop('top_reviews', {}) or {} | |
| enriched_results.append(metadata_object) | |
| con.close() | |
| # 4. Return JSON metadata objects | |
| return [json.loads(json.dumps(obj, default=str)) for obj in enriched_results] | |
| def _format_docs(results, hf_dataset): | |
| """ | |
| Perform similarity search and enrich results with HuggingFace dataset metadata. | |
| Args: | |
| vector_store: LangChain vector store instance | |
| query: Search query string | |
| k: Number of results to return | |
| filter: Filter dict for similarity search | |
| hf_dataset: HuggingFace Arrow dataset (datasets.Dataset) | |
| Returns: | |
| List of enriched metadata objects as dicts | |
| """ | |
| # 1. Extract parent_asins from metadata | |
| parent_asins = [doc.metadata.get("parent_asin") for doc in results] | |
| # 2. Query HuggingFace dataset via DuckDB | |
| con = duckdb.connect() | |
| arrow_table = hf_dataset.data.table # Get underlying PyArrow table | |
| con.register("hf_table", arrow_table) | |
| asin_list = ", ".join(f"'{asin}'" for asin in parent_asins if asin) | |
| query_sql = f"SELECT * FROM hf_table WHERE parent_asin IN ({asin_list})" | |
| hf_rows = con.execute(query_sql).fetchdf() | |
| # Build lookup: parent_asin -> metadata dict | |
| asin_to_metadata = { | |
| row["parent_asin"]: row.to_dict() | |
| for _, row in hf_rows.iterrows() | |
| } | |
| enriched_results = [] | |
| for doc in results: | |
| parent_asin = doc.metadata.get("parent_asin") | |
| total_reviews = doc.metadata.get("total_reviews") | |
| metadata_object = asin_to_metadata.get(parent_asin, {}).copy() | |
| metadata_object['total_reviews'] = total_reviews | |
| # 3. Extract 3 lines after "Top Reviews\n" from page_content | |
| page_content = doc.page_content | |
| metadata_object["reviews"] = decode_ratings(page_content) | |
| enriched_results.append(metadata_object) | |
| con.close() | |
| # 4. Return JSON metadata objects | |
| return [json.loads(json.dumps(obj, default=str)) for obj in enriched_results] |