Spaces:
Sleeping
Sleeping
File size: 6,456 Bytes
b2ccf94 e51a05a b2ccf94 2bf862f 681ec3c 2bf862f 0bcbce0 2bf862f 681ec3c b2ccf94 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 | import duckdb
import json, sys
import re
from pathlib import Path
ROOT_FOLDER = Path(__file__).resolve().parent.parent
sys.path.append(str(ROOT_FOLDER))
from src.semantic import semantic_search
def decode_ratings(page_content):
block_pattern = r'\[\d\.0★\].*'
matches = re.findall(block_pattern, page_content)
if matches:
pattern = r'\[(\d\.0)★\]\s*(.*?)\s*—\s*(.*)'
parsed = []
for r in matches[:3]:
match = re.match(pattern, r)
if match:
rating, title, text = match.groups()
parsed.append({
'rating': float(rating),
'title': title.strip(),
'text': text.strip()
})
return(parsed)
else:
return {}
def enrich_search_results(vector_store, query: str, k: int, hf_dataset):
"""
Perform similarity search and enrich results with HuggingFace dataset metadata.
Args:
vector_store: LangChain vector store instance
query: Search query string
k: Number of results to return
filter: Filter dict for similarity search
hf_dataset: HuggingFace Arrow dataset (datasets.Dataset)
Returns:
List of enriched metadata objects as dicts
"""
results = semantic_search(query, vector_store, k=k)
# 1. Extract parent_asins from metadata
parent_asins = [doc.metadata.get("parent_asin") for doc, score in results]
# 2. Query HuggingFace dataset via DuckDB
con = duckdb.connect()
arrow_table = hf_dataset.data.table # Get underlying PyArrow table
con.register("hf_table", arrow_table)
asin_list = ", ".join(f"'{asin}'" for asin in parent_asins if asin)
query_sql = f"SELECT * FROM hf_table WHERE parent_asin IN ({asin_list})"
hf_rows = con.execute(query_sql).fetchdf()
# Build lookup: parent_asin -> metadata dict
asin_to_metadata = {
row["parent_asin"]: row.to_dict()
for _, row in hf_rows.iterrows()
}
enriched_results = []
for doc, score in results:
parent_asin = doc.metadata.get("parent_asin")
total_reviews = doc.metadata.get("total_reviews")
metadata_object = asin_to_metadata.get(parent_asin, {}).copy()
metadata_object['score'] = score
metadata_object['total_reviews'] = total_reviews
# 3. Extract 3 lines after "Top Reviews\n" from page_content
page_content = doc.page_content
metadata_object["reviews"] = decode_ratings(page_content)
enriched_results.append(metadata_object)
con.close()
# 4. Return JSON metadata objects
return [json.loads(json.dumps(obj, default=str)) for obj in enriched_results]
def enrich_bm25_search_results(retriever, query: str, k: int, hf_dataset):
"""
Perform BM25 search and enrich results with HuggingFace dataset metadata.
Args:
retriever: LangChain BM25Retriever instance
query: Search query string
k: Number of results to return
hf_dataset: HuggingFace Arrow dataset (datasets.Dataset)
Returns:
List of enriched metadata objects as dicts
"""
# Get BM25 scores via underlying rank_bm25 library
query_tokens = query.split()
scores = retriever.vectorizer.get_scores(query_tokens) # numpy array
top_k_indices = sorted(enumerate(scores), key=lambda x: x[1], reverse=True)[:k]
results = [(retriever.docs[i], score) for i, score in top_k_indices]
# 1. Extract parent_asins from metadata
parent_asins = [doc.metadata.get("parent_asin") for doc, score in results]
# 2. Query HuggingFace dataset via DuckDB
con = duckdb.connect()
arrow_table = hf_dataset.data.table
con.register("hf_table", arrow_table)
asin_list = ", ".join(f"'{asin}'" for asin in parent_asins if asin)
query_sql = f"SELECT * FROM hf_table WHERE parent_asin IN ({asin_list})"
hf_rows = con.execute(query_sql).fetchdf()
# Build lookup: parent_asin -> metadata dict
asin_to_metadata = {
row["parent_asin"]: row.to_dict()
for _, row in hf_rows.iterrows()
}
enriched_results = []
for doc, score in results:
parent_asin = doc.metadata.get("parent_asin")
metadata_object = {
**doc.metadata,
**asin_to_metadata.get(parent_asin, {}),
"score": score,
}
metadata_object['reviews'] = metadata_object.pop('top_reviews', {}) or {}
enriched_results.append(metadata_object)
con.close()
# 4. Return JSON metadata objects
return [json.loads(json.dumps(obj, default=str)) for obj in enriched_results]
def _format_docs(results, hf_dataset):
"""
Perform similarity search and enrich results with HuggingFace dataset metadata.
Args:
vector_store: LangChain vector store instance
query: Search query string
k: Number of results to return
filter: Filter dict for similarity search
hf_dataset: HuggingFace Arrow dataset (datasets.Dataset)
Returns:
List of enriched metadata objects as dicts
"""
# 1. Extract parent_asins from metadata
parent_asins = [doc.metadata.get("parent_asin") for doc in results]
# 2. Query HuggingFace dataset via DuckDB
con = duckdb.connect()
arrow_table = hf_dataset.data.table # Get underlying PyArrow table
con.register("hf_table", arrow_table)
asin_list = ", ".join(f"'{asin}'" for asin in parent_asins if asin)
query_sql = f"SELECT * FROM hf_table WHERE parent_asin IN ({asin_list})"
hf_rows = con.execute(query_sql).fetchdf()
# Build lookup: parent_asin -> metadata dict
asin_to_metadata = {
row["parent_asin"]: row.to_dict()
for _, row in hf_rows.iterrows()
}
enriched_results = []
for doc in results:
parent_asin = doc.metadata.get("parent_asin")
total_reviews = doc.metadata.get("total_reviews")
metadata_object = asin_to_metadata.get(parent_asin, {}).copy()
metadata_object['total_reviews'] = total_reviews
# 3. Extract 3 lines after "Top Reviews\n" from page_content
page_content = doc.page_content
metadata_object["reviews"] = decode_ratings(page_content)
enriched_results.append(metadata_object)
con.close()
# 4. Return JSON metadata objects
return [json.loads(json.dumps(obj, default=str)) for obj in enriched_results] |