Sarisha Das commited on
Commit
0bcbce0
Β·
1 Parent(s): 468fa48

streamline app

Browse files
requirements.txt CHANGED
@@ -1,6 +1,7 @@
1
  duckdb
2
  pandas
3
  streamlit
 
4
  sentence-transformers
5
  langchain
6
  langchain-community
 
1
  duckdb
2
  pandas
3
  streamlit
4
+ python-dotenv
5
  sentence-transformers
6
  langchain
7
  langchain-community
src/streamlit_app.py CHANGED
@@ -16,6 +16,9 @@ from utils.retrieval_helpers import enrich_search_results, enrich_bm25_search_re
16
  from utils.bm25 import load
17
  from utils.semantic import load_vector_store
18
 
 
 
 
19
  import warnings
20
  warnings.filterwarnings("ignore", category=UserWarning)
21
 
@@ -31,50 +34,10 @@ st.set_page_config(
31
  FEEDBACK_CSV = ROOT / "results" / "feedback.csv"
32
  FEEDBACK_CSV.parent.mkdir(parents=True, exist_ok=True)
33
 
34
- # ─── Load HF dataset (cached so it only runs once) ───────────────────────────
35
- from datasets import load_dataset
36
-
37
- @st.cache_resource
38
- def load_hf_dataset():
39
- return load_dataset(
40
- "McAuley-Lab/Amazon-Reviews-2023",
41
- "raw_meta_Grocery_and_Gourmet_Food",
42
- trust_remote_code=True,
43
- token=os.environ.get("HF_TOKEN")
44
- )
45
-
46
- HF_DATASET = load_hf_dataset()
47
-
48
- # ─── Download vector store from your HF dataset repo ─────────────────────────
49
- from huggingface_hub import hf_hub_download, snapshot_download, login
50
-
51
- VECTOR_STORE_DIR = ROOT / "embeddings" / "semantic_vector_store"
52
 
53
- @st.cache_resource
54
- def load_vector_store_cached():
55
- hf_token = os.environ.get("HF_TOKEN")
56
- if not hf_token:
57
- st.error("HF_TOKEN secret is not set. Go to Space Settings β†’ Secrets.")
58
- st.stop()
59
-
60
- login(token=hf_token, add_to_git_credential=False)
61
-
62
- VECTOR_STORE_DIR.mkdir(parents=True, exist_ok=True)
63
-
64
- snapshot_path = snapshot_download(
65
- repo_id="rishadaz/amazon_retriever-storage",
66
- repo_type="dataset",
67
- local_dir=str(VECTOR_STORE_DIR),
68
- token=hf_token,
69
- )
70
-
71
- mini_index_path = Path(snapshot_path) / "tokenisation" / "bm25_index_mini.pkl"
72
- embeddings_dir = Path(snapshot_path) / "embeddings"
73
-
74
- vector_store = load_vector_store(embeddings_dir)
75
- bm25_retriever = load(mini_index_path)
76
-
77
- return vector_store, bm25_retriever
78
 
79
  # ─── Custom CSS ───────────────────────────────────────────────────────────────
80
  st.markdown(
@@ -146,15 +109,60 @@ st.markdown(
146
  unsafe_allow_html=True,
147
  )
148
 
149
- # ─── Placeholder retrieval functions ──────────────────────────────────────────
150
- # TODO: Replace with real imports once src/bm25.py and src/semantic.py are ready:
151
- # from src.bm25 import BM25Retriever
152
- # from src.semantic import SemanticRetriever
153
- #
154
- # Expected return format β€” list of dicts with keys:
155
- # asin (str), title (str), text (str), rating (float), score (float)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
156
 
157
- vector_store, bm25_retriever = load_vector_store_cached()
158
 
159
  def bm25_search(query: str, top_k: int = 3) -> list[dict]:
160
  """
@@ -163,7 +171,8 @@ def bm25_search(query: str, top_k: int = 3) -> list[dict]:
163
  return retriever.search(query, top_k=top_k)
164
  Returns top_k review-level results (may include multiple reviews per ASIN).
165
  """
166
- results = enrich_bm25_search_results(bm25_retriever, query, top_k, HF_DATASET['full'])
 
167
  return results
168
 
169
 
@@ -216,7 +225,7 @@ def render_results(results: list[dict], mode: str, query: str) -> None:
216
  title = item["title"]
217
  avg_rating = item["average_rating"]
218
  n_reviews = len(reviews)
219
- total_reviews = item.get('total_reviews', n_reviews)
220
  rating_number = item.get('rating_number', 0)
221
  asin = item['parent_asin']
222
  review_word = "review" if n_reviews == 1 else "reviews"
@@ -247,7 +256,7 @@ def render_results(results: list[dict], mode: str, query: str) -> None:
247
  )
248
 
249
  # ── Reviews in collapsible expander ───────────────────────────────
250
- expander_label = f"πŸ“– View {n_reviews} of total {total_reviews} {review_word} "
251
  with st.expander(expander_label, expanded=(n_reviews == 1)):
252
  for j, rev in enumerate(reviews):
253
  st.markdown(
 
16
  from utils.bm25 import load
17
  from utils.semantic import load_vector_store
18
 
19
+ from dotenv import load_dotenv
20
+ load_dotenv()
21
+
22
  import warnings
23
  warnings.filterwarnings("ignore", category=UserWarning)
24
 
 
34
  FEEDBACK_CSV = ROOT / "results" / "feedback.csv"
35
  FEEDBACK_CSV.parent.mkdir(parents=True, exist_ok=True)
36
 
37
+ HF_TOKEN = os.getenv('HF_TOKEN')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
38
 
39
+ from datasets import load_dataset
40
+ from huggingface_hub import snapshot_download, login
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
41
 
42
  # ─── Custom CSS ───────────────────────────────────────────────────────────────
43
  st.markdown(
 
109
  unsafe_allow_html=True,
110
  )
111
 
112
+ @st.cache_resource
113
+ def load_hf_dataset():
114
+ return load_dataset(
115
+ "McAuley-Lab/Amazon-Reviews-2023",
116
+ "raw_meta_Grocery_and_Gourmet_Food",
117
+ trust_remote_code=True,
118
+ token=HF_TOKEN
119
+ )
120
+
121
+ VECTOR_STORE_DIR = ROOT / "data" / "processed"
122
+
123
+ @st.cache_resource
124
+ def load_vector_store_cached():
125
+ login(token=HF_TOKEN, add_to_git_credential=False)
126
+ VECTOR_STORE_DIR.mkdir(parents=True, exist_ok=True)
127
+
128
+ snapshot_path = snapshot_download(
129
+ repo_id="rishadaz/amazon_retriever-storage",
130
+ repo_type="dataset",
131
+ local_dir=str(VECTOR_STORE_DIR),
132
+ token=HF_TOKEN,
133
+ )
134
+
135
+ mini_index_path = Path(snapshot_path) / "tokenisation" / "bm25_index.pkl"
136
+ embeddings_dir = Path(snapshot_path) / "embeddings"
137
+
138
+ vector_store = load_vector_store(embeddings_dir)
139
+ bm25_retriever = load(mini_index_path)
140
+
141
+ return vector_store, bm25_retriever
142
+
143
+ # ─── Get Data ──────────────────────────────────────────────────────────────
144
+ # local tag will read from your local directory as a default it will
145
+ # read the mini versions of the files we have provided in the repo
146
+
147
+ data_source = "remote" #"remote" or "local"
148
+
149
+ # note: remote has the full generated corpus and
150
+ # embeddings which can take a long time to download and
151
+ # the app might become heavy too and slow down
152
+ # processing. For development pls use the smaller "local" corpus
153
+
154
+ HF_DATASET = load_hf_dataset()
155
+
156
+ if data_source == 'local':
157
+ MINI_INDEX_PATH = ROOT / "data" / "processed" / "tokenisation" / "bm25_index_mini.pkl"
158
+
159
+ vector_store = load_vector_store(ROOT_FOLDER / 'data' / 'processed' / 'embeddings')
160
+ retriever = load(MINI_INDEX_PATH)
161
+ else:
162
+
163
+ vector_store, retriever = load_vector_store_cached()
164
+
165
 
 
166
 
167
  def bm25_search(query: str, top_k: int = 3) -> list[dict]:
168
  """
 
171
  return retriever.search(query, top_k=top_k)
172
  Returns top_k review-level results (may include multiple reviews per ASIN).
173
  """
174
+
175
+ results = enrich_bm25_search_results(retriever, query, top_k, HF_DATASET['full'])
176
  return results
177
 
178
 
 
225
  title = item["title"]
226
  avg_rating = item["average_rating"]
227
  n_reviews = len(reviews)
228
+ # total_reviews = item.get('total_reviews', n_reviews)
229
  rating_number = item.get('rating_number', 0)
230
  asin = item['parent_asin']
231
  review_word = "review" if n_reviews == 1 else "reviews"
 
256
  )
257
 
258
  # ── Reviews in collapsible expander ───────────────────────────────
259
+ expander_label = f"πŸ“– Viewing top {n_reviews} {review_word} "
260
  with st.expander(expander_label, expanded=(n_reviews == 1)):
261
  for j, rev in enumerate(reviews):
262
  st.markdown(
utils/retrieval_helpers.py CHANGED
@@ -27,28 +27,6 @@ def decode_ratings(page_content):
27
  return(parsed)
28
  else:
29
  return {}
30
-
31
- def decode_bm25_ratings(page_content):
32
- block_pattern = r'Review \(Rating:\s*\d+\.\d+\):.*'
33
- matches = re.findall(block_pattern, page_content)
34
-
35
- if matches:
36
- pattern = r'Review \(Rating:\s*(\d+\.\d+)\):\s*([^\.]+)\.\s*(.*)'
37
- parsed = []
38
-
39
- for r in matches[:3]:
40
- match = re.match(pattern, r)
41
- if match:
42
- rating, title, text = match.groups()
43
- parsed.append({
44
- 'rating': float(rating),
45
- 'title': title.strip(),
46
- 'text': text.strip()
47
- })
48
-
49
- return parsed
50
- else:
51
- return {}
52
 
53
  def enrich_search_results(vector_store, query: str, k: int, hf_dataset):
54
  """
@@ -146,14 +124,14 @@ def enrich_bm25_search_results(retriever, query: str, k: int, hf_dataset):
146
 
147
  for doc, score in results:
148
  parent_asin = doc.metadata.get("parent_asin")
149
- total_reviews = doc.metadata.get("total_reviews")
150
- metadata_object = asin_to_metadata.get(parent_asin, {}).copy()
151
- metadata_object['score'] = score
152
- metadata_object['total_reviews'] = total_reviews
153
 
154
- # 3. Extract reviews from page_content
155
- page_content = doc.page_content
156
- metadata_object["reviews"] = decode_ratings(page_content)
 
 
 
 
157
 
158
  enriched_results.append(metadata_object)
159
 
 
27
  return(parsed)
28
  else:
29
  return {}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
30
 
31
  def enrich_search_results(vector_store, query: str, k: int, hf_dataset):
32
  """
 
124
 
125
  for doc, score in results:
126
  parent_asin = doc.metadata.get("parent_asin")
 
 
 
 
127
 
128
+ metadata_object = {
129
+ **doc.metadata,
130
+ **asin_to_metadata.get(parent_asin, {}),
131
+ "score": score,
132
+ }
133
+
134
+ metadata_object['reviews'] = metadata_object.pop('top_reviews', {}) or {}
135
 
136
  enriched_results.append(metadata_object)
137