Spaces:
Running
Running
Sarisha Das commited on
Commit Β·
0bcbce0
1
Parent(s): 468fa48
streamline app
Browse files- requirements.txt +1 -0
- src/streamlit_app.py +63 -54
- utils/retrieval_helpers.py +7 -29
requirements.txt
CHANGED
|
@@ -1,6 +1,7 @@
|
|
| 1 |
duckdb
|
| 2 |
pandas
|
| 3 |
streamlit
|
|
|
|
| 4 |
sentence-transformers
|
| 5 |
langchain
|
| 6 |
langchain-community
|
|
|
|
| 1 |
duckdb
|
| 2 |
pandas
|
| 3 |
streamlit
|
| 4 |
+
python-dotenv
|
| 5 |
sentence-transformers
|
| 6 |
langchain
|
| 7 |
langchain-community
|
src/streamlit_app.py
CHANGED
|
@@ -16,6 +16,9 @@ from utils.retrieval_helpers import enrich_search_results, enrich_bm25_search_re
|
|
| 16 |
from utils.bm25 import load
|
| 17 |
from utils.semantic import load_vector_store
|
| 18 |
|
|
|
|
|
|
|
|
|
|
| 19 |
import warnings
|
| 20 |
warnings.filterwarnings("ignore", category=UserWarning)
|
| 21 |
|
|
@@ -31,50 +34,10 @@ st.set_page_config(
|
|
| 31 |
FEEDBACK_CSV = ROOT / "results" / "feedback.csv"
|
| 32 |
FEEDBACK_CSV.parent.mkdir(parents=True, exist_ok=True)
|
| 33 |
|
| 34 |
-
|
| 35 |
-
from datasets import load_dataset
|
| 36 |
-
|
| 37 |
-
@st.cache_resource
|
| 38 |
-
def load_hf_dataset():
|
| 39 |
-
return load_dataset(
|
| 40 |
-
"McAuley-Lab/Amazon-Reviews-2023",
|
| 41 |
-
"raw_meta_Grocery_and_Gourmet_Food",
|
| 42 |
-
trust_remote_code=True,
|
| 43 |
-
token=os.environ.get("HF_TOKEN")
|
| 44 |
-
)
|
| 45 |
-
|
| 46 |
-
HF_DATASET = load_hf_dataset()
|
| 47 |
-
|
| 48 |
-
# βββ Download vector store from your HF dataset repo βββββββββββββββββββββββββ
|
| 49 |
-
from huggingface_hub import hf_hub_download, snapshot_download, login
|
| 50 |
-
|
| 51 |
-
VECTOR_STORE_DIR = ROOT / "embeddings" / "semantic_vector_store"
|
| 52 |
|
| 53 |
-
|
| 54 |
-
|
| 55 |
-
hf_token = os.environ.get("HF_TOKEN")
|
| 56 |
-
if not hf_token:
|
| 57 |
-
st.error("HF_TOKEN secret is not set. Go to Space Settings β Secrets.")
|
| 58 |
-
st.stop()
|
| 59 |
-
|
| 60 |
-
login(token=hf_token, add_to_git_credential=False)
|
| 61 |
-
|
| 62 |
-
VECTOR_STORE_DIR.mkdir(parents=True, exist_ok=True)
|
| 63 |
-
|
| 64 |
-
snapshot_path = snapshot_download(
|
| 65 |
-
repo_id="rishadaz/amazon_retriever-storage",
|
| 66 |
-
repo_type="dataset",
|
| 67 |
-
local_dir=str(VECTOR_STORE_DIR),
|
| 68 |
-
token=hf_token,
|
| 69 |
-
)
|
| 70 |
-
|
| 71 |
-
mini_index_path = Path(snapshot_path) / "tokenisation" / "bm25_index_mini.pkl"
|
| 72 |
-
embeddings_dir = Path(snapshot_path) / "embeddings"
|
| 73 |
-
|
| 74 |
-
vector_store = load_vector_store(embeddings_dir)
|
| 75 |
-
bm25_retriever = load(mini_index_path)
|
| 76 |
-
|
| 77 |
-
return vector_store, bm25_retriever
|
| 78 |
|
| 79 |
# βββ Custom CSS βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 80 |
st.markdown(
|
|
@@ -146,15 +109,60 @@ st.markdown(
|
|
| 146 |
unsafe_allow_html=True,
|
| 147 |
)
|
| 148 |
|
| 149 |
-
|
| 150 |
-
|
| 151 |
-
|
| 152 |
-
|
| 153 |
-
|
| 154 |
-
|
| 155 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 156 |
|
| 157 |
-
vector_store, bm25_retriever = load_vector_store_cached()
|
| 158 |
|
| 159 |
def bm25_search(query: str, top_k: int = 3) -> list[dict]:
|
| 160 |
"""
|
|
@@ -163,7 +171,8 @@ def bm25_search(query: str, top_k: int = 3) -> list[dict]:
|
|
| 163 |
return retriever.search(query, top_k=top_k)
|
| 164 |
Returns top_k review-level results (may include multiple reviews per ASIN).
|
| 165 |
"""
|
| 166 |
-
|
|
|
|
| 167 |
return results
|
| 168 |
|
| 169 |
|
|
@@ -216,7 +225,7 @@ def render_results(results: list[dict], mode: str, query: str) -> None:
|
|
| 216 |
title = item["title"]
|
| 217 |
avg_rating = item["average_rating"]
|
| 218 |
n_reviews = len(reviews)
|
| 219 |
-
total_reviews = item.get('total_reviews', n_reviews)
|
| 220 |
rating_number = item.get('rating_number', 0)
|
| 221 |
asin = item['parent_asin']
|
| 222 |
review_word = "review" if n_reviews == 1 else "reviews"
|
|
@@ -247,7 +256,7 @@ def render_results(results: list[dict], mode: str, query: str) -> None:
|
|
| 247 |
)
|
| 248 |
|
| 249 |
# ββ Reviews in collapsible expander βββββββββββββββββββββββββββββββ
|
| 250 |
-
expander_label = f"π
|
| 251 |
with st.expander(expander_label, expanded=(n_reviews == 1)):
|
| 252 |
for j, rev in enumerate(reviews):
|
| 253 |
st.markdown(
|
|
|
|
| 16 |
from utils.bm25 import load
|
| 17 |
from utils.semantic import load_vector_store
|
| 18 |
|
| 19 |
+
from dotenv import load_dotenv
|
| 20 |
+
load_dotenv()
|
| 21 |
+
|
| 22 |
import warnings
|
| 23 |
warnings.filterwarnings("ignore", category=UserWarning)
|
| 24 |
|
|
|
|
| 34 |
FEEDBACK_CSV = ROOT / "results" / "feedback.csv"
|
| 35 |
FEEDBACK_CSV.parent.mkdir(parents=True, exist_ok=True)
|
| 36 |
|
| 37 |
+
HF_TOKEN = os.getenv('HF_TOKEN')
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 38 |
|
| 39 |
+
from datasets import load_dataset
|
| 40 |
+
from huggingface_hub import snapshot_download, login
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 41 |
|
| 42 |
# βββ Custom CSS βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 43 |
st.markdown(
|
|
|
|
| 109 |
unsafe_allow_html=True,
|
| 110 |
)
|
| 111 |
|
| 112 |
+
@st.cache_resource
|
| 113 |
+
def load_hf_dataset():
|
| 114 |
+
return load_dataset(
|
| 115 |
+
"McAuley-Lab/Amazon-Reviews-2023",
|
| 116 |
+
"raw_meta_Grocery_and_Gourmet_Food",
|
| 117 |
+
trust_remote_code=True,
|
| 118 |
+
token=HF_TOKEN
|
| 119 |
+
)
|
| 120 |
+
|
| 121 |
+
VECTOR_STORE_DIR = ROOT / "data" / "processed"
|
| 122 |
+
|
| 123 |
+
@st.cache_resource
|
| 124 |
+
def load_vector_store_cached():
|
| 125 |
+
login(token=HF_TOKEN, add_to_git_credential=False)
|
| 126 |
+
VECTOR_STORE_DIR.mkdir(parents=True, exist_ok=True)
|
| 127 |
+
|
| 128 |
+
snapshot_path = snapshot_download(
|
| 129 |
+
repo_id="rishadaz/amazon_retriever-storage",
|
| 130 |
+
repo_type="dataset",
|
| 131 |
+
local_dir=str(VECTOR_STORE_DIR),
|
| 132 |
+
token=HF_TOKEN,
|
| 133 |
+
)
|
| 134 |
+
|
| 135 |
+
mini_index_path = Path(snapshot_path) / "tokenisation" / "bm25_index.pkl"
|
| 136 |
+
embeddings_dir = Path(snapshot_path) / "embeddings"
|
| 137 |
+
|
| 138 |
+
vector_store = load_vector_store(embeddings_dir)
|
| 139 |
+
bm25_retriever = load(mini_index_path)
|
| 140 |
+
|
| 141 |
+
return vector_store, bm25_retriever
|
| 142 |
+
|
| 143 |
+
# βββ Get Data ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 144 |
+
# local tag will read from your local directory as a default it will
|
| 145 |
+
# read the mini versions of the files we have provided in the repo
|
| 146 |
+
|
| 147 |
+
data_source = "remote" #"remote" or "local"
|
| 148 |
+
|
| 149 |
+
# note: remote has the full generated corpus and
|
| 150 |
+
# embeddings which can take a long time to download and
|
| 151 |
+
# the app might become heavy too and slow down
|
| 152 |
+
# processing. For development pls use the smaller "local" corpus
|
| 153 |
+
|
| 154 |
+
HF_DATASET = load_hf_dataset()
|
| 155 |
+
|
| 156 |
+
if data_source == 'local':
|
| 157 |
+
MINI_INDEX_PATH = ROOT / "data" / "processed" / "tokenisation" / "bm25_index_mini.pkl"
|
| 158 |
+
|
| 159 |
+
vector_store = load_vector_store(ROOT_FOLDER / 'data' / 'processed' / 'embeddings')
|
| 160 |
+
retriever = load(MINI_INDEX_PATH)
|
| 161 |
+
else:
|
| 162 |
+
|
| 163 |
+
vector_store, retriever = load_vector_store_cached()
|
| 164 |
+
|
| 165 |
|
|
|
|
| 166 |
|
| 167 |
def bm25_search(query: str, top_k: int = 3) -> list[dict]:
|
| 168 |
"""
|
|
|
|
| 171 |
return retriever.search(query, top_k=top_k)
|
| 172 |
Returns top_k review-level results (may include multiple reviews per ASIN).
|
| 173 |
"""
|
| 174 |
+
|
| 175 |
+
results = enrich_bm25_search_results(retriever, query, top_k, HF_DATASET['full'])
|
| 176 |
return results
|
| 177 |
|
| 178 |
|
|
|
|
| 225 |
title = item["title"]
|
| 226 |
avg_rating = item["average_rating"]
|
| 227 |
n_reviews = len(reviews)
|
| 228 |
+
# total_reviews = item.get('total_reviews', n_reviews)
|
| 229 |
rating_number = item.get('rating_number', 0)
|
| 230 |
asin = item['parent_asin']
|
| 231 |
review_word = "review" if n_reviews == 1 else "reviews"
|
|
|
|
| 256 |
)
|
| 257 |
|
| 258 |
# ββ Reviews in collapsible expander βββββββββββββββββββββββββββββββ
|
| 259 |
+
expander_label = f"π Viewing top {n_reviews} {review_word} "
|
| 260 |
with st.expander(expander_label, expanded=(n_reviews == 1)):
|
| 261 |
for j, rev in enumerate(reviews):
|
| 262 |
st.markdown(
|
utils/retrieval_helpers.py
CHANGED
|
@@ -27,28 +27,6 @@ def decode_ratings(page_content):
|
|
| 27 |
return(parsed)
|
| 28 |
else:
|
| 29 |
return {}
|
| 30 |
-
|
| 31 |
-
def decode_bm25_ratings(page_content):
|
| 32 |
-
block_pattern = r'Review \(Rating:\s*\d+\.\d+\):.*'
|
| 33 |
-
matches = re.findall(block_pattern, page_content)
|
| 34 |
-
|
| 35 |
-
if matches:
|
| 36 |
-
pattern = r'Review \(Rating:\s*(\d+\.\d+)\):\s*([^\.]+)\.\s*(.*)'
|
| 37 |
-
parsed = []
|
| 38 |
-
|
| 39 |
-
for r in matches[:3]:
|
| 40 |
-
match = re.match(pattern, r)
|
| 41 |
-
if match:
|
| 42 |
-
rating, title, text = match.groups()
|
| 43 |
-
parsed.append({
|
| 44 |
-
'rating': float(rating),
|
| 45 |
-
'title': title.strip(),
|
| 46 |
-
'text': text.strip()
|
| 47 |
-
})
|
| 48 |
-
|
| 49 |
-
return parsed
|
| 50 |
-
else:
|
| 51 |
-
return {}
|
| 52 |
|
| 53 |
def enrich_search_results(vector_store, query: str, k: int, hf_dataset):
|
| 54 |
"""
|
|
@@ -146,14 +124,14 @@ def enrich_bm25_search_results(retriever, query: str, k: int, hf_dataset):
|
|
| 146 |
|
| 147 |
for doc, score in results:
|
| 148 |
parent_asin = doc.metadata.get("parent_asin")
|
| 149 |
-
total_reviews = doc.metadata.get("total_reviews")
|
| 150 |
-
metadata_object = asin_to_metadata.get(parent_asin, {}).copy()
|
| 151 |
-
metadata_object['score'] = score
|
| 152 |
-
metadata_object['total_reviews'] = total_reviews
|
| 153 |
|
| 154 |
-
|
| 155 |
-
|
| 156 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 157 |
|
| 158 |
enriched_results.append(metadata_object)
|
| 159 |
|
|
|
|
| 27 |
return(parsed)
|
| 28 |
else:
|
| 29 |
return {}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 30 |
|
| 31 |
def enrich_search_results(vector_store, query: str, k: int, hf_dataset):
|
| 32 |
"""
|
|
|
|
| 124 |
|
| 125 |
for doc, score in results:
|
| 126 |
parent_asin = doc.metadata.get("parent_asin")
|
|
|
|
|
|
|
|
|
|
|
|
|
| 127 |
|
| 128 |
+
metadata_object = {
|
| 129 |
+
**doc.metadata,
|
| 130 |
+
**asin_to_metadata.get(parent_asin, {}),
|
| 131 |
+
"score": score,
|
| 132 |
+
}
|
| 133 |
+
|
| 134 |
+
metadata_object['reviews'] = metadata_object.pop('top_reviews', {}) or {}
|
| 135 |
|
| 136 |
enriched_results.append(metadata_object)
|
| 137 |
|