github-actions[bot] commited on
Commit
2ef07e4
·
1 Parent(s): 05b9d1b

chore: sync app/ and src/ from GitHub

Browse files
Files changed (2) hide show
  1. app/app.py +10 -8
  2. src/semantic.py +18 -15
app/app.py CHANGED
@@ -55,15 +55,17 @@ def load_vector_store_cached():
55
  login(token=HF_TOKEN, add_to_git_credential=False)
56
  VECTOR_STORE_DIR.mkdir(parents=True, exist_ok=True)
57
 
58
- snapshot_path = snapshot_download(
59
- repo_id="rishadaz/amazon_retriever-storage",
60
- repo_type="dataset",
61
- local_dir=str(VECTOR_STORE_DIR),
62
- split='full',
63
- token=HF_TOKEN,
64
- )
 
 
65
 
66
- mini_index_path = Path(snapshot_path) / "tokenisation" / "bm25_index_mini.pkl"
67
  embeddings_dir = Path(snapshot_path) / "embeddings"
68
 
69
  vector_store = load_vector_store(embeddings_dir)
 
55
  login(token=HF_TOKEN, add_to_git_credential=False)
56
  VECTOR_STORE_DIR.mkdir(parents=True, exist_ok=True)
57
 
58
+ if not any(VECTOR_STORE_DIR.iterdir()):
59
+ snapshot_path = Path(snapshot_download(
60
+ repo_id="rishadaz/amazon_retriever-storage",
61
+ repo_type="dataset",
62
+ local_dir=str(VECTOR_STORE_DIR),
63
+ token=HF_TOKEN,
64
+ ))
65
+ else:
66
+ snapshot_path = VECTOR_STORE_DIR
67
 
68
+ mini_index_path = Path(snapshot_path) / "tokenisation" / "bm25_index.pkl"
69
  embeddings_dir = Path(snapshot_path) / "embeddings"
70
 
71
  vector_store = load_vector_store(embeddings_dir)
src/semantic.py CHANGED
@@ -17,6 +17,7 @@ Typical usage
17
 
18
  import logging
19
  from typing import Any
 
20
 
21
  import torch
22
  import json, os, sys
@@ -45,17 +46,19 @@ DEFAULT_TOP_REVIEWS = 5
45
  DEFAULT_TOP_K = 5
46
 
47
  DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
48
- EMBEDDINGS = HuggingFaceEmbeddings(
49
- model_name=DEFAULT_EMBEDDING_MODEL,
50
- model_kwargs={
51
- "device": DEVICE,
52
- "model_kwargs": {"torch_dtype": torch.float16},
53
- },
54
- encode_kwargs={
55
- "batch_size": 128 if DEVICE == 'cpu' else 512,
56
- "normalize_embeddings": True,
57
- },
58
- )
 
 
59
 
60
  # ---------------------------------------------------------------------------
61
  # Document construction
@@ -178,11 +181,11 @@ def build_vector_store(
178
 
179
  # --- Create new store if needed ---
180
  if existing_store is None:
181
- dim = len(EMBEDDINGS.embed_query("probe"))
182
  index = faiss.IndexFlatL2(dim)
183
 
184
  vector_store = FAISS(
185
- embedding_function=EMBEDDINGS,
186
  index=index,
187
  docstore=InMemoryDocstore(),
188
  index_to_docstore_id={},
@@ -208,7 +211,7 @@ def build_and_save_vector_store(
208
  # --- Resume / initialize ---
209
  if os.path.exists(os.path.join(save_path, "index.faiss")):
210
  vector_store = FAISS.load_local(
211
- save_path, EMBEDDINGS, allow_dangerous_deserialization=True
212
  )
213
  already_indexed = set(vector_store.index_to_docstore_id.values())
214
  print(f"Resuming — {len(already_indexed)} docs already indexed.")
@@ -297,6 +300,6 @@ def load_vector_store(
297
 
298
  return FAISS.load_local(
299
  load_path,
300
- embeddings=EMBEDDINGS,
301
  allow_dangerous_deserialization=True,
302
  )
 
17
 
18
  import logging
19
  from typing import Any
20
+ import streamlit as st
21
 
22
  import torch
23
  import json, os, sys
 
46
  DEFAULT_TOP_K = 5
47
 
48
  DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
49
+ @st.cache_resource(show_spinner=False)
50
+ def get_embeddings():
51
+ return HuggingFaceEmbeddings(
52
+ model_name=DEFAULT_EMBEDDING_MODEL,
53
+ model_kwargs={
54
+ "device": DEVICE,
55
+ "model_kwargs": {"torch_dtype": torch.float16},
56
+ },
57
+ encode_kwargs={
58
+ "batch_size": 128 if DEVICE == "cpu" else 512,
59
+ "normalize_embeddings": True,
60
+ },
61
+ )
62
 
63
  # ---------------------------------------------------------------------------
64
  # Document construction
 
181
 
182
  # --- Create new store if needed ---
183
  if existing_store is None:
184
+ dim = len(get_embeddings().embed_query("probe"))
185
  index = faiss.IndexFlatL2(dim)
186
 
187
  vector_store = FAISS(
188
+ embedding_function=get_embeddings(),
189
  index=index,
190
  docstore=InMemoryDocstore(),
191
  index_to_docstore_id={},
 
211
  # --- Resume / initialize ---
212
  if os.path.exists(os.path.join(save_path, "index.faiss")):
213
  vector_store = FAISS.load_local(
214
+ save_path, get_embeddings(), allow_dangerous_deserialization=True
215
  )
216
  already_indexed = set(vector_store.index_to_docstore_id.values())
217
  print(f"Resuming — {len(already_indexed)} docs already indexed.")
 
300
 
301
  return FAISS.load_local(
302
  load_path,
303
+ embeddings=get_embeddings(),
304
  allow_dangerous_deserialization=True,
305
  )