Sarisha Das commited on
Commit
a672427
·
1 Parent(s): 158a1b5

update snapshot download, fix error

Browse files
Files changed (1) hide show
  1. app/app.py +25 -13
app/app.py CHANGED
@@ -8,7 +8,7 @@ import markdown
8
  ROOT_FOLDER = Path(__file__).resolve().parent.parent
9
 
10
  sys.path.append(str(ROOT_FOLDER))
11
- import sys
12
  import os
13
  sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..', 'src'))
14
  from src.retrieval_helpers import enrich_search_results,enrich_bm25_search_results
@@ -62,22 +62,34 @@ VECTOR_STORE_DIR = ROOT / "data" / "processed"
62
 
63
  @st.cache_resource
64
  def load_vector_store_cached():
65
- login(token=HF_TOKEN, add_to_git_credential=False)
 
 
66
  VECTOR_STORE_DIR.mkdir(parents=True, exist_ok=True)
67
 
68
- snapshot_path = snapshot_download(
69
- repo_id="rishadaz/amazon_retriever-storage",
70
- repo_type="dataset",
71
- local_dir=str(VECTOR_STORE_DIR),
72
- split='full',
73
- token=HF_TOKEN,
74
- )
 
 
 
 
 
 
 
 
 
 
 
75
 
76
- mini_index_path = Path(snapshot_path) / "tokenisation" / "bm25_index_mini.pkl"
77
- embeddings_dir = Path(snapshot_path) / "embeddings"
78
 
79
- vector_store = load_vector_store(embeddings_dir)
80
- bm25_retriever = load(mini_index_path)
81
 
82
  return vector_store, bm25_retriever
83
 
 
8
  ROOT_FOLDER = Path(__file__).resolve().parent.parent
9
 
10
  sys.path.append(str(ROOT_FOLDER))
11
+ import pickle
12
  import os
13
  sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..', 'src'))
14
  from src.retrieval_helpers import enrich_search_results,enrich_bm25_search_results
 
62
 
63
  @st.cache_resource
64
  def load_vector_store_cached():
65
+
66
+ hf_token = st.secrets["HF_TOKEN"]
67
+ login(token=hf_token, add_to_git_credential=False)
68
  VECTOR_STORE_DIR.mkdir(parents=True, exist_ok=True)
69
 
70
+ # Skip download if already cached on disk
71
+ if not any(VECTOR_STORE_DIR.iterdir()):
72
+ snapshot_path = Path(snapshot_download(
73
+ repo_id="rishadaz/amazon_retriever-storage",
74
+ repo_type="dataset",
75
+ local_dir=str(VECTOR_STORE_DIR),
76
+ token=hf_token,
77
+ ))
78
+ else:
79
+ snapshot_path = VECTOR_STORE_DIR
80
+
81
+ mini_index_path = snapshot_path / "tokenisation" / "bm25_index_mini.pkl"
82
+ embeddings_dir = snapshot_path / "embeddings"
83
+
84
+ if not mini_index_path.exists():
85
+ raise FileNotFoundError(f"BM25 index not found at {mini_index_path}")
86
+ if not embeddings_dir.exists():
87
+ raise FileNotFoundError(f"Embeddings dir not found at {embeddings_dir}")
88
 
89
+ with open(mini_index_path, "rb") as f:
90
+ bm25_retriever = pickle.load(f)
91
 
92
+ vector_store = load_vector_store(embeddings_dir)
 
93
 
94
  return vector_store, bm25_retriever
95