Spaces:
Sleeping
Sleeping
github-actions[bot] commited on
Commit ·
2ef07e4
1
Parent(s): 05b9d1b
chore: sync app/ and src/ from GitHub
Browse files- app/app.py +10 -8
- src/semantic.py +18 -15
app/app.py
CHANGED
|
@@ -55,15 +55,17 @@ def load_vector_store_cached():
|
|
| 55 |
login(token=HF_TOKEN, add_to_git_credential=False)
|
| 56 |
VECTOR_STORE_DIR.mkdir(parents=True, exist_ok=True)
|
| 57 |
|
| 58 |
-
|
| 59 |
-
|
| 60 |
-
|
| 61 |
-
|
| 62 |
-
|
| 63 |
-
|
| 64 |
-
|
|
|
|
|
|
|
| 65 |
|
| 66 |
-
mini_index_path = Path(snapshot_path) / "tokenisation" / "
|
| 67 |
embeddings_dir = Path(snapshot_path) / "embeddings"
|
| 68 |
|
| 69 |
vector_store = load_vector_store(embeddings_dir)
|
|
|
|
| 55 |
login(token=HF_TOKEN, add_to_git_credential=False)
|
| 56 |
VECTOR_STORE_DIR.mkdir(parents=True, exist_ok=True)
|
| 57 |
|
| 58 |
+
if not any(VECTOR_STORE_DIR.iterdir()):
|
| 59 |
+
snapshot_path = Path(snapshot_download(
|
| 60 |
+
repo_id="rishadaz/amazon_retriever-storage",
|
| 61 |
+
repo_type="dataset",
|
| 62 |
+
local_dir=str(VECTOR_STORE_DIR),
|
| 63 |
+
token=HF_TOKEN,
|
| 64 |
+
))
|
| 65 |
+
else:
|
| 66 |
+
snapshot_path = VECTOR_STORE_DIR
|
| 67 |
|
| 68 |
+
mini_index_path = Path(snapshot_path) / "tokenisation" / "bm25_index.pkl"
|
| 69 |
embeddings_dir = Path(snapshot_path) / "embeddings"
|
| 70 |
|
| 71 |
vector_store = load_vector_store(embeddings_dir)
|
src/semantic.py
CHANGED
|
@@ -17,6 +17,7 @@ Typical usage
|
|
| 17 |
|
| 18 |
import logging
|
| 19 |
from typing import Any
|
|
|
|
| 20 |
|
| 21 |
import torch
|
| 22 |
import json, os, sys
|
|
@@ -45,17 +46,19 @@ DEFAULT_TOP_REVIEWS = 5
|
|
| 45 |
DEFAULT_TOP_K = 5
|
| 46 |
|
| 47 |
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
|
| 48 |
-
|
| 49 |
-
|
| 50 |
-
|
| 51 |
-
|
| 52 |
-
|
| 53 |
-
|
| 54 |
-
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
|
| 58 |
-
|
|
|
|
|
|
|
| 59 |
|
| 60 |
# ---------------------------------------------------------------------------
|
| 61 |
# Document construction
|
|
@@ -178,11 +181,11 @@ def build_vector_store(
|
|
| 178 |
|
| 179 |
# --- Create new store if needed ---
|
| 180 |
if existing_store is None:
|
| 181 |
-
dim = len(
|
| 182 |
index = faiss.IndexFlatL2(dim)
|
| 183 |
|
| 184 |
vector_store = FAISS(
|
| 185 |
-
embedding_function=
|
| 186 |
index=index,
|
| 187 |
docstore=InMemoryDocstore(),
|
| 188 |
index_to_docstore_id={},
|
|
@@ -208,7 +211,7 @@ def build_and_save_vector_store(
|
|
| 208 |
# --- Resume / initialize ---
|
| 209 |
if os.path.exists(os.path.join(save_path, "index.faiss")):
|
| 210 |
vector_store = FAISS.load_local(
|
| 211 |
-
save_path,
|
| 212 |
)
|
| 213 |
already_indexed = set(vector_store.index_to_docstore_id.values())
|
| 214 |
print(f"Resuming — {len(already_indexed)} docs already indexed.")
|
|
@@ -297,6 +300,6 @@ def load_vector_store(
|
|
| 297 |
|
| 298 |
return FAISS.load_local(
|
| 299 |
load_path,
|
| 300 |
-
embeddings=
|
| 301 |
allow_dangerous_deserialization=True,
|
| 302 |
)
|
|
|
|
| 17 |
|
| 18 |
import logging
|
| 19 |
from typing import Any
|
| 20 |
+
import streamlit as st
|
| 21 |
|
| 22 |
import torch
|
| 23 |
import json, os, sys
|
|
|
|
| 46 |
DEFAULT_TOP_K = 5
|
| 47 |
|
| 48 |
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
|
| 49 |
+
@st.cache_resource(show_spinner=False)
|
| 50 |
+
def get_embeddings():
|
| 51 |
+
return HuggingFaceEmbeddings(
|
| 52 |
+
model_name=DEFAULT_EMBEDDING_MODEL,
|
| 53 |
+
model_kwargs={
|
| 54 |
+
"device": DEVICE,
|
| 55 |
+
"model_kwargs": {"torch_dtype": torch.float16},
|
| 56 |
+
},
|
| 57 |
+
encode_kwargs={
|
| 58 |
+
"batch_size": 128 if DEVICE == "cpu" else 512,
|
| 59 |
+
"normalize_embeddings": True,
|
| 60 |
+
},
|
| 61 |
+
)
|
| 62 |
|
| 63 |
# ---------------------------------------------------------------------------
|
| 64 |
# Document construction
|
|
|
|
| 181 |
|
| 182 |
# --- Create new store if needed ---
|
| 183 |
if existing_store is None:
|
| 184 |
+
dim = len(get_embeddings().embed_query("probe"))
|
| 185 |
index = faiss.IndexFlatL2(dim)
|
| 186 |
|
| 187 |
vector_store = FAISS(
|
| 188 |
+
embedding_function=get_embeddings(),
|
| 189 |
index=index,
|
| 190 |
docstore=InMemoryDocstore(),
|
| 191 |
index_to_docstore_id={},
|
|
|
|
| 211 |
# --- Resume / initialize ---
|
| 212 |
if os.path.exists(os.path.join(save_path, "index.faiss")):
|
| 213 |
vector_store = FAISS.load_local(
|
| 214 |
+
save_path, get_embeddings(), allow_dangerous_deserialization=True
|
| 215 |
)
|
| 216 |
already_indexed = set(vector_store.index_to_docstore_id.values())
|
| 217 |
print(f"Resuming — {len(already_indexed)} docs already indexed.")
|
|
|
|
| 300 |
|
| 301 |
return FAISS.load_local(
|
| 302 |
load_path,
|
| 303 |
+
embeddings=get_embeddings(),
|
| 304 |
allow_dangerous_deserialization=True,
|
| 305 |
)
|