Spaces:
Runtime error
Runtime error
Commit ·
67e2fec
1
Parent(s): 2a49c3f
Refactor embedding model integration to use fastembed and update related dependencies
Browse files- build_faiss_index.py +5 -20
- congress_bio_ids.pkl +1 -1
- congress_faiss.index +1 -1
- gradio_app.py +3 -4
- requirements.txt +1 -3
build_faiss_index.py
CHANGED
|
@@ -23,7 +23,7 @@ import pickle
|
|
| 23 |
import time
|
| 24 |
import os
|
| 25 |
from pathlib import Path
|
| 26 |
-
from
|
| 27 |
|
| 28 |
# Paths
|
| 29 |
SCRIPT_DIR = Path(__file__).parent.absolute()
|
|
@@ -43,20 +43,11 @@ def build_faiss_index():
|
|
| 43 |
print(" Run ingest_data.py first to create the database.")
|
| 44 |
return False
|
| 45 |
|
| 46 |
-
# Load
|
| 47 |
-
print("\n1. Loading
|
| 48 |
start = time.time()
|
| 49 |
|
| 50 |
-
|
| 51 |
-
os.environ['TOKENIZERS_PARALLELISM'] = 'false'
|
| 52 |
-
os.environ['OMP_NUM_THREADS'] = '1'
|
| 53 |
-
os.environ['MKL_NUM_THREADS'] = '1'
|
| 54 |
-
os.environ['OPENBLAS_NUM_THREADS'] = '1'
|
| 55 |
-
|
| 56 |
-
import torch
|
| 57 |
-
torch.set_num_threads(1)
|
| 58 |
-
|
| 59 |
-
model = SentenceTransformer('all-MiniLM-L6-v2')
|
| 60 |
print(f" ✓ Model loaded in {time.time() - start:.3f}s")
|
| 61 |
|
| 62 |
# Load biographies from database
|
|
@@ -97,13 +88,7 @@ def build_faiss_index():
|
|
| 97 |
|
| 98 |
for i in range(0, len(texts), batch_size):
|
| 99 |
batch = texts[i:i + batch_size]
|
| 100 |
-
batch_embeddings = model.
|
| 101 |
-
batch,
|
| 102 |
-
show_progress_bar=False,
|
| 103 |
-
convert_to_numpy=True,
|
| 104 |
-
normalize_embeddings=False,
|
| 105 |
-
device='cpu' # Explicit CPU to avoid issues
|
| 106 |
-
)
|
| 107 |
embeddings.extend(batch_embeddings)
|
| 108 |
|
| 109 |
# Progress update every 100 batches (~3200 texts)
|
|
|
|
| 23 |
import time
|
| 24 |
import os
|
| 25 |
from pathlib import Path
|
| 26 |
+
from fastembed import TextEmbedding
|
| 27 |
|
| 28 |
# Paths
|
| 29 |
SCRIPT_DIR = Path(__file__).parent.absolute()
|
|
|
|
| 43 |
print(" Run ingest_data.py first to create the database.")
|
| 44 |
return False
|
| 45 |
|
| 46 |
+
# Load embedding model
|
| 47 |
+
print("\n1. Loading embedding model (fastembed/ONNX)...")
|
| 48 |
start = time.time()
|
| 49 |
|
| 50 |
+
model = TextEmbedding('sentence-transformers/all-MiniLM-L6-v2')
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 51 |
print(f" ✓ Model loaded in {time.time() - start:.3f}s")
|
| 52 |
|
| 53 |
# Load biographies from database
|
|
|
|
| 88 |
|
| 89 |
for i in range(0, len(texts), batch_size):
|
| 90 |
batch = texts[i:i + batch_size]
|
| 91 |
+
batch_embeddings = list(model.embed(batch))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 92 |
embeddings.extend(batch_embeddings)
|
| 93 |
|
| 94 |
# Progress update every 100 batches (~3200 texts)
|
congress_bio_ids.pkl
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 130521
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:a2df06881ad172440055bfdd84607704a18d0f1c05f2929bb9481ab5b0d9f2aa
|
| 3 |
size 130521
|
congress_faiss.index
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 20040237
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:b02b6f4ca7b3d8a9a59d42795d875badcea22b40031e03d8acc5dff21467fb08
|
| 3 |
size 20040237
|
gradio_app.py
CHANGED
|
@@ -11,7 +11,7 @@ import os
|
|
| 11 |
import warnings
|
| 12 |
from typing import List, Dict, Any
|
| 13 |
import numpy as np
|
| 14 |
-
from
|
| 15 |
import faiss
|
| 16 |
import pickle
|
| 17 |
from pathlib import Path
|
|
@@ -39,7 +39,7 @@ def initialize_search_index():
|
|
| 39 |
try:
|
| 40 |
if Path(FAISS_INDEX_PATH).exists() and Path(BIO_IDS_PATH).exists():
|
| 41 |
print(f"Loading FAISS index from: {FAISS_INDEX_PATH}")
|
| 42 |
-
model =
|
| 43 |
faiss_index = faiss.read_index(FAISS_INDEX_PATH)
|
| 44 |
with open(BIO_IDS_PATH, "rb") as f:
|
| 45 |
bio_id_mapping = pickle.load(f)
|
|
@@ -227,8 +227,7 @@ def semantic_search_biography(query: str, top_k: int = 5) -> str:
|
|
| 227 |
top_k = min(max(1, top_k), 20)
|
| 228 |
|
| 229 |
# Encode query
|
| 230 |
-
query_embedding = model.
|
| 231 |
-
query_embedding = query_embedding.reshape(1, -1)
|
| 232 |
faiss.normalize_L2(query_embedding)
|
| 233 |
|
| 234 |
# Search
|
|
|
|
| 11 |
import warnings
|
| 12 |
from typing import List, Dict, Any
|
| 13 |
import numpy as np
|
| 14 |
+
from fastembed import TextEmbedding
|
| 15 |
import faiss
|
| 16 |
import pickle
|
| 17 |
from pathlib import Path
|
|
|
|
| 39 |
try:
|
| 40 |
if Path(FAISS_INDEX_PATH).exists() and Path(BIO_IDS_PATH).exists():
|
| 41 |
print(f"Loading FAISS index from: {FAISS_INDEX_PATH}")
|
| 42 |
+
model = TextEmbedding('sentence-transformers/all-MiniLM-L6-v2')
|
| 43 |
faiss_index = faiss.read_index(FAISS_INDEX_PATH)
|
| 44 |
with open(BIO_IDS_PATH, "rb") as f:
|
| 45 |
bio_id_mapping = pickle.load(f)
|
|
|
|
| 227 |
top_k = min(max(1, top_k), 20)
|
| 228 |
|
| 229 |
# Encode query
|
| 230 |
+
query_embedding = np.array(next(model.embed([query])), dtype='float32').reshape(1, -1)
|
|
|
|
| 231 |
faiss.normalize_L2(query_embedding)
|
| 232 |
|
| 233 |
# Search
|
requirements.txt
CHANGED
|
@@ -1,7 +1,5 @@
|
|
| 1 |
-
# Requires Python 3.10-3.13 (NOT 3.14+ due to FAISS incompatibility)
|
| 2 |
mcp>=1.0.0
|
| 3 |
numpy>=1.24.0
|
| 4 |
-
|
| 5 |
-
torch>=2.0.0
|
| 6 |
faiss-cpu>=1.7.4
|
| 7 |
gradio>=5.0.0
|
|
|
|
|
|
|
| 1 |
mcp>=1.0.0
|
| 2 |
numpy>=1.24.0
|
| 3 |
+
fastembed>=0.3.0
|
|
|
|
| 4 |
faiss-cpu>=1.7.4
|
| 5 |
gradio>=5.0.0
|