stefanjwojcik commited on
Commit
67e2fec
·
1 Parent(s): 2a49c3f

Refactor embedding model integration to use fastembed and update related dependencies

Browse files
build_faiss_index.py CHANGED
@@ -23,7 +23,7 @@ import pickle
23
  import time
24
  import os
25
  from pathlib import Path
26
- from sentence_transformers import SentenceTransformer
27
 
28
  # Paths
29
  SCRIPT_DIR = Path(__file__).parent.absolute()
@@ -43,20 +43,11 @@ def build_faiss_index():
43
  print(" Run ingest_data.py first to create the database.")
44
  return False
45
 
46
- # Load sentence transformer model
47
- print("\n1. Loading sentence transformer model...")
48
  start = time.time()
49
 
50
- # Disable all parallelism to avoid Python 3.14 issues
51
- os.environ['TOKENIZERS_PARALLELISM'] = 'false'
52
- os.environ['OMP_NUM_THREADS'] = '1'
53
- os.environ['MKL_NUM_THREADS'] = '1'
54
- os.environ['OPENBLAS_NUM_THREADS'] = '1'
55
-
56
- import torch
57
- torch.set_num_threads(1)
58
-
59
- model = SentenceTransformer('all-MiniLM-L6-v2')
60
  print(f" ✓ Model loaded in {time.time() - start:.3f}s")
61
 
62
  # Load biographies from database
@@ -97,13 +88,7 @@ def build_faiss_index():
97
 
98
  for i in range(0, len(texts), batch_size):
99
  batch = texts[i:i + batch_size]
100
- batch_embeddings = model.encode(
101
- batch,
102
- show_progress_bar=False,
103
- convert_to_numpy=True,
104
- normalize_embeddings=False,
105
- device='cpu' # Explicit CPU to avoid issues
106
- )
107
  embeddings.extend(batch_embeddings)
108
 
109
  # Progress update every 100 batches (~3200 texts)
 
23
  import time
24
  import os
25
  from pathlib import Path
26
+ from fastembed import TextEmbedding
27
 
28
  # Paths
29
  SCRIPT_DIR = Path(__file__).parent.absolute()
 
43
  print(" Run ingest_data.py first to create the database.")
44
  return False
45
 
46
+ # Load embedding model
47
+ print("\n1. Loading embedding model (fastembed/ONNX)...")
48
  start = time.time()
49
 
50
+ model = TextEmbedding('sentence-transformers/all-MiniLM-L6-v2')
 
 
 
 
 
 
 
 
 
51
  print(f" ✓ Model loaded in {time.time() - start:.3f}s")
52
 
53
  # Load biographies from database
 
88
 
89
  for i in range(0, len(texts), batch_size):
90
  batch = texts[i:i + batch_size]
91
+ batch_embeddings = list(model.embed(batch))
 
 
 
 
 
 
92
  embeddings.extend(batch_embeddings)
93
 
94
  # Progress update every 100 batches (~3200 texts)
congress_bio_ids.pkl CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:1a8b5e408f8c8522843b8dbe7cc8141de36ef80dbbd73495c0efdc3ed443b8d2
3
  size 130521
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a2df06881ad172440055bfdd84607704a18d0f1c05f2929bb9481ab5b0d9f2aa
3
  size 130521
congress_faiss.index CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e1a1761d05411f7b501a70da6b561724d4ca1d25e21a818747efa6e1b3114444
3
  size 20040237
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b02b6f4ca7b3d8a9a59d42795d875badcea22b40031e03d8acc5dff21467fb08
3
  size 20040237
gradio_app.py CHANGED
@@ -11,7 +11,7 @@ import os
11
  import warnings
12
  from typing import List, Dict, Any
13
  import numpy as np
14
- from sentence_transformers import SentenceTransformer
15
  import faiss
16
  import pickle
17
  from pathlib import Path
@@ -39,7 +39,7 @@ def initialize_search_index():
39
  try:
40
  if Path(FAISS_INDEX_PATH).exists() and Path(BIO_IDS_PATH).exists():
41
  print(f"Loading FAISS index from: {FAISS_INDEX_PATH}")
42
- model = SentenceTransformer('all-MiniLM-L6-v2')
43
  faiss_index = faiss.read_index(FAISS_INDEX_PATH)
44
  with open(BIO_IDS_PATH, "rb") as f:
45
  bio_id_mapping = pickle.load(f)
@@ -227,8 +227,7 @@ def semantic_search_biography(query: str, top_k: int = 5) -> str:
227
  top_k = min(max(1, top_k), 20)
228
 
229
  # Encode query
230
- query_embedding = model.encode([query])[0].astype('float32')
231
- query_embedding = query_embedding.reshape(1, -1)
232
  faiss.normalize_L2(query_embedding)
233
 
234
  # Search
 
11
  import warnings
12
  from typing import List, Dict, Any
13
  import numpy as np
14
+ from fastembed import TextEmbedding
15
  import faiss
16
  import pickle
17
  from pathlib import Path
 
39
  try:
40
  if Path(FAISS_INDEX_PATH).exists() and Path(BIO_IDS_PATH).exists():
41
  print(f"Loading FAISS index from: {FAISS_INDEX_PATH}")
42
+ model = TextEmbedding('sentence-transformers/all-MiniLM-L6-v2')
43
  faiss_index = faiss.read_index(FAISS_INDEX_PATH)
44
  with open(BIO_IDS_PATH, "rb") as f:
45
  bio_id_mapping = pickle.load(f)
 
227
  top_k = min(max(1, top_k), 20)
228
 
229
  # Encode query
230
+ query_embedding = np.array(next(model.embed([query])), dtype='float32').reshape(1, -1)
 
231
  faiss.normalize_L2(query_embedding)
232
 
233
  # Search
requirements.txt CHANGED
@@ -1,7 +1,5 @@
1
- # Requires Python 3.10-3.13 (NOT 3.14+ due to FAISS incompatibility)
2
  mcp>=1.0.0
3
  numpy>=1.24.0
4
- sentence-transformers>=2.2.0
5
- torch>=2.0.0
6
  faiss-cpu>=1.7.4
7
  gradio>=5.0.0
 
 
1
  mcp>=1.0.0
2
  numpy>=1.24.0
3
+ fastembed>=0.3.0
 
4
  faiss-cpu>=1.7.4
5
  gradio>=5.0.0