Anshul Prasad commited on
Commit
acb9fe6
·
1 Parent(s): aad84b7

chunking logic integration.

Browse files
Files changed (1) hide show
  1. api/embed_transcripts.py +37 -7
api/embed_transcripts.py CHANGED
@@ -1,17 +1,47 @@
1
  import faiss
 
2
  import logging
3
  from pathlib import Path
4
  from sentence_transformers import SentenceTransformer
5
 
 
 
6
  logger = logging.getLogger(__name__)
7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8
 
9
- def embedding(transcripts: list[str], transcript_index: Path) -> None:
10
- model = SentenceTransformer("all-MiniLM-L6-v2")
11
- embeddings = model.encode(transcripts, show_progress_bar=True)
 
 
12
  dimension = embeddings.shape[1]
13
- index = faiss.IndexFlatL2(dimension)
14
  index.add(embeddings)
15
- transcript_index.mkdir(parents=True, exist_ok=True)
16
- faiss.write_index(index, str(transcript_index))
17
- logger.info("Embedding completed.\n")
 
 
 
 
 
 
1
  import faiss
2
+ import pickle
3
  import logging
4
  from pathlib import Path
5
  from sentence_transformers import SentenceTransformer
6
 
7
+ from utils.preprocess import chunk_text
8
+
9
  logger = logging.getLogger(__name__)
10
 
11
+ EMBED_MODEL = "BAAI/bge-small-en-v1.5" # better than all-MiniLM-L6-v2, same speed
12
+
13
+
14
+ def embedding(
15
+ transcripts: list[str],
16
+ transcript_index: str, # path to .faiss FILE (not dir)
17
+ chunks_pkl: str = "data/chunks.pkl",
18
+ ) -> None:
19
+ """
20
+ Chunk every transcript, embed all chunks, build FAISS index.
21
+
22
+ Saves:
23
+ transcript_index — FAISS flat-L2 index file
24
+ chunks_pkl — pickle of all chunk strings (same order as index)
25
+ """
26
+ # 1. Chunk all transcripts
27
+ all_chunks: list[str] = []
28
+ for text in transcripts:
29
+ all_chunks.extend(chunk_text(text))
30
+ logger.info("Total chunks after splitting: %d", len(all_chunks))
31
 
32
+ # 2. Embed
33
+ model = SentenceTransformer(EMBED_MODEL)
34
+ embeddings = model.encode(all_chunks, show_progress_bar=True, normalize_embeddings=True)
35
+
36
+ # 3. Build FAISS index (fix: write to FILE, not mkdir)
37
  dimension = embeddings.shape[1]
38
+ index = faiss.IndexFlatIP(dimension) # inner-product works well with normalized embeddings
39
  index.add(embeddings)
40
+ faiss.write_index(index, str(transcript_index)) # ← was: transcript_index.mkdir() — BUG FIXED
41
+
42
+ # 4. Save chunks so retrieval can map index → text
43
+ Path(chunks_pkl).parent.mkdir(parents=True, exist_ok=True)
44
+ with open(chunks_pkl, "wb") as f:
45
+ pickle.dump(all_chunks, f)
46
+
47
+ logger.info("Embedding completed. Index: %s Chunks: %s", transcript_index, chunks_pkl)