Spaces:

AnshulPrasad
/

transcript-rag-summarizer

Sleeping

App Files Files Community

Anshul Prasad commited on Mar 15

Commit

acb9fe6

1 Parent(s): aad84b7

chunking logic integration.

Browse files

Files changed (1) hide show

api/embed_transcripts.py +37 -7

api/embed_transcripts.py CHANGED Viewed

@@ -1,17 +1,47 @@
 import faiss
 import logging
 from pathlib import Path
 from sentence_transformers import SentenceTransformer
 logger = logging.getLogger(__name__)
-def embedding(transcripts: list[str], transcript_index: Path) -> None:
-    model = SentenceTransformer("all-MiniLM-L6-v2")
-    embeddings = model.encode(transcripts, show_progress_bar=True)
     dimension = embeddings.shape[1]
-    index = faiss.IndexFlatL2(dimension)
     index.add(embeddings)
-    transcript_index.mkdir(parents=True, exist_ok=True)
-    faiss.write_index(index, str(transcript_index))
-    logger.info("Embedding completed.\n")

 import faiss
+import pickle
 import logging
 from pathlib import Path
 from sentence_transformers import SentenceTransformer
+from utils.preprocess import chunk_text
 logger = logging.getLogger(__name__)
+EMBED_MODEL = "BAAI/bge-small-en-v1.5"   # better than all-MiniLM-L6-v2, same speed
+def embedding(
+    transcripts: list[str],
+    transcript_index: str,           # path to .faiss FILE (not dir)
+    chunks_pkl: str = "data/chunks.pkl",
+) -> None:
+    """
+    Chunk every transcript, embed all chunks, build FAISS index.
+    Saves:
+        transcript_index  — FAISS flat-L2 index file
+        chunks_pkl        — pickle of all chunk strings (same order as index)
+    """
+    # 1. Chunk all transcripts
+    all_chunks: list[str] = []
+    for text in transcripts:
+        all_chunks.extend(chunk_text(text))
+    logger.info("Total chunks after splitting: %d", len(all_chunks))
+    # 2. Embed
+    model = SentenceTransformer(EMBED_MODEL)
+    embeddings = model.encode(all_chunks, show_progress_bar=True, normalize_embeddings=True)
+    # 3. Build FAISS index  (fix: write to FILE, not mkdir)
     dimension = embeddings.shape[1]
+    index = faiss.IndexFlatIP(dimension)   # inner-product works well with normalized embeddings
     index.add(embeddings)
+    faiss.write_index(index, str(transcript_index))   # ← was: transcript_index.mkdir() — BUG FIXED
+    # 4. Save chunks so retrieval can map index → text
+    Path(chunks_pkl).parent.mkdir(parents=True, exist_ok=True)
+    with open(chunks_pkl, "wb") as f:
+        pickle.dump(all_chunks, f)
+    logger.info("Embedding completed. Index: %s  Chunks: %s", transcript_index, chunks_pkl)