Anshul Prasad commited on
Commit
8b5035b
·
1 Parent(s): 9072187

minor improvements

Browse files
Files changed (1) hide show
  1. api/embed_transcripts.py +15 -22
api/embed_transcripts.py CHANGED
@@ -1,29 +1,22 @@
1
- import faiss, logging, os
 
 
2
  from sentence_transformers import SentenceTransformer
3
 
4
- logging.basicConfig(format="%(asctime)s %(levelname)s:%(message)s", level=logging.INFO)
5
 
6
 
7
- def embedding(transcripts, transcript_index):
8
- logging.info("Starting embedding of transcripts...")
9
- embedding_model = SentenceTransformer(
10
- "all-MiniLM-L6-v2"
11
- )
12
- logging.info("Loaded embedding model.")
13
-
14
- transcripts_embeddings = embedding_model.encode(
15
- transcripts, convert_to_tensor=False, show_progress_bar=True
16
- )
17
- logging.info(f"Generated embeddings for {len(transcripts)} transcripts.")
18
-
19
- dimension = transcripts_embeddings[0].shape[0]
20
  index = faiss.IndexFlatL2(dimension)
21
- logging.info("Initialized FAISS index.")
22
-
23
- index.add(transcripts_embeddings)
24
- logging.info("Added embeddings to FAISS index.")
25
 
 
 
26
 
27
- os.makedirs(os.path.dirname(transcript_index), exist_ok=True)
28
- faiss.write_index(index, transcript_index)
29
- logging.info(f"FAISS index written to {transcript_index}.\n")
 
1
+ import faiss
2
+ import logging
3
+ from pathlib import Path
4
  from sentence_transformers import SentenceTransformer
5
 
6
+ logger = logging.getLogger(__name__)
7
 
8
 
9
+ def embedding(
10
+ transcripts: list[str],
11
+ transcript_index: Path,
12
+ ) -> None:
13
+ model = SentenceTransformer("all-MiniLM-L6-v2")
14
+ embeddings = model.encode(transcripts, show_progress_bar=True)
15
+ dimension = embeddings.shape[1]
 
 
 
 
 
 
16
  index = faiss.IndexFlatL2(dimension)
17
+ index.add(embeddings)
 
 
 
18
 
19
+ transcript_index.mkdir(parents=True, exist_ok=True)
20
+ faiss.write_index(index, str(transcript_index))
21
 
22
+ logger.info("Embedding completed.\n")