Spaces:
Sleeping
Sleeping
| import faiss | |
| import numpy as np | |
| import re | |
| def preprocess_transcript(text): | |
| """ | |
| Preprocess the transcript by removing timestamps and speaker labels. | |
| Example input: "[Speaker Guest-1 - 00:13] This is a test." | |
| Example output: "This is a test." | |
| """ | |
| # Remove patterns like [Speaker Guest-X - HH:MM] | |
| cleaned_text = re.sub(r'\[.*?\]', '', text) | |
| # Remove extra whitespace | |
| cleaned_text = ' '.join(cleaned_text.split()) | |
| return cleaned_text | |
| def chunk_text(text, chunk_size=300, overlap=50): | |
| # Preprocess the text to remove timestamps and speaker labels | |
| text = preprocess_transcript(text) | |
| if not text.strip(): | |
| raise ValueError("Transcript is empty after preprocessing.") | |
| words = text.split() | |
| chunks = [] | |
| for i in range(0, len(words), chunk_size - overlap): | |
| chunk = ' '.join(words[i:i + chunk_size]) | |
| chunks.append(chunk) | |
| return chunks | |
| def embed_chunks(chunks, embedder): | |
| print(f"Embedding {len(chunks)} chunks...") | |
| embeddings = embedder.encode(chunks) | |
| return np.array(embeddings), chunks | |
| def create_faiss_index(embeddings): | |
| print(f"Creating FAISS index with {embeddings.shape[0]} embeddings...") | |
| index = faiss.IndexFlatL2(embeddings.shape[1]) | |
| index.add(embeddings) | |
| print("FAISS index created successfully.") | |
| return index |