Spaces:

Alimubariz124
/

chat-with-data

Sleeping

chat-with-data / transcript_handler.py

Update transcript_handler.py

eae9a85 verified 8 months ago

1.35 kB

	import faiss
	import numpy as np

	import re

	def preprocess_transcript(text):
	"""
	Preprocess the transcript by removing timestamps and speaker labels.
	Example input: "[Speaker Guest-1 - 00:13] This is a test."
	Example output: "This is a test."
	"""
	# Remove patterns like [Speaker Guest-X - HH:MM]
	cleaned_text = re.sub(r'\[.*?\]', '', text)
	# Remove extra whitespace
	cleaned_text = ' '.join(cleaned_text.split())
	return cleaned_text

	def chunk_text(text, chunk_size=300, overlap=50):
	# Preprocess the text to remove timestamps and speaker labels
	text = preprocess_transcript(text)

	if not text.strip():
	raise ValueError("Transcript is empty after preprocessing.")

	words = text.split()
	chunks = []
	for i in range(0, len(words), chunk_size - overlap):
	chunk = ' '.join(words[i:i + chunk_size])
	chunks.append(chunk)
	return chunks


	def embed_chunks(chunks, embedder):
	print(f"Embedding {len(chunks)} chunks...")
	embeddings = embedder.encode(chunks)
	return np.array(embeddings), chunks

	def create_faiss_index(embeddings):
	print(f"Creating FAISS index with {embeddings.shape[0]} embeddings...")
	index = faiss.IndexFlatL2(embeddings.shape[1])
	index.add(embeddings)
	print("FAISS index created successfully.")
	return index