File size: 1,895 Bytes
66d10f7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
import os
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

class EmbeddingHandler:
    def __init__(self):
        print("TF-IDF Search ready — free, no download!")
        self.chunks_store = {}
        self.vectorizers = {}
        self.matrices = {}

    def process_and_store(self, transcript: str, session_id: str):
        chunks = self._chunk_transcript(transcript)
        vectorizer = TfidfVectorizer(stop_words='english')
        matrix = vectorizer.fit_transform(chunks)
        self.chunks_store[session_id] = chunks
        self.vectorizers[session_id] = vectorizer
        self.matrices[session_id] = matrix
        print(f"Stored {len(chunks)} chunks for session {session_id}")

    def retrieve(self, query: str, session_id: str, top_k: int = 5) -> str:
        if session_id not in self.chunks_store:
            raise ValueError("Session not found.")
        vectorizer = self.vectorizers[session_id]
        matrix = self.matrices[session_id]
        chunks = self.chunks_store[session_id]
        query_vec = vectorizer.transform([query])
        scores = cosine_similarity(query_vec, matrix).flatten()
        top_indices = scores.argsort()[-top_k:][::-1]
        top_chunks = [chunks[i] for i in top_indices]
        return "\n\n---\n\n".join(top_chunks)

    def _chunk_transcript(self, transcript: str, chunk_size: int = 400, overlap: int = 50) -> list:
        words = transcript.split()
        chunks = []
        for i in range(0, len(words), chunk_size - overlap):
            chunk = " ".join(words[i:i + chunk_size])
            if chunk:
                chunks.append(chunk)
        return chunks

    def cleanup_session(self, session_id: str):
        for store in [self.chunks_store, self.vectorizers, self.matrices]:
            if session_id in store:
                del store[session_id]