import altair as alt import numpy as np import pandas as pd import streamlit as st import streamlit as st import nltk import os nltk.download("punkt_tab") from dataclasses import dataclass from nltk.tokenize import sent_tokenize from llama_index.core import Settings, VectorStoreIndex from llama_index.core.schema import TextNode from llama_index.embeddings.huggingface import HuggingFaceEmbedding from llama_index.retrievers.bm25 import BM25Retriever from llama_index.core.retrievers import QueryFusionRetriever from llama_index.llms.openai import OpenAI from sentence_transformers import CrossEncoder @dataclass class Utterance: start: float end: float speaker: str text: str def ts_to_sec(ts): h, m, s = ts.split(":") return int(h) * 3600 + int(m) * 60 + float(s) def parse_webvtt(vtt_bytes): lines = vtt_bytes.decode("utf-8").splitlines() utterances = [] i = 0 while i < len(lines): if "-->" in lines[i]: start, end = map(str.strip, lines[i].split("-->")) start, end = ts_to_sec(start), ts_to_sec(end) i += 1 speaker, text = "UNKNOWN", "" if i < len(lines) and ":" in lines[i]: speaker, text = lines[i].split(":", 1) speaker, text = speaker.strip(), text.strip() elif i < len(lines): text = lines[i].strip() utterances.append(Utterance(start, end, speaker, text)) i += 1 return utterances def build_chunks(utterances, max_gap=25, sentences_per_chunk=3): blocks, current = [], [] last_end = None for u in utterances: if last_end and u.start - last_end > max_gap: blocks.append(current) current = [] current.append(u) last_end = u.end if current: blocks.append(current) chunks = [] for block in blocks: text = " ".join(u.text for u in block) sentences = sent_tokenize(text) for i in range(0, len(sentences), sentences_per_chunk): chunks.append({ "text": " ".join(sentences[i:i+sentences_per_chunk]), "start": block[0].start, "end": block[-1].end, "speakers": list(set(u.speaker for u in block)) }) return chunks TOPIC_RULES = { "gpu": ["gpu", "cuda", "vram", "nvidia"], "technical_issue": ["issue", "error", "problem", "challenge"], "use_case": ["use case", "real world", "industry"] } def tag_topics(text): text = text.lower() return [ topic for topic, kws in TOPIC_RULES.items() if any(k in text for k in kws) ] Settings.embed_model = HuggingFaceEmbedding( model_name="sentence-transformers/all-MiniLM-L6-v2" ) # Settings.llm = OpenAI( # api_key=, # api_base= # ) reranker = CrossEncoder("cross-encoder/ms-marco-MiniLM-L-6-v2") def build_retriever(vtt_bytes): utterances = parse_webvtt(vtt_bytes) chunks = build_chunks(utterances) nodes = [ TextNode( text=c["text"], metadata={ "start": c["start"], "end": c["end"], "speakers": c["speakers"], "topics": tag_topics(c["text"]) } ) for c in chunks ] index = VectorStoreIndex(nodes) bm25 = BM25Retriever.from_defaults(nodes, similarity_top_k=20) vector = index.as_retriever(similarity_top_k=20) retriever = QueryFusionRetriever( retrievers=[bm25, vector], similarity_top_k=10, mode="reciprocal_rerank" ) return retriever def retrieve(query, retriever): nodes = retriever.retrieve(query) scores = reranker.predict([[query, n.text] for n in nodes]) ranked = sorted(zip(scores, nodes), reverse=True)[:5] results = [] for score, n in ranked: results.append( f"⏱️ {n.metadata['start']:.1f}s – {n.metadata['end']:.1f}s\n" f"👤 Speakers: {', '.join(n.metadata['speakers'])}\n" f"{n.text}" ) return "\n\n---\n\n".join(results) st.set_page_config(page_title="Transcript Hybrid RAG", layout="wide") st.title("🎙️ Transcript Search (Hybrid RAG)") st.write("Hybrid BM25 + Vector retrieval with cross-encoder reranking.") uploaded = st.file_uploader("Upload a transcript", type=None) if "retriever" not in st.session_state: st.session_state.retriever = None if uploaded and st.button("Build Index"): with st.spinner("Indexing transcript..."): st.session_state.retriever = build_retriever(uploaded.read()) st.success("Index built successfully!") query = st.text_input("Ask a question") if query and st.session_state.retriever: with st.spinner("Retrieving evidence..."): answer = retrieve(query, st.session_state.retriever) st.text_area("Retrieved Evidence", answer, height=400)