| import altair as alt |
| import numpy as np |
| import pandas as pd |
| import streamlit as st |
| import streamlit as st |
| import nltk |
| import os |
| nltk.download("punkt_tab") |
|
|
| from dataclasses import dataclass |
| from nltk.tokenize import sent_tokenize |
|
|
| from llama_index.core import Settings, VectorStoreIndex |
| from llama_index.core.schema import TextNode |
| from llama_index.embeddings.huggingface import HuggingFaceEmbedding |
| from llama_index.retrievers.bm25 import BM25Retriever |
| from llama_index.core.retrievers import QueryFusionRetriever |
| from llama_index.llms.openai import OpenAI |
|
|
| from sentence_transformers import CrossEncoder |
|
|
|
|
| @dataclass |
| class Utterance: |
| start: float |
| end: float |
| speaker: str |
| text: str |
|
|
|
|
| def ts_to_sec(ts): |
| h, m, s = ts.split(":") |
| return int(h) * 3600 + int(m) * 60 + float(s) |
|
|
|
|
| def parse_webvtt(vtt_bytes): |
| lines = vtt_bytes.decode("utf-8").splitlines() |
| utterances = [] |
|
|
| i = 0 |
| while i < len(lines): |
| if "-->" in lines[i]: |
| start, end = map(str.strip, lines[i].split("-->")) |
| start, end = ts_to_sec(start), ts_to_sec(end) |
| i += 1 |
|
|
| speaker, text = "UNKNOWN", "" |
| if i < len(lines) and ":" in lines[i]: |
| speaker, text = lines[i].split(":", 1) |
| speaker, text = speaker.strip(), text.strip() |
| elif i < len(lines): |
| text = lines[i].strip() |
|
|
| utterances.append(Utterance(start, end, speaker, text)) |
| i += 1 |
|
|
| return utterances |
|
|
|
|
| def build_chunks(utterances, max_gap=25, sentences_per_chunk=3): |
| blocks, current = [], [] |
| last_end = None |
|
|
| for u in utterances: |
| if last_end and u.start - last_end > max_gap: |
| blocks.append(current) |
| current = [] |
| current.append(u) |
| last_end = u.end |
|
|
| if current: |
| blocks.append(current) |
|
|
| chunks = [] |
| for block in blocks: |
| text = " ".join(u.text for u in block) |
| sentences = sent_tokenize(text) |
|
|
| for i in range(0, len(sentences), sentences_per_chunk): |
| chunks.append({ |
| "text": " ".join(sentences[i:i+sentences_per_chunk]), |
| "start": block[0].start, |
| "end": block[-1].end, |
| "speakers": list(set(u.speaker for u in block)) |
| }) |
|
|
| return chunks |
|
|
|
|
| TOPIC_RULES = { |
| "gpu": ["gpu", "cuda", "vram", "nvidia"], |
| "technical_issue": ["issue", "error", "problem", "challenge"], |
| "use_case": ["use case", "real world", "industry"] |
| } |
|
|
|
|
| def tag_topics(text): |
| text = text.lower() |
| return [ |
| topic for topic, kws in TOPIC_RULES.items() |
| if any(k in text for k in kws) |
| ] |
|
|
|
|
| Settings.embed_model = HuggingFaceEmbedding( |
| model_name="sentence-transformers/all-MiniLM-L6-v2" |
| ) |
|
|
|
|
| |
| |
| |
| |
|
|
| reranker = CrossEncoder("cross-encoder/ms-marco-MiniLM-L-6-v2") |
|
|
|
|
| def build_retriever(vtt_bytes): |
| utterances = parse_webvtt(vtt_bytes) |
| chunks = build_chunks(utterances) |
|
|
| nodes = [ |
| TextNode( |
| text=c["text"], |
| metadata={ |
| "start": c["start"], |
| "end": c["end"], |
| "speakers": c["speakers"], |
| "topics": tag_topics(c["text"]) |
| } |
| ) |
| for c in chunks |
| ] |
|
|
| index = VectorStoreIndex(nodes) |
|
|
| bm25 = BM25Retriever.from_defaults(nodes, similarity_top_k=20) |
| vector = index.as_retriever(similarity_top_k=20) |
|
|
| retriever = QueryFusionRetriever( |
| retrievers=[bm25, vector], |
| similarity_top_k=10, |
| mode="reciprocal_rerank" |
| ) |
|
|
| return retriever |
|
|
|
|
| def retrieve(query, retriever): |
| nodes = retriever.retrieve(query) |
|
|
| scores = reranker.predict([[query, n.text] for n in nodes]) |
| ranked = sorted(zip(scores, nodes), reverse=True)[:5] |
|
|
| results = [] |
| for score, n in ranked: |
| results.append( |
| f"⏱️ {n.metadata['start']:.1f}s – {n.metadata['end']:.1f}s\n" |
| f"👤 Speakers: {', '.join(n.metadata['speakers'])}\n" |
| f"{n.text}" |
| ) |
|
|
| return "\n\n---\n\n".join(results) |
|
|
|
|
| st.set_page_config(page_title="Transcript Hybrid RAG", layout="wide") |
|
|
| st.title("🎙️ Transcript Search (Hybrid RAG)") |
| st.write("Hybrid BM25 + Vector retrieval with cross-encoder reranking.") |
|
|
| uploaded = st.file_uploader("Upload a transcript", type=None) |
|
|
| if "retriever" not in st.session_state: |
| st.session_state.retriever = None |
|
|
| if uploaded and st.button("Build Index"): |
| with st.spinner("Indexing transcript..."): |
| st.session_state.retriever = build_retriever(uploaded.read()) |
| st.success("Index built successfully!") |
|
|
| query = st.text_input("Ask a question") |
|
|
| if query and st.session_state.retriever: |
| with st.spinner("Retrieving evidence..."): |
| answer = retrieve(query, st.session_state.retriever) |
| st.text_area("Retrieved Evidence", answer, height=400) |
|
|
|
|