transcript_insights / src /streamlit_app.py
sohchattglc11111's picture
Update src/streamlit_app.py
7504df0 verified
import altair as alt
import numpy as np
import pandas as pd
import streamlit as st
import streamlit as st
import nltk
import os
nltk.download("punkt_tab")
from dataclasses import dataclass
from nltk.tokenize import sent_tokenize
from llama_index.core import Settings, VectorStoreIndex
from llama_index.core.schema import TextNode
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.retrievers.bm25 import BM25Retriever
from llama_index.core.retrievers import QueryFusionRetriever
from llama_index.llms.openai import OpenAI
from sentence_transformers import CrossEncoder
@dataclass
class Utterance:
start: float
end: float
speaker: str
text: str
def ts_to_sec(ts):
h, m, s = ts.split(":")
return int(h) * 3600 + int(m) * 60 + float(s)
def parse_webvtt(vtt_bytes):
lines = vtt_bytes.decode("utf-8").splitlines()
utterances = []
i = 0
while i < len(lines):
if "-->" in lines[i]:
start, end = map(str.strip, lines[i].split("-->"))
start, end = ts_to_sec(start), ts_to_sec(end)
i += 1
speaker, text = "UNKNOWN", ""
if i < len(lines) and ":" in lines[i]:
speaker, text = lines[i].split(":", 1)
speaker, text = speaker.strip(), text.strip()
elif i < len(lines):
text = lines[i].strip()
utterances.append(Utterance(start, end, speaker, text))
i += 1
return utterances
def build_chunks(utterances, max_gap=25, sentences_per_chunk=3):
blocks, current = [], []
last_end = None
for u in utterances:
if last_end and u.start - last_end > max_gap:
blocks.append(current)
current = []
current.append(u)
last_end = u.end
if current:
blocks.append(current)
chunks = []
for block in blocks:
text = " ".join(u.text for u in block)
sentences = sent_tokenize(text)
for i in range(0, len(sentences), sentences_per_chunk):
chunks.append({
"text": " ".join(sentences[i:i+sentences_per_chunk]),
"start": block[0].start,
"end": block[-1].end,
"speakers": list(set(u.speaker for u in block))
})
return chunks
TOPIC_RULES = {
"gpu": ["gpu", "cuda", "vram", "nvidia"],
"technical_issue": ["issue", "error", "problem", "challenge"],
"use_case": ["use case", "real world", "industry"]
}
def tag_topics(text):
text = text.lower()
return [
topic for topic, kws in TOPIC_RULES.items()
if any(k in text for k in kws)
]
Settings.embed_model = HuggingFaceEmbedding(
model_name="sentence-transformers/all-MiniLM-L6-v2"
)
# Settings.llm = OpenAI(
# api_key=,
# api_base=
# )
reranker = CrossEncoder("cross-encoder/ms-marco-MiniLM-L-6-v2")
def build_retriever(vtt_bytes):
utterances = parse_webvtt(vtt_bytes)
chunks = build_chunks(utterances)
nodes = [
TextNode(
text=c["text"],
metadata={
"start": c["start"],
"end": c["end"],
"speakers": c["speakers"],
"topics": tag_topics(c["text"])
}
)
for c in chunks
]
index = VectorStoreIndex(nodes)
bm25 = BM25Retriever.from_defaults(nodes, similarity_top_k=20)
vector = index.as_retriever(similarity_top_k=20)
retriever = QueryFusionRetriever(
retrievers=[bm25, vector],
similarity_top_k=10,
mode="reciprocal_rerank"
)
return retriever
def retrieve(query, retriever):
nodes = retriever.retrieve(query)
scores = reranker.predict([[query, n.text] for n in nodes])
ranked = sorted(zip(scores, nodes), reverse=True)[:5]
results = []
for score, n in ranked:
results.append(
f"⏱️ {n.metadata['start']:.1f}s – {n.metadata['end']:.1f}s\n"
f"👤 Speakers: {', '.join(n.metadata['speakers'])}\n"
f"{n.text}"
)
return "\n\n---\n\n".join(results)
st.set_page_config(page_title="Transcript Hybrid RAG", layout="wide")
st.title("🎙️ Transcript Search (Hybrid RAG)")
st.write("Hybrid BM25 + Vector retrieval with cross-encoder reranking.")
uploaded = st.file_uploader("Upload a transcript", type=None)
if "retriever" not in st.session_state:
st.session_state.retriever = None
if uploaded and st.button("Build Index"):
with st.spinner("Indexing transcript..."):
st.session_state.retriever = build_retriever(uploaded.read())
st.success("Index built successfully!")
query = st.text_input("Ask a question")
if query and st.session_state.retriever:
with st.spinner("Retrieving evidence..."):
answer = retrieve(query, st.session_state.retriever)
st.text_area("Retrieved Evidence", answer, height=400)