File size: 4,916 Bytes
0ebddf8 64127fe 08a4899 64127fe 906aaf8 64127fe 7504df0 64127fe 31a79b6 64127fe 0ebddf8 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 | import altair as alt
import numpy as np
import pandas as pd
import streamlit as st
import streamlit as st
import nltk
import os
nltk.download("punkt_tab")
from dataclasses import dataclass
from nltk.tokenize import sent_tokenize
from llama_index.core import Settings, VectorStoreIndex
from llama_index.core.schema import TextNode
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.retrievers.bm25 import BM25Retriever
from llama_index.core.retrievers import QueryFusionRetriever
from llama_index.llms.openai import OpenAI
from sentence_transformers import CrossEncoder
@dataclass
class Utterance:
start: float
end: float
speaker: str
text: str
def ts_to_sec(ts):
h, m, s = ts.split(":")
return int(h) * 3600 + int(m) * 60 + float(s)
def parse_webvtt(vtt_bytes):
lines = vtt_bytes.decode("utf-8").splitlines()
utterances = []
i = 0
while i < len(lines):
if "-->" in lines[i]:
start, end = map(str.strip, lines[i].split("-->"))
start, end = ts_to_sec(start), ts_to_sec(end)
i += 1
speaker, text = "UNKNOWN", ""
if i < len(lines) and ":" in lines[i]:
speaker, text = lines[i].split(":", 1)
speaker, text = speaker.strip(), text.strip()
elif i < len(lines):
text = lines[i].strip()
utterances.append(Utterance(start, end, speaker, text))
i += 1
return utterances
def build_chunks(utterances, max_gap=25, sentences_per_chunk=3):
blocks, current = [], []
last_end = None
for u in utterances:
if last_end and u.start - last_end > max_gap:
blocks.append(current)
current = []
current.append(u)
last_end = u.end
if current:
blocks.append(current)
chunks = []
for block in blocks:
text = " ".join(u.text for u in block)
sentences = sent_tokenize(text)
for i in range(0, len(sentences), sentences_per_chunk):
chunks.append({
"text": " ".join(sentences[i:i+sentences_per_chunk]),
"start": block[0].start,
"end": block[-1].end,
"speakers": list(set(u.speaker for u in block))
})
return chunks
TOPIC_RULES = {
"gpu": ["gpu", "cuda", "vram", "nvidia"],
"technical_issue": ["issue", "error", "problem", "challenge"],
"use_case": ["use case", "real world", "industry"]
}
def tag_topics(text):
text = text.lower()
return [
topic for topic, kws in TOPIC_RULES.items()
if any(k in text for k in kws)
]
Settings.embed_model = HuggingFaceEmbedding(
model_name="sentence-transformers/all-MiniLM-L6-v2"
)
# Settings.llm = OpenAI(
# api_key=,
# api_base=
# )
reranker = CrossEncoder("cross-encoder/ms-marco-MiniLM-L-6-v2")
def build_retriever(vtt_bytes):
utterances = parse_webvtt(vtt_bytes)
chunks = build_chunks(utterances)
nodes = [
TextNode(
text=c["text"],
metadata={
"start": c["start"],
"end": c["end"],
"speakers": c["speakers"],
"topics": tag_topics(c["text"])
}
)
for c in chunks
]
index = VectorStoreIndex(nodes)
bm25 = BM25Retriever.from_defaults(nodes, similarity_top_k=20)
vector = index.as_retriever(similarity_top_k=20)
retriever = QueryFusionRetriever(
retrievers=[bm25, vector],
similarity_top_k=10,
mode="reciprocal_rerank"
)
return retriever
def retrieve(query, retriever):
nodes = retriever.retrieve(query)
scores = reranker.predict([[query, n.text] for n in nodes])
ranked = sorted(zip(scores, nodes), reverse=True)[:5]
results = []
for score, n in ranked:
results.append(
f"⏱️ {n.metadata['start']:.1f}s – {n.metadata['end']:.1f}s\n"
f"👤 Speakers: {', '.join(n.metadata['speakers'])}\n"
f"{n.text}"
)
return "\n\n---\n\n".join(results)
st.set_page_config(page_title="Transcript Hybrid RAG", layout="wide")
st.title("🎙️ Transcript Search (Hybrid RAG)")
st.write("Hybrid BM25 + Vector retrieval with cross-encoder reranking.")
uploaded = st.file_uploader("Upload a transcript", type=None)
if "retriever" not in st.session_state:
st.session_state.retriever = None
if uploaded and st.button("Build Index"):
with st.spinner("Indexing transcript..."):
st.session_state.retriever = build_retriever(uploaded.read())
st.success("Index built successfully!")
query = st.text_input("Ask a question")
if query and st.session_state.retriever:
with st.spinner("Retrieving evidence..."):
answer = retrieve(query, st.session_state.retriever)
st.text_area("Retrieved Evidence", answer, height=400)
|