File size: 4,916 Bytes
0ebddf8
 
 
 
64127fe
 
08a4899
64127fe
 
 
 
 
 
 
 
 
 
906aaf8
64127fe
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7504df0
 
 
 
64127fe
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
31a79b6
64127fe
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0ebddf8
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
import altair as alt
import numpy as np
import pandas as pd
import streamlit as st
import streamlit as st
import nltk
import os
nltk.download("punkt_tab")

from dataclasses import dataclass
from nltk.tokenize import sent_tokenize

from llama_index.core import Settings, VectorStoreIndex
from llama_index.core.schema import TextNode
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.retrievers.bm25 import BM25Retriever
from llama_index.core.retrievers import QueryFusionRetriever
from llama_index.llms.openai import OpenAI

from sentence_transformers import CrossEncoder


@dataclass
class Utterance:
    start: float
    end: float
    speaker: str
    text: str


def ts_to_sec(ts):
    h, m, s = ts.split(":")
    return int(h) * 3600 + int(m) * 60 + float(s)


def parse_webvtt(vtt_bytes):
    lines = vtt_bytes.decode("utf-8").splitlines()
    utterances = []

    i = 0
    while i < len(lines):
        if "-->" in lines[i]:
            start, end = map(str.strip, lines[i].split("-->"))
            start, end = ts_to_sec(start), ts_to_sec(end)
            i += 1

            speaker, text = "UNKNOWN", ""
            if i < len(lines) and ":" in lines[i]:
                speaker, text = lines[i].split(":", 1)
                speaker, text = speaker.strip(), text.strip()
            elif i < len(lines):
                text = lines[i].strip()

            utterances.append(Utterance(start, end, speaker, text))
        i += 1

    return utterances


def build_chunks(utterances, max_gap=25, sentences_per_chunk=3):
    blocks, current = [], []
    last_end = None

    for u in utterances:
        if last_end and u.start - last_end > max_gap:
            blocks.append(current)
            current = []
        current.append(u)
        last_end = u.end

    if current:
        blocks.append(current)

    chunks = []
    for block in blocks:
        text = " ".join(u.text for u in block)
        sentences = sent_tokenize(text)

        for i in range(0, len(sentences), sentences_per_chunk):
            chunks.append({
                "text": " ".join(sentences[i:i+sentences_per_chunk]),
                "start": block[0].start,
                "end": block[-1].end,
                "speakers": list(set(u.speaker for u in block))
            })

    return chunks


TOPIC_RULES = {
    "gpu": ["gpu", "cuda", "vram", "nvidia"],
    "technical_issue": ["issue", "error", "problem", "challenge"],
    "use_case": ["use case", "real world", "industry"]
}


def tag_topics(text):
    text = text.lower()
    return [
        topic for topic, kws in TOPIC_RULES.items()
        if any(k in text for k in kws)
    ]


Settings.embed_model = HuggingFaceEmbedding(
    model_name="sentence-transformers/all-MiniLM-L6-v2"
)


# Settings.llm = OpenAI(
#     api_key=,
#     api_base=
# )

reranker = CrossEncoder("cross-encoder/ms-marco-MiniLM-L-6-v2")


def build_retriever(vtt_bytes):
    utterances = parse_webvtt(vtt_bytes)
    chunks = build_chunks(utterances)

    nodes = [
        TextNode(
            text=c["text"],
            metadata={
                "start": c["start"],
                "end": c["end"],
                "speakers": c["speakers"],
                "topics": tag_topics(c["text"])
            }
        )
        for c in chunks
    ]

    index = VectorStoreIndex(nodes)

    bm25 = BM25Retriever.from_defaults(nodes, similarity_top_k=20)
    vector = index.as_retriever(similarity_top_k=20)

    retriever = QueryFusionRetriever(
        retrievers=[bm25, vector],
        similarity_top_k=10,
        mode="reciprocal_rerank"
    )

    return retriever


def retrieve(query, retriever):
    nodes = retriever.retrieve(query)

    scores = reranker.predict([[query, n.text] for n in nodes])
    ranked = sorted(zip(scores, nodes), reverse=True)[:5]

    results = []
    for score, n in ranked:
        results.append(
            f"⏱️ {n.metadata['start']:.1f}s – {n.metadata['end']:.1f}s\n"
            f"👤 Speakers: {', '.join(n.metadata['speakers'])}\n"
            f"{n.text}"
        )

    return "\n\n---\n\n".join(results)


st.set_page_config(page_title="Transcript Hybrid RAG", layout="wide")

st.title("🎙️ Transcript Search (Hybrid RAG)")
st.write("Hybrid BM25 + Vector retrieval with cross-encoder reranking.")

uploaded = st.file_uploader("Upload a transcript", type=None)

if "retriever" not in st.session_state:
    st.session_state.retriever = None

if uploaded and st.button("Build Index"):
    with st.spinner("Indexing transcript..."):
        st.session_state.retriever = build_retriever(uploaded.read())
    st.success("Index built successfully!")

query = st.text_input("Ask a question")

if query and st.session_state.retriever:
    with st.spinner("Retrieving evidence..."):
        answer = retrieve(query, st.session_state.retriever)
    st.text_area("Retrieved Evidence", answer, height=400)