File size: 11,250 Bytes
23e75f0 6afe35e 23e75f0 6afe35e e34ac27 23e75f0 f7322c5 23e75f0 f7322c5 23e75f0 f7322c5 23e75f0 f7322c5 23e75f0 e34ac27 23e75f0 e34ac27 23e75f0 e34ac27 6afe35e e34ac27 23e75f0 e34ac27 23e75f0 f7322c5 23e75f0 e34ac27 23e75f0 e34ac27 23e75f0 e34ac27 e34d257 23e75f0 6afe35e 23e75f0 e34ac27 6afe35e 23e75f0 e34ac27 23e75f0 e34ac27 23e75f0 f7322c5 23e75f0 e34ac27 23e75f0 e34ac27 23e75f0 e34ac27 23e75f0 e34ac27 6afe35e 23e75f0 6afe35e 23e75f0 e34ac27 23e75f0 f7322c5 23e75f0 e34ac27 23e75f0 e34ac27 6afe35e 23e75f0 f7322c5 e34ac27 23e75f0 f7322c5 23e75f0 e34ac27 23e75f0 e34ac27 23e75f0 e34ac27 23e75f0 b87ef21 23e75f0 e34ac27 23e75f0 6afe35e 23e75f0 e34ac27 23e75f0 e34ac27 1077330 23e75f0 1077330 b87ef21 e34ac27 23e75f0 e34ac27 23e75f0 e34ac27 23e75f0 e34ac27 23e75f0 e34ac27 23e75f0 e34ac27 23e75f0 e34ac27 23e75f0 e34ac27 23e75f0 e34ac27 23e75f0 1077330 e34ac27 1077330 23e75f0 b87ef21 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 | # app.py
import os
import re
import gradio as gr
import numpy as np
import faiss
from youtube_transcript_api import (
YouTubeTranscriptApi,
TranscriptsDisabled,
NoTranscriptFound,
VideoUnavailable,
)
from langchain_text_splitters import RecursiveCharacterTextSplitter
from sentence_transformers import SentenceTransformer
from huggingface_hub import InferenceClient
# ---------------------------------------------------------------------------
# Global state
# ---------------------------------------------------------------------------
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
faiss_index = None
chunk_store = []
full_transcript = ""
HF_TOKEN = os.environ.get("HF_TOKEN", "")
LLM_MODEL = "mistralai/Mistral-7B-Instruct-v0.3"
inference_client = InferenceClient(model=LLM_MODEL, token=HF_TOKEN or None)
# ---------------------------------------------------------------------------
# Helper – extract video id
# ---------------------------------------------------------------------------
def _extract_video_id(url: str) -> str:
patterns = [
r"(?:v=)([A-Za-z0-9_-]{11})",
r"(?:youtu\.be/)([A-Za-z0-9_-]{11})",
r"(?:embed/)([A-Za-z0-9_-]{11})",
r"(?:shorts/)([A-Za-z0-9_-]{11})",
]
for pattern in patterns:
match = re.search(pattern, url)
if match:
return match.group(1)
raise ValueError(f"Could not extract a valid video ID from: {url}")
# ---------------------------------------------------------------------------
# 1. Fetch transcript
# Confirmed from source: ALL methods are CLASS methods.
# get_transcript() returns list of dicts: [{"text": str, "start": float, "duration": float}]
# Access text with snippet["text"] not snippet.text
# ---------------------------------------------------------------------------
def get_transcript(url: str) -> str:
video_id = _extract_video_id(url)
# Primary: try English directly
try:
snippets = YouTubeTranscriptApi.get_transcript(
video_id, languages=["en", "en-US", "en-GB"]
)
return " ".join(s["text"] for s in snippets)
except (NoTranscriptFound, TranscriptsDisabled):
pass
except VideoUnavailable:
raise ValueError("This video is unavailable or private.")
except Exception:
pass
# Fallback: list all, pick first available, fetch it
try:
transcript_list = YouTubeTranscriptApi.list_transcripts(video_id)
transcript = None
# prefer any english variant
for t in transcript_list:
if t.language_code.startswith("en"):
transcript = t
break
# if no english, take the first one
if transcript is None:
for t in transcript_list:
transcript = t
break
if transcript is None:
raise ValueError("No transcripts are available for this video.")
# fetch() returns list of dicts [{"text":..., "start":..., "duration":...}]
snippets = transcript.fetch()
return " ".join(s["text"] for s in snippets)
except ValueError:
raise
except TranscriptsDisabled:
raise ValueError("Transcripts are disabled for this video.")
except Exception as exc:
raise ValueError(f"Could not retrieve transcript: {exc}")
# ---------------------------------------------------------------------------
# 2. Process video
# ---------------------------------------------------------------------------
def process_video(url: str):
global faiss_index, chunk_store, full_transcript
faiss_index = None
chunk_store = []
full_transcript = ""
if not url.strip():
return "⚠️ Please enter a YouTube URL.", ""
try:
transcript = get_transcript(url)
except ValueError as exc:
return f"❌ {exc}", ""
except Exception as exc:
return f"❌ Unexpected error: {exc}", ""
if not transcript.strip():
return "❌ Transcript is empty for this video.", ""
full_transcript = transcript
splitter = RecursiveCharacterTextSplitter(
chunk_size=500,
chunk_overlap=50,
length_function=len,
)
chunks = splitter.split_text(transcript)
if not chunks:
return "❌ Could not split transcript into chunks.", transcript
chunk_store = chunks
embeddings = embedding_model.encode(chunks, show_progress_bar=False)
embeddings = np.array(embeddings, dtype="float32")
dim = embeddings.shape[1]
index = faiss.IndexFlatL2(dim)
index.add(embeddings)
faiss_index = index
status = (
f"✅ Video processed successfully!\n"
f" • Chunks created : {len(chunks)}\n"
f" • Embedding dim : {dim}\n"
f" • FAISS vectors : {index.ntotal}\n\n"
f"Switch to the 💬 Chat with Video tab to ask questions."
)
return status, transcript
# ---------------------------------------------------------------------------
# 3. Retrieve top-k chunks
# ---------------------------------------------------------------------------
def retrieve_context(query: str, top_k: int = 3) -> str:
if faiss_index is None or not chunk_store:
return ""
query_vec = embedding_model.encode([query], show_progress_bar=False)
query_vec = np.array(query_vec, dtype="float32")
k = min(top_k, len(chunk_store))
_, indices = faiss_index.search(query_vec, k)
retrieved = [chunk_store[i] for i in indices[0] if 0 <= i < len(chunk_store)]
return "\n\n".join(retrieved)
# ---------------------------------------------------------------------------
# 4. Generate answer
# ---------------------------------------------------------------------------
def generate_answer(query: str) -> str:
if faiss_index is None:
return (
"⚠️ No video processed yet. "
"Go to 📥 Process Video tab first."
)
context = retrieve_context(query, top_k=3)
if not context:
return "⚠️ Could not retrieve relevant context for your question."
system_prompt = (
"You are a helpful assistant that answers questions strictly "
"based on the provided video transcript context. "
"If the answer is not in the context, say: "
"'I could not find this information in the video transcript.' "
"Do NOT hallucinate or make up information."
)
user_prompt = (
f"Context from the video transcript:\n"
f"---\n{context}\n---\n\n"
f"Question: {query}\n\n"
f"Answer:"
)
try:
response = inference_client.chat_completion(
messages=[
{"role": "system", "content": system_prompt},
{"role": "user", "content": user_prompt},
],
max_tokens=512,
temperature=0.2,
top_p=0.9,
)
return response.choices[0].message.content.strip()
except Exception as exc:
return (
f"❌ Inference failed: {exc}\n"
"Check that HF_TOKEN is set correctly as a Space secret."
)
# ---------------------------------------------------------------------------
# 5. Chat helper
# Gradio 6.x Chatbot uses list of [user, bot] pairs (list of lists)
# ---------------------------------------------------------------------------
def chat(user_message: str, history: list):
if not user_message.strip():
history = history + [["", "⚠️ Please enter a question."]]
return history, ""
answer = generate_answer(user_message)
history = history + [[user_message, answer]]
return history, ""
# ---------------------------------------------------------------------------
# 6. Gradio UI — fully compatible with Gradio 6.13
# ---------------------------------------------------------------------------
with gr.Blocks(title="YouTube RAG Chatbot") as app:
gr.Markdown(
"""
# 🎬 YouTube RAG Chatbot
**Fetch any YouTube transcript and chat with it using RAG + Mistral-7B.**
> 🔑 Add your `HF_TOKEN` in Space **Settings → Secrets** for the LLM to work.
"""
)
with gr.Tabs():
# ── Tab 1: Process ─────────────────────────────────────────────────
with gr.TabItem("📥 Process Video"):
gr.Markdown("Enter a YouTube URL and click **Process** to index the transcript.")
with gr.Row():
url_input = gr.Textbox(
label="YouTube URL",
placeholder="https://www.youtube.com/watch?v=...",
scale=5,
)
process_btn = gr.Button("⚙️ Process", variant="primary", scale=1)
status_output = gr.Textbox(
label="Status",
lines=6,
interactive=False,
)
transcript_output = gr.Textbox(
label="Transcript",
lines=15,
interactive=False,
)
process_btn.click(
fn=process_video,
inputs=[url_input],
outputs=[status_output, transcript_output],
)
# ── Tab 2: Chat ────────────────────────────────────────────────────
with gr.TabItem("💬 Chat with Video"):
gr.Markdown("Ask questions about the video. Answers are grounded in the transcript.")
# Gradio 6.13: Chatbot takes list of [user, bot] pairs
chatbot = gr.Chatbot(label="Conversation", height=450)
with gr.Row():
query_input = gr.Textbox(
label="Your question",
placeholder="What is the main topic of this video?",
scale=5,
)
send_btn = gr.Button("Send 🚀", variant="primary", scale=1)
clear_btn = gr.Button("🗑️ Clear", variant="secondary")
# gr.State stores the history list between interactions
chat_history = gr.State([])
send_btn.click(
fn=chat,
inputs=[query_input, chat_history],
outputs=[chatbot, query_input],
).then(
fn=lambda h: h,
inputs=[chatbot],
outputs=[chat_history],
)
query_input.submit(
fn=chat,
inputs=[query_input, chat_history],
outputs=[chatbot, query_input],
).then(
fn=lambda h: h,
inputs=[chatbot],
outputs=[chat_history],
)
clear_btn.click(
fn=lambda: ([], []),
outputs=[chatbot, chat_history],
)
# ---------------------------------------------------------------------------
# Launch — theme passed here in Gradio 6.x
# ---------------------------------------------------------------------------
if __name__ == "__main__":
app.launch() |