Spaces:

agnixcode
/

YoutubeTranscribevideochatbot

Sleeping

App Files Files Community

YoutubeTranscribevideochatbot / app.py

agnixcode

Create app.py

f93afb8 verified 18 days ago

raw

history blame contribute delete

22.4 kB

	# ============================================================
	# YouTube RAG Q&A System — Production-Quality Colab Notebook
	# Author : Your Name
	# Model : Groq LLaMA-3.3-70B-Versatile (128K context)
	# Embedder: all-MiniLM-L6-v2 (Sentence-Transformers, free)
	# Vector DB: FAISS (Facebook AI, free, CPU)
	# UI : Gradio 4.x
	# ============================================================


	# ─────────────────────────────────────────────────────────────
	# MODULE 0 ❯ INSTALLATION
	# Run this cell once. Restart runtime after it finishes.
	# ─────────────────────────────────────────────────────────────

	# !pip install -q \
	# gradio \
	# youtube-transcript-api \
	# sentence-transformers \
	# faiss-cpu \
	# groq \
	# langchain-text-splitters \
	# python-dotenv


	# ─────────────────────────────────────────────────────────────
	# MODULE 1 ❯ IMPORTS & CONFIGURATION
	# All third-party imports live here.
	# API key is read from Colab Secrets (preferred) or env var.
	# ─────────────────────────────────────────────────────────────

	import os
	import re
	import logging
	from typing import Optional

	# ── UI framework ─────────────────────────────────────────────
	import gradio as gr

	# ── YouTube transcript (free, no API key required) ───────────
	from youtube_transcript_api import YouTubeTranscriptApi
	from youtube_transcript_api._errors import (
	TranscriptsDisabled,
	NoTranscriptFound,
	VideoUnavailable,
	)

	# ── Embedding model (local, runs on CPU) ─────────────────────
	from sentence_transformers import SentenceTransformer

	# ── Text splitting ────────────────────────────────────────────
	from langchain_text_splitters import RecursiveCharacterTextSplitter

	# ── Numerical / vector DB ─────────────────────────────────────
	import numpy as np
	import faiss

	# ── Groq LLM client ───────────────────────────────────────────
	from groq import Groq

	# ── Logging — shows clean status in Colab output ──────────────
	logging.basicConfig(
	level=logging.INFO,
	format="%(asctime)s \| %(levelname)s \| %(message)s",
	datefmt="%H:%M:%S",
	)
	log = logging.getLogger("rag")

	# ── API key ────────────────────────────────────────────────────
	# Option A (recommended in Colab): use Secrets panel (🔑 left sidebar)
	# key name → GROQ_API_KEY
	try:
	from google.colab import userdata # type: ignore
	GROQ_API_KEY = userdata.get("GROQ_API_KEY")
	except Exception:
	GROQ_API_KEY = os.getenv("GROQ_API_KEY", "")

	if not GROQ_API_KEY:
	raise EnvironmentError(
	"⚠️ GROQ_API_KEY not found. "
	"Add it via Colab Secrets (🔑) or set os.environ['GROQ_API_KEY']."
	)

	# ── Model identifiers ──────────────────────────────────────────
	GROQ_MODEL = "llama-3.3-70b-versatile" # 128K context, best OSS on Groq 2025
	EMBED_MODEL = "all-MiniLM-L6-v2" # 384-dim, fast, free, CPU-friendly
	CHUNK_SIZE = 500 # tokens per chunk
	CHUNK_OVERLAP = 50 # overlap to preserve context across chunks
	TOP_K = 4 # how many chunks to retrieve per query
	MAX_NEW_TOKENS = 1024 # LLM answer budget


	# ─────────────────────────────────────────────────────────────
	# MODULE 2 ❯ MODEL INITIALISATION
	# Load embedding model once at startup so every call is fast.
	# Groq client is stateless — one instance is enough.
	# ─────────────────────────────────────────────────────────────

	log.info("Loading embedding model …")
	embedding_model = SentenceTransformer(EMBED_MODEL)
	log.info("Embedding model ready ✓")

	groq_client = Groq(api_key=GROQ_API_KEY)

	# ── Global vector store ────────────────────────────────────────
	# These are module-level globals so every Gradio callback
	# can read/write them without passing objects around.
	vector_store: Optional[faiss.IndexFlatL2] = None # FAISS index
	chunks_store: list[str] = [] # parallel list of text chunks
	current_video_title: str = "" # shown in the UI


	# ─────────────────────────────────────────────────────────────
	# MODULE 3 ❯ YOUTUBE TRANSCRIPT FETCHER
	# ─────────────────────────────────────────────────────────────

	def extract_video_id(url: str) -> str:
	"""
	Extract the YouTube video ID from any common URL format.

	Handles:
	https://www.youtube.com/watch?v=VIDEO_ID
	https://youtu.be/VIDEO_ID
	https://youtube.com/shorts/VIDEO_ID
	https://www.youtube.com/embed/VIDEO_ID
	"""
	patterns = [
	r"(?:v=)([A-Za-z0-9_-]{11})",
	r"youtu\.be/([A-Za-z0-9_-]{11})",
	r"shorts/([A-Za-z0-9_-]{11})",
	r"embed/([A-Za-z0-9_-]{11})",
	]
	for pattern in patterns:
	match = re.search(pattern, url)
	if match:
	return match.group(1)
	raise ValueError(f"Could not extract video ID from URL: {url}")


	def get_transcript(url: str) -> tuple[str, str]:
	"""
	Fetch the transcript for a YouTube video.

	Returns
	-------
	(transcript_text, status_message)
	On error: (empty string, error description)
	"""
	try:
	video_id = extract_video_id(url)
	log.info(f"Fetching transcript for video ID: {video_id}")

	api = YouTubeTranscriptApi()
	# .fetch() returns a FetchedTranscript object (updated API)
	transcript_data = api.fetch(video_id)

	# Join all text segments into one continuous string
	full_text = " ".join(
	segment.text.strip()
	for segment in transcript_data
	if segment.text.strip()
	)

	word_count = len(full_text.split())
	log.info(f"Transcript fetched — {word_count:,} words")
	return full_text, f"✅ Transcript fetched ({word_count:,} words)"

	except VideoUnavailable:
	return "", "❌ Video is unavailable or private."
	except TranscriptsDisabled:
	return "", "❌ Transcripts are disabled for this video."
	except NoTranscriptFound:
	return "", "❌ No transcript found. Try a video with auto-generated captions."
	except ValueError as e:
	return "", f"❌ Invalid URL — {e}"
	except Exception as e:
	log.exception("Unexpected error fetching transcript")
	return "", f"❌ Unexpected error: {e}"


	# ─────────────────────────────────────────────────────────────
	# MODULE 4 ❯ VECTOR DATABASE BUILDER
	# Splits transcript → chunks → embeddings → FAISS index
	# ─────────────────────────────────────────────────────────────

	def build_vector_store(transcript: str) -> str:
	"""
	Convert a raw transcript into a FAISS vector index.

	Steps
	-----
	1. Split text into overlapping chunks via RecursiveCharacterTextSplitter
	2. Encode each chunk with the embedding model
	3. Build a FAISS IndexFlatL2 and add the vectors
	4. Store everything in module-level globals

	Returns
	-------
	Status message string.
	"""
	global vector_store, chunks_store

	# ── Step 1: Chunk ──────────────────────────────────────────
	splitter = RecursiveCharacterTextSplitter(
	chunk_size=CHUNK_SIZE,
	chunk_overlap=CHUNK_OVERLAP,
	length_function=len, # character-based length
	separators=["\n\n", "\n", ". ", " ", ""],
	)
	chunks = splitter.split_text(transcript)
	log.info(f"Created {len(chunks)} chunks")

	if not chunks:
	return "❌ No chunks created — transcript may be too short."

	# ── Step 2: Embed ──────────────────────────────────────────
	log.info("Encoding chunks …")
	embeddings = embedding_model.encode(
	chunks,
	show_progress_bar=False,
	batch_size=64,
	normalize_embeddings=True, # cosine similarity via inner product
	)

	# ── Step 3: Index ──────────────────────────────────────────
	dimension = embeddings.shape[1]
	index = faiss.IndexFlatIP(dimension) # Inner Product → cosine on normalised vecs
	index.add(np.array(embeddings, dtype=np.float32))

	# ── Step 4: Persist to globals ─────────────────────────────
	vector_store = index
	chunks_store = chunks

	log.info(f"FAISS index built — {index.ntotal} vectors, dim={dimension}")
	return f"✅ Indexed {len(chunks)} chunks into FAISS (dim={dimension})"


	# ─────────────────────────────────────────────────────────────
	# MODULE 5 ❯ RETRIEVER
	# Similarity search: query → top-k relevant chunks
	# ─────────────────────────────────────────────────────────────

	def retrieve_context(query: str, top_k: int = TOP_K) -> str:
	"""
	Retrieve the most semantically relevant chunks for a given query.

	Parameters
	----------
	query : user's natural-language question
	top_k : number of chunks to return

	Returns
	-------
	String of concatenated retrieved chunks, separated by blank lines.
	"""
	if vector_store is None or not chunks_store:
	return ""

	# Embed and normalise the query (same preprocessing as the chunks)
	query_vec = embedding_model.encode(
	[query],
	normalize_embeddings=True,
	)

	# FAISS inner-product search (cosine on normalised vectors)
	scores, indices = vector_store.search(
	np.array(query_vec, dtype=np.float32), top_k
	)

	retrieved = []
	for score, idx in zip(scores[0], indices[0]):
	if idx == -1: # FAISS returns -1 for empty slots
	continue
	retrieved.append(f"[Relevance: {score:.3f}]\n{chunks_store[idx]}")

	log.info(f"Retrieved {len(retrieved)} chunks for query: '{query[:60]}…'")
	return "\n\n---\n\n".join(retrieved)


	# ─────────────────────────────────────────────────────────────
	# MODULE 6 ❯ LLM — GROQ LLAMA 3.3-70B
	# Augment + Generate step of RAG
	# ─────────────────────────────────────────────────────────────

	SYSTEM_PROMPT = """\
	You are a precise, helpful AI assistant that answers questions about YouTube videos \
	based strictly on the provided transcript context.

	Rules:
	- Answer ONLY from the context provided.
	- If the context does not contain enough information, say so clearly.
	- Be concise but complete.
	- Use bullet points for lists or steps.
	- Never fabricate information not present in the context.
	"""

	def generate_answer(query: str) -> str:
	"""
	Full RAG generate step:
	1. Retrieve relevant context from FAISS
	2. Build an augmented prompt
	3. Send to Groq LLaMA-3.3-70B
	4. Return the model's response

	Parameters
	----------
	query : user's question

	Returns
	-------
	The model's answer as a string.
	"""
	context = retrieve_context(query)

	if not context:
	return "⚠️ No relevant context found in the transcript for your question."

	user_message = f"""\
	Context from the video transcript:

	{context}

	---

	Question: {query}

	Answer:"""

	try:
	response = groq_client.chat.completions.create(
	model=GROQ_MODEL,
	messages=[
	{"role": "system", "content": SYSTEM_PROMPT},
	{"role": "user", "content": user_message},
	],
	max_tokens=MAX_NEW_TOKENS,
	temperature=0.2, # low temp → factual, grounded answers
	top_p=0.9,
	)
	answer = response.choices[0].message.content.strip()
	log.info("LLM response received")
	return answer

	except Exception as e:
	log.exception("Groq API error")
	return f"❌ LLM error: {e}"


	# ─────────────────────────────────────────────────────────────
	# MODULE 7 ❯ ORCHESTRATION PIPELINE
	# Ties transcript fetch + vector store build together.
	# Called by the Gradio "Process Video" button.
	# ─────────────────────────────────────────────────────────────

	def process_video(url: str) -> tuple[str, str, str]:
	"""
	Full ingestion pipeline triggered by the UI.

	Returns
	-------
	(transcript_preview, index_status, combined_status)
	suitable for Gradio outputs.
	"""
	global current_video_title

	if not url or not url.strip():
	return "", "", "⚠️ Please enter a YouTube URL."

	# ── Phase 1: Fetch transcript ──────────────────────────────
	transcript, fetch_status = get_transcript(url.strip())
	if not transcript:
	return "", "", fetch_status

	# ── Phase 2: Build vector store ───────────────────────────
	index_status = build_vector_store(transcript)

	# ── Phase 3: Summary line for UI ──────────────────────────
	combined = f"{fetch_status}\n{index_status}\n\n💬 Video is ready — switch to the Chat tab!"

	# Show first 2000 chars in the transcript preview box
	preview = transcript[:2000] + (" …[truncated]" if len(transcript) > 2000 else "")

	return preview, index_status, combined


	# ─────────────────────────────────────────────────────────────
	# MODULE 8 ❯ CHAT HANDLER
	# Called on every user message in the Chat tab.
	# ─────────────────────────────────────────────────────────────

	def chat_with_video(
	user_query: str,
	history: list[tuple[str, str]],
	) -> tuple[list[tuple[str, str]], str]:
	"""
	Handle a single chat turn.

	Parameters
	----------
	user_query : the question typed by the user
	history : Gradio chat history (list of (user, assistant) pairs)

	Returns
	-------
	Updated history, empty string (clears the input box)
	"""
	if not user_query.strip():
	return history, ""

	if vector_store is None:
	history.append((user_query, "⚠️ Please process a video first on the Process Video tab."))
	return history, ""

	answer = generate_answer(user_query)
	history.append((user_query, answer))
	return history, ""


	# ─────────────────────────────────────────────────────────────
	# MODULE 9 ❯ GRADIO USER INTERFACE
	# Professional two-tab layout:
	# Tab 1 — Process Video (URL input, status, transcript preview)
	# Tab 2 — Chat (conversation window + input)
	# ─────────────────────────────────────────────────────────────

	CSS = """
	/* ── Global ── */
	#app-header { text-align: center; margin-bottom: 0.5rem; }
	#status-box textarea {
	font-size: 0.85rem;
	color: var(--body-text-color);
	background: var(--input-background-fill);
	}
	#transcript-box textarea { font-size: 0.8rem; }
	#chat-window { height: 480px; }
	/* ── Send on Enter ── */
	#chat-input textarea { resize: none; }
	"""

	with gr.Blocks(
	title="YouTube RAG Q&A",
	theme=gr.themes.Soft(
	primary_hue="indigo",
	neutral_hue="slate",
	font=gr.themes.GoogleFont("Inter"),
	),
	css=CSS,
	) as app:

	# ── Header ─────────────────────────────────────────────────
	gr.Markdown(
	"""
	# 🎥 YouTube RAG Q&A
	Paste any YouTube URL → transcribe → chat with the video using AI

	Powered by [Groq](https://groq.com) · LLaMA 3.3-70B · FAISS · Sentence-Transformers
	""",
	elem_id="app-header",
	)

	# ── Tab 1: Process Video ────────────────────────────────────
	with gr.Tab("📥 Process Video", id="tab-process"):

	with gr.Row():
	url_input = gr.Textbox(
	label="YouTube URL",
	placeholder="https://www.youtube.com/watch?v=...",
	scale=4,
	)
	process_btn = gr.Button(
	"▶ Transcribe & Index",
	variant="primary",
	scale=1,
	min_width=180,
	)

	status_output = gr.Textbox(
	label="Pipeline Status",
	interactive=False,
	lines=4,
	elem_id="status-box",
	)

	with gr.Accordion("📄 Transcript Preview (first 2000 chars)", open=False):
	transcript_output = gr.Textbox(
	label="Raw transcript",
	interactive=False,
	lines=12,
	elem_id="transcript-box",
	)

	# ── Wiring ────────────────────────────────────────────
	process_btn.click(
	fn=process_video,
	inputs=url_input,
	outputs=[transcript_output, gr.Textbox(visible=False), status_output],
	)

	# ── Tab 2: Chat ─────────────────────────────────────────────
	with gr.Tab("💬 Chat with Video", id="tab-chat"):

	chatbot = gr.Chatbot(
	label="Conversation",
	bubble_full_width=False,
	height=480,
	elem_id="chat-window",
	)

	with gr.Row():
	chat_input = gr.Textbox(
	placeholder="Ask anything about the video…",
	label="",
	scale=5,
	elem_id="chat-input",
	autofocus=True,
	)
	send_btn = gr.Button("Send ➤", variant="primary", scale=1, min_width=100)

	clear_btn = gr.Button("🗑 Clear conversation", variant="secondary", size="sm")

	# ── Wiring ────────────────────────────────────────────
	# Submit on button click or Enter key
	send_btn.click(
	fn=chat_with_video,
	inputs=[chat_input, chatbot],
	outputs=[chatbot, chat_input],
	)
	chat_input.submit(
	fn=chat_with_video,
	inputs=[chat_input, chatbot],
	outputs=[chatbot, chat_input],
	)
	clear_btn.click(fn=lambda: [], outputs=chatbot)

	# ── Footer ──────────────────────────────────────────────────
	gr.Markdown(
	"<center style='font-size:0.75rem; color:#888;'>"
	"Open-source · No data stored · Transcript processed locally"
	"</center>"
	)


	# ─────────────────────────────────────────────────────────────
	# MODULE 10 ❯ LAUNCH
	# ─────────────────────────────────────────────────────────────

	if __name__ == "__main__":
	app.launch(
	debug=True, # shows tracebacks in output
	share=True, # creates a public gradio.live link (great for demos)
	show_error=True,
	)