Spaces:

ShahbazAhmad-Lab
/

RagYoutube

Sleeping

App Files Files Community

RagYoutube / app.py

ShahbazAhmad-Lab

Update app.py

8e37bbd verified about 1 month ago

raw

history blame contribute delete

11.7 kB

	# ============================================================
	# YouTube RAG QA System — app.py
	# Transcript: Supadata API (works on HuggingFace, no SSL block)
	# LLM: Groq LLaMA 3.3-70B
	# Vector DB: FAISS + sentence-transformers
	# UI: Gradio 5
	# ============================================================

	import os
	import re
	import requests
	import numpy as np
	import faiss
	import gradio as gr
	from sentence_transformers import SentenceTransformer
	from groq import Groq

	# ─── GLOBAL STATE ────────────────────────────────────────────
	_embed_model = None
	_faiss_index = None
	_chunks = []
	_groq_client = None


	# ─── LAZY CLIENTS ────────────────────────────────────────────
	def get_groq_client():
	global _groq_client
	if _groq_client is not None:
	return _groq_client
	api_key = os.environ.get("GROQ_API_KEY", "").strip()
	if not api_key:
	raise ValueError(
	"GROQ_API_KEY not set!\n"
	"Space → Settings → Variables and secrets → New secret\n"
	"Name: GROQ_API_KEY Value: gsk_xxxxxxxxxx"
	)
	_groq_client = Groq(api_key=api_key)
	return _groq_client


	def get_embed_model():
	global _embed_model
	if _embed_model is None:
	_embed_model = SentenceTransformer("all-MiniLM-L6-v2")
	return _embed_model


	# ─── MODULE 1: Transcript Fetcher (via Supadata REST API) ────
	def extract_video_id(url: str) -> str:
	"""Extract 11-char YouTube video ID from any URL format."""
	for pat in [
	r"(?:v=\|\/)([0-9A-Za-z_-]{11})",
	r"youtu\.be\/([0-9A-Za-z_-]{11})",
	r"shorts\/([0-9A-Za-z_-]{11})",
	]:
	m = re.search(pat, url)
	if m:
	return m.group(1)
	raise ValueError(f"Cannot extract video ID from: {url}")


	def fetch_transcript(url: str) -> str:
	"""
	Fetch transcript using Supadata API — works on HuggingFace
	(no direct YouTube SSL connection needed).
	Free tier: 100 requests/month — get key at supadata.ai
	"""
	supadata_key = os.environ.get("SUPADATA_API_KEY", "").strip()
	if not supadata_key:
	raise ValueError(
	"SUPADATA_API_KEY not set!\n"
	"1. Go to https://supadata.ai → Sign up (free, no credit card)\n"
	"2. Copy your API key\n"
	"3. Space → Settings → Variables and secrets → New secret\n"
	" Name: SUPADATA_API_KEY Value: your_key_here"
	)

	video_id = extract_video_id(url)
	response = requests.get(
	"https://api.supadata.ai/v1/youtube/transcript",
	params={"videoId": video_id, "text": "true"},
	headers={"x-api-key": supadata_key},
	timeout=30,
	)

	if response.status_code == 401:
	raise ValueError("Invalid SUPADATA_API_KEY — check your key at supadata.ai")
	if response.status_code == 404:
	raise ValueError("No transcript found for this video (may be private or have no captions)")
	if response.status_code != 200:
	raise ValueError(f"Supadata API error {response.status_code}: {response.text}")

	data = response.json()

	# text=true returns plain string in data["content"]
	if isinstance(data.get("content"), str):
	return data["content"]

	# fallback: join segment list
	if isinstance(data.get("content"), list):
	return " ".join(seg.get("text", "") for seg in data["content"])

	raise ValueError(f"Unexpected Supadata response: {data}")


	# ─── MODULE 2: Text Chunker ───────────────────────────────────
	def chunk_text(text: str, chunk_size: int = 500, overlap: int = 50) -> list:
	"""Split transcript into overlapping word-based chunks."""
	words, chunks, start = text.split(), [], 0
	while start < len(words):
	end = min(start + chunk_size, len(words))
	chunks.append(" ".join(words[start:end]))
	if end == len(words):
	break
	start += chunk_size - overlap
	return chunks


	# ─── MODULE 3: Vector Store (FAISS) ──────────────────────────
	def build_faiss_index(chunks: list):
	"""Encode chunks with MiniLM → build FAISS L2 index."""
	emb = get_embed_model().encode(chunks, show_progress_bar=False).astype("float32")
	index = faiss.IndexFlatL2(emb.shape[1])
	index.add(emb)
	return index


	def retrieve_chunks(query: str, index, chunks: list, top_k: int = 4) -> list:
	"""Return top-k most relevant chunks for a query."""
	q_vec = get_embed_model().encode([query]).astype("float32")
	_, idxs = index.search(q_vec, top_k)
	return [chunks[i] for i in idxs[0] if i < len(chunks)]


	# ─── MODULE 4: LLM via Groq ───────────────────────────────────
	def ask_llm(question: str, context_chunks: list) -> str:
	"""Build RAG prompt and call Groq LLaMA 3.3-70B."""
	context = "\n\n".join(f"[Chunk {i+1}]:\n{c}" for i, c in enumerate(context_chunks))
	prompt = (
	"You are a helpful assistant. Answer ONLY from the transcript context below.\n"
	"If the answer is not in the context, say: 'I could not find that in the video.'\n\n"
	f"CONTEXT:\n{context}\n\nQUESTION: {question}\n\nANSWER:"
	)
	resp = get_groq_client().chat.completions.create(
	model="llama-3.3-70b-versatile",
	messages=[{"role": "user", "content": prompt}],
	max_tokens=1024,
	temperature=0.3,
	)
	return resp.choices[0].message.content.strip()


	# ─── HANDLER: Process Video ───────────────────────────────────
	def process_video(url: str):
	"""Generator — yields live status messages to Textbox."""
	global _faiss_index, _chunks

	if not url or not url.strip():
	yield "⚠️ Please enter a YouTube URL first."
	return

	# Check keys before starting
	if not os.environ.get("SUPADATA_API_KEY", "").strip():
	yield (
	"❌ SUPADATA_API_KEY is missing!\n\n"
	"Steps to fix:\n"
	"1. Go to https://supadata.ai → Sign up FREE (no credit card)\n"
	"2. Get your API key from dashboard\n"
	"3. HuggingFace Space → Settings → Variables and secrets\n"
	"4. Click 'New secret'\n"
	" Name: SUPADATA_API_KEY\n"
	" Value: your_supadata_key_here\n"
	"5. Save → Space will restart → Try again!"
	)
	return

	try:
	yield "⏳ [1/4] Fetching transcript via Supadata API..."
	transcript = fetch_transcript(url.strip())
	yield f"✅ [1/4] Transcript fetched! ({len(transcript.split()):,} words)\n⏳ [2/4] Splitting into chunks..."

	_chunks = chunk_text(transcript)
	yield f"✅ [2/4] {len(_chunks)} chunks created\n⏳ [3/4] Generating embeddings (30-60 sec on CPU)..."

	_faiss_index = build_faiss_index(_chunks)
	yield (
	f"✅ [3/4] Embeddings generated\n"
	f"✅ [4/4] FAISS index ready!\n\n"
	f"🎉 Done! {len(_chunks)} chunks indexed.\n"
	f"👉 Switch to '💬 Chat with Video' tab and ask your questions!"
	)

	except Exception as e:
	_faiss_index = None
	_chunks = []
	yield f"❌ Error: {e}"


	# ─── HANDLER: Chat ────────────────────────────────────────────
	def chat_fn(message: str, history: list):
	"""RAG pipeline: retrieve → augment → LLM → answer."""
	if not message.strip():
	return history, ""

	if _faiss_index is None or not _chunks:
	history.append({"role": "user", "content": message})
	history.append({"role": "assistant", "content":
	"⚠️ No video processed yet!\n\n"
	"1. Go to '📹 Process Video' tab\n"
	"2. Paste a YouTube URL\n"
	"3. Click 🚀 Process Video\n"
	"4. Wait for ✅ success\n"
	"5. Come back here to chat!"
	})
	return history, ""

	try:
	if not os.environ.get("GROQ_API_KEY", "").strip():
	history.append({"role": "user", "content": message})
	history.append({"role": "assistant", "content":
	"❌ GROQ_API_KEY is missing!\n\n"
	"Space → Settings → Variables and secrets → New secret\n"
	"Name: GROQ_API_KEY Value: gsk_xxxxxxxxxx"
	})
	return history, ""

	context = retrieve_chunks(message, _faiss_index, _chunks)
	answer = ask_llm(message, context)
	history.append({"role": "user", "content": message})
	history.append({"role": "assistant", "content": answer})

	except Exception as e:
	history.append({"role": "user", "content": message})
	history.append({"role": "assistant", "content": f"❌ Error: {e}"})

	return history, ""


	# ─── GRADIO UI ────────────────────────────────────────────────
	with gr.Blocks(title="YouTube RAG QA", theme=gr.themes.Soft()) as demo:

	gr.Markdown("""
	# 🎬 YouTube RAG QA System
	### Kisi bhi YouTube video se sawaal poochho!
	Powered by: Supadata · FAISS · sentence-transformers · Groq LLaMA 3.3-70B · Gradio 5

	Step 1 → URL daalo + Process karo     Step 2 → Chat tab mein sawaal karo
	""")

	with gr.Tabs():

	# ── Tab 1: Process Video ──────────────────────────────
	with gr.Tab("📹 Process Video"):
	gr.Markdown("YouTube URL paste karo. Transcript fetch → chunk → embed → FAISS index.")
	with gr.Row():
	url_box = gr.Textbox(
	label="🔗 YouTube URL",
	placeholder="https://www.youtube.com/watch?v=... ya https://youtu.be/...",
	scale=4,
	)
	process_btn = gr.Button("🚀 Process Video", variant="primary", scale=1)

	status_box = gr.Textbox(
	label="📊 Live Processing Status",
	interactive=False,
	lines=9,
	)
	process_btn.click(process_video, inputs=[url_box], outputs=[status_box])

	# ── Tab 2: Chat ───────────────────────────────────────
	with gr.Tab("💬 Chat with Video"):
	gr.Markdown("Video process hone ke baad yahan sawaal poochho.")
	chatbot = gr.Chatbot(type="messages", height=430, label="Chat")
	with gr.Row():
	msg_box = gr.Textbox(
	placeholder="Sawaal likho aur Enter dabao...",
	label="Your Question",
	scale=5,
	)
	send_btn = gr.Button("Send ➤", variant="primary", scale=1)
	clear_btn = gr.Button("🗑️ Clear Chat", variant="secondary")

	send_btn.click(chat_fn, [msg_box, chatbot], [chatbot, msg_box])
	msg_box.submit(chat_fn, [msg_box, chatbot], [chatbot, msg_box])
	clear_btn.click(lambda: ([], ""), outputs=[chatbot, msg_box])

	if __name__ == "__main__":
	demo.launch(server_name="0.0.0.0", server_port=7860)