Spaces:

wahab5763
/

VideoTranscript-App

Runtime error

App Files Files Community

VideoTranscript-App / app.py

wahab5763

Create app.py

1c34698 verified 18 days ago

raw

history blame contribute delete

16.3 kB

	# app.py - YouTube Video RAG Q&A for Hugging Face Spaces

	import gradio as gr
	from youtube_transcript_api import YouTubeTranscriptApi
	from youtube_transcript_api._errors import TranscriptsDisabled, NoTranscriptFound
	from sentence_transformers import SentenceTransformer
	import faiss
	import numpy as np
	import pickle
	import os
	import re
	import groq
	from typing import List, Dict, Tuple
	import tempfile

	# ============================================
	# Configuration - Optimized for Token Limits
	# ============================================

	GROQ_API_KEY = os.getenv("GROQ_API_KEY") # Get from Hugging Face Secrets
	EMBEDDING_MODEL = "all-MiniLM-L6-v2"
	CHUNK_SIZE = 300
	MAX_CONTEXT_TOKENS = 1500
	MAX_RETRIEVAL_CHUNKS = 2

	# ============================================
	# YouTube Transcript Extraction
	# ============================================

	class YouTubeTranscriptProcessor:
	"""Handles YouTube transcript extraction and processing using new API"""

	@staticmethod
	def extract_transcript(youtube_url: str) -> Tuple[List[Dict], str]:
	"""Extract transcript from YouTube video"""
	try:
	video_id = YouTubeTranscriptProcessor.extract_video_id(youtube_url)
	if not video_id:
	return None, "Invalid YouTube URL"

	print(f"Processing video ID: {video_id}")

	# Create API instance and fetch transcript
	ytt_api = YouTubeTranscriptApi()

	try:
	fetched_transcript = ytt_api.fetch(video_id, languages=['en'])
	print("Found English transcript")
	except:
	print("English transcript not found, trying any available language...")
	fetched_transcript = ytt_api.fetch(video_id)
	print(f"Found transcript in language: {fetched_transcript.language}")

	# Convert to formatted transcript
	formatted_transcript = []
	for snippet in fetched_transcript.snippets:
	formatted_transcript.append({
	'text': snippet.text,
	'start': snippet.start,
	'duration': snippet.duration
	})

	print(f"Successfully extracted {len(formatted_transcript)} transcript entries")
	return formatted_transcript, None

	except Exception as e:
	return None, f"Error extracting transcript: {str(e)}"

	@staticmethod
	def extract_video_id(url: str) -> str:
	"""Extract video ID from YouTube URL"""
	patterns = [
	r'(?:youtube\.com\/watch\?v=)([\w-]+)',
	r'(?:youtu\.be\/)([\w-]+)',
	r'(?:youtube\.com\/embed\/)([\w-]+)',
	r'(?:youtube\.com\/v\/)([\w-]+)',
	r'(?:youtube\.com\/shorts\/)([\w-]+)'
	]

	for pattern in patterns:
	match = re.search(pattern, url)
	if match:
	return match.group(1)
	return None

	@staticmethod
	def get_full_transcript_text(transcript: List[Dict]) -> str:
	"""Convert transcript to readable full text without timestamps"""
	# Just join all text entries with spaces
	full_text = " ".join([entry['text'] for entry in transcript])

	# Clean up extra spaces
	full_text = re.sub(r'\s+', ' ', full_text).strip()

	# Add line breaks every ~100 characters for better readability
	lines = []
	words = full_text.split()
	current_line = []
	current_length = 0

	for word in words:
	if current_length + len(word) + 1 <= 100:
	current_line.append(word)
	current_length += len(word) + 1
	else:
	lines.append(" ".join(current_line))
	current_line = [word]
	current_length = len(word)

	if current_line:
	lines.append(" ".join(current_line))

	return "\n".join(lines)

	@staticmethod
	def chunk_transcript(transcript: List[Dict]) -> List[Dict]:
	"""Split transcript into smaller overlapping chunks"""
	full_text = " ".join([entry['text'] for entry in transcript])
	sentences = re.split(r'(?<=[.!?])\s+', full_text)

	chunks = []
	current_chunk = []
	current_length = 0

	for sentence in sentences:
	sentence_length = len(sentence)

	if current_length + sentence_length <= CHUNK_SIZE:
	current_chunk.append(sentence)
	current_length += sentence_length
	else:
	if current_chunk:
	chunk_text = " ".join(current_chunk)
	chunks.append({
	'text': chunk_text,
	'chunk_id': len(chunks)
	})

	overlap_text = " ".join(current_chunk[-2:]) if len(current_chunk) > 2 else " ".join(current_chunk)
	current_chunk = [overlap_text, sentence] if overlap_text else [sentence]
	current_length = len(overlap_text) + sentence_length if overlap_text else sentence_length

	if current_chunk:
	chunks.append({
	'text': " ".join(current_chunk),
	'chunk_id': len(chunks)
	})

	print(f"Created {len(chunks)} chunks from transcript")
	return chunks

	# ============================================
	# Vector Database Management
	# ============================================

	class VectorDatabase:
	"""Manages FAISS vector database and embeddings"""

	def __init__(self):
	print("Loading embedding model...")
	self.embedding_model = SentenceTransformer(EMBEDDING_MODEL)
	self.index = None
	self.chunks = []
	# Use temporary files for Hugging Face Spaces
	self.index_path = tempfile.NamedTemporaryFile(delete=False, suffix='.bin').name
	self.chunks_path = tempfile.NamedTemporaryFile(delete=False, suffix='.pkl').name

	def create_embeddings(self, texts: List[str]) -> np.ndarray:
	"""Create embeddings for texts"""
	print(f"Creating embeddings for {len(texts)} chunks...")
	batch_size = 32
	all_embeddings = []
	for i in range(0, len(texts), batch_size):
	batch = texts[i:i+batch_size]
	batch_embeddings = self.embedding_model.encode(batch, show_progress_bar=True)
	all_embeddings.append(batch_embeddings)

	return np.vstack(all_embeddings)

	def build_index(self, chunks: List[Dict]):
	"""Build FAISS index from chunks"""
	self.chunks = chunks
	texts = [chunk['text'] for chunk in chunks]
	embeddings = self.create_embeddings(texts)

	dimension = embeddings.shape[1]
	self.index = faiss.IndexFlatL2(dimension)
	self.index.add(embeddings.astype('float32'))

	self.save()
	return True

	def search(self, query: str, k: int = MAX_RETRIEVAL_CHUNKS) -> List[Tuple[str, float]]:
	"""Search for similar chunks"""
	if self.index is None or not self.chunks:
	return []

	query_embedding = self.embedding_model.encode([query])
	distances, indices = self.index.search(query_embedding.astype('float32'), k)

	results = []
	for i, idx in enumerate(indices[0]):
	if idx != -1 and idx < len(self.chunks):
	results.append((self.chunks[idx]['text'], float(distances[0][i])))

	return results

	def save(self):
	if self.index:
	faiss.write_index(self.index, self.index_path)
	with open(self.chunks_path, 'wb') as f:
	pickle.dump(self.chunks, f)
	print("Database saved successfully")

	def load(self):
	if os.path.exists(self.index_path) and os.path.exists(self.chunks_path):
	self.index = faiss.read_index(self.index_path)
	with open(self.chunks_path, 'rb') as f:
	self.chunks = pickle.load(f)
	print(f"Loaded database with {len(self.chunks)} chunks")
	return True
	return False

	def clear(self):
	self.index = None
	self.chunks = []
	if os.path.exists(self.index_path):
	os.remove(self.index_path)
	if os.path.exists(self.chunks_path):
	os.remove(self.chunks_path)
	print("Database cleared")

	# ============================================
	# RAG Question Answering
	# ============================================

	class RAGQA:
	"""Handles RAG-based question answering using Groq directly"""

	def __init__(self):
	self.vector_db = VectorDatabase()
	self.client = groq.Groq(api_key=GROQ_API_KEY) if GROQ_API_KEY else None
	self.current_transcript_text = ""
	self.vector_db.load()

	def truncate_context(self, context: str, max_tokens: int = MAX_CONTEXT_TOKENS) -> str:
	max_chars = max_tokens * 4
	if len(context) > max_chars:
	return context[:max_chars] + "..."
	return context

	def process_video(self, youtube_url: str) -> Tuple[str, str, bool]:
	"""Process YouTube video and build vector database, return full transcript"""
	# Extract transcript
	transcript, error = YouTubeTranscriptProcessor.extract_transcript(youtube_url)
	if error:
	return error, "", False

	if not transcript:
	return "No transcript data found", "", False

	# Get full transcript text without timestamps
	self.current_transcript_text = YouTubeTranscriptProcessor.get_full_transcript_text(transcript)

	# Chunk transcript for RAG
	chunks = YouTubeTranscriptProcessor.chunk_transcript(transcript)

	if not chunks:
	return "No content to process", self.current_transcript_text, False

	# Build vector database
	self.vector_db.build_index(chunks)

	return f"✅ Successfully processed {len(chunks)} chunks from video!", self.current_transcript_text, True

	def ask_question(self, question: str) -> str:
	"""Answer question using RAG with Groq"""
	if not GROQ_API_KEY:
	return "⚠️ Please set your Groq API key in Hugging Face Secrets."

	if self.vector_db.index is None or not self.vector_db.chunks:
	return "⚠️ Please load a video transcript first (click 'Get Transcript') before asking questions."

	relevant_chunks = self.vector_db.search(question, k=MAX_RETRIEVAL_CHUNKS)

	if not relevant_chunks:
	return "❓ No relevant information found in the transcript. Please try a different question."

	context = "\n\n---\n\n".join([chunk[0] for chunk in relevant_chunks])
	context = self.truncate_context(context, MAX_CONTEXT_TOKENS)

	system_prompt = """Answer questions based ONLY on the provided transcript context. Be brief (2-3 sentences max). If the answer isn't in the context, say so."""
	user_prompt = f"""Context: {context}\n\nQuestion: {question}\n\nAnswer:"""

	try:
	chat_completion = self.client.chat.completions.create(
	messages=[
	{"role": "system", "content": system_prompt},
	{"role": "user", "content": user_prompt}
	],
	model="llama-3.1-8b-instant",
	temperature=0.3,
	max_tokens=150
	)

	return chat_completion.choices[0].message.content

	except Exception as e:
	if "rate_limit_exceeded" in str(e) or "too large" in str(e):
	return "⚠️ Context too large. Please ask a more specific question."
	return f"❌ Error: {str(e)}"

	def clear_database(self) -> str:
	self.vector_db.clear()
	self.current_transcript_text = ""
	return "🗑️ Database cleared successfully!"

	# ============================================
	# Gradio UI Application
	# ============================================

	# Initialize RAG system
	rag_system = RAGQA()

	def process_youtube_url(youtube_url):
	if not youtube_url or youtube_url.strip() == "":
	return "❌ Please enter a YouTube URL", "⚠️ Waiting for video...", ""

	message, transcript_text, success = rag_system.process_video(youtube_url)
	if success:
	return message, "✅ Ready for questions!", transcript_text
	else:
	return message, "❌ Failed to process video", ""

	def answer_question(question, history):
	if not question or question.strip() == "":
	return history

	answer = rag_system.ask_question(question)
	history = history or []
	history.append((question, answer))
	return history

	def clear_everything():
	message = rag_system.clear_database()
	return message, "⚠️ Waiting for video...", "", []

	# Create Gradio interface
	with gr.Blocks(title="🎥 YouTube Video RAG Q&A", theme=gr.themes.Soft()) as demo:
	gr.Markdown("""
	# 📚 YouTube Video Q&A with RAG
	### Extract transcript and ask questions about any YouTube video!

	How it works:
	1. Enter a YouTube URL
	2. Click "Get Transcript" to extract and process the video transcript
	3. Ask questions about the video content
	4. Get accurate answers based solely on the transcript

	Note: Make sure the video has captions/transcripts enabled.
	""")

	with gr.Row():
	with gr.Column(scale=3):
	youtube_url = gr.Textbox(
	label="🔗 YouTube URL",
	placeholder="https://www.youtube.com/watch?v=...",
	lines=1
	)

	with gr.Column(scale=1):
	process_btn = gr.Button("🎬 Get Transcript", variant="primary", size="lg")

	with gr.Row():
	status_text = gr.Textbox(label="📊 Status", interactive=False, lines=2)
	qa_status = gr.Textbox(label="QA Status", interactive=False, lines=1, value="⚠️ Waiting for video...")

	gr.Markdown("---")

	with gr.Row():
	with gr.Column(scale=1):
	gr.Markdown("### 📝 Complete Transcript")
	transcript_display = gr.Textbox(
	label="",
	interactive=False,
	lines=25,
	max_lines=25,
	placeholder="Transcript will appear here after processing..."
	)

	with gr.Column(scale=1):
	gr.Markdown("### 💬 Ask Questions")
	chatbot = gr.Chatbot(
	label="Chat",
	height=400,
	bubble_full_width=False,
	avatar_images=(None, "🤖")
	)

	with gr.Row():
	question = gr.Textbox(
	label="Your Question",
	placeholder="Ask about the video...",
	lines=2,
	scale=4
	)
	submit_btn = gr.Button("Ask", variant="primary", scale=1)

	with gr.Row():
	clear_chat_btn = gr.Button("🗑️ Clear Chat", variant="secondary", size="sm")
	clear_all_btn = gr.Button("🔄 Clear All", variant="stop", size="sm")

	# Event handlers
	process_btn.click(
	process_youtube_url,
	inputs=[youtube_url],
	outputs=[status_text, qa_status, transcript_display]
	)

	submit_btn.click(
	answer_question,
	inputs=[question, chatbot],
	outputs=[chatbot]
	).then(
	lambda: "", None, [question]
	)

	clear_chat_btn.click(
	lambda: [], None, [chatbot]
	)

	clear_all_btn.click(
	clear_everything,
	outputs=[status_text, qa_status, transcript_display, chatbot]
	)

	question.submit(
	answer_question,
	inputs=[question, chatbot],
	outputs=[chatbot]
	).then(
	lambda: "", None, [question]
	)

	# Launch the app
	if __name__ == "__main__":
	demo.launch()