Spaces:

agnixcode
/

youtube-rag-chat

Runtime error

App Files Files Community

agnixcode commited on 30 days ago

Commit

bcdefd2

verified ·

1 Parent(s): 7462c22

Update app.py

Browse files

Files changed (1) hide show

app.py +59 -27

app.py CHANGED Viewed

@@ -3,6 +3,7 @@ import re
 import gradio as gr
 import numpy as np
 import faiss
 from youtube_transcript_api import YouTubeTranscriptApi
 from sentence_transformers import SentenceTransformer
 from langchain_text_splitters import RecursiveCharacterTextSplitter
@@ -12,7 +13,7 @@ from groq import Groq
 # CONFIG & INITIALIZATION
 # ===============================
-# Get API Key from Environment Variables (Set this in HF Settings)
 GROQ_API_KEY = os.getenv("GROQ_API_KEY")
 groq_client = Groq(api_key=GROQ_API_KEY) if GROQ_API_KEY else None
@@ -28,21 +29,27 @@ chunks_store = []
 # ===============================
 def extract_video_id(url):
-    """Extracts the 11-character YouTube video ID."""
-    regex = r"(?:v=|\/)([0-9A-Za-z_-]{11}).*"
     match = re.search(regex, url)
     if match:
         return match.group(1)
     return None
 def get_transcript(url):
     try:
         video_id = extract_video_id(url)
         if not video_id:
-            return "ERROR: Invalid YouTube URL."
-        transcript_data = YouTubeTranscriptApi.get_transcript(video_id)
-        full_text = " ".join([item['text'] for item in transcript_data])
         return full_text
     except Exception as e:
         return f"ERROR: Could not retrieve transcript. (Details: {str(e)})"
@@ -50,32 +57,53 @@ def get_transcript(url):
 def process_transcript(transcript):
     global vector_store, chunks_store
     splitter = RecursiveCharacterTextSplitter(chunk_size=600, chunk_overlap=60)
     chunks = splitter.split_text(transcript)
     embeddings = embedding_model.encode(chunks)
     dimension = embeddings.shape[1]
     index = faiss.IndexFlatL2(dimension)
     index.add(np.array(embeddings).astype('float32'))
     vector_store = index
     chunks_store = chunks
 def retrieve_context(query, top_k=3):
     query_embedding = embedding_model.encode([query])
     distances, indices = vector_store.search(np.array(query_embedding).astype('float32'), top_k)
     retrieved_chunks = [chunks_store[i] for i in indices[0] if i != -1]
     return "\n\n".join(retrieved_chunks)
 def generate_answer(query):
     if not groq_client:
-        return "API Key not configured. Please add GROQ_API_KEY to Space Secrets."
     context = retrieve_context(query)
-    prompt = f"""You are a helpful AI assistant. Use ONLY the context below to answer.
-    Context: {context}
-    Question: {query}
-    Answer:"""
     response = groq_client.chat.completions.create(
         model="llama-3.3-70b-versatile",
@@ -84,23 +112,27 @@ def generate_answer(query):
     return response.choices[0].message.content
 # ===============================
-# GRADIO PIPELINE FUNCTIONS
 # ===============================
 def process_video_ui(url):
     if not url:
-        return "Please enter a URL", "❌ No URL provided"
     transcript = get_transcript(url)
     if transcript.startswith("ERROR"):
-        return transcript, "❌ Failed"
     process_transcript(transcript)
-    return transcript[:1000] + "...", "✅ Video processed! Start chatting."
 def chat_with_video_ui(user_query, history):
     if vector_store is None:
-        history.append((user_query, "⚠️ Please process a video in the first tab first."))
         return history, ""
     answer = generate_answer(user_query)
@@ -108,28 +140,28 @@ def chat_with_video_ui(user_query, history):
     return history, ""
 # ===============================
-# GRADIO UI DESIGN
 # ===============================
 with gr.Blocks(theme=gr.themes.Soft()) as app:
-    gr.Markdown("# 🎥 YouTube RAG: Chat with any Video")
-    gr.Markdown("Paste a YouTube link to transcribe it, then ask questions about the content using Llama 3.3 via Groq.")
     with gr.Tabs():
-        with gr.Tab("1. Process Video"):
-            url_input = gr.Textbox(label="YouTube URL", placeholder="https://www.youtube.com/watch?v=...")
-            process_btn = gr.Button("Transcribe & Index", variant="primary")
             with gr.Row():
                 status_output = gr.Textbox(label="Status")
-                transcript_preview = gr.Textbox(label="Transcript Preview (First 1000 chars)", lines=5)
             process_btn.click(process_video_ui, inputs=url_input, outputs=[transcript_preview, status_output])
-        with gr.Tab("2. Chat"):
-            chatbot = gr.Chatbot(height=450)
             with gr.Row():
-                msg = gr.Textbox(label="Ask a question...", placeholder="What is this video about?", scale=4)
-                submit = gr.Button("Send", variant="primary", scale=1)
             submit.click(chat_with_video_ui, inputs=[msg, chatbot], outputs=[chatbot, msg])
             msg.submit(chat_with_video_ui, inputs=[msg, chatbot], outputs=[chatbot, msg])

 import gradio as gr
 import numpy as np
 import faiss
+# Import the library
 from youtube_transcript_api import YouTubeTranscriptApi
 from sentence_transformers import SentenceTransformer
 from langchain_text_splitters import RecursiveCharacterTextSplitter
 # CONFIG & INITIALIZATION
 # ===============================
+# Get API Key from Environment Variables (Set this in HF Space Secrets)
 GROQ_API_KEY = os.getenv("GROQ_API_KEY")
 groq_client = Groq(api_key=GROQ_API_KEY) if GROQ_API_KEY else None
 # ===============================
 def extract_video_id(url):
+    """Extracts the 11-character YouTube video ID from various URL formats."""
+    regex = r"(?:v=|\/|be\/)([0-9A-Za-z_-]{11}).*"
     match = re.search(regex, url)
     if match:
         return match.group(1)
     return None
 def get_transcript(url):
+    """
+    Fetch transcript using the correct static method.
+    """
     try:
         video_id = extract_video_id(url)
         if not video_id:
+            return "ERROR: Invalid YouTube URL. Could not find Video ID."
+        # FIX: Calling the static method directly on the class
+        # We also try to fetch English by default or the first available
+        transcript_list = YouTubeTranscriptApi.get_transcript(video_id)
+        full_text = " ".join([item['text'] for item in transcript_list])
         return full_text
     except Exception as e:
         return f"ERROR: Could not retrieve transcript. (Details: {str(e)})"
 def process_transcript(transcript):
     global vector_store, chunks_store
+    # Split text into manageable chunks
     splitter = RecursiveCharacterTextSplitter(chunk_size=600, chunk_overlap=60)
     chunks = splitter.split_text(transcript)
+    # Create embeddings
     embeddings = embedding_model.encode(chunks)
+    # Initialize FAISS Index
     dimension = embeddings.shape[1]
     index = faiss.IndexFlatL2(dimension)
     index.add(np.array(embeddings).astype('float32'))
+    # Store globally for retrieval
     vector_store = index
     chunks_store = chunks
 def retrieve_context(query, top_k=3):
+    if vector_store is None:
+        return ""
     query_embedding = embedding_model.encode([query])
     distances, indices = vector_store.search(np.array(query_embedding).astype('float32'), top_k)
+    # Fetch matching chunks
     retrieved_chunks = [chunks_store[i] for i in indices[0] if i != -1]
     return "\n\n".join(retrieved_chunks)
 def generate_answer(query):
     if not groq_client:
+        return "Error: Groq API Key is not set in Hugging Face Secrets."
     context = retrieve_context(query)
+    if not context:
+        return "I don't have any context from the video yet. Please process a video first."
+    prompt = f"""
+You are a professional AI Assistant. Use the provided context from a YouTube video to answer the user's question.
+If the answer isn't in the context, say you don't know based on the video.
+Context:
+{context}
+Question:
+{query}
+Answer:
+"""
     response = groq_client.chat.completions.create(
         model="llama-3.3-70b-versatile",
     return response.choices[0].message.content
 # ===============================
+# UI LOGIC
 # ===============================
 def process_video_ui(url):
     if not url:
+        return "Please enter a valid URL", "❌ No URL"
     transcript = get_transcript(url)
     if transcript.startswith("ERROR"):
+        return transcript, "❌ Failed to fetch transcript"
     process_transcript(transcript)
+    return transcript[:1500] + "...", "✅ Video processed! You can now chat."
 def chat_with_video_ui(user_query, history):
+    if not user_query:
+        return history, ""
     if vector_store is None:
+        history.append((user_query, "⚠️ Please process a video in the first tab before chatting."))
         return history, ""
     answer = generate_answer(user_query)
     return history, ""
 # ===============================
+# GRADIO INTERFACE
 # ===============================
 with gr.Blocks(theme=gr.themes.Soft()) as app:
+    gr.Markdown("# 🎥 YouTube RAG AI Expert")
+    gr.Markdown("Transcribe any YouTube video and chat with its content using Llama 3.3 & FAISS.")
     with gr.Tabs():
+        with gr.Tab("1. Load Video"):
+            url_input = gr.Textbox(label="YouTube Link", placeholder="https://www.youtube.com/watch?v=...")
+            process_btn = gr.Button("Transcribe & Index Video", variant="primary")
             with gr.Row():
                 status_output = gr.Textbox(label="Status")
+                transcript_preview = gr.Textbox(label="Transcript Preview", lines=8)
             process_btn.click(process_video_ui, inputs=url_input, outputs=[transcript_preview, status_output])
+        with gr.Tab("2. Chat with AI"):
+            chatbot = gr.Chatbot(height=500)
             with gr.Row():
+                msg = gr.Textbox(label="Your Question", placeholder="What are the key takeaways?", scale=4)
+                submit = gr.Button("Ask", variant="primary", scale=1)
             submit.click(chat_with_video_ui, inputs=[msg, chatbot], outputs=[chatbot, msg])
             msg.submit(chat_with_video_ui, inputs=[msg, chatbot], outputs=[chatbot, msg])