Spaces:

agnixcode
/

youtube-rag-chat

Runtime error

File size: 4,709 Bytes

7bd0e02
d46d52c
7bd0e02
 
 
7462c22
587e2e0
7462c22
 
 
 
670c1c5
7462c22
 
587e2e0
7bd0e02
 
 
587e2e0
7462c22
 
670c1c5
7462c22
 
 
 
670c1c5
7462c22
 
 
670c1c5
587e2e0
 
7bd0e02
670c1c5
7462c22
 
587e2e0
670c1c5
 
 
7bd0e02
 
587e2e0
 
 
 
7bd0e02
587e2e0
7bd0e02
670c1c5
 
7462c22
 
670c1c5
7bd0e02
670c1c5
7462c22
670c1c5
 
bcdefd2
670c1c5
7462c22
 
7bd0e02
7462c22
7bd0e02
670c1c5
 
 
 
7bd0e02
670c1c5
 
 
 
7bd0e02
587e2e0
 
 
670c1c5
 
 
7bd0e02
 
670c1c5
 
 
 
 
7bd0e02
670c1c5
7bd0e02
 
 
 
 
670c1c5
7462c22
 
670c1c5
7462c22
670c1c5
 
7bd0e02
670c1c5
 
587e2e0
 
7462c22
670c1c5
 
7bd0e02
d46d52c
 
7bd0e02
d46d52c
 
670c1c5
 
d46d52c
7bd0e02
670c1c5
 
 
 
 
7bd0e02
670c1c5
7bd0e02
670c1c5
 
 
 
7bd0e02
670c1c5
7bd0e02
 
670c1c5

import os
import re
import gradio as gr
import numpy as np
import faiss
from youtube_transcript_api import YouTubeTranscriptApi
from sentence_transformers import Transformer, SentenceTransformer
from langchain_text_splitters import RecursiveCharacterTextSplitter
from groq import Groq

# ===============================
# CONFIGURATION
# ===============================

# Load Groq API Key from Hugging Face Secrets
GROQ_API_KEY = os.getenv("GROQ_API_KEY")
groq_client = Groq(api_key=GROQ_API_KEY) if GROQ_API_KEY else None

# Load embedding model
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")

# Global variables to store the "brain" of the current video
vector_store = None
chunks_store = []

# ===============================
# CORE FUNCTIONS
# ===============================

def extract_video_id(url):
    """Extracts the 11-character YouTube video ID."""
    # Handles standard URLs, shorts, and shared links
    regex = r"(?:v=|\/|be\/|embed\/|shorts\/)([0-9A-Za-z_-]{11})"
    match = re.search(regex, url)
    return match.group(1) if match else None

def get_transcript(url):
    """Fetches transcript from YouTube."""
    video_id = extract_video_id(url)
    if not video_id:
        return "ERROR: Invalid YouTube URL."
    
    try:
        # Correct Method Call using the imported class
        transcript_list = YouTubeTranscriptApi.get_transcript(video_id)
        text = " ".join([i['text'] for i in transcript_list])
        return text
    except Exception as e:
        return f"ERROR: Could not retrieve transcript. (Details: {str(e)})"

def build_vector_index(text):
    """Chunks text and stores it in a FAISS vector database."""
    global vector_store, chunks_store
    
    # 1. Chunking
    splitter = RecursiveCharacterTextSplitter(chunk_size=600, chunk_overlap=60)
    chunks_store = splitter.split_text(text)
    
    # 2. Embedding
    embeddings = embedding_model.encode(chunks_store)
    
    # 3. Indexing with FAISS
    dimension = embeddings.shape[1]
    index = faiss.IndexFlatL2(dimension)
    index.add(np.array(embeddings).astype('float32'))
    vector_store = index

def get_ai_response(user_query):
    """Retrieves context and asks Groq Llama 3."""
    if vector_store is None or not chunks_store:
        return "Please load a video first."
    
    # Search for relevant chunks
    query_embedding = embedding_model.encode([user_query])
    D, I = vector_store.search(np.array(query_embedding).astype('float32'), k=3)
    context = "\n".join([chunks_store[i] for i in I[0] if i != -1])

    prompt = f"""Use the following video transcript context to answer the question. 
    If the answer isn't in the context, say you don't know based on the video.
    
    Context: {context}
    Question: {user_query}
    Answer:"""

    try:
        completion = groq_client.chat.completions.create(
            model="llama-3.3-70b-versatile",
            messages=[{"role": "user", "content": prompt}]
        )
        return completion.choices[0].message.content
    except Exception as e:
        return f"AI Error: {str(e)}"

# ===============================
# UI LOGIC
# ===============================

def process_video_step(url):
    transcript = get_transcript(url)
    if transcript.startswith("ERROR"):
        return transcript, "❌ Failed"
    
    build_vector_index(transcript)
    return transcript[:1000] + "...", "✅ Video Indexed! Go to Chat tab."

def chat_step(message, history):
    if not GROQ_API_KEY:
        history.append((message, "Error: Groq API Key missing in Secrets."))
        return history, ""
    
    answer = get_ai_response(message)
    history.append((message, answer))
    return history, ""

# ===============================
# GRADIO INTERFACE
# ===============================

with gr.Blocks(theme=gr.themes.Soft()) as demo:
    gr.Markdown("# 📺 YouTube AI Expert (RAG)")
    
    with gr.Tabs():
        with gr.Tab("1. Setup Video"):
            url_input = gr.Textbox(label="YouTube URL", placeholder="https://www.youtube.com/watch?v=...")
            process_btn = gr.Button("Process Video", variant="primary")
            status = gr.Textbox(label="Status")
            preview = gr.Textbox(label="Transcript Preview (First 1000 chars)", lines=5)
            
            process_btn.click(process_video_step, inputs=url_input, outputs=[preview, status])
            
        with gr.Tab("2. Chat with Video"):
            chatbot = gr.Chatbot(height=400)
            msg = gr.Textbox(label="Ask anything about the video...")
            clear = gr.ClearButton([msg, chatbot])
            
            msg.submit(chat_step, [msg, chatbot], [chatbot, msg])

if __name__ == "__main__":
    demo.launch()