Spaces:

agnixcode
/

youtube_chatbot_transcriber

Sleeping

App Files Files Community

agnixcode commited on Apr 22

Commit

23e75f0

verified ·

1 Parent(s): e34d257

Update app.py

Browse files

Files changed (1) hide show

app.py +335 -177

app.py CHANGED Viewed

@@ -1,192 +1,350 @@
-# ================================
-# INSTALL DEPENDENCIES
-# ================================
-# pip install sentence-transformers faiss-cpu gradio groq requests
-# ================================
-# IMPORTS
-# ================================
-import requests
-from sentence_transformers import SentenceTransformer
-import faiss
-import numpy as np
-import gradio as gr
-from groq import Groq
-import re
-import os
-# ================================
-# CONFIG
-# ================================
-GROQ_API_KEY = os.getenv("GROQ_API_KEY")
-SUPADATA_API_KEY = os.getenv("SUPADATA_API_KEY")
-client = Groq(api_key=GROQ_API_KEY)
-embed_model = SentenceTransformer("all-MiniLM-L6-v2")
-# Global store
-vector_store = None
-stored_chunks = []
-# ================================
-# UTIL: EXTRACT VIDEO ID
-# ================================
-def extract_video_id(url):
-    match = re.search(r"(?:v=|\/)([0-9A-Za-z_-]{11})", url)
-    return match.group(1) if match else None
-# ================================
-# STEP 1: GET TRANSCRIPT
-# Using Supadata API — works from any cloud server (no IP blocks)
-# ================================
-def get_transcript(url):
-    video_id = extract_video_id(url)
-    if not video_id:
-        return "❌ Invalid YouTube URL"
     try:
-        response = requests.get(
-            "https://api.supadata.ai/v1/youtube/transcript",
-            params={"videoId": video_id, "text": "true"},
-            headers={"x-api-key": SUPADATA_API_KEY},
-            timeout=30
-        )
-        if response.status_code == 401:
-            return "❌ Invalid Supadata API key. Check your HF secret: SUPADATA_API_KEY"
-        if response.status_code == 404:
-            return "❌ No transcript found for this video (it may have captions disabled)"
-        if response.status_code != 200:
-            return f"❌ Supadata API error {response.status_code}: {response.text}"
-        data = response.json()
-        # text=true returns content as a plain string
-        content = data.get("content", "")
-        if not content:
-            return "❌ Transcript is empty"
-        return content
-    except Exception as e:
-        return f"❌ Transcript Error: {str(e)}"
-# ================================
-# STEP 2: CHUNKING
-# ================================
-def chunk_text(text, chunk_size=300):
-    words = text.split()
-    chunks = []
-    for i in range(0, len(words), chunk_size):
-        chunk = " ".join(words[i:i + chunk_size])
-        chunks.append(chunk)
-    return chunks
-# ================================
-# STEP 3: VECTOR STORE
-# ================================
-def create_vector_store(chunks):
-    global vector_store, stored_chunks
-    embeddings = embed_model.encode(chunks)
     dim = embeddings.shape[1]
     index = faiss.IndexFlatL2(dim)
-    index.add(np.array(embeddings))
-    vector_store = index
-    stored_chunks = chunks
-# ================================
-# STEP 4: RETRIEVAL
-# ================================
-def retrieve(query, top_k=3):
-    query_embedding = embed_model.encode([query])
-    distances, indices = vector_store.search(np.array(query_embedding), top_k)
-    results = [stored_chunks[i] for i in indices[0]]
-    return "\n".join(results)
-# ================================
-# STEP 5: LLM (GROQ)
-# ================================
-def generate_answer(query, context):
-    prompt = f"""You are a helpful assistant.
-Use ONLY the context below to answer the question.
-Context:
-{context}
-Question:
-{query}
-Answer:"""
-    response = client.chat.completions.create(
-        model="llama-3.3-70b-versatile",
-        messages=[{"role": "user", "content": prompt}],
-        temperature=0.3
     )
-    return response.choices[0].message.content
-# ================================
-# HANDLERS
-# ================================
-def handle_process(url):
-    transcript = get_transcript(url)
-    if transcript.startswith("❌"):
-        return transcript, "", []
-    chunks = chunk_text(transcript)
-    create_vector_store(chunks)
-    preview = transcript[:500]
-    return "✅ Video processed successfully!", preview, []
-def handle_chat(query, chat_history):
-    if vector_store is None:
-        return "", chat_history + [(query, "❌ Process a video first")]
-    context = retrieve(query)
-    answer = generate_answer(query, context)
-    chat_history.append((query, answer))
-    return "", chat_history
-# ================================
-# UI
-# ================================
-with gr.Blocks(theme=gr.themes.Soft()) as app:
-    gr.Markdown("# 🎥 YouTube Video Assistant")
-    gr.Markdown("Paste a YouTube link → process → chat with the video")
-    with gr.Row():
-        url_input = gr.Textbox(label="🔗 YouTube URL", scale=4)
-        process_btn = gr.Button("🚀 Process", scale=1)
-    status_output = gr.Markdown("")
-    transcript_preview = gr.Textbox(
-        label="📄 Transcript Preview",
-        lines=5,
-        interactive=False
     )
-    gr.Markdown("---")
-    chatbot = gr.Chatbot(label="💬 Chat with Video")
-    with gr.Row():
-        query_input = gr.Textbox(
-            placeholder="Ask something about the video...",
-            scale=4
         )
-        send_btn = gr.Button("Send", scale=1)
-    process_btn.click(
-        handle_process,
-        inputs=url_input,
-        outputs=[status_output, transcript_preview, chatbot]
-    )
-    send_btn.click(
-        handle_chat,
-        inputs=[query_input, chatbot],
-        outputs=[query_input, chatbot]
     )
-app.launch()

+# app.py
+import os
+import re
+import gradio as gr
+import numpy as np
+import faiss
+import torch
+from youtube_transcript_api import YouTubeTranscriptApi, TranscriptsDisabled, NoTranscriptFound
+from langchain_text_splitters import RecursiveCharacterTextSplitter
+from sentence_transformers import SentenceTransformer
+from huggingface_hub import InferenceClient
+# ---------------------------------------------------------------------------
+# Global state
+# ---------------------------------------------------------------------------
+embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
+faiss_index: faiss.IndexFlatL2 | None = None
+chunk_store: list[str] = []          # parallel list of text chunks
+full_transcript: str = ""            # raw transcript for display
+# HF Inference API – set HF_TOKEN as a Space secret or environment variable.
+HF_TOKEN = os.environ.get("HF_TOKEN", "")
+LLM_MODEL = "mistralai/Mistral-7B-Instruct-v0.3"   # swap freely
+inference_client = InferenceClient(model=LLM_MODEL, token=HF_TOKEN or None)
+# ---------------------------------------------------------------------------
+# Helper – extract video id from various YouTube URL formats
+# ---------------------------------------------------------------------------
+def _extract_video_id(url: str) -> str:
+    """Return the 11-char YouTube video ID from any common URL format."""
+    patterns = [
+        r"(?:v=)([A-Za-z0-9_-]{11})",          # ?v=xxxx
+        r"(?:youtu\.be/)([A-Za-z0-9_-]{11})",  # short link
+        r"(?:embed/)([A-Za-z0-9_-]{11})",       # embed link
+        r"(?:shorts/)([A-Za-z0-9_-]{11})",      # shorts
+    ]
+    for pattern in patterns:
+        match = re.search(pattern, url)
+        if match:
+            return match.group(1)
+    raise ValueError(f"Could not extract a valid video ID from URL: {url}")
+# ---------------------------------------------------------------------------
+# 1. Fetch transcript
+# ---------------------------------------------------------------------------
+def get_transcript(url: str) -> str:
+    """
+    Fetch the transcript for a YouTube video.
+    Returns the full transcript as a single string.
+    Raises ValueError with a human-readable message on failure.
+    """
+    video_id = _extract_video_id(url)
     try:
+        transcript_list = YouTubeTranscriptApi.get_transcript(video_id)
+    except TranscriptsDisabled:
+        raise ValueError("Transcripts are disabled for this video.")
+    except NoTranscriptFound:
+        # Try fetching any available language and translating to English
+        try:
+            transcript_list = (
+                YouTubeTranscriptApi.list_transcripts(video_id)
+                .find_generated_transcript(["en", "en-US", "en-GB"])
+                .fetch()
+            )
+        except Exception as inner_exc:
+            raise ValueError(
+                f"No transcript found for this video. Details: {inner_exc}"
+            )
+    except Exception as exc:
+        raise ValueError(f"Failed to retrieve transcript: {exc}")
+    # Concatenate all segments into a single string
+    return " ".join(seg["text"] for seg in transcript_list)
+# ---------------------------------------------------------------------------
+# 2. Process video – build FAISS index
+# ---------------------------------------------------------------------------
+def process_video(url: str):
+    """
+    Full pipeline:
+      1. Fetch transcript
+      2. Split into chunks
+      3. Compute embeddings
+      4. Build FAISS index
+    Returns (status_message, transcript_text) for the Gradio UI.
+    """
+    global faiss_index, chunk_store, full_transcript
+    # Reset state
+    faiss_index = None
+    chunk_store = []
+    full_transcript = ""
+    # -- Step 1: transcript --------------------------------------------------
+    try:
+        transcript = get_transcript(url)
+    except ValueError as exc:
+        return str(exc), ""
+    full_transcript = transcript
+    # -- Step 2: chunking ----------------------------------------------------
+    splitter = RecursiveCharacterTextSplitter(
+        chunk_size=500,
+        chunk_overlap=50,
+        length_function=len,
+    )
+    chunks = splitter.split_text(transcript)
+    if not chunks:
+        return "Transcript was fetched but produced no text chunks.", transcript
+    chunk_store = chunks
+    # -- Step 3: embeddings --------------------------------------------------
+    embeddings = embedding_model.encode(chunks, show_progress_bar=False)
+    embeddings = np.array(embeddings, dtype="float32")
+    # -- Step 4: FAISS index -------------------------------------------------
     dim = embeddings.shape[1]
     index = faiss.IndexFlatL2(dim)
+    index.add(embeddings)
+    faiss_index = index
+    status = (
+        f"✅ Video processed successfully!\n"
+        f"   • Chunks created : {len(chunks)}\n"
+        f"   • Embedding dim  : {dim}\n"
+        f"   • FAISS vectors  : {index.ntotal}\n\n"
+        f"Switch to the **Chat with Video** tab to start asking questions."
     )
+    return status, transcript
+# ---------------------------------------------------------------------------
+# 3. Retrieve top-k chunks
+# ---------------------------------------------------------------------------
+def retrieve_context(query: str, top_k: int = 3) -> str:
+    """
+    Encode the query and retrieve the top-k most relevant transcript chunks
+    from the FAISS index.
+    Returns a single string with the chunks separated by newlines.
+    """
+    if faiss_index is None or not chunk_store:
+        return ""
+    query_vec = embedding_model.encode([query], show_progress_bar=False)
+    query_vec = np.array(query_vec, dtype="float32")
+    k = min(top_k, len(chunk_store))
+    distances, indices = faiss_index.search(query_vec, k)
+    retrieved = [chunk_store[i] for i in indices[0] if i < len(chunk_store)]
+    return "\n\n".join(retrieved)
+# ---------------------------------------------------------------------------
+# 4. Generate answer via HF Inference API (RAG prompt)
+# ---------------------------------------------------------------------------
+def generate_answer(query: str) -> str:
+    """
+    Retrieve context chunks and call the LLM to produce a grounded answer.
+    The prompt explicitly instructs the model to rely only on the provided
+    context and not hallucinate.
+    """
+    if faiss_index is None:
+        return (
+            "⚠️ No video has been processed yet. "
+            "Please go to the **Process Video** tab and load a YouTube URL first."
+        )
+    context = retrieve_context(query, top_k=3)
+    if not context:
+        return "⚠️ Could not retrieve any relevant context for your question."
+    # RAG prompt – works well with instruction-tuned models
+    system_prompt = (
+        "You are a helpful assistant that answers questions strictly based on "
+        "the provided transcript context. "
+        "If the answer is not contained in the context, say: "
+        "'I could not find this information in the video transcript.' "
+        "Do NOT make up information."
     )
+    user_prompt = (
+        f"Context from the video transcript:\n"
+        f"---\n{context}\n---\n\n"
+        f"Question: {query}\n\n"
+        f"Answer:"
+    )
+    messages = [
+        {"role": "system", "content": system_prompt},
+        {"role": "user",   "content": user_prompt},
+    ]
+    try:
+        response = inference_client.chat_completion(
+            messages=messages,
+            max_tokens=512,
+            temperature=0.2,       # low temperature → more faithful to context
+            top_p=0.9,
+        )
+        answer = response.choices[0].message.content.strip()
+    except Exception as exc:
+        answer = (
+            f"❌ Model inference failed: {exc}\n\n"
+            "Make sure HF_TOKEN is set and the model endpoint is available."
         )
+    return answer
+# ---------------------------------------------------------------------------
+# 5. Gradio chat helper (maintains history list)
+# ---------------------------------------------------------------------------
+def chat(user_message: str, history: list[list[str]]):
+    """
+    Called by the Gradio ChatInterface-style callback.
+    Appends the new Q-A pair to history and returns updated history.
+    """
+    if not user_message.strip():
+        history.append([user_message, "Please enter a question."])
+        return history, ""
+    answer = generate_answer(user_message)
+    history.append([user_message, answer])
+    return history, ""
+# ---------------------------------------------------------------------------
+# 6. Gradio UI
+# ---------------------------------------------------------------------------
+with gr.Blocks(
+    title="YouTube RAG Chatbot",
+    theme=gr.themes.Soft(),
+) as app:
+    gr.Markdown(
+        """
+        # 🎬 YouTube RAG Chatbot
+        **Process any YouTube video and chat with its transcript using Retrieval-Augmented Generation.**
+        > **Note:** Set your `HF_TOKEN` environment variable (Space secret) so the LLM inference works.
+        """
     )
+    with gr.Tabs():
+        # ------------------------------------------------------------------ #
+        # Tab 1 – Process Video
+        # ------------------------------------------------------------------ #
+        with gr.TabItem("📥 Process Video"):
+            gr.Markdown(
+                "Paste a YouTube URL below and click **Process**. "
+                "The transcript will be fetched, chunked, embedded, and indexed."
+            )
+            with gr.Row():
+                url_input = gr.Textbox(
+                    label="YouTube URL",
+                    placeholder="https://www.youtube.com/watch?v=...",
+                    scale=5,
+                )
+                process_btn = gr.Button("⚙️ Process", variant="primary", scale=1)
+            status_output = gr.Textbox(
+                label="Status",
+                lines=6,
+                interactive=False,
+            )
+            transcript_output = gr.Textbox(
+                label="Transcript (read-only)",
+                lines=15,
+                interactive=False,
+                show_copy_button=True,
+            )
+            process_btn.click(
+                fn=process_video,
+                inputs=[url_input],
+                outputs=[status_output, transcript_output],
+            )
+        # ------------------------------------------------------------------ #
+        # Tab 2 – Chat with Video
+        # ------------------------------------------------------------------ #
+        with gr.TabItem("💬 Chat with Video"):
+            gr.Markdown(
+                "Ask any question about the processed video. "
+                "The bot retrieves the most relevant transcript segments "
+                "and generates a grounded answer."
+            )
+            chatbot = gr.Chatbot(
+                label="Conversation",
+                height=450,
+                bubble_full_width=False,
+            )
+            with gr.Row():
+                query_input = gr.Textbox(
+                    label="Your question",
+                    placeholder="What is the main topic discussed in this video?",
+                    scale=5,
+                )
+                send_btn = gr.Button("Send 🚀", variant="primary", scale=1)
+            clear_btn = gr.Button("🗑️ Clear conversation", variant="secondary")
+            # Shared state for conversation history
+            chat_history = gr.State([])
+            send_btn.click(
+                fn=chat,
+                inputs=[query_input, chat_history],
+                outputs=[chatbot, query_input],
+            ).then(
+                fn=lambda h: h,
+                inputs=[chatbot],
+                outputs=[chat_history],
+            )
+            query_input.submit(
+                fn=chat,
+                inputs=[query_input, chat_history],
+                outputs=[chatbot, query_input],
+            ).then(
+                fn=lambda h: h,
+                inputs=[chatbot],
+                outputs=[chat_history],
+            )
+            clear_btn.click(
+                fn=lambda: ([], []),
+                outputs=[chatbot, chat_history],
+            )
+# ---------------------------------------------------------------------------
+# Entry point
+# ---------------------------------------------------------------------------
+if __name__ == "__main__":
+    app.launch()