Spaces:

agnixcode
/

YoutubeTranscribevideochatbot

Sleeping

File size: 22,449 Bytes

f93afb8

# ============================================================
# YouTube RAG Q&A System — Production-Quality Colab Notebook
# Author  : Your Name
# Model   : Groq LLaMA-3.3-70B-Versatile (128K context)
# Embedder: all-MiniLM-L6-v2  (Sentence-Transformers, free)
# Vector DB: FAISS (Facebook AI, free, CPU)
# UI      : Gradio 4.x
# ============================================================


# ─────────────────────────────────────────────────────────────
# MODULE 0 ❯  INSTALLATION
# Run this cell once.  Restart runtime after it finishes.
# ─────────────────────────────────────────────────────────────

# !pip install -q \
#   gradio \
#   youtube-transcript-api \
#   sentence-transformers \
#   faiss-cpu \
#   groq \
#   langchain-text-splitters \
#   python-dotenv


# ─────────────────────────────────────────────────────────────
# MODULE 1 ❯  IMPORTS & CONFIGURATION
# All third-party imports live here.
# API key is read from Colab Secrets (preferred) or env var.
# ─────────────────────────────────────────────────────────────

import os
import re
import logging
from typing import Optional

# ── UI framework ─────────────────────────────────────────────
import gradio as gr

# ── YouTube transcript (free, no API key required) ───────────
from youtube_transcript_api import YouTubeTranscriptApi
from youtube_transcript_api._errors import (
    TranscriptsDisabled,
    NoTranscriptFound,
    VideoUnavailable,
)

# ── Embedding model (local, runs on CPU) ─────────────────────
from sentence_transformers import SentenceTransformer

# ── Text splitting ────────────────────────────────────────────
from langchain_text_splitters import RecursiveCharacterTextSplitter

# ── Numerical / vector DB ─────────────────────────────────────
import numpy as np
import faiss

# ── Groq LLM client ───────────────────────────────────────────
from groq import Groq

# ── Logging — shows clean status in Colab output ──────────────
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s | %(levelname)s | %(message)s",
    datefmt="%H:%M:%S",
)
log = logging.getLogger("rag")

# ── API key ────────────────────────────────────────────────────
# Option A (recommended in Colab): use Secrets panel (🔑 left sidebar)
#   key name → GROQ_API_KEY
try:
    from google.colab import userdata  # type: ignore
    GROQ_API_KEY = userdata.get("GROQ_API_KEY")
except Exception:
    GROQ_API_KEY = os.getenv("GROQ_API_KEY", "")

if not GROQ_API_KEY:
    raise EnvironmentError(
        "⚠️  GROQ_API_KEY not found. "
        "Add it via Colab Secrets (🔑) or set os.environ['GROQ_API_KEY']."
    )

# ── Model identifiers ──────────────────────────────────────────
GROQ_MODEL      = "llama-3.3-70b-versatile"   # 128K context, best OSS on Groq 2025
EMBED_MODEL     = "all-MiniLM-L6-v2"          # 384-dim, fast, free, CPU-friendly
CHUNK_SIZE      = 500                          # tokens per chunk
CHUNK_OVERLAP   = 50                           # overlap to preserve context across chunks
TOP_K           = 4                            # how many chunks to retrieve per query
MAX_NEW_TOKENS  = 1024                         # LLM answer budget


# ─────────────────────────────────────────────────────────────
# MODULE 2 ❯  MODEL INITIALISATION
# Load embedding model once at startup so every call is fast.
# Groq client is stateless — one instance is enough.
# ─────────────────────────────────────────────────────────────

log.info("Loading embedding model …")
embedding_model = SentenceTransformer(EMBED_MODEL)
log.info("Embedding model ready ✓")

groq_client = Groq(api_key=GROQ_API_KEY)

# ── Global vector store ────────────────────────────────────────
# These are module-level globals so every Gradio callback
# can read/write them without passing objects around.
vector_store: Optional[faiss.IndexFlatL2] = None   # FAISS index
chunks_store: list[str] = []                        # parallel list of text chunks
current_video_title: str = ""                       # shown in the UI


# ─────────────────────────────────────────────────────────────
# MODULE 3 ❯  YOUTUBE TRANSCRIPT FETCHER
# ─────────────────────────────────────────────────────────────

def extract_video_id(url: str) -> str:
    """
    Extract the YouTube video ID from any common URL format.

    Handles:
      https://www.youtube.com/watch?v=VIDEO_ID
      https://youtu.be/VIDEO_ID
      https://youtube.com/shorts/VIDEO_ID
      https://www.youtube.com/embed/VIDEO_ID
    """
    patterns = [
        r"(?:v=)([A-Za-z0-9_-]{11})",
        r"youtu\.be/([A-Za-z0-9_-]{11})",
        r"shorts/([A-Za-z0-9_-]{11})",
        r"embed/([A-Za-z0-9_-]{11})",
    ]
    for pattern in patterns:
        match = re.search(pattern, url)
        if match:
            return match.group(1)
    raise ValueError(f"Could not extract video ID from URL: {url}")


def get_transcript(url: str) -> tuple[str, str]:
    """
    Fetch the transcript for a YouTube video.

    Returns
    -------
    (transcript_text, status_message)
    On error: (empty string, error description)
    """
    try:
        video_id = extract_video_id(url)
        log.info(f"Fetching transcript for video ID: {video_id}")

        api = YouTubeTranscriptApi()
        # .fetch() returns a FetchedTranscript object (updated API)
        transcript_data = api.fetch(video_id)

        # Join all text segments into one continuous string
        full_text = " ".join(
            segment.text.strip()
            for segment in transcript_data
            if segment.text.strip()
        )

        word_count = len(full_text.split())
        log.info(f"Transcript fetched — {word_count:,} words")
        return full_text, f"✅ Transcript fetched ({word_count:,} words)"

    except VideoUnavailable:
        return "", "❌ Video is unavailable or private."
    except TranscriptsDisabled:
        return "", "❌ Transcripts are disabled for this video."
    except NoTranscriptFound:
        return "", "❌ No transcript found. Try a video with auto-generated captions."
    except ValueError as e:
        return "", f"❌ Invalid URL — {e}"
    except Exception as e:
        log.exception("Unexpected error fetching transcript")
        return "", f"❌ Unexpected error: {e}"


# ─────────────────────────────────────────────────────────────
# MODULE 4 ❯  VECTOR DATABASE BUILDER
# Splits transcript → chunks → embeddings → FAISS index
# ─────────────────────────────────────────────────────────────

def build_vector_store(transcript: str) -> str:
    """
    Convert a raw transcript into a FAISS vector index.

    Steps
    -----
    1. Split text into overlapping chunks via RecursiveCharacterTextSplitter
    2. Encode each chunk with the embedding model
    3. Build a FAISS IndexFlatL2 and add the vectors
    4. Store everything in module-level globals

    Returns
    -------
    Status message string.
    """
    global vector_store, chunks_store

    # ── Step 1: Chunk ──────────────────────────────────────────
    splitter = RecursiveCharacterTextSplitter(
        chunk_size=CHUNK_SIZE,
        chunk_overlap=CHUNK_OVERLAP,
        length_function=len,           # character-based length
        separators=["\n\n", "\n", ". ", " ", ""],
    )
    chunks = splitter.split_text(transcript)
    log.info(f"Created {len(chunks)} chunks")

    if not chunks:
        return "❌ No chunks created — transcript may be too short."

    # ── Step 2: Embed ──────────────────────────────────────────
    log.info("Encoding chunks …")
    embeddings = embedding_model.encode(
        chunks,
        show_progress_bar=False,
        batch_size=64,
        normalize_embeddings=True,     # cosine similarity via inner product
    )

    # ── Step 3: Index ──────────────────────────────────────────
    dimension = embeddings.shape[1]
    index = faiss.IndexFlatIP(dimension)   # Inner Product → cosine on normalised vecs
    index.add(np.array(embeddings, dtype=np.float32))

    # ── Step 4: Persist to globals ─────────────────────────────
    vector_store = index
    chunks_store = chunks

    log.info(f"FAISS index built — {index.ntotal} vectors, dim={dimension}")
    return f"✅ Indexed {len(chunks)} chunks into FAISS (dim={dimension})"


# ─────────────────────────────────────────────────────────────
# MODULE 5 ❯  RETRIEVER
# Similarity search: query → top-k relevant chunks
# ─────────────────────────────────────────────────────────────

def retrieve_context(query: str, top_k: int = TOP_K) -> str:
    """
    Retrieve the most semantically relevant chunks for a given query.

    Parameters
    ----------
    query  : user's natural-language question
    top_k  : number of chunks to return

    Returns
    -------
    String of concatenated retrieved chunks, separated by blank lines.
    """
    if vector_store is None or not chunks_store:
        return ""

    # Embed and normalise the query (same preprocessing as the chunks)
    query_vec = embedding_model.encode(
        [query],
        normalize_embeddings=True,
    )

    # FAISS inner-product search (cosine on normalised vectors)
    scores, indices = vector_store.search(
        np.array(query_vec, dtype=np.float32), top_k
    )

    retrieved = []
    for score, idx in zip(scores[0], indices[0]):
        if idx == -1:   # FAISS returns -1 for empty slots
            continue
        retrieved.append(f"[Relevance: {score:.3f}]\n{chunks_store[idx]}")

    log.info(f"Retrieved {len(retrieved)} chunks for query: '{query[:60]}…'")
    return "\n\n---\n\n".join(retrieved)


# ─────────────────────────────────────────────────────────────
# MODULE 6 ❯  LLM — GROQ LLAMA 3.3-70B
# Augment + Generate step of RAG
# ─────────────────────────────────────────────────────────────

SYSTEM_PROMPT = """\
You are a precise, helpful AI assistant that answers questions about YouTube videos \
based strictly on the provided transcript context.

Rules:
- Answer ONLY from the context provided.
- If the context does not contain enough information, say so clearly.
- Be concise but complete.
- Use bullet points for lists or steps.
- Never fabricate information not present in the context.
"""

def generate_answer(query: str) -> str:
    """
    Full RAG generate step:
      1. Retrieve relevant context from FAISS
      2. Build an augmented prompt
      3. Send to Groq LLaMA-3.3-70B
      4. Return the model's response

    Parameters
    ----------
    query : user's question

    Returns
    -------
    The model's answer as a string.
    """
    context = retrieve_context(query)

    if not context:
        return "⚠️ No relevant context found in the transcript for your question."

    user_message = f"""\
Context from the video transcript:

{context}

---

Question: {query}

Answer:"""

    try:
        response = groq_client.chat.completions.create(
            model=GROQ_MODEL,
            messages=[
                {"role": "system", "content": SYSTEM_PROMPT},
                {"role": "user",   "content": user_message},
            ],
            max_tokens=MAX_NEW_TOKENS,
            temperature=0.2,       # low temp → factual, grounded answers
            top_p=0.9,
        )
        answer = response.choices[0].message.content.strip()
        log.info("LLM response received")
        return answer

    except Exception as e:
        log.exception("Groq API error")
        return f"❌ LLM error: {e}"


# ─────────────────────────────────────────────────────────────
# MODULE 7 ❯  ORCHESTRATION PIPELINE
# Ties transcript fetch + vector store build together.
# Called by the Gradio "Process Video" button.
# ─────────────────────────────────────────────────────────────

def process_video(url: str) -> tuple[str, str, str]:
    """
    Full ingestion pipeline triggered by the UI.

    Returns
    -------
    (transcript_preview, index_status, combined_status)
    suitable for Gradio outputs.
    """
    global current_video_title

    if not url or not url.strip():
        return "", "", "⚠️ Please enter a YouTube URL."

    # ── Phase 1: Fetch transcript ──────────────────────────────
    transcript, fetch_status = get_transcript(url.strip())
    if not transcript:
        return "", "", fetch_status

    # ── Phase 2: Build vector store ───────────────────────────
    index_status = build_vector_store(transcript)

    # ── Phase 3: Summary line for UI ──────────────────────────
    combined = f"{fetch_status}\n{index_status}\n\n💬 Video is ready — switch to the Chat tab!"

    # Show first 2000 chars in the transcript preview box
    preview = transcript[:2000] + (" …[truncated]" if len(transcript) > 2000 else "")

    return preview, index_status, combined


# ─────────────────────────────────────────────────────────────
# MODULE 8 ❯  CHAT HANDLER
# Called on every user message in the Chat tab.
# ─────────────────────────────────────────────────────────────

def chat_with_video(
    user_query: str,
    history: list[tuple[str, str]],
) -> tuple[list[tuple[str, str]], str]:
    """
    Handle a single chat turn.

    Parameters
    ----------
    user_query : the question typed by the user
    history    : Gradio chat history (list of (user, assistant) pairs)

    Returns
    -------
    Updated history, empty string (clears the input box)
    """
    if not user_query.strip():
        return history, ""

    if vector_store is None:
        history.append((user_query, "⚠️ Please process a video first on the **Process Video** tab."))
        return history, ""

    answer = generate_answer(user_query)
    history.append((user_query, answer))
    return history, ""


# ─────────────────────────────────────────────────────────────
# MODULE 9 ❯  GRADIO USER INTERFACE
# Professional two-tab layout:
#   Tab 1 — Process Video (URL input, status, transcript preview)
#   Tab 2 — Chat         (conversation window + input)
# ─────────────────────────────────────────────────────────────

CSS = """
/* ── Global ── */
#app-header { text-align: center; margin-bottom: 0.5rem; }
#status-box textarea {
    font-size: 0.85rem;
    color: var(--body-text-color);
    background: var(--input-background-fill);
}
#transcript-box textarea { font-size: 0.8rem; }
#chat-window { height: 480px; }
/* ── Send on Enter ── */
#chat-input textarea { resize: none; }
"""

with gr.Blocks(
    title="YouTube RAG Q&A",
    theme=gr.themes.Soft(
        primary_hue="indigo",
        neutral_hue="slate",
        font=gr.themes.GoogleFont("Inter"),
    ),
    css=CSS,
) as app:

    # ── Header ─────────────────────────────────────────────────
    gr.Markdown(
        """
        # 🎥 YouTube RAG Q&A
        **Paste any YouTube URL → transcribe → chat with the video using AI**

        *Powered by [Groq](https://groq.com) · LLaMA 3.3-70B · FAISS · Sentence-Transformers*
        """,
        elem_id="app-header",
    )

    # ── Tab 1: Process Video ────────────────────────────────────
    with gr.Tab("📥  Process Video", id="tab-process"):

        with gr.Row():
            url_input = gr.Textbox(
                label="YouTube URL",
                placeholder="https://www.youtube.com/watch?v=...",
                scale=4,
            )
            process_btn = gr.Button(
                "▶  Transcribe & Index",
                variant="primary",
                scale=1,
                min_width=180,
            )

        status_output = gr.Textbox(
            label="Pipeline Status",
            interactive=False,
            lines=4,
            elem_id="status-box",
        )

        with gr.Accordion("📄  Transcript Preview (first 2000 chars)", open=False):
            transcript_output = gr.Textbox(
                label="Raw transcript",
                interactive=False,
                lines=12,
                elem_id="transcript-box",
            )

        # ── Wiring ────────────────────────────────────────────
        process_btn.click(
            fn=process_video,
            inputs=url_input,
            outputs=[transcript_output, gr.Textbox(visible=False), status_output],
        )

    # ── Tab 2: Chat ─────────────────────────────────────────────
    with gr.Tab("💬  Chat with Video", id="tab-chat"):

        chatbot = gr.Chatbot(
            label="Conversation",
            bubble_full_width=False,
            height=480,
            elem_id="chat-window",
        )

        with gr.Row():
            chat_input = gr.Textbox(
                placeholder="Ask anything about the video…",
                label="",
                scale=5,
                elem_id="chat-input",
                autofocus=True,
            )
            send_btn = gr.Button("Send  ➤", variant="primary", scale=1, min_width=100)

        clear_btn = gr.Button("🗑  Clear conversation", variant="secondary", size="sm")

        # ── Wiring ────────────────────────────────────────────
        # Submit on button click or Enter key
        send_btn.click(
            fn=chat_with_video,
            inputs=[chat_input, chatbot],
            outputs=[chatbot, chat_input],
        )
        chat_input.submit(
            fn=chat_with_video,
            inputs=[chat_input, chatbot],
            outputs=[chatbot, chat_input],
        )
        clear_btn.click(fn=lambda: [], outputs=chatbot)

    # ── Footer ──────────────────────────────────────────────────
    gr.Markdown(
        "<center style='font-size:0.75rem; color:#888;'>"
        "Open-source · No data stored · Transcript processed locally"
        "</center>"
    )


# ─────────────────────────────────────────────────────────────
# MODULE 10 ❯  LAUNCH
# ─────────────────────────────────────────────────────────────

if __name__ == "__main__":
    app.launch(
        debug=True,          # shows tracebacks in output
        share=True,          # creates a public gradio.live link (great for demos)
        show_error=True,
    )