# ============================================================
# YouTube RAG QA System — app.py
# Transcript: Supadata API (works on HuggingFace, no SSL block)
# LLM: Groq LLaMA 3.3-70B
# Vector DB: FAISS + sentence-transformers
# UI: Gradio 5
# ============================================================

import os
import re
import requests
import numpy as np
import faiss
import gradio as gr
from sentence_transformers import SentenceTransformer
from groq import Groq

# ─── GLOBAL STATE ────────────────────────────────────────────
_embed_model = None
_faiss_index = None
_chunks      = []
_groq_client = None


# ─── LAZY CLIENTS ────────────────────────────────────────────
def get_groq_client():
    global _groq_client
    if _groq_client is not None:
        return _groq_client
    api_key = os.environ.get("GROQ_API_KEY", "").strip()
    if not api_key:
        raise ValueError(
            "GROQ_API_KEY not set!\n"
            "Space → Settings → Variables and secrets → New secret\n"
            "Name: GROQ_API_KEY   Value: gsk_xxxxxxxxxx"
        )
    _groq_client = Groq(api_key=api_key)
    return _groq_client


def get_embed_model():
    global _embed_model
    if _embed_model is None:
        _embed_model = SentenceTransformer("all-MiniLM-L6-v2")
    return _embed_model


# ─── MODULE 1: Transcript Fetcher (via Supadata REST API) ────
def extract_video_id(url: str) -> str:
    """Extract 11-char YouTube video ID from any URL format."""
    for pat in [
        r"(?:v=|\/)([0-9A-Za-z_-]{11})",
        r"youtu\.be\/([0-9A-Za-z_-]{11})",
        r"shorts\/([0-9A-Za-z_-]{11})",
    ]:
        m = re.search(pat, url)
        if m:
            return m.group(1)
    raise ValueError(f"Cannot extract video ID from: {url}")


def fetch_transcript(url: str) -> str:
    """
    Fetch transcript using Supadata API — works on HuggingFace
    (no direct YouTube SSL connection needed).
    Free tier: 100 requests/month — get key at supadata.ai
    """
    supadata_key = os.environ.get("SUPADATA_API_KEY", "").strip()
    if not supadata_key:
        raise ValueError(
            "SUPADATA_API_KEY not set!\n"
            "1. Go to https://supadata.ai → Sign up (free, no credit card)\n"
            "2. Copy your API key\n"
            "3. Space → Settings → Variables and secrets → New secret\n"
            "   Name: SUPADATA_API_KEY   Value: your_key_here"
        )

    video_id = extract_video_id(url)
    response = requests.get(
        "https://api.supadata.ai/v1/youtube/transcript",
        params={"videoId": video_id, "text": "true"},
        headers={"x-api-key": supadata_key},
        timeout=30,
    )

    if response.status_code == 401:
        raise ValueError("Invalid SUPADATA_API_KEY — check your key at supadata.ai")
    if response.status_code == 404:
        raise ValueError("No transcript found for this video (may be private or have no captions)")
    if response.status_code != 200:
        raise ValueError(f"Supadata API error {response.status_code}: {response.text}")

    data = response.json()

    # text=true returns plain string in data["content"]
    if isinstance(data.get("content"), str):
        return data["content"]

    # fallback: join segment list
    if isinstance(data.get("content"), list):
        return " ".join(seg.get("text", "") for seg in data["content"])

    raise ValueError(f"Unexpected Supadata response: {data}")


# ─── MODULE 2: Text Chunker ───────────────────────────────────
def chunk_text(text: str, chunk_size: int = 500, overlap: int = 50) -> list:
    """Split transcript into overlapping word-based chunks."""
    words, chunks, start = text.split(), [], 0
    while start < len(words):
        end = min(start + chunk_size, len(words))
        chunks.append(" ".join(words[start:end]))
        if end == len(words):
            break
        start += chunk_size - overlap
    return chunks


# ─── MODULE 3: Vector Store (FAISS) ──────────────────────────
def build_faiss_index(chunks: list):
    """Encode chunks with MiniLM → build FAISS L2 index."""
    emb   = get_embed_model().encode(chunks, show_progress_bar=False).astype("float32")
    index = faiss.IndexFlatL2(emb.shape[1])
    index.add(emb)
    return index


def retrieve_chunks(query: str, index, chunks: list, top_k: int = 4) -> list:
    """Return top-k most relevant chunks for a query."""
    q_vec   = get_embed_model().encode([query]).astype("float32")
    _, idxs = index.search(q_vec, top_k)
    return [chunks[i] for i in idxs[0] if i < len(chunks)]


# ─── MODULE 4: LLM via Groq ───────────────────────────────────
def ask_llm(question: str, context_chunks: list) -> str:
    """Build RAG prompt and call Groq LLaMA 3.3-70B."""
    context = "\n\n".join(f"[Chunk {i+1}]:\n{c}" for i, c in enumerate(context_chunks))
    prompt  = (
        "You are a helpful assistant. Answer ONLY from the transcript context below.\n"
        "If the answer is not in the context, say: 'I could not find that in the video.'\n\n"
        f"CONTEXT:\n{context}\n\nQUESTION: {question}\n\nANSWER:"
    )
    resp = get_groq_client().chat.completions.create(
        model="llama-3.3-70b-versatile",
        messages=[{"role": "user", "content": prompt}],
        max_tokens=1024,
        temperature=0.3,
    )
    return resp.choices[0].message.content.strip()


# ─── HANDLER: Process Video ───────────────────────────────────
def process_video(url: str):
    """Generator — yields live status messages to Textbox."""
    global _faiss_index, _chunks

    if not url or not url.strip():
        yield "⚠️ Please enter a YouTube URL first."
        return

    # Check keys before starting
    if not os.environ.get("SUPADATA_API_KEY", "").strip():
        yield (
            "❌ SUPADATA_API_KEY is missing!\n\n"
            "Steps to fix:\n"
            "1. Go to https://supadata.ai → Sign up FREE (no credit card)\n"
            "2. Get your API key from dashboard\n"
            "3. HuggingFace Space → Settings → Variables and secrets\n"
            "4. Click 'New secret'\n"
            "   Name:  SUPADATA_API_KEY\n"
            "   Value: your_supadata_key_here\n"
            "5. Save → Space will restart → Try again!"
        )
        return

    try:
        yield "⏳ [1/4] Fetching transcript via Supadata API..."
        transcript = fetch_transcript(url.strip())
        yield f"✅ [1/4] Transcript fetched! ({len(transcript.split()):,} words)\n⏳ [2/4] Splitting into chunks..."

        _chunks = chunk_text(transcript)
        yield f"✅ [2/4] {len(_chunks)} chunks created\n⏳ [3/4] Generating embeddings (30-60 sec on CPU)..."

        _faiss_index = build_faiss_index(_chunks)
        yield (
            f"✅ [3/4] Embeddings generated\n"
            f"✅ [4/4] FAISS index ready!\n\n"
            f"🎉 Done! {len(_chunks)} chunks indexed.\n"
            f"👉 Switch to '💬 Chat with Video' tab and ask your questions!"
        )

    except Exception as e:
        _faiss_index = None
        _chunks      = []
        yield f"❌ Error: {e}"


# ─── HANDLER: Chat ────────────────────────────────────────────
def chat_fn(message: str, history: list):
    """RAG pipeline: retrieve → augment → LLM → answer."""
    if not message.strip():
        return history, ""

    if _faiss_index is None or not _chunks:
        history.append({"role": "user",      "content": message})
        history.append({"role": "assistant", "content":
            "⚠️ No video processed yet!\n\n"
            "1. Go to '📹 Process Video' tab\n"
            "2. Paste a YouTube URL\n"
            "3. Click 🚀 Process Video\n"
            "4. Wait for ✅ success\n"
            "5. Come back here to chat!"
        })
        return history, ""

    try:
        if not os.environ.get("GROQ_API_KEY", "").strip():
            history.append({"role": "user",      "content": message})
            history.append({"role": "assistant", "content":
                "❌ GROQ_API_KEY is missing!\n\n"
                "Space → Settings → Variables and secrets → New secret\n"
                "Name: GROQ_API_KEY   Value: gsk_xxxxxxxxxx"
            })
            return history, ""

        context = retrieve_chunks(message, _faiss_index, _chunks)
        answer  = ask_llm(message, context)
        history.append({"role": "user",      "content": message})
        history.append({"role": "assistant", "content": answer})

    except Exception as e:
        history.append({"role": "user",      "content": message})
        history.append({"role": "assistant", "content": f"❌ Error: {e}"})

    return history, ""


# ─── GRADIO UI ────────────────────────────────────────────────
with gr.Blocks(title="YouTube RAG QA", theme=gr.themes.Soft()) as demo:

    gr.Markdown("""
    # 🎬 YouTube RAG QA System
    ### Kisi bhi YouTube video se sawaal poochho!
    **Powered by:** Supadata · FAISS · sentence-transformers · Groq LLaMA 3.3-70B · Gradio 5

    **Step 1 →** URL daalo + Process karo &nbsp;&nbsp;&nbsp; **Step 2 →** Chat tab mein sawaal karo
    """)

    with gr.Tabs():

        # ── Tab 1: Process Video ──────────────────────────────
        with gr.Tab("📹 Process Video"):
            gr.Markdown("YouTube URL paste karo. Transcript fetch → chunk → embed → FAISS index.")
            with gr.Row():
                url_box     = gr.Textbox(
                    label="🔗 YouTube URL",
                    placeholder="https://www.youtube.com/watch?v=...  ya  https://youtu.be/...",
                    scale=4,
                )
                process_btn = gr.Button("🚀 Process Video", variant="primary", scale=1)

            status_box = gr.Textbox(
                label="📊 Live Processing Status",
                interactive=False,
                lines=9,
            )
            process_btn.click(process_video, inputs=[url_box], outputs=[status_box])

        # ── Tab 2: Chat ───────────────────────────────────────
        with gr.Tab("💬 Chat with Video"):
            gr.Markdown("Video process hone ke baad yahan sawaal poochho.")
            chatbot  = gr.Chatbot(type="messages", height=430, label="Chat")
            with gr.Row():
                msg_box  = gr.Textbox(
                    placeholder="Sawaal likho aur Enter dabao...",
                    label="Your Question",
                    scale=5,
                )
                send_btn = gr.Button("Send ➤", variant="primary", scale=1)
            clear_btn = gr.Button("🗑️ Clear Chat", variant="secondary")

            send_btn.click(chat_fn, [msg_box, chatbot], [chatbot, msg_box])
            msg_box.submit(chat_fn, [msg_box, chatbot], [chatbot, msg_box])
            clear_btn.click(lambda: ([], ""), outputs=[chatbot, msg_box])

if __name__ == "__main__":
    demo.launch(server_name="0.0.0.0", server_port=7860)