| # ============================================================ | |
| # YouTube RAG Q&A System β Production-Quality Colab Notebook | |
| # Author : Your Name | |
| # Model : Groq LLaMA-3.3-70B-Versatile (128K context) | |
| # Embedder: all-MiniLM-L6-v2 (Sentence-Transformers, free) | |
| # Vector DB: FAISS (Facebook AI, free, CPU) | |
| # UI : Gradio 4.x | |
| # ============================================================ | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # MODULE 0 β― INSTALLATION | |
| # Run this cell once. Restart runtime after it finishes. | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # !pip install -q \ | |
| # gradio \ | |
| # youtube-transcript-api \ | |
| # sentence-transformers \ | |
| # faiss-cpu \ | |
| # groq \ | |
| # langchain-text-splitters \ | |
| # python-dotenv | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # MODULE 1 β― IMPORTS & CONFIGURATION | |
| # All third-party imports live here. | |
| # API key is read from Colab Secrets (preferred) or env var. | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| import os | |
| import re | |
| import logging | |
| from typing import Optional | |
| # ββ UI framework βββββββββββββββββββββββββββββββββββββββββββββ | |
| import gradio as gr | |
| # ββ YouTube transcript (free, no API key required) βββββββββββ | |
| from youtube_transcript_api import YouTubeTranscriptApi | |
| from youtube_transcript_api._errors import ( | |
| TranscriptsDisabled, | |
| NoTranscriptFound, | |
| VideoUnavailable, | |
| ) | |
| # ββ Embedding model (local, runs on CPU) βββββββββββββββββββββ | |
| from sentence_transformers import SentenceTransformer | |
| # ββ Text splitting ββββββββββββββββββββββββββββββββββββββββββββ | |
| from langchain_text_splitters import RecursiveCharacterTextSplitter | |
| # ββ Numerical / vector DB βββββββββββββββββββββββββββββββββββββ | |
| import numpy as np | |
| import faiss | |
| # ββ Groq LLM client βββββββββββββββββββββββββββββββββββββββββββ | |
| from groq import Groq | |
| # ββ Logging β shows clean status in Colab output ββββββββββββββ | |
| logging.basicConfig( | |
| level=logging.INFO, | |
| format="%(asctime)s | %(levelname)s | %(message)s", | |
| datefmt="%H:%M:%S", | |
| ) | |
| log = logging.getLogger("rag") | |
| # ββ API key ββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # Option A (recommended in Colab): use Secrets panel (π left sidebar) | |
| # key name β GROQ_API_KEY | |
| try: | |
| from google.colab import userdata # type: ignore | |
| GROQ_API_KEY = userdata.get("GROQ_API_KEY") | |
| except Exception: | |
| GROQ_API_KEY = os.getenv("GROQ_API_KEY", "") | |
| if not GROQ_API_KEY: | |
| raise EnvironmentError( | |
| "β οΈ GROQ_API_KEY not found. " | |
| "Add it via Colab Secrets (π) or set os.environ['GROQ_API_KEY']." | |
| ) | |
| # ββ Model identifiers ββββββββββββββββββββββββββββββββββββββββββ | |
| GROQ_MODEL = "llama-3.3-70b-versatile" # 128K context, best OSS on Groq 2025 | |
| EMBED_MODEL = "all-MiniLM-L6-v2" # 384-dim, fast, free, CPU-friendly | |
| CHUNK_SIZE = 500 # tokens per chunk | |
| CHUNK_OVERLAP = 50 # overlap to preserve context across chunks | |
| TOP_K = 4 # how many chunks to retrieve per query | |
| MAX_NEW_TOKENS = 1024 # LLM answer budget | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # MODULE 2 β― MODEL INITIALISATION | |
| # Load embedding model once at startup so every call is fast. | |
| # Groq client is stateless β one instance is enough. | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| log.info("Loading embedding model β¦") | |
| embedding_model = SentenceTransformer(EMBED_MODEL) | |
| log.info("Embedding model ready β") | |
| groq_client = Groq(api_key=GROQ_API_KEY) | |
| # ββ Global vector store ββββββββββββββββββββββββββββββββββββββββ | |
| # These are module-level globals so every Gradio callback | |
| # can read/write them without passing objects around. | |
| vector_store: Optional[faiss.IndexFlatL2] = None # FAISS index | |
| chunks_store: list[str] = [] # parallel list of text chunks | |
| current_video_title: str = "" # shown in the UI | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # MODULE 3 β― YOUTUBE TRANSCRIPT FETCHER | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def extract_video_id(url: str) -> str: | |
| """ | |
| Extract the YouTube video ID from any common URL format. | |
| Handles: | |
| https://www.youtube.com/watch?v=VIDEO_ID | |
| https://youtu.be/VIDEO_ID | |
| https://youtube.com/shorts/VIDEO_ID | |
| https://www.youtube.com/embed/VIDEO_ID | |
| """ | |
| patterns = [ | |
| r"(?:v=)([A-Za-z0-9_-]{11})", | |
| r"youtu\.be/([A-Za-z0-9_-]{11})", | |
| r"shorts/([A-Za-z0-9_-]{11})", | |
| r"embed/([A-Za-z0-9_-]{11})", | |
| ] | |
| for pattern in patterns: | |
| match = re.search(pattern, url) | |
| if match: | |
| return match.group(1) | |
| raise ValueError(f"Could not extract video ID from URL: {url}") | |
| def get_transcript(url: str) -> tuple[str, str]: | |
| """ | |
| Fetch the transcript for a YouTube video. | |
| Returns | |
| ------- | |
| (transcript_text, status_message) | |
| On error: (empty string, error description) | |
| """ | |
| try: | |
| video_id = extract_video_id(url) | |
| log.info(f"Fetching transcript for video ID: {video_id}") | |
| api = YouTubeTranscriptApi() | |
| # .fetch() returns a FetchedTranscript object (updated API) | |
| transcript_data = api.fetch(video_id) | |
| # Join all text segments into one continuous string | |
| full_text = " ".join( | |
| segment.text.strip() | |
| for segment in transcript_data | |
| if segment.text.strip() | |
| ) | |
| word_count = len(full_text.split()) | |
| log.info(f"Transcript fetched β {word_count:,} words") | |
| return full_text, f"β Transcript fetched ({word_count:,} words)" | |
| except VideoUnavailable: | |
| return "", "β Video is unavailable or private." | |
| except TranscriptsDisabled: | |
| return "", "β Transcripts are disabled for this video." | |
| except NoTranscriptFound: | |
| return "", "β No transcript found. Try a video with auto-generated captions." | |
| except ValueError as e: | |
| return "", f"β Invalid URL β {e}" | |
| except Exception as e: | |
| log.exception("Unexpected error fetching transcript") | |
| return "", f"β Unexpected error: {e}" | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # MODULE 4 β― VECTOR DATABASE BUILDER | |
| # Splits transcript β chunks β embeddings β FAISS index | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def build_vector_store(transcript: str) -> str: | |
| """ | |
| Convert a raw transcript into a FAISS vector index. | |
| Steps | |
| ----- | |
| 1. Split text into overlapping chunks via RecursiveCharacterTextSplitter | |
| 2. Encode each chunk with the embedding model | |
| 3. Build a FAISS IndexFlatL2 and add the vectors | |
| 4. Store everything in module-level globals | |
| Returns | |
| ------- | |
| Status message string. | |
| """ | |
| global vector_store, chunks_store | |
| # ββ Step 1: Chunk ββββββββββββββββββββββββββββββββββββββββββ | |
| splitter = RecursiveCharacterTextSplitter( | |
| chunk_size=CHUNK_SIZE, | |
| chunk_overlap=CHUNK_OVERLAP, | |
| length_function=len, # character-based length | |
| separators=["\n\n", "\n", ". ", " ", ""], | |
| ) | |
| chunks = splitter.split_text(transcript) | |
| log.info(f"Created {len(chunks)} chunks") | |
| if not chunks: | |
| return "β No chunks created β transcript may be too short." | |
| # ββ Step 2: Embed ββββββββββββββββββββββββββββββββββββββββββ | |
| log.info("Encoding chunks β¦") | |
| embeddings = embedding_model.encode( | |
| chunks, | |
| show_progress_bar=False, | |
| batch_size=64, | |
| normalize_embeddings=True, # cosine similarity via inner product | |
| ) | |
| # ββ Step 3: Index ββββββββββββββββββββββββββββββββββββββββββ | |
| dimension = embeddings.shape[1] | |
| index = faiss.IndexFlatIP(dimension) # Inner Product β cosine on normalised vecs | |
| index.add(np.array(embeddings, dtype=np.float32)) | |
| # ββ Step 4: Persist to globals βββββββββββββββββββββββββββββ | |
| vector_store = index | |
| chunks_store = chunks | |
| log.info(f"FAISS index built β {index.ntotal} vectors, dim={dimension}") | |
| return f"β Indexed {len(chunks)} chunks into FAISS (dim={dimension})" | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # MODULE 5 β― RETRIEVER | |
| # Similarity search: query β top-k relevant chunks | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def retrieve_context(query: str, top_k: int = TOP_K) -> str: | |
| """ | |
| Retrieve the most semantically relevant chunks for a given query. | |
| Parameters | |
| ---------- | |
| query : user's natural-language question | |
| top_k : number of chunks to return | |
| Returns | |
| ------- | |
| String of concatenated retrieved chunks, separated by blank lines. | |
| """ | |
| if vector_store is None or not chunks_store: | |
| return "" | |
| # Embed and normalise the query (same preprocessing as the chunks) | |
| query_vec = embedding_model.encode( | |
| [query], | |
| normalize_embeddings=True, | |
| ) | |
| # FAISS inner-product search (cosine on normalised vectors) | |
| scores, indices = vector_store.search( | |
| np.array(query_vec, dtype=np.float32), top_k | |
| ) | |
| retrieved = [] | |
| for score, idx in zip(scores[0], indices[0]): | |
| if idx == -1: # FAISS returns -1 for empty slots | |
| continue | |
| retrieved.append(f"[Relevance: {score:.3f}]\n{chunks_store[idx]}") | |
| log.info(f"Retrieved {len(retrieved)} chunks for query: '{query[:60]}β¦'") | |
| return "\n\n---\n\n".join(retrieved) | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # MODULE 6 β― LLM β GROQ LLAMA 3.3-70B | |
| # Augment + Generate step of RAG | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| SYSTEM_PROMPT = """\ | |
| You are a precise, helpful AI assistant that answers questions about YouTube videos \ | |
| based strictly on the provided transcript context. | |
| Rules: | |
| - Answer ONLY from the context provided. | |
| - If the context does not contain enough information, say so clearly. | |
| - Be concise but complete. | |
| - Use bullet points for lists or steps. | |
| - Never fabricate information not present in the context. | |
| """ | |
| def generate_answer(query: str) -> str: | |
| """ | |
| Full RAG generate step: | |
| 1. Retrieve relevant context from FAISS | |
| 2. Build an augmented prompt | |
| 3. Send to Groq LLaMA-3.3-70B | |
| 4. Return the model's response | |
| Parameters | |
| ---------- | |
| query : user's question | |
| Returns | |
| ------- | |
| The model's answer as a string. | |
| """ | |
| context = retrieve_context(query) | |
| if not context: | |
| return "β οΈ No relevant context found in the transcript for your question." | |
| user_message = f"""\ | |
| Context from the video transcript: | |
| {context} | |
| --- | |
| Question: {query} | |
| Answer:""" | |
| try: | |
| response = groq_client.chat.completions.create( | |
| model=GROQ_MODEL, | |
| messages=[ | |
| {"role": "system", "content": SYSTEM_PROMPT}, | |
| {"role": "user", "content": user_message}, | |
| ], | |
| max_tokens=MAX_NEW_TOKENS, | |
| temperature=0.2, # low temp β factual, grounded answers | |
| top_p=0.9, | |
| ) | |
| answer = response.choices[0].message.content.strip() | |
| log.info("LLM response received") | |
| return answer | |
| except Exception as e: | |
| log.exception("Groq API error") | |
| return f"β LLM error: {e}" | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # MODULE 7 β― ORCHESTRATION PIPELINE | |
| # Ties transcript fetch + vector store build together. | |
| # Called by the Gradio "Process Video" button. | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def process_video(url: str) -> tuple[str, str, str]: | |
| """ | |
| Full ingestion pipeline triggered by the UI. | |
| Returns | |
| ------- | |
| (transcript_preview, index_status, combined_status) | |
| suitable for Gradio outputs. | |
| """ | |
| global current_video_title | |
| if not url or not url.strip(): | |
| return "", "", "β οΈ Please enter a YouTube URL." | |
| # ββ Phase 1: Fetch transcript ββββββββββββββββββββββββββββββ | |
| transcript, fetch_status = get_transcript(url.strip()) | |
| if not transcript: | |
| return "", "", fetch_status | |
| # ββ Phase 2: Build vector store βββββββββββββββββββββββββββ | |
| index_status = build_vector_store(transcript) | |
| # ββ Phase 3: Summary line for UI ββββββββββββββββββββββββββ | |
| combined = f"{fetch_status}\n{index_status}\n\nπ¬ Video is ready β switch to the Chat tab!" | |
| # Show first 2000 chars in the transcript preview box | |
| preview = transcript[:2000] + (" β¦[truncated]" if len(transcript) > 2000 else "") | |
| return preview, index_status, combined | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # MODULE 8 β― CHAT HANDLER | |
| # Called on every user message in the Chat tab. | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def chat_with_video( | |
| user_query: str, | |
| history: list[tuple[str, str]], | |
| ) -> tuple[list[tuple[str, str]], str]: | |
| """ | |
| Handle a single chat turn. | |
| Parameters | |
| ---------- | |
| user_query : the question typed by the user | |
| history : Gradio chat history (list of (user, assistant) pairs) | |
| Returns | |
| ------- | |
| Updated history, empty string (clears the input box) | |
| """ | |
| if not user_query.strip(): | |
| return history, "" | |
| if vector_store is None: | |
| history.append((user_query, "β οΈ Please process a video first on the **Process Video** tab.")) | |
| return history, "" | |
| answer = generate_answer(user_query) | |
| history.append((user_query, answer)) | |
| return history, "" | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # MODULE 9 β― GRADIO USER INTERFACE | |
| # Professional two-tab layout: | |
| # Tab 1 β Process Video (URL input, status, transcript preview) | |
| # Tab 2 β Chat (conversation window + input) | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| CSS = """ | |
| /* ββ Global ββ */ | |
| #app-header { text-align: center; margin-bottom: 0.5rem; } | |
| #status-box textarea { | |
| font-size: 0.85rem; | |
| color: var(--body-text-color); | |
| background: var(--input-background-fill); | |
| } | |
| #transcript-box textarea { font-size: 0.8rem; } | |
| #chat-window { height: 480px; } | |
| /* ββ Send on Enter ββ */ | |
| #chat-input textarea { resize: none; } | |
| """ | |
| with gr.Blocks( | |
| title="YouTube RAG Q&A", | |
| theme=gr.themes.Soft( | |
| primary_hue="indigo", | |
| neutral_hue="slate", | |
| font=gr.themes.GoogleFont("Inter"), | |
| ), | |
| css=CSS, | |
| ) as app: | |
| # ββ Header βββββββββββββββββββββββββββββββββββββββββββββββββ | |
| gr.Markdown( | |
| """ | |
| # π₯ YouTube RAG Q&A | |
| **Paste any YouTube URL β transcribe β chat with the video using AI** | |
| *Powered by [Groq](https://groq.com) Β· LLaMA 3.3-70B Β· FAISS Β· Sentence-Transformers* | |
| """, | |
| elem_id="app-header", | |
| ) | |
| # ββ Tab 1: Process Video ββββββββββββββββββββββββββββββββββββ | |
| with gr.Tab("π₯ Process Video", id="tab-process"): | |
| with gr.Row(): | |
| url_input = gr.Textbox( | |
| label="YouTube URL", | |
| placeholder="https://www.youtube.com/watch?v=...", | |
| scale=4, | |
| ) | |
| process_btn = gr.Button( | |
| "βΆ Transcribe & Index", | |
| variant="primary", | |
| scale=1, | |
| min_width=180, | |
| ) | |
| status_output = gr.Textbox( | |
| label="Pipeline Status", | |
| interactive=False, | |
| lines=4, | |
| elem_id="status-box", | |
| ) | |
| with gr.Accordion("π Transcript Preview (first 2000 chars)", open=False): | |
| transcript_output = gr.Textbox( | |
| label="Raw transcript", | |
| interactive=False, | |
| lines=12, | |
| elem_id="transcript-box", | |
| ) | |
| # ββ Wiring ββββββββββββββββββββββββββββββββββββββββββββ | |
| process_btn.click( | |
| fn=process_video, | |
| inputs=url_input, | |
| outputs=[transcript_output, gr.Textbox(visible=False), status_output], | |
| ) | |
| # ββ Tab 2: Chat βββββββββββββββββββββββββββββββββββββββββββββ | |
| with gr.Tab("π¬ Chat with Video", id="tab-chat"): | |
| chatbot = gr.Chatbot( | |
| label="Conversation", | |
| bubble_full_width=False, | |
| height=480, | |
| elem_id="chat-window", | |
| ) | |
| with gr.Row(): | |
| chat_input = gr.Textbox( | |
| placeholder="Ask anything about the videoβ¦", | |
| label="", | |
| scale=5, | |
| elem_id="chat-input", | |
| autofocus=True, | |
| ) | |
| send_btn = gr.Button("Send β€", variant="primary", scale=1, min_width=100) | |
| clear_btn = gr.Button("π Clear conversation", variant="secondary", size="sm") | |
| # ββ Wiring ββββββββββββββββββββββββββββββββββββββββββββ | |
| # Submit on button click or Enter key | |
| send_btn.click( | |
| fn=chat_with_video, | |
| inputs=[chat_input, chatbot], | |
| outputs=[chatbot, chat_input], | |
| ) | |
| chat_input.submit( | |
| fn=chat_with_video, | |
| inputs=[chat_input, chatbot], | |
| outputs=[chatbot, chat_input], | |
| ) | |
| clear_btn.click(fn=lambda: [], outputs=chatbot) | |
| # ββ Footer ββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| gr.Markdown( | |
| "<center style='font-size:0.75rem; color:#888;'>" | |
| "Open-source Β· No data stored Β· Transcript processed locally" | |
| "</center>" | |
| ) | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # MODULE 10 β― LAUNCH | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| if __name__ == "__main__": | |
| app.launch( | |
| debug=True, # shows tracebacks in output | |
| share=True, # creates a public gradio.live link (great for demos) | |
| show_error=True, | |
| ) |