import os import gradio as gr from urllib.parse import urlparse, parse_qs from youtube_transcript_api import YouTubeTranscriptApi from langchain_text_splitters import RecursiveCharacterTextSplitter from langchain_community.embeddings import HuggingFaceEmbeddings from langchain_community.vectorstores import FAISS from groq import Groq # ── Setup ────────────────────────────────────────────── GROQ_API_KEY = os.environ.get("GROQ_API_KEY") # Set this in HF Space Secrets client = Groq(api_key=GROQ_API_KEY) embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2") vectorstore = None # global, updated when URL is loaded # ── URL Parsing ──────────────────────────────────────── def extract_video_id(url): """Handle all YouTube URL formats: - https://www.youtube.com/watch?v=ID - https://youtu.be/ID - https://www.youtube.com/live/ID - https://www.youtube.com/shorts/ID - https://www.youtube.com/embed/ID """ url = url.strip() parsed = urlparse(url) # youtu.be/ID if parsed.netloc in ("youtu.be", "www.youtu.be"): return parsed.path.lstrip("/").split("?")[0] # /watch?v=ID qs = parse_qs(parsed.query) if "v" in qs: return qs["v"][0] # /live/ID or /shorts/ID or /embed/ID parts = [p for p in parsed.path.split("/") if p] if len(parts) >= 2 and parts[0] in ("live", "shorts", "embed"): return parts[1] raise ValueError( f"Could not extract video ID from: {url}\n" "Supported formats: /watch?v=ID, youtu.be/ID, /live/ID, /shorts/ID" ) # ── Core Functions ───────────────────────────────────── def get_transcript(video_url): video_id = extract_video_id(video_url) ytt = YouTubeTranscriptApi() transcript = ytt.fetch(video_id) return " ".join([entry.text for entry in transcript]) def build_vectorstore(text): splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50) chunks = splitter.create_documents([text]) return FAISS.from_documents(chunks, embeddings) def answer_query(vs, query): docs = vs.similarity_search(query, k=3) context = "\n\n".join([doc.page_content for doc in docs]) response = client.chat.completions.create( model="llama-3.1-8b-instant", messages=[ { "role": "system", "content": f"""You are a helpful assistant that answers questions based ONLY on the provided video transcript context. If the answer is not in the context, say 'This topic was not covered in the video.' Context: {context}""" }, {"role": "user", "content": query} ] ) return response.choices[0].message.content # ── Gradio Handlers ──────────────────────────────────── def load_video(url): global vectorstore if not url.strip(): return "⚠️ Please enter a YouTube URL.", gr.update(interactive=False) try: transcript = get_transcript(url) vectorstore = build_vectorstore(transcript) return "✅ Video loaded! You can now ask questions below.", gr.update(interactive=True) except Exception as e: return f"❌ Error: {str(e)}", gr.update(interactive=False) def chat(query, history): if vectorstore is None: return "⚠️ Please load a YouTube video first using the URL field above." if not query.strip(): return "Please enter a question." return answer_query(vectorstore, query) # ── UI ───────────────────────────────────────────────── with gr.Blocks(title="YouTube RAG Chatbot") as app: gr.Markdown("# 🎥 YouTube RAG Chatbot\nPaste any YouTube URL and ask questions about the video!") with gr.Row(): url_input = gr.Textbox( placeholder="https://www.youtube.com/watch?v=...", label="YouTube URL", scale=4 ) load_btn = gr.Button("▶ Load Video", variant="primary", scale=1) status_box = gr.Textbox(label="Status", interactive=False) load_btn.click( fn=load_video, inputs=[url_input], outputs=[status_box, load_btn] ) gr.ChatInterface( fn=chat, examples=[ "What is neuroplasticity?", "How can we change our brain?", "What role does behavior play in learning?", "What did the speaker say about stroke patients?" ], title="" ) app.launch() # No share=True needed on HF Spaces