Spaces:

Daksh0505
/

Youtube-Chatbot

Sleeping

App Files Files Community

Daksh0505 commited on Oct 5, 2025

Commit

aa7e489

verified ·

1 Parent(s): b0b5ef6

Update app.py

Browse files

Files changed (1) hide show

app.py +51 -90

app.py CHANGED Viewed

@@ -3,101 +3,83 @@ from langchain_huggingface import ChatHuggingFace, HuggingFaceEndpoint, HuggingF
 from langchain.text_splitter import RecursiveCharacterTextSplitter
 from langchain_community.vectorstores import FAISS
 from langchain.prompts import PromptTemplate
 import os
 import requests
 api_key = os.getenv("HF_API_KEY")
 RAPIDAPI_KEY = (os.getenv("RAPIDAPI_KEY") or "").strip()
 if not RAPIDAPI_KEY:
     st.error("❌ RAPIDAPI_KEY not set. Please add it in your environment variables.")
-# 📼 Transcript Fetcher using RapidAPI
 @st.cache_data
 def get_transcript(video_id, language_code="en"):
     url = "https://youtube-transcript3.p.rapidapi.com/api/transcript"
-    querystring = {"videoId": video_id, "lang": language_code}  # note videoId
     headers = {
         "x-rapidapi-key": RAPIDAPI_KEY,
         "x-rapidapi-host": "youtube-transcript3.p.rapidapi.com"
     }
     try:
         response = requests.get(url, headers=headers, params=querystring, timeout=10)
-        st.write("Status Code:", response.status_code)
-        st.write("Response JSON:", response.text)
         if response.status_code != 200:
             st.error(f"API Error: {response.status_code}")
             return None
         data = response.json()
-        # Handle transcript properly
-        if isinstance(data, dict) and data.get("success") and "transcript" in data:
-            transcript_list = data["transcript"]
-            return ' '.join([item.get('text', '') for item in transcript_list])
-        elif isinstance(data, dict) and "message" in data:
-            st.error(f"API returned message: {data['message']}")
-            return None
         else:
             st.warning("Unexpected API response format")
             return None
     except Exception as e:
         st.error(f"Error: {str(e)}")
         return None
-# 📼 Get Available Languages (simplified - try common ones)
-def get_available_languages():
-    return [
-        ("en", "English"),
-        ("es", "Spanish"),
-        ("fr", "French"),
-        ("de", "German"),
-        ("hi", "Hindi"),
-        ("zh", "Chinese"),
-        ("ja", "Japanese"),
-        ("ko", "Korean"),
-        ("pt", "Portuguese"),
-        ("ru", "Russian")
-    ]
-# 🧠 Embedding Loader
 @st.cache_resource
-def load_embeddings():
-    return HuggingFaceEmbeddings(
-        model_name="intfloat/multilingual-e5-base",
-        model_kwargs={"device": "cpu"}
     )
-# 🧱 Vector Store Builder
-@st.cache_data
-def create_vector_store(transcript):
-    splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
-    docs = splitter.create_documents([transcript])
-    return FAISS.from_documents(docs, load_embeddings())
-# 🤖 Model Builder
-# 🤖 Model Builder (with free model option)
-def build_model(model_choice, temperature):
     if model_choice == "DeepSeek":
         repo_id = "deepseek-ai/DeepSeek-V3.2-Exp"  # paid
     elif model_choice == "OpenAI":
-        repo_id = "openai/gpt-oss-20b"             # paid
     else:
-        # Free Hugging Face model
-        repo_id = "bigscience/bloom-560m"         # free, smaller model
-    llm = HuggingFaceEndpoint(
-        repo_id=repo_id,
-        huggingfacehub_api_token=api_key,
-        task="text-generation"
-    )
-    return ChatHuggingFace(llm=llm, temperature=temperature)
 # 🧾 Prompt Template
@@ -108,59 +90,38 @@ prompt_template = PromptTemplate(
         "If the context does not mention the topic, say clearly: 'There is no mention of the topic in the video you provided.'\n"
         "Then, based on your own knowledge, try to answer the question.\n"
         "If both the context and your knowledge are insufficient, say: 'I don't know.'\n\n"
-        "Keep the answer format neat, clean, and human-readable.\n\n"
         "Context:\n{context}\n\n"
         "Question:\n{question}"
     ),
     input_variables=["context", "question"]
 )
-# 🚀 App UI
-st.title("🎥 YouTube Transcript Chatbot")
-with st.sidebar:
-    st.subheader("⚙️ API Setup")
-    st.info("Using RapidAPI for transcripts")
-    st.markdown("[Get your free API key](https://rapidapi.com/ytjar/api/youtube-transcript3)")
-video_id = st.text_input("YouTube Video ID", value="lv1_-RER4_I",
-                          help="Example: dQw4w9WgXcQ from youtube.com/watch?v=dQw4w9WgXcQ")
-langs = get_available_languages()
-lang_options = [f"{name} ({code})" for code, name in langs]
-selected_lang = st.selectbox("Transcript Language", lang_options)
-language_code = selected_lang.split("(")[-1].strip(")")
 query = st.text_area("Your Query", value="What is RAG?")
-model_choice = st.radio("Model to Use", ["DeepSeek", "OpenAI", "Free HF Model"])
-temperature = st.slider("Temperature", 0, 100, value=50)
 if st.button("🚀 Run Chatbot"):
     if not video_id or not query:
         st.warning("Please fill in all fields.")
     else:
         with st.spinner("Fetching transcript..."):
-            transcript = get_transcript(video_id, language_code)
             if not transcript:
-                st.error("Could not fetch transcript. Make sure the video ID is correct and has captions.")
             else:
                 st.success(f"✅ Transcript fetched! ({len(transcript)} characters)")
                 with st.spinner("Generating response..."):
-                    retriever = create_vector_store(transcript).as_retriever(
-                        search_type="mmr",
-                        search_kwargs={"k": 5}
-                    )
                     relevant_docs = retriever.invoke(query)
                     context_text = "\n\n".join(doc.page_content for doc in relevant_docs)
-                    prompt = prompt_template.invoke({
-                        "context": context_text,
-                        "question": query
-                    })
-                    model = build_model(model_choice, temperature / 100.0)
-                    response = model.invoke(prompt)
-                    st.text_area("Model Response", value=response.content, height=400)

 from langchain.text_splitter import RecursiveCharacterTextSplitter
 from langchain_community.vectorstores import FAISS
 from langchain.prompts import PromptTemplate
+from langchain.llms import HuggingFacePipeline
+from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
+import torch
 import os
 import requests
+# Environment variables
 api_key = os.getenv("HF_API_KEY")
 RAPIDAPI_KEY = (os.getenv("RAPIDAPI_KEY") or "").strip()
 if not RAPIDAPI_KEY:
     st.error("❌ RAPIDAPI_KEY not set. Please add it in your environment variables.")
+# 📼 Transcript Fetcher
 @st.cache_data
 def get_transcript(video_id, language_code="en"):
     url = "https://youtube-transcript3.p.rapidapi.com/api/transcript"
+    querystring = {"videoId": video_id, "lang": language_code}
     headers = {
         "x-rapidapi-key": RAPIDAPI_KEY,
         "x-rapidapi-host": "youtube-transcript3.p.rapidapi.com"
     }
     try:
         response = requests.get(url, headers=headers, params=querystring, timeout=10)
         if response.status_code != 200:
             st.error(f"API Error: {response.status_code}")
             return None
         data = response.json()
+        if data.get("success") and "transcript" in data:
+            return ' '.join([item.get('text', '') for item in data["transcript"]])
         else:
             st.warning("Unexpected API response format")
             return None
     except Exception as e:
         st.error(f"Error: {str(e)}")
         return None
+# 🧱 Vector Store
+@st.cache_data
+def create_vector_store(transcript):
+    splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
+    docs = splitter.create_documents([transcript])
+    embeddings = HuggingFaceEmbeddings(
+        model_name="intfloat/multilingual-e5-base",
+        model_kwargs={"device": "cpu"}
+    )
+    return FAISS.from_documents(docs, embeddings)
+# 🤖 Load Free BLOOM locally
 @st.cache_resource
+def load_bloom():
+    model_name = "bigscience/bloom-560m"
+    tokenizer = AutoTokenizer.from_pretrained(model_name)
+    model = AutoModelForCausalLM.from_pretrained(model_name)
+    pipe = pipeline(
+        "text-generation",
+        model=model,
+        tokenizer=tokenizer,
+        device=0 if torch.cuda.is_available() else -1
     )
+    return HuggingFacePipeline(pipeline=pipe)
+# 🧩 Build model (handles endpoints + free local model)
+def build_model(model_choice, temperature=0.7):
     if model_choice == "DeepSeek":
         repo_id = "deepseek-ai/DeepSeek-V3.2-Exp"  # paid
+        llm = HuggingFaceEndpoint(repo_id=repo_id, huggingfacehub_api_token=api_key, task="text-generation")
+        return ChatHuggingFace(llm=llm, temperature=temperature)
     elif model_choice == "OpenAI":
+        repo_id = "openai/gpt-oss-20b"  # paid
+        llm = HuggingFaceEndpoint(repo_id=repo_id, huggingfacehub_api_token=api_key, task="text-generation")
+        return ChatHuggingFace(llm=llm, temperature=temperature)
     else:
+        return load_bloom()  # free local BLOOM
 # 🧾 Prompt Template
         "If the context does not mention the topic, say clearly: 'There is no mention of the topic in the video you provided.'\n"
         "Then, based on your own knowledge, try to answer the question.\n"
         "If both the context and your knowledge are insufficient, say: 'I don't know.'\n\n"
         "Context:\n{context}\n\n"
         "Question:\n{question}"
     ),
     input_variables=["context", "question"]
 )
+# 🚀 Streamlit UI
+st.title("🎥 YouTube Transcript Chatbot (Hybrid: Free + Paid)")
+video_id = st.text_input("YouTube Video ID", value="lv1_-RER4_I")
 query = st.text_area("Your Query", value="What is RAG?")
+model_choice = st.radio("Model to Use", ["DeepSeek", "OpenAI", "Free BLOOM"])
+temperature = st.slider("Temperature", 0, 100, value=50) / 100.0
 if st.button("🚀 Run Chatbot"):
     if not video_id or not query:
         st.warning("Please fill in all fields.")
     else:
         with st.spinner("Fetching transcript..."):
+            transcript = get_transcript(video_id)
             if not transcript:
+                st.error("Could not fetch transcript.")
             else:
                 st.success(f"✅ Transcript fetched! ({len(transcript)} characters)")
                 with st.spinner("Generating response..."):
+                    retriever = create_vector_store(transcript).as_retriever(search_type="mmr", search_kwargs={"k": 5})
                     relevant_docs = retriever.invoke(query)
                     context_text = "\n\n".join(doc.page_content for doc in relevant_docs)
+                    prompt = prompt_template.format(context=context_text, question=query)
+                    model = build_model(model_choice, temperature)
+                    response = model.invoke(prompt) if model_choice != "Free BLOOM" else model(prompt)
+                    st.text_area("Model Response", value=response if isinstance(response, str) else response.content, height=400)