Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -5,6 +5,7 @@ from langchain_community.vectorstores import FAISS
|
|
| 5 |
from langchain.prompts import PromptTemplate
|
| 6 |
from langchain.llms import HuggingFacePipeline
|
| 7 |
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, pipeline
|
|
|
|
| 8 |
import torch
|
| 9 |
import os
|
| 10 |
import requests
|
|
@@ -16,28 +17,39 @@ if not RAPIDAPI_KEY:
|
|
| 16 |
st.error("β RAPIDAPI_KEY not set. Please add it in your environment variables.")
|
| 17 |
|
| 18 |
|
| 19 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 20 |
@st.cache_data
|
| 21 |
def get_transcript(video_id, language_code="en"):
|
| 22 |
-
|
| 23 |
-
querystring = {"videoId": video_id, "lang": language_code}
|
| 24 |
-
headers = {
|
| 25 |
-
"x-rapidapi-key": RAPIDAPI_KEY,
|
| 26 |
-
"x-rapidapi-host": "youtube-transcript3.p.rapidapi.com"
|
| 27 |
-
}
|
| 28 |
try:
|
| 29 |
-
|
| 30 |
-
|
| 31 |
-
|
| 32 |
-
|
| 33 |
-
data = response.json()
|
| 34 |
-
if data.get("success") and "transcript" in data:
|
| 35 |
-
return ' '.join([item.get('text', '') for item in data["transcript"]])
|
| 36 |
-
else:
|
| 37 |
-
st.warning("Unexpected API response format")
|
| 38 |
-
return None
|
| 39 |
except Exception as e:
|
| 40 |
-
st.error(f"Error: {str(e)}")
|
| 41 |
return None
|
| 42 |
|
| 43 |
|
|
@@ -53,7 +65,7 @@ def create_vector_store(transcript):
|
|
| 53 |
return FAISS.from_documents(docs, embeddings)
|
| 54 |
|
| 55 |
|
| 56 |
-
# π€ Load Free Flan-T5 locally
|
| 57 |
@st.cache_resource
|
| 58 |
def load_flan_t5():
|
| 59 |
model_name = "google/flan-t5-base"
|
|
@@ -71,10 +83,10 @@ def load_flan_t5():
|
|
| 71 |
return HuggingFacePipeline(pipeline=pipe)
|
| 72 |
|
| 73 |
|
| 74 |
-
# π§© Build model
|
| 75 |
def build_model(model_choice, temperature=0.7):
|
| 76 |
if model_choice == "DeepSeek":
|
| 77 |
-
repo_id = "deepseek-ai/DeepSeek-V3.2-Exp"
|
| 78 |
llm = HuggingFaceEndpoint(
|
| 79 |
repo_id=repo_id,
|
| 80 |
huggingfacehub_api_token=api_key,
|
|
@@ -83,7 +95,7 @@ def build_model(model_choice, temperature=0.7):
|
|
| 83 |
)
|
| 84 |
return ChatHuggingFace(llm=llm, temperature=temperature)
|
| 85 |
elif model_choice == "OpenAI":
|
| 86 |
-
repo_id = "openai/gpt-oss-20b"
|
| 87 |
llm = HuggingFaceEndpoint(
|
| 88 |
repo_id=repo_id,
|
| 89 |
huggingfacehub_api_token=api_key,
|
|
@@ -92,10 +104,10 @@ def build_model(model_choice, temperature=0.7):
|
|
| 92 |
)
|
| 93 |
return ChatHuggingFace(llm=llm, temperature=temperature)
|
| 94 |
else:
|
| 95 |
-
return load_flan_t5()
|
| 96 |
|
| 97 |
|
| 98 |
-
# π§Ύ Prompt Template
|
| 99 |
prompt_template = PromptTemplate(
|
| 100 |
template=(
|
| 101 |
"Answer the question based on the context below.\n\n"
|
|
@@ -108,19 +120,48 @@ prompt_template = PromptTemplate(
|
|
| 108 |
|
| 109 |
|
| 110 |
# π Streamlit UI
|
| 111 |
-
st.title("π₯ YouTube Transcript Chatbot
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 112 |
|
| 113 |
-
|
| 114 |
-
query = st.text_area("Your Query", value="What is RAG?")
|
|
|
|
|
|
|
| 115 |
model_choice = st.radio("Model to Use", ["Flan-T5 (Free)", "DeepSeek", "OpenAI"])
|
| 116 |
-
temperature = st.slider("Temperature", 0, 100, value=50) / 100.0
|
| 117 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 118 |
if st.button("π Run Chatbot"):
|
| 119 |
if not video_id or not query:
|
| 120 |
st.warning("Please fill in all fields.")
|
| 121 |
else:
|
| 122 |
with st.spinner("Fetching transcript..."):
|
| 123 |
-
transcript = get_transcript(video_id)
|
| 124 |
if not transcript:
|
| 125 |
st.error("Could not fetch transcript.")
|
| 126 |
else:
|
|
@@ -144,4 +185,14 @@ if st.button("π Run Chatbot"):
|
|
| 144 |
response_obj = model.invoke(prompt)
|
| 145 |
response = response_obj.content if hasattr(response_obj, 'content') else str(response_obj)
|
| 146 |
|
| 147 |
-
st.text_area("Model Response", value=response, height=400)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5 |
from langchain.prompts import PromptTemplate
|
| 6 |
from langchain.llms import HuggingFacePipeline
|
| 7 |
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, pipeline
|
| 8 |
+
from youtube_transcript_api import YouTubeTranscriptApi
|
| 9 |
import torch
|
| 10 |
import os
|
| 11 |
import requests
|
|
|
|
| 17 |
st.error("β RAPIDAPI_KEY not set. Please add it in your environment variables.")
|
| 18 |
|
| 19 |
|
| 20 |
+
# π List Available Languages
|
| 21 |
+
@st.cache_data
|
| 22 |
+
def list_available_languages(video_id):
|
| 23 |
+
"""Fetch all available transcript languages for a video"""
|
| 24 |
+
try:
|
| 25 |
+
api = YouTubeTranscriptApi()
|
| 26 |
+
transcript_list = api.list(video_id)
|
| 27 |
+
|
| 28 |
+
languages = []
|
| 29 |
+
for transcript in transcript_list:
|
| 30 |
+
lang_code = transcript.language_code
|
| 31 |
+
lang_name = transcript.language
|
| 32 |
+
is_generated = transcript.is_generated
|
| 33 |
+
label = f"{lang_name} ({lang_code})" + (" - Auto-generated" if is_generated else " - Manual")
|
| 34 |
+
languages.append((lang_code, label))
|
| 35 |
+
|
| 36 |
+
return languages
|
| 37 |
+
except Exception as e:
|
| 38 |
+
st.warning(f"Could not fetch available languages: {e}")
|
| 39 |
+
return [("en", "English (en) - Default")]
|
| 40 |
+
|
| 41 |
+
|
| 42 |
+
# πΌ Transcript Fetcher (Using youtube_transcript_api)
|
| 43 |
@st.cache_data
|
| 44 |
def get_transcript(video_id, language_code="en"):
|
| 45 |
+
"""Fetch transcript using youtube_transcript_api"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 46 |
try:
|
| 47 |
+
api = YouTubeTranscriptApi()
|
| 48 |
+
transcript_list = api.fetch(video_id, languages=[language_code])
|
| 49 |
+
transcript = ' '.join([snippet.text for snippet in transcript_list])
|
| 50 |
+
return transcript
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 51 |
except Exception as e:
|
| 52 |
+
st.error(f"Error fetching transcript: {str(e)}")
|
| 53 |
return None
|
| 54 |
|
| 55 |
|
|
|
|
| 65 |
return FAISS.from_documents(docs, embeddings)
|
| 66 |
|
| 67 |
|
| 68 |
+
# π€ Load Free Flan-T5 locally
|
| 69 |
@st.cache_resource
|
| 70 |
def load_flan_t5():
|
| 71 |
model_name = "google/flan-t5-base"
|
|
|
|
| 83 |
return HuggingFacePipeline(pipeline=pipe)
|
| 84 |
|
| 85 |
|
| 86 |
+
# π§© Build model
|
| 87 |
def build_model(model_choice, temperature=0.7):
|
| 88 |
if model_choice == "DeepSeek":
|
| 89 |
+
repo_id = "deepseek-ai/DeepSeek-V3.2-Exp"
|
| 90 |
llm = HuggingFaceEndpoint(
|
| 91 |
repo_id=repo_id,
|
| 92 |
huggingfacehub_api_token=api_key,
|
|
|
|
| 95 |
)
|
| 96 |
return ChatHuggingFace(llm=llm, temperature=temperature)
|
| 97 |
elif model_choice == "OpenAI":
|
| 98 |
+
repo_id = "openai/gpt-oss-20b"
|
| 99 |
llm = HuggingFaceEndpoint(
|
| 100 |
repo_id=repo_id,
|
| 101 |
huggingfacehub_api_token=api_key,
|
|
|
|
| 104 |
)
|
| 105 |
return ChatHuggingFace(llm=llm, temperature=temperature)
|
| 106 |
else:
|
| 107 |
+
return load_flan_t5()
|
| 108 |
|
| 109 |
|
| 110 |
+
# π§Ύ Prompt Template
|
| 111 |
prompt_template = PromptTemplate(
|
| 112 |
template=(
|
| 113 |
"Answer the question based on the context below.\n\n"
|
|
|
|
| 120 |
|
| 121 |
|
| 122 |
# π Streamlit UI
|
| 123 |
+
st.title("π₯ YouTube Transcript Chatbot")
|
| 124 |
+
|
| 125 |
+
# Video ID input
|
| 126 |
+
video_id = st.text_input("YouTube Video ID", value="lv1_-RER4_I", help="Enter the video ID from YouTube URL")
|
| 127 |
+
|
| 128 |
+
# Language selection
|
| 129 |
+
language_code = "en"
|
| 130 |
+
if video_id:
|
| 131 |
+
with st.spinner("Checking available transcripts..."):
|
| 132 |
+
available_langs = list_available_languages(video_id)
|
| 133 |
+
|
| 134 |
+
if available_langs:
|
| 135 |
+
st.success(f"Found {len(available_langs)} available transcript(s)")
|
| 136 |
+
|
| 137 |
+
# Create dropdown with available languages
|
| 138 |
+
lang_options = {label: code for code, label in available_langs}
|
| 139 |
+
selected_label = st.selectbox(
|
| 140 |
+
"Select Transcript Language",
|
| 141 |
+
options=list(lang_options.keys()),
|
| 142 |
+
help="Choose from available transcripts for this video"
|
| 143 |
+
)
|
| 144 |
+
language_code = lang_options[selected_label]
|
| 145 |
+
else:
|
| 146 |
+
st.info("Using default English transcript")
|
| 147 |
+
language_code = "en"
|
| 148 |
|
| 149 |
+
# Query input
|
| 150 |
+
query = st.text_area("Your Query", value="What is RAG?", help="Ask a question about the video content")
|
| 151 |
+
|
| 152 |
+
# Model selection
|
| 153 |
model_choice = st.radio("Model to Use", ["Flan-T5 (Free)", "DeepSeek", "OpenAI"])
|
|
|
|
| 154 |
|
| 155 |
+
# Temperature slider
|
| 156 |
+
temperature = st.slider("Temperature", 0, 100, value=50, help="Higher = more creative, Lower = more focused") / 100.0
|
| 157 |
+
|
| 158 |
+
# Run button
|
| 159 |
if st.button("π Run Chatbot"):
|
| 160 |
if not video_id or not query:
|
| 161 |
st.warning("Please fill in all fields.")
|
| 162 |
else:
|
| 163 |
with st.spinner("Fetching transcript..."):
|
| 164 |
+
transcript = get_transcript(video_id, language_code)
|
| 165 |
if not transcript:
|
| 166 |
st.error("Could not fetch transcript.")
|
| 167 |
else:
|
|
|
|
| 185 |
response_obj = model.invoke(prompt)
|
| 186 |
response = response_obj.content if hasattr(response_obj, 'content') else str(response_obj)
|
| 187 |
|
| 188 |
+
st.text_area("Model Response", value=response, height=400)
|
| 189 |
+
|
| 190 |
+
# Sidebar with info
|
| 191 |
+
with st.sidebar:
|
| 192 |
+
st.header("βΉοΈ About")
|
| 193 |
+
st.write("This chatbot analyzes YouTube videos using their transcripts.")
|
| 194 |
+
st.write("**Features:**")
|
| 195 |
+
st.write("- Auto-detect available languages")
|
| 196 |
+
st.write("- RAG-based Q&A")
|
| 197 |
+
st.write("- Multiple model options")
|
| 198 |
+
st.write("- Cached for performance")
|