Daksh0505 commited on
Commit
cac610a
Β·
verified Β·
1 Parent(s): 7406deb

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +125 -68
app.py CHANGED
@@ -4,48 +4,109 @@ from langchain.text_splitter import RecursiveCharacterTextSplitter
4
  from langchain_community.vectorstores import FAISS
5
  from langchain.prompts import PromptTemplate
6
  from youtube_transcript_api import YouTubeTranscriptApi
 
7
  import os
8
 
9
- # Environment variables
 
10
  api_key = os.getenv("HF_API_KEY")
 
 
11
 
12
- # πŸ“‹ List Available Languages
13
  @st.cache_data
14
  def list_available_languages(video_id):
15
- """Fetch all available transcript languages for a video"""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
16
  try:
17
- api = YouTubeTranscriptApi()
18
- transcript_list = api.list(video_id)
19
-
20
- languages = []
21
  for transcript in transcript_list:
22
  lang_code = transcript.language_code
23
  lang_name = transcript.language
24
  is_generated = transcript.is_generated
25
- label = f"{lang_name} ({lang_code})" + (" - Auto-generated" if is_generated else " - Manual")
26
  languages.append((lang_code, label))
27
-
28
- return languages
29
- except Exception as e:
30
- st.warning(f"Could not fetch available languages: {e}")
31
- return [("en", "English (en) - Default")]
 
 
32
 
33
 
34
- # πŸ“Ό Transcript Fetcher (Using youtube_transcript_api)
35
  @st.cache_data
36
- def get_transcript(video_id, language_code="en"):
37
- """Fetch transcript using youtube_transcript_api"""
38
  try:
39
- api = YouTubeTranscriptApi()
40
- transcript_list = api.fetch(video_id, languages=[language_code])
41
  transcript = ' '.join([snippet.text for snippet in transcript_list])
42
  return transcript
43
  except Exception as e:
44
- st.error(f"Error fetching transcript: {str(e)}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
45
  return None
46
 
47
 
48
- # 🧱 Vector Store (keeping multilingual-e5-base as requested)
49
  @st.cache_data
50
  def create_vector_store(transcript):
51
  splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
@@ -57,7 +118,7 @@ def create_vector_store(transcript):
57
  return FAISS.from_documents(docs, embeddings)
58
 
59
 
60
- # 🧩 Build model (ALL using Inference API now)
61
  def build_model(model_choice, temperature=0.7):
62
  if model_choice == "Flan-T5 (Free)":
63
  llm = HuggingFaceEndpoint(
@@ -67,6 +128,7 @@ def build_model(model_choice, temperature=0.7):
67
  temperature=temperature
68
  )
69
  return ChatHuggingFace(llm=llm)
 
70
  elif model_choice == "DeepSeek":
71
  llm = HuggingFaceEndpoint(
72
  repo_id="deepseek-ai/DeepSeek-V3.2-Exp",
@@ -75,6 +137,7 @@ def build_model(model_choice, temperature=0.7):
75
  max_new_tokens=500
76
  )
77
  return ChatHuggingFace(llm=llm, temperature=temperature)
 
78
  elif model_choice == "OpenAI":
79
  llm = HuggingFaceEndpoint(
80
  repo_id="openai/gpt-oss-20b",
@@ -100,73 +163,67 @@ prompt_template = PromptTemplate(
100
  # πŸš€ Streamlit UI
101
  st.title("πŸŽ₯ YouTube Transcript Chatbot")
102
 
103
- # Video ID input
104
- video_id = st.text_input("YouTube Video ID", value="lv1_-RER4_I", help="Enter the video ID from YouTube URL")
 
 
 
 
 
 
 
105
 
106
- # Language selection
107
- language_code = "en"
108
  if video_id:
109
- with st.spinner("Checking available transcripts..."):
110
  available_langs = list_available_languages(video_id)
111
-
112
  if available_langs:
113
  st.success(f"Found {len(available_langs)} available transcript(s)")
114
-
115
- # Create dropdown with available languages
116
  lang_options = {label: code for code, label in available_langs}
117
- selected_label = st.selectbox(
118
- "Select Transcript Language",
119
- options=list(lang_options.keys()),
120
- help="Choose from available transcripts for this video"
121
- )
122
  language_code = lang_options[selected_label]
123
  else:
124
- st.info("Using default English transcript")
125
- language_code = "en"
126
-
127
- # Query input
128
- query = st.text_area("Your Query", value="What is RAG?", help="Ask a question about the video content")
129
 
130
- # Model selection
131
- model_choice = st.radio("Model to Use", ["Flan-T5 (Free)", "DeepSeek", "OpenAI"])
132
-
133
- # Temperature slider
134
- temperature = st.slider("Temperature", 0, 100, value=50, help="Higher = more creative, Lower = more focused") / 100.0
135
-
136
- # Run button
137
  if st.button("πŸš€ Run Chatbot"):
138
- if not video_id or not query:
139
- st.warning("Please fill in all fields.")
140
  else:
141
- with st.spinner("Fetching transcript..."):
142
- transcript = get_transcript(video_id, language_code)
 
 
 
 
 
 
 
 
 
143
  if not transcript:
144
- st.error("Could not fetch transcript.")
145
  else:
146
- st.success(f"βœ… Transcript fetched! ({len(transcript)} characters)")
147
 
148
- with st.spinner("Generating response..."):
149
- retriever = create_vector_store(transcript).as_retriever(
150
- search_type="mmr",
151
- search_kwargs={"k": 5}
152
- )
153
  relevant_docs = retriever.invoke(query)
154
  context_text = "\n\n".join(doc.page_content for doc in relevant_docs)
155
  prompt = prompt_template.format(context=context_text, question=query)
156
 
157
  model = build_model(model_choice, temperature)
158
  response = model.invoke(prompt)
159
-
160
- # Extract content from response
161
  response_text = response.content if hasattr(response, 'content') else str(response)
162
- st.text_area("Model Response", value=response_text, height=400)
163
 
164
- # Sidebar with info
165
  with st.sidebar:
166
- st.header("ℹ️ About")
167
- st.write("This chatbot analyzes YouTube videos using their transcripts.")
168
- st.write("**Features:**")
169
- st.write("- Auto-detect available languages")
170
- st.write("- RAG-based Q&A")
171
- st.write("- Multiple model options")
172
- st.write("- Cached for performance")
 
4
  from langchain_community.vectorstores import FAISS
5
  from langchain.prompts import PromptTemplate
6
  from youtube_transcript_api import YouTubeTranscriptApi
7
+ import requests
8
  import os
9
 
10
+
11
+ # πŸ”‘ Environment variables
12
  api_key = os.getenv("HF_API_KEY")
13
+ RAPIDAPI_KEY = (os.getenv("RAPIDAPI_KEY") or "").strip()
14
+
15
 
16
+ # πŸ“‹ List Available Languages (RapidAPI β†’ fallback YouTube)
17
  @st.cache_data
18
  def list_available_languages(video_id):
19
+ """Try RapidAPI first to list languages, fallback to YouTubeTranscriptApi."""
20
+ languages = []
21
+
22
+ # --- Try RapidAPI ---
23
+ try:
24
+ if not RAPIDAPI_KEY:
25
+ raise ValueError("RapidAPI key missing")
26
+
27
+ url = "https://youtube-transcript3.p.rapidapi.com/"
28
+ querystring = {"video_id": video_id}
29
+ headers = {
30
+ "x-rapidapi-key": RAPIDAPI_KEY,
31
+ "x-rapidapi-host": "youtube-transcript3.p.rapidapi.com"
32
+ }
33
+
34
+ response = requests.get(url, headers=headers, params=querystring, timeout=15)
35
+ response.raise_for_status()
36
+ data = response.json()
37
+
38
+ if "languages" in data and isinstance(data["languages"], list):
39
+ for lang in data["languages"]:
40
+ lang_code = lang.get("code", "unknown")
41
+ lang_name = lang.get("name", lang_code)
42
+ label = f"{lang_name} ({lang_code})"
43
+ languages.append((lang_code, label))
44
+ elif "availableLanguages" in data:
45
+ for lang in data["availableLanguages"]:
46
+ code = lang.get("language_code", "unknown")
47
+ name = lang.get("language_name", code)
48
+ languages.append((code, f"{name} ({code})"))
49
+
50
+ if languages:
51
+ return languages
52
+ else:
53
+ st.info("RapidAPI did not return language list; using YouTubeTranscriptApi fallback.")
54
+
55
+ except Exception as e:
56
+ st.info(f"RapidAPI language fetch failed: {e}")
57
+
58
+ # --- Fallback: YouTubeTranscriptApi ---
59
  try:
60
+ transcript_list = YouTubeTranscriptApi.list_transcripts(video_id)
 
 
 
61
  for transcript in transcript_list:
62
  lang_code = transcript.language_code
63
  lang_name = transcript.language
64
  is_generated = transcript.is_generated
65
+ label = f"{lang_name} ({lang_code})" + (" - Auto-generated" if is_generated else "")
66
  languages.append((lang_code, label))
67
+ if languages:
68
+ return languages
69
+ except Exception as e2:
70
+ st.warning(f"YouTubeTranscriptApi also failed: {e2}")
71
+
72
+ # --- Final fallback ---
73
+ return [("en", "English (en) - Default")]
74
 
75
 
76
+ # πŸ“Ό Transcript Fetchers (two sources)
77
  @st.cache_data
78
+ def get_transcript_youtube(video_id, language_code):
79
+ """Fetch transcript using YouTubeTranscriptApi."""
80
  try:
81
+ transcript_list = YouTubeTranscriptApi().fetch(video_id, languages=[language_code])
 
82
  transcript = ' '.join([snippet.text for snippet in transcript_list])
83
  return transcript
84
  except Exception as e:
85
+ st.warning(f"YouTubeTranscriptApi failed: {e}")
86
+ return None
87
+
88
+
89
+ @st.cache_data
90
+ def get_transcript_rapidapi(video_id, language_code):
91
+ """Fetch transcript via RapidAPI."""
92
+ try:
93
+ url = "https://youtube-transcript3.p.rapidapi.com/"
94
+ querystring = {"video_id": video_id, "lang": language_code}
95
+ headers = {
96
+ "x-rapidapi-key": RAPIDAPI_KEY,
97
+ "x-rapidapi-host": "youtube-transcript3.p.rapidapi.com"
98
+ }
99
+ response = requests.get(url, headers=headers, params=querystring, timeout=20)
100
+ response.raise_for_status()
101
+ data = response.json()
102
+ transcript = " ".join([item["text"] for item in data.get("transcript", [])])
103
+ return transcript if transcript else None
104
+ except Exception as e:
105
+ st.error(f"RapidAPI transcript fetch failed: {e}")
106
  return None
107
 
108
 
109
+ # 🧱 Vector Store
110
  @st.cache_data
111
  def create_vector_store(transcript):
112
  splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
 
118
  return FAISS.from_documents(docs, embeddings)
119
 
120
 
121
+ # 🧩 Build Model
122
  def build_model(model_choice, temperature=0.7):
123
  if model_choice == "Flan-T5 (Free)":
124
  llm = HuggingFaceEndpoint(
 
128
  temperature=temperature
129
  )
130
  return ChatHuggingFace(llm=llm)
131
+
132
  elif model_choice == "DeepSeek":
133
  llm = HuggingFaceEndpoint(
134
  repo_id="deepseek-ai/DeepSeek-V3.2-Exp",
 
137
  max_new_tokens=500
138
  )
139
  return ChatHuggingFace(llm=llm, temperature=temperature)
140
+
141
  elif model_choice == "OpenAI":
142
  llm = HuggingFaceEndpoint(
143
  repo_id="openai/gpt-oss-20b",
 
163
  # πŸš€ Streamlit UI
164
  st.title("πŸŽ₯ YouTube Transcript Chatbot")
165
 
166
+ video_id = st.text_input("🎬 YouTube Video ID", value="lv1_-RER4_I")
167
+ query = st.text_area("πŸ’¬ Your Query", value="What is RAG?")
168
+ model_choice = st.radio("🧠 Model to Use", ["Flan-T5 (Free)", "DeepSeek", "OpenAI"])
169
+ temperature = st.slider("πŸ”₯ Temperature", 0, 100, value=50) / 100.0
170
+
171
+ source_choice = st.radio(
172
+ "πŸ“œ Transcript Source",
173
+ ["Auto (Try RapidAPI, then YouTubeTranscriptApi)", "RapidAPI", "YouTubeTranscriptApi"]
174
+ )
175
 
 
 
176
  if video_id:
177
+ with st.spinner("πŸ”Ž Checking available transcript languages..."):
178
  available_langs = list_available_languages(video_id)
 
179
  if available_langs:
180
  st.success(f"Found {len(available_langs)} available transcript(s)")
 
 
181
  lang_options = {label: code for code, label in available_langs}
182
+ selected_label = st.selectbox("🌐 Select Transcript Language", options=list(lang_options.keys()))
 
 
 
 
183
  language_code = lang_options[selected_label]
184
  else:
185
+ st.warning("No transcripts found for this video.")
186
+ language_code = None
187
+ else:
188
+ language_code = None
 
189
 
 
 
 
 
 
 
 
190
  if st.button("πŸš€ Run Chatbot"):
191
+ if not video_id or not query or not language_code:
192
+ st.warning("Please provide video ID, query, and select a language.")
193
  else:
194
+ with st.spinner("🧾 Fetching transcript..."):
195
+ transcript = None
196
+ if source_choice == "RapidAPI":
197
+ transcript = get_transcript_rapidapi(video_id, language_code)
198
+ elif source_choice == "YouTubeTranscriptApi":
199
+ transcript = get_transcript_youtube(video_id, language_code)
200
+ else: # Auto mode
201
+ transcript = get_transcript_rapidapi(video_id, language_code)
202
+ if not transcript:
203
+ transcript = get_transcript_youtube(video_id, language_code)
204
+
205
  if not transcript:
206
+ st.error("❌ Could not fetch transcript from any source.")
207
  else:
208
+ st.success(f"βœ… Transcript fetched successfully ({len(transcript)} characters).")
209
 
210
+ with st.spinner("βš™οΈ Generating response..."):
211
+ retriever = create_vector_store(transcript).as_retriever(search_type="mmr", search_kwargs={"k": 5})
 
 
 
212
  relevant_docs = retriever.invoke(query)
213
  context_text = "\n\n".join(doc.page_content for doc in relevant_docs)
214
  prompt = prompt_template.format(context=context_text, question=query)
215
 
216
  model = build_model(model_choice, temperature)
217
  response = model.invoke(prompt)
 
 
218
  response_text = response.content if hasattr(response, 'content') else str(response)
219
+ st.text_area("🧩 Model Response", value=response_text, height=400)
220
 
221
+ # πŸ“˜ Sidebar Info
222
  with st.sidebar:
223
+ st.header("ℹ️ About this App")
224
+ st.write("""
225
+ - Uses both **RapidAPI** and **YouTubeTranscriptApi**
226
+ - Detects transcript languages dynamically (RapidAPI first)
227
+ - RAG-based Q&A powered by Hugging Face models
228
+ - Models supported: Flan-T5 (Free), DeepSeek, OpenAI (via HF)
229
+ """)