MahatirTusher commited on
Commit
4535d8e
Β·
verified Β·
1 Parent(s): 8b4678f

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +56 -23
app.py CHANGED
@@ -14,12 +14,16 @@ from bs4 import SoupStrainer
14
  from youtube_transcript_api import YouTubeTranscriptApi, TranscriptsDisabled, NoTranscriptFound
15
  import yt_dlp
16
  import re
 
 
17
 
18
  # Load environment variables (optional)
19
  load_dotenv()
20
 
21
  # Hardcoded Groq API key
22
  GROQ_API_KEY = "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"
 
 
23
 
24
  # Custom CSS
25
  st.markdown("""
@@ -189,12 +193,49 @@ def fetch_youtube_transcript(video_id):
189
  return " ".join([item['text'] for item in translated_transcript])
190
  return None
191
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
192
  # Function to extract subtitles using yt-dlp with cookies
193
  def extract_subtitles_with_ytdlp(video_url):
194
  ydl_opts = {
195
  'writesubtitles': True,
196
  'writeautomaticsub': True,
197
- 'subtitleslangs': ['all', '-live_chat'], # Fetch all languages, exclude live chat
198
  'skip_download': True,
199
  'subtitlesformat': 'vtt',
200
  'outtmpl': 'subtitle.%(ext)s',
@@ -205,17 +246,16 @@ def extract_subtitles_with_ytdlp(video_url):
205
  'Referer': 'https://www.youtube.com/',
206
  'Origin': 'https://www.youtube.com',
207
  },
208
- 'cookiefile': 'cookies.txt', # Path to cookies.txt
209
  'retries': 10,
210
  'retry_sleep': 5,
211
- 'no_check_certificate': True, # Bypass certificate checks
212
- 'geo_bypass': True, # Attempt to bypass geo-restrictions
213
  'force_generic_extractor': True,
214
- 'quiet': False, # Allow verbose output for debugging
215
- 'verbose': True, # Enable verbose logging for debugging
216
  }
217
  try:
218
- # Check if cookies.txt exists
219
  if not os.path.exists('cookies.txt'):
220
  st.error(
221
  "cookies.txt file not found. Please upload a valid cookies.txt file to the root directory of your Space. "
@@ -228,17 +268,14 @@ def extract_subtitles_with_ytdlp(video_url):
228
  return None
229
 
230
  with yt_dlp.YoutubeDL(ydl_opts) as ydl:
231
- # Redirect yt-dlp output to Streamlit for debugging
232
  ydl.params['logger'] = st
233
  info = ydl.extract_info(video_url, download=False)
234
  available_subs = info.get('subtitles', {})
235
  auto_subs = info.get('automatic_captions', {})
236
 
237
- # Log available subtitles for debugging
238
  st.text(f"Available subtitles: {list(available_subs.keys())}")
239
  st.text(f"Available auto-captions: {list(auto_subs.keys())}")
240
 
241
- # Use any available language if English isn't available
242
  subtitle_langs = list(available_subs.keys()) or list(auto_subs.keys())
243
  if not subtitle_langs:
244
  return None
@@ -246,7 +283,6 @@ def extract_subtitles_with_ytdlp(video_url):
246
  ydl.params['subtitleslangs'] = subtitle_langs
247
  ydl.download([video_url])
248
 
249
- # Look for the subtitle file
250
  subtitle_file = None
251
  for lang in subtitle_langs:
252
  possible_file = f"subtitle.{lang}.vtt"
@@ -257,14 +293,11 @@ def extract_subtitles_with_ytdlp(video_url):
257
  if not subtitle_file:
258
  return None
259
 
260
- # Read and parse the subtitle file
261
  with open(subtitle_file, 'r', encoding='utf-8') as f:
262
  subtitle_text = f.read()
263
 
264
- # Clean up the subtitle file
265
  os.remove(subtitle_file)
266
 
267
- # Extract text from VTT format, removing timestamps and metadata
268
  lines = subtitle_text.split('\n')
269
  text_lines = []
270
  for line in lines:
@@ -283,12 +316,11 @@ def process_content(text, embeddings, source):
283
  chunk_overlap=200,
284
  separators=["\n\n", "\n", ".", " ", ""]
285
  )
286
- # Create documents with source metadata
287
  docs = text_splitter.create_documents([text], metadatas=[{"source": source}])
288
- # Debug: Check metadata of the first document
289
  if docs:
290
  st.text(f"Document metadata: {docs[0].metadata}")
291
  vectorstore = FAISS.from_documents(docs, embeddings)
 
292
  return vectorstore
293
 
294
  # Function to create QA chain
@@ -300,9 +332,10 @@ def create_qa_chain(vectorstore, llm):
300
  chain_type="stuff",
301
  chain_type_kwargs={
302
  "prompt": qa_prompt,
303
- "document_variable_name": "context" # Match the variable name in the prompt
304
  }
305
  )
 
306
  return qa_chain
307
 
308
  # Process Web URL
@@ -325,13 +358,11 @@ if process_url_clicked:
325
  st.error("No content loaded from URL. Try a different URL (e.g., https://www.bbc.com/news/science-environment-67299122).")
326
  st.stop()
327
 
328
- # Store content for summarization
329
  st.session_state.url_content = "\n".join([doc.page_content for doc in data])
330
  embeddings = st.session_state.embeddings
331
- # Pass the URL as the source metadata
332
  st.session_state.vectorstore = process_content(st.session_state.url_content, embeddings, source=url.strip())
333
  st.session_state.index_created = True
334
- st.session_state.qa_chain = None # Clear cached QA chain
335
  st.text("Content processed successfully! βœ…βœ…βœ…")
336
  except Exception as e:
337
  st.error(f"Error processing URL: {str(e)}")
@@ -358,6 +389,10 @@ if process_youtube_clicked:
358
  st.text("Fetching Closed Captions...Started...βœ…βœ…βœ…")
359
  transcript_text = extract_subtitles_with_ytdlp(youtube_url)
360
 
 
 
 
 
361
  if not transcript_text:
362
  st.error(
363
  "No transcripts or closed captions available. "
@@ -376,13 +411,11 @@ if process_youtube_clicked:
376
  st.error("Transcript or captions are empty. Try a different video.")
377
  st.stop()
378
 
379
- # Process the transcript
380
  st.session_state.url_content = transcript_text
381
  embeddings = st.session_state.embeddings
382
- # Pass the YouTube URL as the source metadata
383
  st.session_state.vectorstore = process_content(transcript_text, embeddings, source=youtube_url.strip())
384
  st.session_state.index_created = True
385
- st.session_state.qa_chain = None # Clear cached QA chain
386
  st.text("YouTube video processed successfully! βœ…βœ…βœ…")
387
  except Exception as e:
388
  st.error(f"Error processing YouTube video: {str(e)}")
 
14
  from youtube_transcript_api import YouTubeTranscriptApi, TranscriptsDisabled, NoTranscriptFound
15
  import yt_dlp
16
  import re
17
+ from googleapiclient.discovery import build
18
+ from googleapiclient.errors import HttpError
19
 
20
  # Load environment variables (optional)
21
  load_dotenv()
22
 
23
  # Hardcoded Groq API key
24
  GROQ_API_KEY = "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"
25
+ # YouTube API key (to be set in Hugging Face Spaces secrets)
26
+ YOUTUBE_API_KEY = os.getenv("YOUTUBE_API_KEY")
27
 
28
  # Custom CSS
29
  st.markdown("""
 
193
  return " ".join([item['text'] for item in translated_transcript])
194
  return None
195
 
196
+ # Function to fetch captions using YouTube Data API (limited to listing with API key)
197
+ def fetch_youtube_captions_api(video_id, api_key):
198
+ if not api_key:
199
+ st.warning("YOUTUBE_API_KEY not set. Skipping YouTube Data API fallback.")
200
+ return None
201
+ try:
202
+ youtube = build('youtube', 'v3', developerKey=api_key)
203
+ captions = youtube.captions().list(
204
+ part='snippet',
205
+ videoId=video_id
206
+ ).execute()
207
+
208
+ caption_id = None
209
+ for item in captions.get('items', []):
210
+ if item['snippet']['language'] == 'en':
211
+ caption_id = item['id']
212
+ break
213
+ elif item['snippet']['language'] in ['en-US', 'en-GB']:
214
+ caption_id = item['id']
215
+ break
216
+
217
+ if not caption_id:
218
+ st.warning("No English captions found via YouTube Data API.")
219
+ return None
220
+
221
+ # Note: Downloading captions requires OAuth 2.0 authentication
222
+ st.warning(
223
+ "Downloading captions requires OAuth 2.0 authentication, which is not supported in this environment. "
224
+ "English captions are available but cannot be fetched with an API key alone. "
225
+ "Consider setting up OAuth 2.0 for full functionality (see documentation)."
226
+ )
227
+ return None
228
+
229
+ except HttpError as e:
230
+ st.error(f"Error fetching captions with YouTube Data API: {str(e)}")
231
+ return None
232
+
233
  # Function to extract subtitles using yt-dlp with cookies
234
  def extract_subtitles_with_ytdlp(video_url):
235
  ydl_opts = {
236
  'writesubtitles': True,
237
  'writeautomaticsub': True,
238
+ 'subtitleslangs': ['all', '-live_chat'],
239
  'skip_download': True,
240
  'subtitlesformat': 'vtt',
241
  'outtmpl': 'subtitle.%(ext)s',
 
246
  'Referer': 'https://www.youtube.com/',
247
  'Origin': 'https://www.youtube.com',
248
  },
249
+ 'cookiefile': 'cookies.txt',
250
  'retries': 10,
251
  'retry_sleep': 5,
252
+ 'no_check_certificate': True,
253
+ 'geo_bypass': True,
254
  'force_generic_extractor': True,
255
+ 'quiet': False,
256
+ 'verbose': True,
257
  }
258
  try:
 
259
  if not os.path.exists('cookies.txt'):
260
  st.error(
261
  "cookies.txt file not found. Please upload a valid cookies.txt file to the root directory of your Space. "
 
268
  return None
269
 
270
  with yt_dlp.YoutubeDL(ydl_opts) as ydl:
 
271
  ydl.params['logger'] = st
272
  info = ydl.extract_info(video_url, download=False)
273
  available_subs = info.get('subtitles', {})
274
  auto_subs = info.get('automatic_captions', {})
275
 
 
276
  st.text(f"Available subtitles: {list(available_subs.keys())}")
277
  st.text(f"Available auto-captions: {list(auto_subs.keys())}")
278
 
 
279
  subtitle_langs = list(available_subs.keys()) or list(auto_subs.keys())
280
  if not subtitle_langs:
281
  return None
 
283
  ydl.params['subtitleslangs'] = subtitle_langs
284
  ydl.download([video_url])
285
 
 
286
  subtitle_file = None
287
  for lang in subtitle_langs:
288
  possible_file = f"subtitle.{lang}.vtt"
 
293
  if not subtitle_file:
294
  return None
295
 
 
296
  with open(subtitle_file, 'r', encoding='utf-8') as f:
297
  subtitle_text = f.read()
298
 
 
299
  os.remove(subtitle_file)
300
 
 
301
  lines = subtitle_text.split('\n')
302
  text_lines = []
303
  for line in lines:
 
316
  chunk_overlap=200,
317
  separators=["\n\n", "\n", ".", " ", ""]
318
  )
 
319
  docs = text_splitter.create_documents([text], metadatas=[{"source": source}])
 
320
  if docs:
321
  st.text(f"Document metadata: {docs[0].metadata}")
322
  vectorstore = FAISS.from_documents(docs, embeddings)
323
+ st.text(f"Vector store created with {len(docs)} documents.")
324
  return vectorstore
325
 
326
  # Function to create QA chain
 
332
  chain_type="stuff",
333
  chain_type_kwargs={
334
  "prompt": qa_prompt,
335
+ "document_variable_name": "context"
336
  }
337
  )
338
+ st.text("QA chain created successfully.")
339
  return qa_chain
340
 
341
  # Process Web URL
 
358
  st.error("No content loaded from URL. Try a different URL (e.g., https://www.bbc.com/news/science-environment-67299122).")
359
  st.stop()
360
 
 
361
  st.session_state.url_content = "\n".join([doc.page_content for doc in data])
362
  embeddings = st.session_state.embeddings
 
363
  st.session_state.vectorstore = process_content(st.session_state.url_content, embeddings, source=url.strip())
364
  st.session_state.index_created = True
365
+ st.session_state.qa_chain = None
366
  st.text("Content processed successfully! βœ…βœ…βœ…")
367
  except Exception as e:
368
  st.error(f"Error processing URL: {str(e)}")
 
389
  st.text("Fetching Closed Captions...Started...βœ…βœ…βœ…")
390
  transcript_text = extract_subtitles_with_ytdlp(youtube_url)
391
 
392
+ if not transcript_text and YOUTUBE_API_KEY:
393
+ st.text("Fetching Captions via YouTube Data API...Started...βœ…βœ…βœ…")
394
+ transcript_text = fetch_youtube_captions_api(video_id, YOUTUBE_API_KEY)
395
+
396
  if not transcript_text:
397
  st.error(
398
  "No transcripts or closed captions available. "
 
411
  st.error("Transcript or captions are empty. Try a different video.")
412
  st.stop()
413
 
 
414
  st.session_state.url_content = transcript_text
415
  embeddings = st.session_state.embeddings
 
416
  st.session_state.vectorstore = process_content(transcript_text, embeddings, source=youtube_url.strip())
417
  st.session_state.index_created = True
418
+ st.session_state.qa_chain = None
419
  st.text("YouTube video processed successfully! βœ…βœ…βœ…")
420
  except Exception as e:
421
  st.error(f"Error processing YouTube video: {str(e)}")