MahatirTusher commited on
Commit
74d84fb
Β·
verified Β·
1 Parent(s): ea40c8d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +27 -61
app.py CHANGED
@@ -14,16 +14,12 @@ from bs4 import SoupStrainer
14
  from youtube_transcript_api import YouTubeTranscriptApi, TranscriptsDisabled, NoTranscriptFound
15
  import yt_dlp
16
  import re
17
- from googleapiclient.discovery import build
18
- from googleapiclient.errors import HttpError
19
 
20
  # Load environment variables (optional)
21
  load_dotenv()
22
 
23
  # Hardcoded Groq API key
24
  GROQ_API_KEY = "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"
25
- # YouTube API key (to be set in Hugging Face Spaces secrets)
26
- YOUTUBE_API_KEY = os.getenv("YOUTUBE_API_KEY")
27
 
28
  # Custom CSS
29
  st.markdown("""
@@ -121,7 +117,7 @@ if "vectorstore" not in st.session_state:
121
  if "summary" not in st.session_state:
122
  st.session_state.summary = None
123
  if "qa_chain" not in st.session_state:
124
- st.session_state.qa_chain = None # Clear any cached QA chain
125
 
126
  # Initialize embeddings once at the start
127
  if "embeddings" not in st.session_state:
@@ -193,65 +189,32 @@ def fetch_youtube_transcript(video_id):
193
  return " ".join([item['text'] for item in translated_transcript])
194
  return None
195
 
196
- # Function to fetch captions using YouTube Data API
197
- def fetch_youtube_captions_api(video_id, api_key):
198
- if not api_key:
199
- return None
200
- try:
201
- youtube = build('youtube', 'v3', developerKey=api_key)
202
- captions = youtube.captions().list(
203
- part='snippet',
204
- videoId=video_id
205
- ).execute()
206
-
207
- caption_id = None
208
- for item in captions.get('items', []):
209
- if item['snippet']['language'] == 'en':
210
- caption_id = item['id']
211
- break
212
- elif item['snippet']['language'] in ['en-US', 'en-GB']:
213
- caption_id = item['id']
214
- break
215
-
216
- if not caption_id:
217
- return None
218
-
219
- caption_content = youtube.captions().download(
220
- id=caption_id,
221
- tfmt='srt'
222
- ).execute()
223
-
224
- # Parse SRT content
225
- caption_text = caption_content.decode('utf-8')
226
- lines = caption_text.split('\n')
227
- text_lines = []
228
- for line in lines:
229
- if not line.strip().isdigit() and not re.match(r'\d{2}:\d{2}:\d{2},\d{3} --> \d{2}:\d{2}:\d{2},\d{3}', line) and line.strip():
230
- text_lines.append(line.strip())
231
-
232
- return " ".join(text_lines)
233
- except HttpError as e:
234
- st.error(f"Error fetching captions with YouTube Data API: {str(e)}")
235
- return None
236
-
237
  # Function to extract subtitles using yt-dlp with cookies
238
  def extract_subtitles_with_ytdlp(video_url):
239
  ydl_opts = {
240
  'writesubtitles': True,
241
  'writeautomaticsub': True,
242
- 'subtitleslangs': ['all'],
243
  'skip_download': True,
244
  'subtitlesformat': 'vtt',
245
  'outtmpl': 'subtitle.%(ext)s',
246
  'http_headers': {
247
  'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
248
- 'Accept-Language': 'en-US,en;q=0.9',
 
 
249
  },
250
  'cookiefile': 'cookies.txt', # Path to cookies.txt
251
- 'retries': 3,
252
- 'retry_sleep': 5,
 
253
  }
254
  try:
 
 
 
 
 
255
  with yt_dlp.YoutubeDL(ydl_opts) as ydl:
256
  info = ydl.extract_info(video_url, download=False)
257
  available_subs = info.get('subtitles', {})
@@ -261,17 +224,22 @@ def extract_subtitles_with_ytdlp(video_url):
261
  st.text(f"Available subtitles: {list(available_subs.keys())}")
262
  st.text(f"Available auto-captions: {list(auto_subs.keys())}")
263
 
264
- # Download the first available subtitle or auto-caption
265
  subtitle_langs = list(available_subs.keys()) or list(auto_subs.keys())
266
  if not subtitle_langs:
267
  return None
268
 
269
- ydl.params['subtitleslangs'] = subtitle_langs
 
 
 
 
 
270
  ydl.download([video_url])
271
 
272
  # Look for the subtitle file
273
  subtitle_file = None
274
- for lang in subtitle_langs:
275
  possible_file = f"subtitle.{lang}.vtt"
276
  if os.path.exists(possible_file):
277
  subtitle_file = possible_file
@@ -341,8 +309,7 @@ if process_url_clicked:
341
  loader = WebBaseLoader(
342
  web_paths=[url.strip()],
343
  bs_kwargs={"parse_only": parse_only},
344
- requests_kwargs={"headers": {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"}}
345
- )
346
  data = loader.load()
347
 
348
  if not data or all(len(doc.page_content.strip()) == 0 for doc in data):
@@ -382,13 +349,12 @@ if process_youtube_clicked:
382
  st.text("Fetching Closed Captions...Started...βœ…βœ…βœ…")
383
  transcript_text = extract_subtitles_with_ytdlp(youtube_url)
384
 
385
- # Fallback to YouTube Data API if yt-dlp fails
386
- if not transcript_text and YOUTUBE_API_KEY:
387
- st.text("Fetching Captions via YouTube Data API...Started...βœ…βœ…βœ…")
388
- transcript_text = fetch_youtube_captions_api(video_id, YOUTUBE_API_KEY)
389
-
390
  if not transcript_text:
391
- st.error("No transcripts or closed captions available in any language. Please try a different video, or ensure captions are enabled for this video.")
 
 
 
 
392
  st.stop()
393
 
394
  if not transcript_text.strip():
 
14
  from youtube_transcript_api import YouTubeTranscriptApi, TranscriptsDisabled, NoTranscriptFound
15
  import yt_dlp
16
  import re
 
 
17
 
18
  # Load environment variables (optional)
19
  load_dotenv()
20
 
21
  # Hardcoded Groq API key
22
  GROQ_API_KEY = "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"
 
 
23
 
24
  # Custom CSS
25
  st.markdown("""
 
117
  if "summary" not in st.session_state:
118
  st.session_state.summary = None
119
  if "qa_chain" not in st.session_state:
120
+ st.session_state.qa_chain = None
121
 
122
  # Initialize embeddings once at the start
123
  if "embeddings" not in st.session_state:
 
189
  return " ".join([item['text'] for item in translated_transcript])
190
  return None
191
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
192
  # Function to extract subtitles using yt-dlp with cookies
193
  def extract_subtitles_with_ytdlp(video_url):
194
  ydl_opts = {
195
  'writesubtitles': True,
196
  'writeautomaticsub': True,
197
+ 'subtitleslangs': ['all', '-live_chat'],
198
  'skip_download': True,
199
  'subtitlesformat': 'vtt',
200
  'outtmpl': 'subtitle.%(ext)s',
201
  'http_headers': {
202
  'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
203
+ 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
204
+ 'Accept-Language': 'en-US,en;q=0.5',
205
+ 'Referer': 'https://www.youtube.com/',
206
  },
207
  'cookiefile': 'cookies.txt', # Path to cookies.txt
208
+ 'retries': 5,
209
+ 'retry_sleep': 3,
210
+ 'force_generic_extractor': True,
211
  }
212
  try:
213
+ # Check if cookies.txt exists
214
+ if not os.path.exists('cookies.txt'):
215
+ st.error("cookies.txt file not found. Please upload a valid cookies.txt file to the root directory of your Space. See instructions on how to generate it using a browser extension like 'Export Cookies' for Chrome.")
216
+ return None
217
+
218
  with yt_dlp.YoutubeDL(ydl_opts) as ydl:
219
  info = ydl.extract_info(video_url, download=False)
220
  available_subs = info.get('subtitles', {})
 
224
  st.text(f"Available subtitles: {list(available_subs.keys())}")
225
  st.text(f"Available auto-captions: {list(auto_subs.keys())}")
226
 
227
+ # Prioritize English subtitles or auto-captions
228
  subtitle_langs = list(available_subs.keys()) or list(auto_subs.keys())
229
  if not subtitle_langs:
230
  return None
231
 
232
+ # Filter for English or related languages
233
+ target_langs = [lang for lang in subtitle_langs if lang.startswith('en')]
234
+ if not target_langs:
235
+ target_langs = subtitle_langs # Fallback to any language
236
+
237
+ ydl.params['subtitleslangs'] = target_langs
238
  ydl.download([video_url])
239
 
240
  # Look for the subtitle file
241
  subtitle_file = None
242
+ for lang in target_langs:
243
  possible_file = f"subtitle.{lang}.vtt"
244
  if os.path.exists(possible_file):
245
  subtitle_file = possible_file
 
309
  loader = WebBaseLoader(
310
  web_paths=[url.strip()],
311
  bs_kwargs={"parse_only": parse_only},
312
+ requests_kwargs={"headers": {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"}})
 
313
  data = loader.load()
314
 
315
  if not data or all(len(doc.page_content.strip()) == 0 for doc in data):
 
349
  st.text("Fetching Closed Captions...Started...βœ…βœ…βœ…")
350
  transcript_text = extract_subtitles_with_ytdlp(youtube_url)
351
 
 
 
 
 
 
352
  if not transcript_text:
353
+ st.error(
354
+ "No transcripts or closed captions available in any language. "
355
+ "Please ensure captions are enabled for this video, or try a different video (e.g., https://www.youtube.com/watch?v=dQw4w9WgXcQ). "
356
+ "If the issue persists, ensure your cookies.txt file is up-to-date."
357
+ )
358
  st.stop()
359
 
360
  if not transcript_text.strip():