Ahmed Mostafa commited on
Commit
d842b52
·
1 Parent(s): 9201bb0

feat: implement multi-provider YouTube transcript downloader and notes API module

Browse files
Files changed (2) hide show
  1. src/api/downloader.py +65 -8
  2. src/api/notes_routes.py +2 -1
src/api/downloader.py CHANGED
@@ -34,12 +34,12 @@ class YouTubeDownloader:
34
  def __init__(self):
35
  self._assemblyai_key = os.environ.get("ASSEMBLYAI_API_KEY", "").strip()
36
  self._supadata_key = os.environ.get("SUPADATA_API_KEY", "").strip()
 
 
37
  self._strategy = settings.youtube_transcript_strategy
38
 
39
  if self._strategy == "cookies_required":
40
- logger.info(
41
- "Transcript strategy 'cookies_required' currently follows YouTube-first ordering until cookie support is added."
42
- )
43
 
44
  def get_transcript(self, url: str) -> str:
45
  video_id = self._extract_video_id(url)
@@ -185,6 +185,7 @@ class YouTubeDownloader:
185
  'no_warnings': True,
186
  'extract_flat': False,
187
  }
 
188
 
189
  try:
190
  with yt_dlp.YoutubeDL(ydl_opts) as ydl:
@@ -253,12 +254,18 @@ class YouTubeDownloader:
253
  }
254
  ],
255
  }
 
256
 
257
- try:
258
- with yt_dlp.YoutubeDL(ydl_opts) as ydl:
259
- ydl.extract_info(url, download=True)
260
- except Exception as exc:
261
- raise RuntimeError(f"Audio extraction failed: {exc}") from exc
 
 
 
 
 
262
 
263
  if expected_audio_path.exists() and expected_audio_path.stat().st_size > 0:
264
  logger.info("Audio extracted for deep scan: %s", expected_audio_path)
@@ -272,6 +279,56 @@ class YouTubeDownloader:
272
 
273
  raise RuntimeError("Audio extraction completed but no audio file was produced.")
274
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
275
  def cleanup(self, path=None):
276
  if path is None:
277
  return
 
34
  def __init__(self):
35
  self._assemblyai_key = os.environ.get("ASSEMBLYAI_API_KEY", "").strip()
36
  self._supadata_key = os.environ.get("SUPADATA_API_KEY", "").strip()
37
+ self._youtube_cookies = os.environ.get("YOUTUBE_COOKIES", "").strip()
38
+ self._youtube_cookies_file = os.environ.get("YOUTUBE_COOKIES_FILE", "").strip()
39
  self._strategy = settings.youtube_transcript_strategy
40
 
41
  if self._strategy == "cookies_required":
42
+ logger.info("Transcript strategy 'cookies_required' enabled.")
 
 
43
 
44
  def get_transcript(self, url: str) -> str:
45
  video_id = self._extract_video_id(url)
 
185
  'no_warnings': True,
186
  'extract_flat': False,
187
  }
188
+ self._apply_cookie_options(ydl_opts)
189
 
190
  try:
191
  with yt_dlp.YoutubeDL(ydl_opts) as ydl:
 
254
  }
255
  ],
256
  }
257
+ self._apply_cookie_options(ydl_opts)
258
 
259
+ failures: List[str] = []
260
+ for provider_name, provider in self._build_audio_download_plan(ydl_opts):
261
+ try:
262
+ provider(url, safe_stem)
263
+ break
264
+ except Exception as exc:
265
+ failures.append(f"{provider_name}: {exc}")
266
+ logger.warning("%s audio extraction failed: %s", provider_name, exc)
267
+ else:
268
+ raise RuntimeError(f"Audio extraction failed. {' | '.join(failures)}")
269
 
270
  if expected_audio_path.exists() and expected_audio_path.stat().st_size > 0:
271
  logger.info("Audio extracted for deep scan: %s", expected_audio_path)
 
279
 
280
  raise RuntimeError("Audio extraction completed but no audio file was produced.")
281
 
282
+ def _build_audio_download_plan(self, ydl_opts: dict) -> List[Tuple[str, Callable[[str, str], None]]]:
283
+ return [
284
+ ("yt-dlp", lambda url, _safe_stem: self._download_audio_via_ytdlp(url, ydl_opts)),
285
+ ("pytubefix", self._download_audio_via_pytubefix),
286
+ ]
287
+
288
+ def _download_audio_via_ytdlp(self, url: str, ydl_opts: dict) -> None:
289
+ import yt_dlp
290
+
291
+ with yt_dlp.YoutubeDL(ydl_opts) as ydl:
292
+ ydl.extract_info(url, download=True)
293
+
294
+ def _download_audio_via_pytubefix(self, url: str, safe_stem: str) -> None:
295
+ from pytubefix import YouTube
296
+
297
+ try:
298
+ yt = YouTube(url, use_oauth=False, allow_oauth_cache=False)
299
+ stream = yt.streams.filter(only_audio=True).order_by("abr").desc().first()
300
+ if stream is None:
301
+ raise RuntimeError("No audio stream returned by pytubefix.")
302
+ stream.download(
303
+ output_path=str(settings.temp_dir),
304
+ filename=f"{safe_stem}.{stream.subtype or 'mp4'}",
305
+ )
306
+ except Exception as exc:
307
+ raise RuntimeError(f"pytubefix failed: {exc}") from exc
308
+
309
+ def _apply_cookie_options(self, ydl_opts: dict) -> None:
310
+ cookie_file = self._resolve_cookie_file()
311
+ if cookie_file:
312
+ ydl_opts["cookiefile"] = str(cookie_file)
313
+
314
+ def _resolve_cookie_file(self) -> Path | None:
315
+ if self._youtube_cookies_file:
316
+ cookie_path = Path(self._youtube_cookies_file)
317
+ if cookie_path.exists():
318
+ return cookie_path
319
+ logger.warning("YOUTUBE_COOKIES_FILE is set but does not exist: %s", cookie_path)
320
+
321
+ if not self._youtube_cookies:
322
+ return None
323
+
324
+ settings.temp_dir.mkdir(parents=True, exist_ok=True)
325
+ cookie_path = settings.temp_dir / "youtube_cookies.txt"
326
+ cookie_text = self._youtube_cookies.replace("\\n", "\n")
327
+ if not cookie_text.endswith("\n"):
328
+ cookie_text += "\n"
329
+ cookie_path.write_text(cookie_text, encoding="utf-8")
330
+ return cookie_path
331
+
332
  def cleanup(self, path=None):
333
  if path is None:
334
  return
src/api/notes_routes.py CHANGED
@@ -356,7 +356,8 @@ def _transcribe_audio_fallback(
356
  except Exception as exc:
357
  raise RuntimeError(
358
  "Deep scan failed: audio extraction or transcription could not be completed. "
359
- "The video may be private, restricted, DRM-protected, or unavailable."
 
360
  ) from exc
361
  finally:
362
  if audio_path is not None:
 
356
  except Exception as exc:
357
  raise RuntimeError(
358
  "Deep scan failed: audio extraction or transcription could not be completed. "
359
+ "The video may be private, restricted, DRM-protected, unavailable, "
360
+ "or YouTube may require YOUTUBE_COOKIES for this Space."
361
  ) from exc
362
  finally:
363
  if audio_path is not None: