Spaces:

ATInc1
/

AIdea-Server

Running

App Files Files Community

Ahmed Mostafa commited on 17 days ago

Commit

d842b52

1 Parent(s): 9201bb0

feat: implement multi-provider YouTube transcript downloader and notes API module

Browse files

Files changed (2) hide show

src/api/downloader.py +65 -8
src/api/notes_routes.py +2 -1

src/api/downloader.py CHANGED Viewed

@@ -34,12 +34,12 @@ class YouTubeDownloader:
     def __init__(self):
         self._assemblyai_key = os.environ.get("ASSEMBLYAI_API_KEY", "").strip()
         self._supadata_key = os.environ.get("SUPADATA_API_KEY", "").strip()
         self._strategy = settings.youtube_transcript_strategy
         if self._strategy == "cookies_required":
-            logger.info(
-                "Transcript strategy 'cookies_required' currently follows YouTube-first ordering until cookie support is added."
-            )
     def get_transcript(self, url: str) -> str:
         video_id = self._extract_video_id(url)
@@ -185,6 +185,7 @@ class YouTubeDownloader:
             'no_warnings': True,
             'extract_flat': False,
         }
         try:
             with yt_dlp.YoutubeDL(ydl_opts) as ydl:
@@ -253,12 +254,18 @@ class YouTubeDownloader:
                 }
             ],
         }
-        try:
-            with yt_dlp.YoutubeDL(ydl_opts) as ydl:
-                ydl.extract_info(url, download=True)
-        except Exception as exc:
-            raise RuntimeError(f"Audio extraction failed: {exc}") from exc
         if expected_audio_path.exists() and expected_audio_path.stat().st_size > 0:
             logger.info("Audio extracted for deep scan: %s", expected_audio_path)
@@ -272,6 +279,56 @@ class YouTubeDownloader:
         raise RuntimeError("Audio extraction completed but no audio file was produced.")
     def cleanup(self, path=None):
         if path is None:
             return

     def __init__(self):
         self._assemblyai_key = os.environ.get("ASSEMBLYAI_API_KEY", "").strip()
         self._supadata_key = os.environ.get("SUPADATA_API_KEY", "").strip()
+        self._youtube_cookies = os.environ.get("YOUTUBE_COOKIES", "").strip()
+        self._youtube_cookies_file = os.environ.get("YOUTUBE_COOKIES_FILE", "").strip()
         self._strategy = settings.youtube_transcript_strategy
         if self._strategy == "cookies_required":
+            logger.info("Transcript strategy 'cookies_required' enabled.")
     def get_transcript(self, url: str) -> str:
         video_id = self._extract_video_id(url)
             'no_warnings': True,
             'extract_flat': False,
         }
+        self._apply_cookie_options(ydl_opts)
         try:
             with yt_dlp.YoutubeDL(ydl_opts) as ydl:
                 }
             ],
         }
+        self._apply_cookie_options(ydl_opts)
+        failures: List[str] = []
+        for provider_name, provider in self._build_audio_download_plan(ydl_opts):
+            try:
+                provider(url, safe_stem)
+                break
+            except Exception as exc:
+                failures.append(f"{provider_name}: {exc}")
+                logger.warning("%s audio extraction failed: %s", provider_name, exc)
+        else:
+            raise RuntimeError(f"Audio extraction failed. {' | '.join(failures)}")
         if expected_audio_path.exists() and expected_audio_path.stat().st_size > 0:
             logger.info("Audio extracted for deep scan: %s", expected_audio_path)
         raise RuntimeError("Audio extraction completed but no audio file was produced.")
+    def _build_audio_download_plan(self, ydl_opts: dict) -> List[Tuple[str, Callable[[str, str], None]]]:
+        return [
+            ("yt-dlp", lambda url, _safe_stem: self._download_audio_via_ytdlp(url, ydl_opts)),
+            ("pytubefix", self._download_audio_via_pytubefix),
+        ]
+    def _download_audio_via_ytdlp(self, url: str, ydl_opts: dict) -> None:
+        import yt_dlp
+        with yt_dlp.YoutubeDL(ydl_opts) as ydl:
+            ydl.extract_info(url, download=True)
+    def _download_audio_via_pytubefix(self, url: str, safe_stem: str) -> None:
+        from pytubefix import YouTube
+        try:
+            yt = YouTube(url, use_oauth=False, allow_oauth_cache=False)
+            stream = yt.streams.filter(only_audio=True).order_by("abr").desc().first()
+            if stream is None:
+                raise RuntimeError("No audio stream returned by pytubefix.")
+            stream.download(
+                output_path=str(settings.temp_dir),
+                filename=f"{safe_stem}.{stream.subtype or 'mp4'}",
+            )
+        except Exception as exc:
+            raise RuntimeError(f"pytubefix failed: {exc}") from exc
+    def _apply_cookie_options(self, ydl_opts: dict) -> None:
+        cookie_file = self._resolve_cookie_file()
+        if cookie_file:
+            ydl_opts["cookiefile"] = str(cookie_file)
+    def _resolve_cookie_file(self) -> Path | None:
+        if self._youtube_cookies_file:
+            cookie_path = Path(self._youtube_cookies_file)
+            if cookie_path.exists():
+                return cookie_path
+            logger.warning("YOUTUBE_COOKIES_FILE is set but does not exist: %s", cookie_path)
+        if not self._youtube_cookies:
+            return None
+        settings.temp_dir.mkdir(parents=True, exist_ok=True)
+        cookie_path = settings.temp_dir / "youtube_cookies.txt"
+        cookie_text = self._youtube_cookies.replace("\\n", "\n")
+        if not cookie_text.endswith("\n"):
+            cookie_text += "\n"
+        cookie_path.write_text(cookie_text, encoding="utf-8")
+        return cookie_path
     def cleanup(self, path=None):
         if path is None:
             return

src/api/notes_routes.py CHANGED Viewed

@@ -356,7 +356,8 @@ def _transcribe_audio_fallback(
     except Exception as exc:
         raise RuntimeError(
             "Deep scan failed: audio extraction or transcription could not be completed. "
-            "The video may be private, restricted, DRM-protected, or unavailable."
         ) from exc
     finally:
         if audio_path is not None:

     except Exception as exc:
         raise RuntimeError(
             "Deep scan failed: audio extraction or transcription could not be completed. "
+            "The video may be private, restricted, DRM-protected, unavailable, "
+            "or YouTube may require YOUTUBE_COOKIES for this Space."
         ) from exc
     finally:
         if audio_path is not None: