Spaces:
Running
Running
Ahmed Mostafa commited on
Commit ·
d842b52
1
Parent(s): 9201bb0
feat: implement multi-provider YouTube transcript downloader and notes API module
Browse files- src/api/downloader.py +65 -8
- src/api/notes_routes.py +2 -1
src/api/downloader.py
CHANGED
|
@@ -34,12 +34,12 @@ class YouTubeDownloader:
|
|
| 34 |
def __init__(self):
|
| 35 |
self._assemblyai_key = os.environ.get("ASSEMBLYAI_API_KEY", "").strip()
|
| 36 |
self._supadata_key = os.environ.get("SUPADATA_API_KEY", "").strip()
|
|
|
|
|
|
|
| 37 |
self._strategy = settings.youtube_transcript_strategy
|
| 38 |
|
| 39 |
if self._strategy == "cookies_required":
|
| 40 |
-
logger.info(
|
| 41 |
-
"Transcript strategy 'cookies_required' currently follows YouTube-first ordering until cookie support is added."
|
| 42 |
-
)
|
| 43 |
|
| 44 |
def get_transcript(self, url: str) -> str:
|
| 45 |
video_id = self._extract_video_id(url)
|
|
@@ -185,6 +185,7 @@ class YouTubeDownloader:
|
|
| 185 |
'no_warnings': True,
|
| 186 |
'extract_flat': False,
|
| 187 |
}
|
|
|
|
| 188 |
|
| 189 |
try:
|
| 190 |
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
|
|
@@ -253,12 +254,18 @@ class YouTubeDownloader:
|
|
| 253 |
}
|
| 254 |
],
|
| 255 |
}
|
|
|
|
| 256 |
|
| 257 |
-
|
| 258 |
-
|
| 259 |
-
|
| 260 |
-
|
| 261 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 262 |
|
| 263 |
if expected_audio_path.exists() and expected_audio_path.stat().st_size > 0:
|
| 264 |
logger.info("Audio extracted for deep scan: %s", expected_audio_path)
|
|
@@ -272,6 +279,56 @@ class YouTubeDownloader:
|
|
| 272 |
|
| 273 |
raise RuntimeError("Audio extraction completed but no audio file was produced.")
|
| 274 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 275 |
def cleanup(self, path=None):
|
| 276 |
if path is None:
|
| 277 |
return
|
|
|
|
| 34 |
def __init__(self):
|
| 35 |
self._assemblyai_key = os.environ.get("ASSEMBLYAI_API_KEY", "").strip()
|
| 36 |
self._supadata_key = os.environ.get("SUPADATA_API_KEY", "").strip()
|
| 37 |
+
self._youtube_cookies = os.environ.get("YOUTUBE_COOKIES", "").strip()
|
| 38 |
+
self._youtube_cookies_file = os.environ.get("YOUTUBE_COOKIES_FILE", "").strip()
|
| 39 |
self._strategy = settings.youtube_transcript_strategy
|
| 40 |
|
| 41 |
if self._strategy == "cookies_required":
|
| 42 |
+
logger.info("Transcript strategy 'cookies_required' enabled.")
|
|
|
|
|
|
|
| 43 |
|
| 44 |
def get_transcript(self, url: str) -> str:
|
| 45 |
video_id = self._extract_video_id(url)
|
|
|
|
| 185 |
'no_warnings': True,
|
| 186 |
'extract_flat': False,
|
| 187 |
}
|
| 188 |
+
self._apply_cookie_options(ydl_opts)
|
| 189 |
|
| 190 |
try:
|
| 191 |
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
|
|
|
|
| 254 |
}
|
| 255 |
],
|
| 256 |
}
|
| 257 |
+
self._apply_cookie_options(ydl_opts)
|
| 258 |
|
| 259 |
+
failures: List[str] = []
|
| 260 |
+
for provider_name, provider in self._build_audio_download_plan(ydl_opts):
|
| 261 |
+
try:
|
| 262 |
+
provider(url, safe_stem)
|
| 263 |
+
break
|
| 264 |
+
except Exception as exc:
|
| 265 |
+
failures.append(f"{provider_name}: {exc}")
|
| 266 |
+
logger.warning("%s audio extraction failed: %s", provider_name, exc)
|
| 267 |
+
else:
|
| 268 |
+
raise RuntimeError(f"Audio extraction failed. {' | '.join(failures)}")
|
| 269 |
|
| 270 |
if expected_audio_path.exists() and expected_audio_path.stat().st_size > 0:
|
| 271 |
logger.info("Audio extracted for deep scan: %s", expected_audio_path)
|
|
|
|
| 279 |
|
| 280 |
raise RuntimeError("Audio extraction completed but no audio file was produced.")
|
| 281 |
|
| 282 |
+
def _build_audio_download_plan(self, ydl_opts: dict) -> List[Tuple[str, Callable[[str, str], None]]]:
|
| 283 |
+
return [
|
| 284 |
+
("yt-dlp", lambda url, _safe_stem: self._download_audio_via_ytdlp(url, ydl_opts)),
|
| 285 |
+
("pytubefix", self._download_audio_via_pytubefix),
|
| 286 |
+
]
|
| 287 |
+
|
| 288 |
+
def _download_audio_via_ytdlp(self, url: str, ydl_opts: dict) -> None:
|
| 289 |
+
import yt_dlp
|
| 290 |
+
|
| 291 |
+
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
|
| 292 |
+
ydl.extract_info(url, download=True)
|
| 293 |
+
|
| 294 |
+
def _download_audio_via_pytubefix(self, url: str, safe_stem: str) -> None:
|
| 295 |
+
from pytubefix import YouTube
|
| 296 |
+
|
| 297 |
+
try:
|
| 298 |
+
yt = YouTube(url, use_oauth=False, allow_oauth_cache=False)
|
| 299 |
+
stream = yt.streams.filter(only_audio=True).order_by("abr").desc().first()
|
| 300 |
+
if stream is None:
|
| 301 |
+
raise RuntimeError("No audio stream returned by pytubefix.")
|
| 302 |
+
stream.download(
|
| 303 |
+
output_path=str(settings.temp_dir),
|
| 304 |
+
filename=f"{safe_stem}.{stream.subtype or 'mp4'}",
|
| 305 |
+
)
|
| 306 |
+
except Exception as exc:
|
| 307 |
+
raise RuntimeError(f"pytubefix failed: {exc}") from exc
|
| 308 |
+
|
| 309 |
+
def _apply_cookie_options(self, ydl_opts: dict) -> None:
|
| 310 |
+
cookie_file = self._resolve_cookie_file()
|
| 311 |
+
if cookie_file:
|
| 312 |
+
ydl_opts["cookiefile"] = str(cookie_file)
|
| 313 |
+
|
| 314 |
+
def _resolve_cookie_file(self) -> Path | None:
|
| 315 |
+
if self._youtube_cookies_file:
|
| 316 |
+
cookie_path = Path(self._youtube_cookies_file)
|
| 317 |
+
if cookie_path.exists():
|
| 318 |
+
return cookie_path
|
| 319 |
+
logger.warning("YOUTUBE_COOKIES_FILE is set but does not exist: %s", cookie_path)
|
| 320 |
+
|
| 321 |
+
if not self._youtube_cookies:
|
| 322 |
+
return None
|
| 323 |
+
|
| 324 |
+
settings.temp_dir.mkdir(parents=True, exist_ok=True)
|
| 325 |
+
cookie_path = settings.temp_dir / "youtube_cookies.txt"
|
| 326 |
+
cookie_text = self._youtube_cookies.replace("\\n", "\n")
|
| 327 |
+
if not cookie_text.endswith("\n"):
|
| 328 |
+
cookie_text += "\n"
|
| 329 |
+
cookie_path.write_text(cookie_text, encoding="utf-8")
|
| 330 |
+
return cookie_path
|
| 331 |
+
|
| 332 |
def cleanup(self, path=None):
|
| 333 |
if path is None:
|
| 334 |
return
|
src/api/notes_routes.py
CHANGED
|
@@ -356,7 +356,8 @@ def _transcribe_audio_fallback(
|
|
| 356 |
except Exception as exc:
|
| 357 |
raise RuntimeError(
|
| 358 |
"Deep scan failed: audio extraction or transcription could not be completed. "
|
| 359 |
-
"The video may be private, restricted, DRM-protected,
|
|
|
|
| 360 |
) from exc
|
| 361 |
finally:
|
| 362 |
if audio_path is not None:
|
|
|
|
| 356 |
except Exception as exc:
|
| 357 |
raise RuntimeError(
|
| 358 |
"Deep scan failed: audio extraction or transcription could not be completed. "
|
| 359 |
+
"The video may be private, restricted, DRM-protected, unavailable, "
|
| 360 |
+
"or YouTube may require YOUTUBE_COOKIES for this Space."
|
| 361 |
) from exc
|
| 362 |
finally:
|
| 363 |
if audio_path is not None:
|