Spaces:

samir72
/

AudioChatTranscriber

Sleeping

AudioChatTranscriber / extract /app /Youtubeextraction.py

samir72

Resolve cookies expiry issue

5c45d3a 5 months ago

6.14 kB

	import os, tempfile, subprocess, re, json, shutil, time
	from fastapi import FastAPI, HTTPException
	from pathlib import Path
	from typing import Optional, Callable, Any
	import yt_dlp
	# from utils.storage import upload_and_sign # To remove circular import issue
	from extract.utils.storage import upload_and_sign # To remove circular import issue
	from extract.utils.retrieve_filepath import retrieve_file_path # To get the file path of cookies.txt
	from extract.utils.cookies_refresher import start_cookies_refresher # To refresh cookies.txt periodically

	app = FastAPI()

	def ensure_ffmpeg():
	"""
	Verify that ffmpeg is available in PATH.
	Raises RuntimeError with helpful guidance if missing.
	Prints ffmpeg version to logs if found.
	"""
	ffmpeg_path = shutil.which("ffmpeg")
	if ffmpeg_path is None:
	raise RuntimeError(
	"FFmpeg not found in PATH.\n\n"
	"👉 For Hugging Face Spaces:\n"
	" • If using Gradio/Streamlit template → add a `packages.txt` file at repo root with a line: ffmpeg\n"
	" • If using Docker template → add `apt-get install -y ffmpeg` in your Dockerfile\n\n"
	"Without ffmpeg, yt-dlp cannot extract/convert audio."
	)

	try:
	result = subprocess.run(
	["ffmpeg", "-version"],
	stdout=subprocess.PIPE,
	stderr=subprocess.STDOUT,
	text=True,
	check=False,
	)
	print("✅ ffmpeg found at:", ffmpeg_path)
	print(result.stdout.splitlines()[0]) # show first line of version info
	except Exception as e:
	raise RuntimeError(f"ffmpeg was found at {ffmpeg_path} but could not run: {e}")

	class YTDLPError(RuntimeError):
	pass

	def _require(bin_name: str):
	if shutil.which(bin_name) is None:
	raise YTDLPError(f"Required executable '{bin_name}' not found in PATH.")


	@app.get("/health")
	def health():
	return {"ok": True}

	@app.post("/extract")
	def extract(
	youtube_url: str,
	out_dir: Optional[str] = None,
	target_sr: int = 16000,
	target_channels: int = 1,
	quiet: bool = True,
	keep_intermediate: bool = False,
	progress_hook: Optional[Callable[[dict[str, Any]], None]] = None,
	) -> str:
	"""
	Download YouTube audio via yt_dlp's Python API, extract to WAV,
	and post-process with ffmpeg to 16 kHz mono. Returns path to the final WAV.

	Args
	----
	youtube_url : str
	out_dir : Optional[str] Directory for outputs (temp dir if None).
	target_sr : int Sample rate for final WAV (default 16000).
	target_channels : int Channels for final WAV (default 1 = mono).
	quiet : bool Suppress yt-dlp logs if True.
	keep_intermediate : bool Keep the pre-downsampled WAV if True.
	progress_hook : callable Optional yt-dlp progress hook.

	Raises
	------
	YTDLPError on failure.
	"""
	if not youtube_url or not isinstance(youtube_url, str):
	raise ValueError("youtube_url must be a non-empty string.")

	_require("ffmpeg") # we call ffmpeg ourselves
	# yt-dlp bundles ffmpeg via postprocessors, but we still run ffmpeg explicitly

	work_dir = Path(out_dir or tempfile.mkdtemp(prefix="ytwav_")).resolve()
	work_dir.mkdir(parents=True, exist_ok=True)

	# First stage: let yt-dlp extract WAV (whatever SR/channels)
	out_template = str(work_dir / "%(title).100B [%(id)s].%(ext)s")
	hooks = [progress_hook] if progress_hook else []
	### Use cookies.txt if available
	#cookies_path = retrieve_file_path("cookies.txt")
	#cookies_path = "./app/utils/cookies.txt"
	# Call the cookies refresher to start refreshing cookies in background
	start_cookies_refresher()
	cookies_path = os.getenv("COOKIES_PATH")
	print(f"cookies_path value: {cookies_path}")
	if not cookies_path:
	cookies_path = None
	print("Cookie file NOT found in container!")
	return f"User authentication cookie file NOT found in container! Please try again later."

	ydl_opts = {
	"cookiefile": cookies_path,
	"format": "bestaudio/best",
	"outtmpl": out_template,
	"noplaylist": True,
	"postprocessors": [
	{
	"key": "FFmpegExtractAudio",
	"preferredcodec": "wav",
	"preferredquality": "0",
	}
	],
	"quiet": quiet,
	"verbose": not quiet,
	"no_warnings": quiet,
	"progress_hooks": hooks,
	}

	try:
	with yt_dlp.YoutubeDL(ydl_opts) as ydl:
	ydl.extract_info(youtube_url, download=True)
	except Exception as e:
	#raise YTDLPError(f"yt-dlp API failed: {e}") from e
	return f"yt-dlp API failed: {e}"

	# Locate the produced WAV (pre-downsampled)
	pre_wavs = list(work_dir.glob("*.wav"))
	if not pre_wavs:
	#raise YTDLPError("yt-dlp completed but no WAV was found.")
	return "yt-dlp completed but no WAV was found."
	pre_wav = max(pre_wavs, key=lambda p: p.stat().st_mtime)

	# Second stage: force 16 kHz mono via ffmpeg
	final_wav = pre_wav.with_name(pre_wav.stem + f".{target_sr}Hz.{target_channels}ch.wav")
	try:
	subprocess.run(
	[
	"ffmpeg", "-y",
	"-i", str(pre_wav),
	"-ac", str(target_channels),
	"-ar", str(target_sr),
	str(final_wav),
	],
	check=True,
	stdout=subprocess.PIPE if quiet else None,
	stderr=subprocess.PIPE if quiet else None,
	text=True,
	)
	except subprocess.CalledProcessError as e:
	#raise YTDLPError(f"ffmpeg failed to resample: {e.stderr or e.stdout}") from e
	return f"ffmpeg failed to resample: {e.stderr or e.stdout}"

	# 3) upload + sign (short-lived)
	signed = upload_and_sign(final_wav, ttl_minutes=45)

	# Clean up intermediates if desired
	if not keep_intermediate:
	try:
	if pre_wav.exists() and pre_wav != final_wav:
	pre_wav.unlink()
	except Exception:
	pass

	return signed