Spaces:

ATInc1
/

AIdea-Server

Running

Ahmed Mostafa

fix RapidAPI v1.2

3d4b8b5 8 days ago

36.4 kB

	import json
	import base64
	import binascii
	import html
	import http.cookiejar
	import inspect
	import logging
	import os
	import re
	import time
	import requests
	from curl_cffi import requests as curl_requests
	from pathlib import Path
	from typing import Callable, List, Tuple

	from src.utils.config import settings


	logger = logging.getLogger(__name__)

	_FAST_FAIL_SSL_MARKERS = (
	"UNEXPECTED_EOF_WHILE_READING",
	"SSLEOFError",
	"EOF occurred in violation of protocol",
	"TLS",
	"TLS connect error",
	"invalid library",
	)


	def _is_fast_fail_ssl_error(exc: Exception) -> bool:
	error_text = str(exc)
	return any(marker in error_text for marker in _FAST_FAIL_SSL_MARKERS)


	class TranscriptProviderError(RuntimeError):
	"""Raised when a transcript provider cannot return usable transcript text."""


	class YouTubeDownloader:
	def __init__(self):
	self._assemblyai_key = os.environ.get("ASSEMBLYAI_API_KEY", "").strip()
	self._supadata_key = os.environ.get("SUPADATA_API_KEY", "").strip()
	self._rapidapi_key = os.environ.get("RAPIDAPI_KEY", "").strip()
	self._rapidapi_host = os.environ.get("RAPIDAPI_HOST", "youtube-transcriptor.p.rapidapi.com").strip()
	self._youtube_cookies = os.environ.get("YOUTUBE_COOKIES", "").strip()
	self._youtube_cookies_b64 = os.environ.get("YOUTUBE_COOKIES_B64", "").strip()
	self._youtube_cookies_path = os.environ.get("YOUTUBE_COOKIES_PATH", "").strip()
	self._youtube_cookies_file = os.environ.get("YOUTUBE_COOKIES_FILE", "").strip()
	self._youtube_po_token = os.environ.get("YOUTUBE_PO_TOKEN", "").strip()
	self._youtube_po_token_client = os.environ.get("YOUTUBE_PO_TOKEN_CLIENT", "web").strip()
	self._youtube_po_token_context = os.environ.get("YOUTUBE_PO_TOKEN_CONTEXT", "gvs").strip()
	self._proxy_url = (
	os.environ.get("PROXY_URL", "").strip()
	or os.environ.get("YOUTUBE_PROXY", "").strip()
	)
	self._invidious_instances = self._load_invidious_instances()
	self._strategy = settings.youtube_transcript_strategy
	self._configure_proxy_environment()

	if self._strategy == "cookies_required":
	logger.info("Transcript strategy 'cookies_required' enabled.")

	def get_transcript(self, url: str) -> str:
	video_id = self._extract_video_id(url)
	logger.info("Transcript pipeline for video ID %s using strategy=%s", video_id, self._strategy)

	failures: List[str] = []
	providers = self._build_provider_plan()

	for index, (provider_name, provider) in enumerate(providers, start=1):
	try:
	logger.info("Trying transcript strategy: %s", provider_name)
	transcript = provider(video_id)
	if transcript:
	return transcript
	raise TranscriptProviderError(f"{provider_name} returned empty transcript text.")
	except Exception as exc:
	failures.append(f"{provider_name}: {exc}")
	has_more_fallbacks = index < len(providers)
	if has_more_fallbacks:
	logger.info("%s transcript provider unavailable, trying next fallback.", provider_name)
	else:
	logger.error("All transcript providers failed for %s.", video_id)

	raise RuntimeError(
	f"All transcript strategies exhausted for {video_id}. "
	f"Failures: {' \| '.join(failures)}"
	)

	def _build_provider_plan(self) -> List[Tuple[str, Callable[[str], str]]]:
	return [
	("RapidAPI", self._get_transcript_via_rapidapi),
	("RapidAPI-v2", self._get_transcript_via_rapidapi_v2),
	("Supadata", self._get_transcript_via_supadata),
	("YouTube Transcript API", self._get_transcript_via_youtube),
	("yt-dlp", self._get_transcript_via_ytdlp),
	("pytubefix captions", self._get_transcript_via_pytubefix),
	]

	def _get_transcript_via_youtube(self, video_id: str) -> str:
	last_error: Exception \| None = None
	languages = ["en", "ar", "en-US"]
	cookie_file = self._resolve_cookie_file()
	proxies = self._requests_proxies()

	for attempt in range(3):
	try:
	from youtube_transcript_api import YouTubeTranscriptApi

	if hasattr(YouTubeTranscriptApi, "get_transcript"):
	kwargs = {"languages": languages}
	if cookie_file:
	kwargs["cookies"] = str(cookie_file)
	if proxies:
	kwargs["proxies"] = proxies

	data = YouTubeTranscriptApi.get_transcript(video_id, **kwargs)
	logger.info("YouTube Transcript API get_transcript succeeded on attempt %s", attempt + 1)
	return self._join_transcript_entries(data)

	session = self._build_requests_session(cookie_file)
	api = YouTubeTranscriptApi(http_client=session)
	if hasattr(api, "fetch"):
	data = api.fetch(video_id, languages=languages)
	logger.info("YouTube Transcript API fetch succeeded on attempt %s", attempt + 1)
	return self._join_transcript_entries(data)

	list_kwargs = {}
	if cookie_file:
	list_kwargs["cookies"] = str(cookie_file)
	if proxies:
	list_kwargs["proxies"] = proxies

	transcript_list = YouTubeTranscriptApi.list_transcripts(video_id, **list_kwargs)
	try:
	transcript = transcript_list.find_manually_created_transcript(["en", "ar", "en-US"])
	except Exception:
	try:
	transcript = transcript_list.find_generated_transcript(["en", "ar", "en-US"])
	except Exception:
	transcript = next(iter(transcript_list))

	entries = transcript.fetch()
	logger.info("YouTube Transcript API succeeded on attempt %s", attempt + 1)
	return self._join_transcript_entries(entries)
	except Exception as exc:
	last_error = exc
	if _is_fast_fail_ssl_error(exc):
	break
	if attempt < 2:
	time.sleep(1.5 * (attempt + 1))

	raise TranscriptProviderError(f"YouTube Transcript API failed: {last_error}") from last_error

	def _get_transcript_via_rapidapi(self, video_id: str) -> str:
	import httpx

	if not self._rapidapi_key:
	raise TranscriptProviderError("RapidAPI key not configured.")

	url = f"https://{self._rapidapi_host}/transcript"
	headers = {
	"x-rapidapi-key": self._rapidapi_key,
	"x-rapidapi-host": self._rapidapi_host,
	}
	params = {
	"video_id": video_id,
	"lang": "en",
	}

	try:
	with httpx.Client(timeout=30) as client:
	response = client.get(url, headers=headers, params=params)
	response.raise_for_status()
	data = response.json()
	except Exception as exc:
	raise TranscriptProviderError(f"RapidAPI request failed: {exc}") from exc

	if isinstance(data, dict) and "error" in data:
	raise ValueError(f"RapidAPI: {data['error']}")

	if isinstance(data, list) and len(data) > 0:
	item = data[0]
	if isinstance(item, dict):
	# 1. FIRST try data[0]["transcriptionAsText"]
	full_text = item.get("transcriptionAsText", "")
	if full_text and str(full_text).strip():
	logger.info("RapidAPI transcript (full text) fetched successfully (%d chars).", len(full_text.strip()))
	return full_text.strip()

	# 2. FALLBACK to joining data[0]["transcription"][n]["subtitle"]
	transcription_list = item.get("transcription", [])
	if isinstance(transcription_list, list) and len(transcription_list) > 0:
	transcript = " ".join(
	str(seg.get("subtitle", "")) for seg in transcription_list if isinstance(seg, dict)
	)
	if transcript.strip():
	logger.info("RapidAPI transcript (segments) fetched successfully (%d chars).", len(transcript.strip()))
	return transcript.strip()

	# 3. If neither works, log the full raw response at WARNING level and raise
	logger.warning("RapidAPI raw response: %s", data)
	raise TranscriptProviderError("RapidAPI response did not contain usable transcript content.")

	def _get_transcript_via_rapidapi_v2(self, video_id: str) -> str:
	import httpx

	if not self._rapidapi_key:
	raise TranscriptProviderError("RapidAPI key not configured.")

	url = "https://youtube-transcripts-transcribe-youtube-video-to-text.p.rapidapi.com/transcribe"
	headers = {
	"x-rapidapi-key": self._rapidapi_key,
	"x-rapidapi-host": "youtube-transcripts-transcribe-youtube-video-to-text.p.rapidapi.com",
	"Content-Type": "application/json",
	}
	payload = {
	"url": f"https://www.youtube.com/watch?v={video_id}"
	}

	try:
	with httpx.Client(timeout=60) as client:
	response = client.post(url, headers=headers, json=payload)
	response.raise_for_status()
	data = response.json()
	except Exception as exc:
	raise TranscriptProviderError(f"RapidAPI-v2 request failed: {exc}") from exc

	# Log raw response for debugging
	logger.info(
	f"[RapidAPI-v2 DEBUG] status={response.status_code} "
	f"preview={str(data)[:300]}"
	)

	# Handle error responses
	if isinstance(data, dict) and "error" in data:
	raise ValueError(f"RapidAPI-v2: {data['error']}")

	# Response shape is typically:
	# {"transcript": "full text..."}
	# OR {"segments": [{"text": "...", "start": 0.0}, ...]}
	# OR {"content": "full text..."}
	if isinstance(data, dict):
	for key in ("transcript", "content", "text", "result"):
	if data.get(key) and isinstance(data[key], str) and data[key].strip():
	return data[key].strip()

	# Fallback: join segments array if present
	for key in ("segments", "transcription", "words"):
	if isinstance(data.get(key), list):
	joined = " ".join(
	seg.get("text", "") for seg in data[key]
	if isinstance(seg, dict)
	).strip()
	if joined:
	return joined

	raise TranscriptProviderError("RapidAPI-v2 response did not contain usable transcript content.")

	def _get_transcript_via_supadata(self, video_id: str) -> str:
	if not self._supadata_key:
	raise TranscriptProviderError("Supadata API key not configured.")

	clean_url = f"https://www.youtube.com/watch?v={video_id}"
	headers = {
	"x-api-key": self._supadata_key,
	"User-Agent": (
	"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
	"AppleWebKit/537.36 (KHTML, like Gecko) "
	"Chrome/120.0.0.0 Safari/537.36"
	),
	}
	try:
	resp = curl_requests.get(
	f"https://api.supadata.ai/v1/youtube/transcript?url={clean_url}&text=true",
	headers=headers,
	impersonate="chrome124",
	timeout=30,
	proxies=self._requests_proxies() or None,
	)
	resp.raise_for_status()
	data = resp.json()
	except Exception as exc:
	raise TranscriptProviderError(f"Supadata request failed: {exc}") from exc

	# Handle both "content" (plain text) and "segments" (structured list)
	text = ""
	content_val = data.get("content")
	if isinstance(content_val, str) and content_val.strip():
	text = content_val.strip()
	elif isinstance(content_val, list):
	# If content is returned as a list of segments instead of text
	text = " ".join(
	s.get("text", "") for s in content_val if isinstance(s, dict)
	).strip()

	# Fallback to "segments" key if content is empty
	if not text:
	segments = data.get("segments", [])
	if segments and isinstance(segments, list):
	text = " ".join(
	s.get("text", "") for s in segments if isinstance(s, dict)
	).strip()

	if not text:
	raise TranscriptProviderError("Supadata response did not include usable transcript content.")

	logger.info("Supadata transcript fetched successfully (%d chars).", len(text))
	return text

	def _get_transcript_via_pytubefix(self, video_id: str) -> str:
	url = f"https://www.youtube.com/watch?v={video_id}"

	try:
	from pytubefix import YouTube
	except Exception as exc:
	raise TranscriptProviderError(f"pytubefix import failed: {exc}") from exc

	try:
	init_kwargs = self._pytubefix_init_kwargs(YouTube)
	yt = YouTube(url, **init_kwargs)
	captions = getattr(yt, "captions", None)
	if not captions:
	raise TranscriptProviderError("pytubefix returned no captions.")

	caption = self._select_pytubefix_caption(captions)
	if caption is None:
	raise TranscriptProviderError("pytubefix found no preferred caption track.")

	text = self._caption_to_text(caption)
	if not text:
	raise TranscriptProviderError("pytubefix caption track was empty.")

	logger.info("pytubefix captions fetched successfully (%d chars).", len(text))
	return text
	except TranscriptProviderError:
	raise
	except Exception as exc:
	raise TranscriptProviderError(f"pytubefix captions failed: {exc}") from exc

	def _get_transcript_via_ytdlp(self, video_id: str) -> str:
	"""
	Final fallback: uses yt-dlp which is most robust and supports POT tokens.
	"""
	import yt_dlp
	url = f"https://www.youtube.com/watch?v={video_id}"

	# Configure yt-dlp to be quiet and only fetch metadata/subs
	ydl_opts = {
	'skip_download': True,
	'writesubtitles': True,
	'writeautomaticsubs': True,
	'subtitleslangs': ['en', 'ar', 'en-US'],
	'quiet': True,
	'no_warnings': True,
	'extract_flat': False,
	}
	self._apply_youtube_network_options(ydl_opts)
	self._apply_cookie_options(ydl_opts)

	try:
	with yt_dlp.YoutubeDL(ydl_opts) as ydl:
	info = ydl.extract_info(url, download=False)

	subtitles = info.get('subtitles') or {}
	auto_subs = info.get('automatic_captions') or {}

	# Preferred languages in order
	for lang in ['en', 'en-US', 'ar']:
	# Try manual subs first, then auto
	for source in [subtitles, auto_subs]:
	if lang in source:
	# Find a format we can parse (json3 is easiest, then vtt)
	formats = source[lang]
	# Try to find json3
	json3_url = next((f['url'] for f in formats if f.get('ext') == 'json3'), None)
	if json3_url:
	resp = curl_requests.get(
	json3_url,
	impersonate="chrome124",
	proxies=self._requests_proxies() or None,
	)
	data = resp.json()
	return " ".join(
	seg.get('utf8', '')
	for event in data.get('events', [])
	for seg in event.get('segs', [])
	).strip()

	# Fallback to vtt
	vtt_url = next((f['url'] for f in formats if f.get('ext') == 'vtt'), None)
	if vtt_url:
	resp = curl_requests.get(
	vtt_url,
	impersonate="chrome124",
	proxies=self._requests_proxies() or None,
	)
	# Simple VTT parsing (strip tags and timestamps)
	vtt_text = resp.text
	lines = []
	for line in vtt_text.splitlines():
	if '-->' not in line and line.strip() and not line.strip().isdigit() and line != 'WEBVTT':
	clean = re.sub(r'<[^>]+>', '', line).strip()
	if clean: lines.append(clean)
	return " ".join(lines).strip()

	raise TranscriptProviderError("No usable subtitle formats found via yt-dlp.")
	except Exception as exc:
	raise TranscriptProviderError(f"yt-dlp failed: {exc}") from exc
	finally:
	cookiefile = ydl_opts.get("cookiefile")
	if cookiefile and os.path.exists(cookiefile) and ("tmp" in str(cookiefile).lower() or "temp" in str(cookiefile).lower()):
	try:
	os.remove(cookiefile)
	except OSError:
	pass

	def download_audio(self, url: str, output_stem: str) -> Path:
	"""
	Download the best available audio stream for Whisper deep-scan fallback.
	"""
	import yt_dlp

	settings.temp_dir.mkdir(parents=True, exist_ok=True)
	safe_stem = re.sub(r"[^A-Za-z0-9_-]+", "_", output_stem).strip("_") or "audio"
	output_template = str(settings.temp_dir / f"{safe_stem}.%(ext)s")
	expected_audio_path = settings.temp_dir / f"{safe_stem}.mp3"

	ydl_opts = {
	"format": "bestaudio/best",
	"outtmpl": output_template,
	"quiet": True,
	"no_warnings": True,
	"noplaylist": True,
	"postprocessors": [
	{
	"key": "FFmpegExtractAudio",
	"preferredcodec": "mp3",
	"preferredquality": "128",
	}
	],
	}
	self._apply_youtube_network_options(ydl_opts)
	self._apply_cookie_options(ydl_opts)

	failures: List[str] = []
	try:
	for provider_name, provider in self._build_audio_download_plan(ydl_opts):
	try:
	provider(url, safe_stem)
	break
	except Exception as exc:
	failures.append(f"{provider_name}: {exc}")
	logger.warning("%s audio extraction failed: %s", provider_name, exc)
	else:
	auth_hint = ""
	if self._looks_like_youtube_auth_block(failures) and not self._has_youtube_auth():
	auth_hint = (
	" YouTube authentication is required for this video/Space. "
	"Set YOUTUBE_COOKIES_B64 (recommended) or YOUTUBE_COOKIES, "
	"and optionally YOUTUBE_PO_TOKEN."
	)
	raise RuntimeError(f"Audio extraction failed.{auth_hint} {' \| '.join(failures)}")

	if expected_audio_path.exists() and expected_audio_path.stat().st_size > 0:
	logger.info("Audio extracted for deep scan: %s", expected_audio_path)
	return expected_audio_path

	matches = sorted(settings.temp_dir.glob(f"{safe_stem}.*"))
	for candidate in matches:
	if candidate.is_file() and candidate.stat().st_size > 0:
	logger.info("Audio extracted for deep scan: %s", candidate)
	return candidate

	raise RuntimeError("Audio extraction completed but no audio file was produced.")
	finally:
	cookiefile = ydl_opts.get("cookiefile")
	if cookiefile and os.path.exists(cookiefile) and ("tmp" in str(cookiefile).lower() or "temp" in str(cookiefile).lower()):
	try:
	os.remove(cookiefile)
	except OSError:
	pass
	def _build_audio_download_plan(self, ydl_opts: dict) -> List[Tuple[str, Callable[[str, str], None]]]:
	return [
	("yt-dlp", lambda url, _safe_stem: self._download_audio_via_ytdlp(url, ydl_opts)),
	("Invidious proxy", self._download_audio_via_invidious),
	("pytubefix", self._download_audio_via_pytubefix),
	]

	def _download_audio_via_ytdlp(self, url: str, ydl_opts: dict) -> None:
	import yt_dlp

	with yt_dlp.YoutubeDL(ydl_opts) as ydl:
	ydl.extract_info(url, download=True)

	def _download_audio_via_pytubefix(self, url: str, safe_stem: str) -> None:
	from pytubefix import YouTube

	try:
	yt = YouTube(url, **self._pytubefix_init_kwargs(YouTube))
	stream = yt.streams.filter(only_audio=True).order_by("abr").desc().first()
	if stream is None:
	raise RuntimeError("No audio stream returned by pytubefix.")
	stream.download(
	output_path=str(settings.temp_dir),
	filename=f"{safe_stem}.{stream.subtype or 'mp4'}",
	)
	except Exception as exc:
	raise RuntimeError(f"pytubefix failed: {exc}") from exc

	def _download_audio_via_invidious(self, url: str, safe_stem: str) -> None:
	video_id = self._extract_video_id(url)
	if not video_id or video_id == "unknown":
	raise RuntimeError("Could not extract video ID for Invidious fallback.")

	failures: List[str] = []
	for instance in self._invidious_instances:
	instance = instance.rstrip("/")
	try:
	api_url = f"{instance}/api/v1/videos/{video_id}"
	resp = requests.get(
	api_url,
	headers=self._browser_headers(),
	proxies=self._requests_proxies() or None,
	timeout=20,
	)
	resp.raise_for_status()
	data = resp.json()
	audio_formats = self._extract_invidious_audio_formats(data)
	if not audio_formats:
	raise RuntimeError("No audio formats in Invidious response.")

	selected = audio_formats[0]
	itag = selected.get("itag")
	download_url = (
	f"{instance}/latest_version?id={video_id}&itag={itag}&local=true"
	if itag
	else selected.get("url", "")
	)
	if not download_url:
	raise RuntimeError("No downloadable audio URL in Invidious response.")

	extension = self._extension_from_mime(selected.get("type", "audio/webm"))
	output_path = settings.temp_dir / f"{safe_stem}.{extension}"
	self._stream_download(download_url, output_path)
	logger.info("Invidious audio extracted via %s: %s", instance, output_path)
	return
	except Exception as exc:
	failures.append(f"{instance}: {exc}")
	logger.warning("Invidious instance failed for audio extraction: %s", exc)

	raise RuntimeError("All Invidious instances failed. " + " \| ".join(failures))

	def _extract_invidious_audio_formats(self, data: dict) -> List[dict]:
	formats = data.get("adaptiveFormats") or data.get("formatStreams") or []
	audio_formats = [
	item
	for item in formats
	if isinstance(item, dict)
	and str(item.get("type", "")).startswith("audio/")
	and (item.get("itag") or item.get("url"))
	]
	return sorted(
	audio_formats,
	key=lambda item: int(item.get("bitrate") or item.get("bitrateBps") or 0),
	reverse=True,
	)

	def _stream_download(self, url: str, output_path: Path) -> None:
	with requests.get(
	url,
	headers=self._browser_headers(),
	proxies=self._requests_proxies() or None,
	stream=True,
	timeout=60,
	) as resp:
	resp.raise_for_status()
	content_type = resp.headers.get("Content-Type", "").lower()
	if "text/html" in content_type or "application/json" in content_type:
	raise RuntimeError(f"Unexpected audio response type: {content_type}")

	with output_path.open("wb") as audio_file:
	for chunk in resp.iter_content(chunk_size=1024 * 1024):
	if chunk:
	audio_file.write(chunk)

	if not output_path.exists() or output_path.stat().st_size == 0:
	raise RuntimeError("Downloaded audio file is empty.")

	def _extension_from_mime(self, mime_type: str) -> str:
	if "mp4" in mime_type or "m4a" in mime_type:
	return "m4a"
	if "mpeg" in mime_type or "mp3" in mime_type:
	return "mp3"
	if "ogg" in mime_type:
	return "ogg"
	return "webm"

	def _join_transcript_entries(self, entries) -> str:
	texts = []
	for entry in entries:
	if isinstance(entry, dict):
	text = entry.get("text", "")
	else:
	text = getattr(entry, "text", "")
	if text:
	texts.append(str(text))
	return " ".join(texts).strip()

	def _build_requests_session(self, cookie_file: Path \| None = None) -> requests.Session:
	session = requests.Session()
	session.headers.update(self._browser_headers())
	proxies = self._requests_proxies()
	if proxies:
	session.proxies.update(proxies)

	if cookie_file:
	try:
	cookie_jar = http.cookiejar.MozillaCookieJar(str(cookie_file))
	cookie_jar.load(ignore_discard=True, ignore_expires=True)
	session.cookies.update(cookie_jar)
	except Exception as exc:
	logger.warning("Could not load YouTube cookies from %s: %s", cookie_file, exc)

	return session

	def _pytubefix_init_kwargs(self, youtube_cls) -> dict:
	kwargs = {}
	try:
	params = inspect.signature(youtube_cls).parameters
	except (TypeError, ValueError):
	params = {}

	if "use_oauth" in params:
	kwargs["use_oauth"] = False
	if "allow_oauth_cache" in params:
	kwargs["allow_oauth_cache"] = True
	if self._proxy_url:
	if "proxies" in params:
	kwargs["proxies"] = self._requests_proxies()
	elif "proxy" in params:
	kwargs["proxy"] = self._proxy_url

	return kwargs

	def _select_pytubefix_caption(self, captions):
	preferred_codes = ["en", "a.en", "en-US", "a.en-US", "ar", "a.ar"]

	for code in preferred_codes:
	try:
	return captions[code]
	except Exception:
	pass

	getter = getattr(captions, "get_by_language_code", None)
	if callable(getter):
	try:
	caption = getter(code)
	if caption is not None:
	return caption
	except Exception:
	pass

	try:
	for caption in captions:
	if not isinstance(caption, str):
	return caption
	try:
	return captions[caption]
	except Exception:
	pass
	except Exception:
	return None

	return None

	def _caption_to_text(self, caption) -> str:
	srt_method = getattr(caption, "generate_srt_captions", None)
	if callable(srt_method):
	return self._strip_srt(srt_method())

	for attr_name in ("xml_captions", "xml_caption", "caption_xml"):
	value = getattr(caption, attr_name, None)
	if value:
	return self._strip_markup(str(value))

	json_value = getattr(caption, "json_captions", None)
	if json_value:
	try:
	data = json.loads(json_value) if isinstance(json_value, str) else json_value
	return self._join_caption_json(data)
	except Exception:
	pass

	return self._strip_markup(str(caption))

	def _strip_srt(self, srt_text: str) -> str:
	lines = []
	for line in srt_text.splitlines():
	stripped = line.strip()
	if not stripped or stripped.isdigit() or "-->" in stripped:
	continue
	lines.append(stripped)
	return " ".join(lines).strip()

	def _strip_markup(self, value: str) -> str:
	no_tags = re.sub(r"<[^>]+>", " ", value)
	return re.sub(r"\s+", " ", html.unescape(no_tags)).strip()

	def _join_caption_json(self, data) -> str:
	texts = []
	for event in data.get("events", []) if isinstance(data, dict) else []:
	for segment in event.get("segs", []) or []:
	text = segment.get("utf8", "")
	if text:
	texts.append(text)
	return " ".join(texts).strip()

	def _apply_cookie_options(self, ydl_opts: dict) -> None:
	cookie_b64 = os.getenv("YOUTUBE_COOKIES_B64")
	if cookie_b64:
	import tempfile, base64
	try:
	cookie_bytes = base64.b64decode(cookie_b64)
	with tempfile.NamedTemporaryFile(delete=False, suffix=".txt") as f:
	f.write(cookie_bytes)
	cookie_path = f.name
	ydl_opts["cookiefile"] = cookie_path
	except Exception as exc:
	logger.warning("Failed to decode YOUTUBE_COOKIES_B64: %s", exc)
	else:
	cookie_file = self._resolve_cookie_file()
	if cookie_file:
	ydl_opts["cookiefile"] = str(cookie_file)

	def _apply_youtube_network_options(self, ydl_opts: dict) -> None:
	youtube_args = {
	"player_client": ["android", "web_safari", "tv"],
	}
	po_tokens = self._build_po_token_args()
	if po_tokens:
	youtube_args["po_token"] = po_tokens

	ydl_opts.update(
	{
	"source_address": "0.0.0.0",
	"socket_timeout": 30,
	"retries": 5,
	"fragment_retries": 5,
	"geo_bypass": True,
	"http_headers": self._browser_headers(),
	"extractor_args": {
	"youtube": youtube_args,
	},
	}
	)
	if self._proxy_url:
	ydl_opts["proxy"] = self._proxy_url

	def _browser_headers(self) -> dict:
	return {
	"User-Agent": (
	"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
	"AppleWebKit/537.36 (KHTML, like Gecko) "
	"Chrome/124.0.0.0 Safari/537.36"
	),
	"Accept-Language": "en-US,en;q=0.9",
	}

	def _requests_proxies(self) -> dict:
	if not self._proxy_url:
	return {}
	return {
	"http": self._proxy_url,
	"https": self._proxy_url,
	}

	def _configure_proxy_environment(self) -> None:
	if not self._proxy_url:
	return

	for key in ("HTTP_PROXY", "HTTPS_PROXY", "http_proxy", "https_proxy"):
	os.environ.setdefault(key, self._proxy_url)

	def _load_invidious_instances(self) -> List[str]:
	configured = os.environ.get("INVIDIOUS_INSTANCES", "").strip()
	if configured:
	return [
	item.strip().rstrip("/")
	for item in configured.split(",")
	if item.strip()
	]

	return [
	"https://yewtu.be",
	"https://inv.nadeko.net",
	"https://invidious.privacyredirect.com",
	"https://vid.puffyan.us",
	]

	def _build_po_token_args(self) -> List[str]:
	if not self._youtube_po_token:
	return []

	raw_tokens = [
	token.strip()
	for token in re.split(r"[\n,]+", self._youtube_po_token)
	if token.strip()
	]
	if not raw_tokens:
	return []

	po_tokens = []
	for token in raw_tokens:
	if "+" in token:
	po_tokens.append(token)
	else:
	client = self._youtube_po_token_client or "web"
	context = self._youtube_po_token_context or "gvs"
	po_tokens.append(f"{client}.{context}+{token}")
	return po_tokens

	def _has_youtube_auth(self) -> bool:
	return bool(
	self._youtube_cookies
	or self._youtube_cookies_b64
	or self._youtube_cookies_path
	or self._youtube_cookies_file
	or self._youtube_po_token
	or self._proxy_url
	)

	def _looks_like_youtube_auth_block(self, failures: List[str]) -> bool:
	combined = " ".join(failures).lower()
	return any(
	marker in combined
	for marker in (
	"sign in to confirm",
	"detected as a bot",
	"po_token",
	"bot",
	"forbidden",
	"403",
	)
	)

	def _resolve_cookie_file(self) -> Path \| None:
	if self._youtube_cookies_path:
	cookie_path = Path(self._youtube_cookies_path)
	if cookie_path.exists():
	return cookie_path
	logger.warning("YOUTUBE_COOKIES_PATH is set but does not exist: %s", cookie_path)

	if self._youtube_cookies_file:
	cookie_path = Path(self._youtube_cookies_file)
	if cookie_path.exists():
	return cookie_path
	logger.warning("YOUTUBE_COOKIES_FILE is set but does not exist: %s", cookie_path)

	cookie_text = self._youtube_cookies
	if not cookie_text and self._youtube_cookies_b64:
	try:
	cookie_text = base64.b64decode(self._youtube_cookies_b64).decode("utf-8")
	except (binascii.Error, UnicodeDecodeError) as exc:
	logger.warning("YOUTUBE_COOKIES_B64 could not be decoded: %s", exc)
	return None

	if not cookie_text:
	return None

	settings.temp_dir.mkdir(parents=True, exist_ok=True)
	cookie_path = settings.temp_dir / "youtube_cookies.txt"
	cookie_text = cookie_text.replace("\\n", "\n")
	if not cookie_text.endswith("\n"):
	cookie_text += "\n"
	cookie_path.write_text(cookie_text, encoding="utf-8")
	return cookie_path

	def cleanup(self, path=None):
	if path is None:
	return

	try:
	audio_path = Path(path)
	if audio_path.exists() and audio_path.is_file():
	audio_path.unlink()
	except Exception as exc:
	logger.warning("Failed to clean up temporary audio file %s: %s", path, exc)

	def _extract_video_id(self, url: str) -> str:
	match = re.search(r"(?:v=\|youtu\.be/\|shorts/\|embed/)([A-Za-z0-9_-]{11})", str(url))
	return match.group(1) if match else "unknown"