Spaces:

andytaylor-smg
/

cfb40

Sleeping

App Files Files Community

cfb40 / src /source_finding /downloader.py

andytaylor-smg

first bite of the app

bb3e8ea about 1 month ago

raw

history blame contribute delete

13.3 kB

	"""
	Video download utilities for college football games.

	This module provides multiple strategies for downloading videos:
	1. Generate yt-dlp commands for users to run locally (offline mode)
	2. Extract direct video URLs for browser-direct downloads (preferred for apps)
	3. Stream video through a proxy (fallback, uses server bandwidth)
	"""

	import logging
	import shlex
	import subprocess
	from pathlib import Path
	from typing import Generator, Optional

	import requests
	from bs4 import BeautifulSoup

	from .models import GameResult

	logger = logging.getLogger(__name__)

	# User agent for web requests
	HEADERS = {"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"}

	# Number of concurrent fragment downloads for yt-dlp
	# Set to 4 for compatibility with free Hugging Face Spaces tier (2 vCPU)
	CONCURRENT_FRAGMENTS = 4


	def _extract_video_source_url(page_url: str) -> Optional[str]:
	"""
	Extract the actual video source URL from an nfl-video.com page.

	nfl-video.com has two types of pages:
	1. Aggregator pages that link to external hosts like collegegamestoday.com
	2. Pages with embedded video players (iframes to ok.ru, filemoon, etc.)

	This function handles both cases.

	Args:
	page_url: The nfl-video.com page URL

	Returns:
	The extracted video source URL, or None if not found
	"""
	try:
	logger.debug("Fetching page to extract video source: %s", page_url)
	response = requests.get(page_url, headers=HEADERS, timeout=30)
	response.raise_for_status()

	soup = BeautifulSoup(response.text, "html.parser")

	# Known video hosting sites (both external links and embed sources)
	external_link_hosts = ["collegegamestoday.com", "sportsurge", "streamsport", "sportshd"]
	embed_hosts = ["ok.ru", "okru", "filemoon", "streamtape", "mixdrop", "doodstream", "dailymotion", "vimeo"]

	# Strategy 1: Look for embedded video iframes (direct video player on page)
	# These are often the most reliable source as they're the actual player
	for iframe in soup.find_all("iframe"):
	src = iframe.get("src", "") or iframe.get("data-src", "") or iframe.get("data-lazy-src", "")
	if not src:
	continue

	# Normalize URL (some start with //)
	if src.startswith("//"):
	src = "https:" + src

	# Check if iframe points to a known video embed host
	for host in embed_hosts:
	if host in src.lower():
	logger.info("Found embedded video iframe: %s", src)
	return src

	# Strategy 2: Look for "Watch" button links that point to external video hosts
	# These are typically styled buttons with class "su-button"
	for link in soup.find_all("a", href=True):
	href = link["href"]
	text = link.get_text(strip=True).lower()

	# Look for watch/play buttons linking to known video hosts
	if text in ["watch", "play", "stream"] or "watch" in text:
	for host in external_link_hosts:
	if host in href:
	logger.info("Found video source URL via watch button: %s", href)
	return href

	# Strategy 3: Fallback - look for any link to known video hosting sites
	for link in soup.find_all("a", href=True):
	href = link["href"]
	for host in external_link_hosts:
	if host in href:
	logger.info("Found video source URL (fallback link): %s", href)
	return href

	logger.warning("No video source URL found on page: %s", page_url)
	return None

	except requests.RequestException as e:
	logger.error("Failed to fetch page %s: %s", page_url, e)
	return None


	def get_suggested_filename(game: GameResult, extension: str = "mp4") -> str:
	"""
	Generate a suggested filename for a game video.

	Args:
	game: GameResult object
	extension: File extension (default "mp4")

	Returns:
	Safe filename string like "Ohio_State_vs_Oregon_2024.mp4"
	"""
	return f"{game.get_filename_base()}.{extension}"


	def get_download_command(game: GameResult, output_dir: str = ".", output_filename: Optional[str] = None) -> str:
	"""
	Generate a yt-dlp command string for downloading a game video.

	This is the "offline mode" - returns a command users can copy and run locally.

	Since nfl-video.com is an aggregator that links to external video hosts,
	this function first extracts the external host URL to generate a working command.

	Args:
	game: GameResult object containing the game URL
	output_dir: Directory to save the video (default current directory)
	output_filename: Optional custom filename; if None, generates from game metadata

	Returns:
	A complete yt-dlp command string ready to run in a terminal

	Example:
	>>> cmd = get_download_command(game, output_dir="~/Downloads")
	>>> print(cmd)
	yt-dlp "https://collegegamestoday.com/..." -o "~/Downloads/Ohio_State_vs_Oregon_2024.mp4"
	"""
	if output_filename is None:
	output_filename = get_suggested_filename(game)

	# Build the output path
	output_path = Path(output_dir) / output_filename

	# Extract the actual video source URL since nfl-video.com is just an aggregator
	video_url = _extract_video_source_url(game.url)
	if video_url is None:
	# Fall back to the game URL if extraction fails
	logger.warning("Could not extract video source URL, using original game URL")
	video_url = game.url

	# Quote the URL and path for shell safety
	quoted_url = shlex.quote(video_url)
	quoted_output = shlex.quote(str(output_path))

	# Build the command
	# Using yt-dlp with common options for best compatibility
	# -N flag enables concurrent fragment downloads for faster speeds on fragmented streams (HLS/DASH)
	command = f"yt-dlp -N {CONCURRENT_FRAGMENTS} {quoted_url} -o {quoted_output}"

	return command


	def _run_ytdlp_get_url(url: str, timeout_seconds: int = 60) -> Optional[str]:
	"""
	Run yt-dlp --get-url on a URL and return the direct video URL.

	Args:
	url: URL to extract video from
	timeout_seconds: Maximum time to wait

	Returns:
	Direct video URL string, or None if extraction failed
	"""
	try:
	result = subprocess.run(
	["yt-dlp", "--get-url", "--no-warnings", "-f", "best", url],
	capture_output=True,
	text=True,
	timeout=timeout_seconds,
	check=False,
	)

	if result.returncode != 0:
	return None

	video_url = result.stdout.strip()
	if not video_url:
	return None

	# Sometimes yt-dlp returns multiple URLs (for video/audio streams)
	# Take the first one which is typically the video
	if "\n" in video_url:
	video_url = video_url.split("\n")[0]

	return video_url

	except (subprocess.TimeoutExpired, FileNotFoundError):
	return None


	def extract_direct_video_url(game: GameResult, timeout_seconds: int = 60) -> Optional[str]:
	"""
	Extract the direct video URL from a game page using yt-dlp.

	This is the preferred method for in-app downloads - extracts the actual video
	URL from the hosting service (ok.ru, mixdrop, etc.) so the browser can
	download directly without going through your server.

	Since nfl-video.com is an aggregator that links to external video hosts,
	this function first extracts the external host URL, then uses yt-dlp on that.

	IMPORTANT: The extracted URL expires after some time (varies by host). Call this
	function on-demand when the user clicks "Download", not when displaying results.

	Args:
	game: GameResult object containing the game page URL
	timeout_seconds: Maximum time to wait for URL extraction

	Returns:
	Direct video URL string, or None if extraction failed

	Example:
	>>> url = extract_direct_video_url(game)
	>>> print(url)
	'https://vd608.okcdn.ru/expires/1769790485013/...'
	"""
	try:
	logger.info("Extracting direct video URL for: %s", game.url)

	# First, try yt-dlp directly on the game URL (in case the site structure changes)
	direct_url = _run_ytdlp_get_url(game.url, timeout_seconds)
	if direct_url:
	logger.info("Extracted direct URL: %s...", direct_url[:100] if len(direct_url) > 100 else direct_url)
	return direct_url

	# If direct extraction failed, nfl-video.com likely links to an external host
	# Extract the video source URL from the page
	logger.info("Direct extraction failed, looking for external video source...")
	video_source_url = _extract_video_source_url(game.url)

	if not video_source_url:
	logger.error("Could not find video source URL on page")
	return None

	logger.info("Found external video source: %s", video_source_url)

	# Now try yt-dlp on the external video source
	direct_url = _run_ytdlp_get_url(video_source_url, timeout_seconds)

	if direct_url:
	logger.info("Extracted direct URL: %s...", direct_url[:100] if len(direct_url) > 100 else direct_url)
	return direct_url

	logger.error("yt-dlp failed to extract URL from video source")
	return None

	except FileNotFoundError:
	logger.error("yt-dlp not found. Please install it: pip install yt-dlp")
	return None
	except Exception as e: # pylint: disable=broad-exception-caught
	logger.error("Failed to extract video URL: %s", e)
	return None


	def is_ytdlp_available() -> bool:
	"""
	Check if yt-dlp is available on the system.

	Returns:
	True if yt-dlp is installed and accessible
	"""
	try:
	result = subprocess.run(["yt-dlp", "--version"], capture_output=True, text=True, timeout=5, check=False)
	return result.returncode == 0
	except (FileNotFoundError, subprocess.TimeoutExpired):
	return False


	def stream_video_proxy(game: GameResult, chunk_size: int = 8192) -> Generator[bytes, None, None]:
	"""
	Stream video content in chunks (fallback method).

	This is Option B from the architecture - streams video through your server.
	Use this only if extract_direct_video_url() doesn't work reliably (e.g., URLs expire too fast).

	Memory usage is constant regardless of video size (~chunk_size bytes).

	NOTE: This is a stub implementation. For production use, you would:
	1. First extract the direct URL using yt-dlp
	2. Stream from that URL using requests with stream=True

	Args:
	game: GameResult object
	chunk_size: Size of chunks to yield (default 8KB)

	Yields:
	Bytes chunks of the video file

	Example:
	>>> for chunk in stream_video_proxy(game):
	... response.write(chunk) # In a web framework
	"""
	# First, get the direct video URL
	direct_url = extract_direct_video_url(game)

	if direct_url is None:
	logger.error("Cannot stream - failed to extract direct URL")
	return

	try:
	logger.info("Starting streaming proxy for: %s", game.title)

	# Stream the video content
	with requests.get(direct_url, stream=True, timeout=30) as response:
	response.raise_for_status()

	# Yield chunks as they arrive
	for chunk in response.iter_content(chunk_size=chunk_size):
	if chunk: # Filter out keep-alive chunks
	yield chunk

	logger.info("Streaming complete for: %s", game.title)

	except Exception as e: # pylint: disable=broad-exception-caught
	logger.error("Streaming failed: %s", e)
	return


	def get_video_info(game: GameResult, timeout_seconds: int = 30) -> Optional[dict]:
	"""
	Get video metadata without downloading.

	Useful for showing file size, duration, quality options to users before download.

	Args:
	game: GameResult object
	timeout_seconds: Maximum time to wait

	Returns:
	Dictionary with video info (title, duration, filesize, etc.) or None if failed
	"""
	try:
	import json # pylint: disable=import-outside-toplevel

	logger.info("Getting video info for: %s", game.url)

	result = subprocess.run(
	["yt-dlp", "--dump-json", "--no-download", "--no-warnings", game.url],
	capture_output=True,
	text=True,
	timeout=timeout_seconds,
	check=False, # We handle return codes manually
	)

	if result.returncode != 0:
	logger.error("yt-dlp info extraction failed: %s", result.stderr)
	return None

	info = json.loads(result.stdout)
	return info

	except subprocess.TimeoutExpired:
	logger.error("yt-dlp info timed out after %d seconds", timeout_seconds)
	return None
	except Exception as e: # pylint: disable=broad-exception-caught
	logger.error("Failed to get video info: %s", e)
	return None