""" Video download utilities for college football games. This module provides multiple strategies for downloading videos: 1. Generate yt-dlp commands for users to run locally (offline mode) 2. Extract direct video URLs for browser-direct downloads (preferred for apps) 3. Stream video through a proxy (fallback, uses server bandwidth) """ import logging import shlex import subprocess from pathlib import Path from typing import Generator, Optional import requests from bs4 import BeautifulSoup from .models import GameResult logger = logging.getLogger(__name__) # User agent for web requests HEADERS = {"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"} # Number of concurrent fragment downloads for yt-dlp # Set to 4 for compatibility with free Hugging Face Spaces tier (2 vCPU) CONCURRENT_FRAGMENTS = 4 def _extract_video_source_url(page_url: str) -> Optional[str]: """ Extract the actual video source URL from an nfl-video.com page. nfl-video.com has two types of pages: 1. Aggregator pages that link to external hosts like collegegamestoday.com 2. Pages with embedded video players (iframes to ok.ru, filemoon, etc.) This function handles both cases. Args: page_url: The nfl-video.com page URL Returns: The extracted video source URL, or None if not found """ try: logger.debug("Fetching page to extract video source: %s", page_url) response = requests.get(page_url, headers=HEADERS, timeout=30) response.raise_for_status() soup = BeautifulSoup(response.text, "html.parser") # Known video hosting sites (both external links and embed sources) external_link_hosts = ["collegegamestoday.com", "sportsurge", "streamsport", "sportshd"] embed_hosts = ["ok.ru", "okru", "filemoon", "streamtape", "mixdrop", "doodstream", "dailymotion", "vimeo"] # Strategy 1: Look for embedded video iframes (direct video player on page) # These are often the most reliable source as they're the actual player for iframe in soup.find_all("iframe"): src = iframe.get("src", "") or iframe.get("data-src", "") or iframe.get("data-lazy-src", "") if not src: continue # Normalize URL (some start with //) if src.startswith("//"): src = "https:" + src # Check if iframe points to a known video embed host for host in embed_hosts: if host in src.lower(): logger.info("Found embedded video iframe: %s", src) return src # Strategy 2: Look for "Watch" button links that point to external video hosts # These are typically styled buttons with class "su-button" for link in soup.find_all("a", href=True): href = link["href"] text = link.get_text(strip=True).lower() # Look for watch/play buttons linking to known video hosts if text in ["watch", "play", "stream"] or "watch" in text: for host in external_link_hosts: if host in href: logger.info("Found video source URL via watch button: %s", href) return href # Strategy 3: Fallback - look for any link to known video hosting sites for link in soup.find_all("a", href=True): href = link["href"] for host in external_link_hosts: if host in href: logger.info("Found video source URL (fallback link): %s", href) return href logger.warning("No video source URL found on page: %s", page_url) return None except requests.RequestException as e: logger.error("Failed to fetch page %s: %s", page_url, e) return None def get_suggested_filename(game: GameResult, extension: str = "mp4") -> str: """ Generate a suggested filename for a game video. Args: game: GameResult object extension: File extension (default "mp4") Returns: Safe filename string like "Ohio_State_vs_Oregon_2024.mp4" """ return f"{game.get_filename_base()}.{extension}" def get_download_command(game: GameResult, output_dir: str = ".", output_filename: Optional[str] = None) -> str: """ Generate a yt-dlp command string for downloading a game video. This is the "offline mode" - returns a command users can copy and run locally. Since nfl-video.com is an aggregator that links to external video hosts, this function first extracts the external host URL to generate a working command. Args: game: GameResult object containing the game URL output_dir: Directory to save the video (default current directory) output_filename: Optional custom filename; if None, generates from game metadata Returns: A complete yt-dlp command string ready to run in a terminal Example: >>> cmd = get_download_command(game, output_dir="~/Downloads") >>> print(cmd) yt-dlp "https://collegegamestoday.com/..." -o "~/Downloads/Ohio_State_vs_Oregon_2024.mp4" """ if output_filename is None: output_filename = get_suggested_filename(game) # Build the output path output_path = Path(output_dir) / output_filename # Extract the actual video source URL since nfl-video.com is just an aggregator video_url = _extract_video_source_url(game.url) if video_url is None: # Fall back to the game URL if extraction fails logger.warning("Could not extract video source URL, using original game URL") video_url = game.url # Quote the URL and path for shell safety quoted_url = shlex.quote(video_url) quoted_output = shlex.quote(str(output_path)) # Build the command # Using yt-dlp with common options for best compatibility # -N flag enables concurrent fragment downloads for faster speeds on fragmented streams (HLS/DASH) command = f"yt-dlp -N {CONCURRENT_FRAGMENTS} {quoted_url} -o {quoted_output}" return command def _run_ytdlp_get_url(url: str, timeout_seconds: int = 60) -> Optional[str]: """ Run yt-dlp --get-url on a URL and return the direct video URL. Args: url: URL to extract video from timeout_seconds: Maximum time to wait Returns: Direct video URL string, or None if extraction failed """ try: result = subprocess.run( ["yt-dlp", "--get-url", "--no-warnings", "-f", "best", url], capture_output=True, text=True, timeout=timeout_seconds, check=False, ) if result.returncode != 0: return None video_url = result.stdout.strip() if not video_url: return None # Sometimes yt-dlp returns multiple URLs (for video/audio streams) # Take the first one which is typically the video if "\n" in video_url: video_url = video_url.split("\n")[0] return video_url except (subprocess.TimeoutExpired, FileNotFoundError): return None def extract_direct_video_url(game: GameResult, timeout_seconds: int = 60) -> Optional[str]: """ Extract the direct video URL from a game page using yt-dlp. This is the preferred method for in-app downloads - extracts the actual video URL from the hosting service (ok.ru, mixdrop, etc.) so the browser can download directly without going through your server. Since nfl-video.com is an aggregator that links to external video hosts, this function first extracts the external host URL, then uses yt-dlp on that. IMPORTANT: The extracted URL expires after some time (varies by host). Call this function on-demand when the user clicks "Download", not when displaying results. Args: game: GameResult object containing the game page URL timeout_seconds: Maximum time to wait for URL extraction Returns: Direct video URL string, or None if extraction failed Example: >>> url = extract_direct_video_url(game) >>> print(url) 'https://vd608.okcdn.ru/expires/1769790485013/...' """ try: logger.info("Extracting direct video URL for: %s", game.url) # First, try yt-dlp directly on the game URL (in case the site structure changes) direct_url = _run_ytdlp_get_url(game.url, timeout_seconds) if direct_url: logger.info("Extracted direct URL: %s...", direct_url[:100] if len(direct_url) > 100 else direct_url) return direct_url # If direct extraction failed, nfl-video.com likely links to an external host # Extract the video source URL from the page logger.info("Direct extraction failed, looking for external video source...") video_source_url = _extract_video_source_url(game.url) if not video_source_url: logger.error("Could not find video source URL on page") return None logger.info("Found external video source: %s", video_source_url) # Now try yt-dlp on the external video source direct_url = _run_ytdlp_get_url(video_source_url, timeout_seconds) if direct_url: logger.info("Extracted direct URL: %s...", direct_url[:100] if len(direct_url) > 100 else direct_url) return direct_url logger.error("yt-dlp failed to extract URL from video source") return None except FileNotFoundError: logger.error("yt-dlp not found. Please install it: pip install yt-dlp") return None except Exception as e: # pylint: disable=broad-exception-caught logger.error("Failed to extract video URL: %s", e) return None def is_ytdlp_available() -> bool: """ Check if yt-dlp is available on the system. Returns: True if yt-dlp is installed and accessible """ try: result = subprocess.run(["yt-dlp", "--version"], capture_output=True, text=True, timeout=5, check=False) return result.returncode == 0 except (FileNotFoundError, subprocess.TimeoutExpired): return False def stream_video_proxy(game: GameResult, chunk_size: int = 8192) -> Generator[bytes, None, None]: """ Stream video content in chunks (fallback method). This is Option B from the architecture - streams video through your server. Use this only if extract_direct_video_url() doesn't work reliably (e.g., URLs expire too fast). Memory usage is constant regardless of video size (~chunk_size bytes). NOTE: This is a stub implementation. For production use, you would: 1. First extract the direct URL using yt-dlp 2. Stream from that URL using requests with stream=True Args: game: GameResult object chunk_size: Size of chunks to yield (default 8KB) Yields: Bytes chunks of the video file Example: >>> for chunk in stream_video_proxy(game): ... response.write(chunk) # In a web framework """ # First, get the direct video URL direct_url = extract_direct_video_url(game) if direct_url is None: logger.error("Cannot stream - failed to extract direct URL") return try: logger.info("Starting streaming proxy for: %s", game.title) # Stream the video content with requests.get(direct_url, stream=True, timeout=30) as response: response.raise_for_status() # Yield chunks as they arrive for chunk in response.iter_content(chunk_size=chunk_size): if chunk: # Filter out keep-alive chunks yield chunk logger.info("Streaming complete for: %s", game.title) except Exception as e: # pylint: disable=broad-exception-caught logger.error("Streaming failed: %s", e) return def get_video_info(game: GameResult, timeout_seconds: int = 30) -> Optional[dict]: """ Get video metadata without downloading. Useful for showing file size, duration, quality options to users before download. Args: game: GameResult object timeout_seconds: Maximum time to wait Returns: Dictionary with video info (title, duration, filesize, etc.) or None if failed """ try: import json # pylint: disable=import-outside-toplevel logger.info("Getting video info for: %s", game.url) result = subprocess.run( ["yt-dlp", "--dump-json", "--no-download", "--no-warnings", game.url], capture_output=True, text=True, timeout=timeout_seconds, check=False, # We handle return codes manually ) if result.returncode != 0: logger.error("yt-dlp info extraction failed: %s", result.stderr) return None info = json.loads(result.stdout) return info except subprocess.TimeoutExpired: logger.error("yt-dlp info timed out after %d seconds", timeout_seconds) return None except Exception as e: # pylint: disable=broad-exception-caught logger.error("Failed to get video info: %s", e) return None