Spaces:
Sleeping
Sleeping
| """ | |
| Video download utilities for college football games. | |
| This module provides multiple strategies for downloading videos: | |
| 1. Generate yt-dlp commands for users to run locally (offline mode) | |
| 2. Extract direct video URLs for browser-direct downloads (preferred for apps) | |
| 3. Stream video through a proxy (fallback, uses server bandwidth) | |
| """ | |
| import logging | |
| import shlex | |
| import subprocess | |
| from pathlib import Path | |
| from typing import Generator, Optional | |
| import requests | |
| from bs4 import BeautifulSoup | |
| from .models import GameResult | |
| logger = logging.getLogger(__name__) | |
| # User agent for web requests | |
| HEADERS = {"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"} | |
| # Number of concurrent fragment downloads for yt-dlp | |
| # Set to 4 for compatibility with free Hugging Face Spaces tier (2 vCPU) | |
| CONCURRENT_FRAGMENTS = 4 | |
| def _extract_video_source_url(page_url: str) -> Optional[str]: | |
| """ | |
| Extract the actual video source URL from an nfl-video.com page. | |
| nfl-video.com has two types of pages: | |
| 1. Aggregator pages that link to external hosts like collegegamestoday.com | |
| 2. Pages with embedded video players (iframes to ok.ru, filemoon, etc.) | |
| This function handles both cases. | |
| Args: | |
| page_url: The nfl-video.com page URL | |
| Returns: | |
| The extracted video source URL, or None if not found | |
| """ | |
| try: | |
| logger.debug("Fetching page to extract video source: %s", page_url) | |
| response = requests.get(page_url, headers=HEADERS, timeout=30) | |
| response.raise_for_status() | |
| soup = BeautifulSoup(response.text, "html.parser") | |
| # Known video hosting sites (both external links and embed sources) | |
| external_link_hosts = ["collegegamestoday.com", "sportsurge", "streamsport", "sportshd"] | |
| embed_hosts = ["ok.ru", "okru", "filemoon", "streamtape", "mixdrop", "doodstream", "dailymotion", "vimeo"] | |
| # Strategy 1: Look for embedded video iframes (direct video player on page) | |
| # These are often the most reliable source as they're the actual player | |
| for iframe in soup.find_all("iframe"): | |
| src = iframe.get("src", "") or iframe.get("data-src", "") or iframe.get("data-lazy-src", "") | |
| if not src: | |
| continue | |
| # Normalize URL (some start with //) | |
| if src.startswith("//"): | |
| src = "https:" + src | |
| # Check if iframe points to a known video embed host | |
| for host in embed_hosts: | |
| if host in src.lower(): | |
| logger.info("Found embedded video iframe: %s", src) | |
| return src | |
| # Strategy 2: Look for "Watch" button links that point to external video hosts | |
| # These are typically styled buttons with class "su-button" | |
| for link in soup.find_all("a", href=True): | |
| href = link["href"] | |
| text = link.get_text(strip=True).lower() | |
| # Look for watch/play buttons linking to known video hosts | |
| if text in ["watch", "play", "stream"] or "watch" in text: | |
| for host in external_link_hosts: | |
| if host in href: | |
| logger.info("Found video source URL via watch button: %s", href) | |
| return href | |
| # Strategy 3: Fallback - look for any link to known video hosting sites | |
| for link in soup.find_all("a", href=True): | |
| href = link["href"] | |
| for host in external_link_hosts: | |
| if host in href: | |
| logger.info("Found video source URL (fallback link): %s", href) | |
| return href | |
| logger.warning("No video source URL found on page: %s", page_url) | |
| return None | |
| except requests.RequestException as e: | |
| logger.error("Failed to fetch page %s: %s", page_url, e) | |
| return None | |
| def get_suggested_filename(game: GameResult, extension: str = "mp4") -> str: | |
| """ | |
| Generate a suggested filename for a game video. | |
| Args: | |
| game: GameResult object | |
| extension: File extension (default "mp4") | |
| Returns: | |
| Safe filename string like "Ohio_State_vs_Oregon_2024.mp4" | |
| """ | |
| return f"{game.get_filename_base()}.{extension}" | |
| def get_download_command(game: GameResult, output_dir: str = ".", output_filename: Optional[str] = None) -> str: | |
| """ | |
| Generate a yt-dlp command string for downloading a game video. | |
| This is the "offline mode" - returns a command users can copy and run locally. | |
| Since nfl-video.com is an aggregator that links to external video hosts, | |
| this function first extracts the external host URL to generate a working command. | |
| Args: | |
| game: GameResult object containing the game URL | |
| output_dir: Directory to save the video (default current directory) | |
| output_filename: Optional custom filename; if None, generates from game metadata | |
| Returns: | |
| A complete yt-dlp command string ready to run in a terminal | |
| Example: | |
| >>> cmd = get_download_command(game, output_dir="~/Downloads") | |
| >>> print(cmd) | |
| yt-dlp "https://collegegamestoday.com/..." -o "~/Downloads/Ohio_State_vs_Oregon_2024.mp4" | |
| """ | |
| if output_filename is None: | |
| output_filename = get_suggested_filename(game) | |
| # Build the output path | |
| output_path = Path(output_dir) / output_filename | |
| # Extract the actual video source URL since nfl-video.com is just an aggregator | |
| video_url = _extract_video_source_url(game.url) | |
| if video_url is None: | |
| # Fall back to the game URL if extraction fails | |
| logger.warning("Could not extract video source URL, using original game URL") | |
| video_url = game.url | |
| # Quote the URL and path for shell safety | |
| quoted_url = shlex.quote(video_url) | |
| quoted_output = shlex.quote(str(output_path)) | |
| # Build the command | |
| # Using yt-dlp with common options for best compatibility | |
| # -N flag enables concurrent fragment downloads for faster speeds on fragmented streams (HLS/DASH) | |
| command = f"yt-dlp -N {CONCURRENT_FRAGMENTS} {quoted_url} -o {quoted_output}" | |
| return command | |
| def _run_ytdlp_get_url(url: str, timeout_seconds: int = 60) -> Optional[str]: | |
| """ | |
| Run yt-dlp --get-url on a URL and return the direct video URL. | |
| Args: | |
| url: URL to extract video from | |
| timeout_seconds: Maximum time to wait | |
| Returns: | |
| Direct video URL string, or None if extraction failed | |
| """ | |
| try: | |
| result = subprocess.run( | |
| ["yt-dlp", "--get-url", "--no-warnings", "-f", "best", url], | |
| capture_output=True, | |
| text=True, | |
| timeout=timeout_seconds, | |
| check=False, | |
| ) | |
| if result.returncode != 0: | |
| return None | |
| video_url = result.stdout.strip() | |
| if not video_url: | |
| return None | |
| # Sometimes yt-dlp returns multiple URLs (for video/audio streams) | |
| # Take the first one which is typically the video | |
| if "\n" in video_url: | |
| video_url = video_url.split("\n")[0] | |
| return video_url | |
| except (subprocess.TimeoutExpired, FileNotFoundError): | |
| return None | |
| def extract_direct_video_url(game: GameResult, timeout_seconds: int = 60) -> Optional[str]: | |
| """ | |
| Extract the direct video URL from a game page using yt-dlp. | |
| This is the preferred method for in-app downloads - extracts the actual video | |
| URL from the hosting service (ok.ru, mixdrop, etc.) so the browser can | |
| download directly without going through your server. | |
| Since nfl-video.com is an aggregator that links to external video hosts, | |
| this function first extracts the external host URL, then uses yt-dlp on that. | |
| IMPORTANT: The extracted URL expires after some time (varies by host). Call this | |
| function on-demand when the user clicks "Download", not when displaying results. | |
| Args: | |
| game: GameResult object containing the game page URL | |
| timeout_seconds: Maximum time to wait for URL extraction | |
| Returns: | |
| Direct video URL string, or None if extraction failed | |
| Example: | |
| >>> url = extract_direct_video_url(game) | |
| >>> print(url) | |
| 'https://vd608.okcdn.ru/expires/1769790485013/...' | |
| """ | |
| try: | |
| logger.info("Extracting direct video URL for: %s", game.url) | |
| # First, try yt-dlp directly on the game URL (in case the site structure changes) | |
| direct_url = _run_ytdlp_get_url(game.url, timeout_seconds) | |
| if direct_url: | |
| logger.info("Extracted direct URL: %s...", direct_url[:100] if len(direct_url) > 100 else direct_url) | |
| return direct_url | |
| # If direct extraction failed, nfl-video.com likely links to an external host | |
| # Extract the video source URL from the page | |
| logger.info("Direct extraction failed, looking for external video source...") | |
| video_source_url = _extract_video_source_url(game.url) | |
| if not video_source_url: | |
| logger.error("Could not find video source URL on page") | |
| return None | |
| logger.info("Found external video source: %s", video_source_url) | |
| # Now try yt-dlp on the external video source | |
| direct_url = _run_ytdlp_get_url(video_source_url, timeout_seconds) | |
| if direct_url: | |
| logger.info("Extracted direct URL: %s...", direct_url[:100] if len(direct_url) > 100 else direct_url) | |
| return direct_url | |
| logger.error("yt-dlp failed to extract URL from video source") | |
| return None | |
| except FileNotFoundError: | |
| logger.error("yt-dlp not found. Please install it: pip install yt-dlp") | |
| return None | |
| except Exception as e: # pylint: disable=broad-exception-caught | |
| logger.error("Failed to extract video URL: %s", e) | |
| return None | |
| def is_ytdlp_available() -> bool: | |
| """ | |
| Check if yt-dlp is available on the system. | |
| Returns: | |
| True if yt-dlp is installed and accessible | |
| """ | |
| try: | |
| result = subprocess.run(["yt-dlp", "--version"], capture_output=True, text=True, timeout=5, check=False) | |
| return result.returncode == 0 | |
| except (FileNotFoundError, subprocess.TimeoutExpired): | |
| return False | |
| def stream_video_proxy(game: GameResult, chunk_size: int = 8192) -> Generator[bytes, None, None]: | |
| """ | |
| Stream video content in chunks (fallback method). | |
| This is Option B from the architecture - streams video through your server. | |
| Use this only if extract_direct_video_url() doesn't work reliably (e.g., URLs expire too fast). | |
| Memory usage is constant regardless of video size (~chunk_size bytes). | |
| NOTE: This is a stub implementation. For production use, you would: | |
| 1. First extract the direct URL using yt-dlp | |
| 2. Stream from that URL using requests with stream=True | |
| Args: | |
| game: GameResult object | |
| chunk_size: Size of chunks to yield (default 8KB) | |
| Yields: | |
| Bytes chunks of the video file | |
| Example: | |
| >>> for chunk in stream_video_proxy(game): | |
| ... response.write(chunk) # In a web framework | |
| """ | |
| # First, get the direct video URL | |
| direct_url = extract_direct_video_url(game) | |
| if direct_url is None: | |
| logger.error("Cannot stream - failed to extract direct URL") | |
| return | |
| try: | |
| logger.info("Starting streaming proxy for: %s", game.title) | |
| # Stream the video content | |
| with requests.get(direct_url, stream=True, timeout=30) as response: | |
| response.raise_for_status() | |
| # Yield chunks as they arrive | |
| for chunk in response.iter_content(chunk_size=chunk_size): | |
| if chunk: # Filter out keep-alive chunks | |
| yield chunk | |
| logger.info("Streaming complete for: %s", game.title) | |
| except Exception as e: # pylint: disable=broad-exception-caught | |
| logger.error("Streaming failed: %s", e) | |
| return | |
| def get_video_info(game: GameResult, timeout_seconds: int = 30) -> Optional[dict]: | |
| """ | |
| Get video metadata without downloading. | |
| Useful for showing file size, duration, quality options to users before download. | |
| Args: | |
| game: GameResult object | |
| timeout_seconds: Maximum time to wait | |
| Returns: | |
| Dictionary with video info (title, duration, filesize, etc.) or None if failed | |
| """ | |
| try: | |
| import json # pylint: disable=import-outside-toplevel | |
| logger.info("Getting video info for: %s", game.url) | |
| result = subprocess.run( | |
| ["yt-dlp", "--dump-json", "--no-download", "--no-warnings", game.url], | |
| capture_output=True, | |
| text=True, | |
| timeout=timeout_seconds, | |
| check=False, # We handle return codes manually | |
| ) | |
| if result.returncode != 0: | |
| logger.error("yt-dlp info extraction failed: %s", result.stderr) | |
| return None | |
| info = json.loads(result.stdout) | |
| return info | |
| except subprocess.TimeoutExpired: | |
| logger.error("yt-dlp info timed out after %d seconds", timeout_seconds) | |
| return None | |
| except Exception as e: # pylint: disable=broad-exception-caught | |
| logger.error("Failed to get video info: %s", e) | |
| return None | |