cfb40 / src /source_finding /downloader.py
andytaylor-smg's picture
first bite of the app
bb3e8ea
"""
Video download utilities for college football games.
This module provides multiple strategies for downloading videos:
1. Generate yt-dlp commands for users to run locally (offline mode)
2. Extract direct video URLs for browser-direct downloads (preferred for apps)
3. Stream video through a proxy (fallback, uses server bandwidth)
"""
import logging
import shlex
import subprocess
from pathlib import Path
from typing import Generator, Optional
import requests
from bs4 import BeautifulSoup
from .models import GameResult
logger = logging.getLogger(__name__)
# User agent for web requests
HEADERS = {"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"}
# Number of concurrent fragment downloads for yt-dlp
# Set to 4 for compatibility with free Hugging Face Spaces tier (2 vCPU)
CONCURRENT_FRAGMENTS = 4
def _extract_video_source_url(page_url: str) -> Optional[str]:
"""
Extract the actual video source URL from an nfl-video.com page.
nfl-video.com has two types of pages:
1. Aggregator pages that link to external hosts like collegegamestoday.com
2. Pages with embedded video players (iframes to ok.ru, filemoon, etc.)
This function handles both cases.
Args:
page_url: The nfl-video.com page URL
Returns:
The extracted video source URL, or None if not found
"""
try:
logger.debug("Fetching page to extract video source: %s", page_url)
response = requests.get(page_url, headers=HEADERS, timeout=30)
response.raise_for_status()
soup = BeautifulSoup(response.text, "html.parser")
# Known video hosting sites (both external links and embed sources)
external_link_hosts = ["collegegamestoday.com", "sportsurge", "streamsport", "sportshd"]
embed_hosts = ["ok.ru", "okru", "filemoon", "streamtape", "mixdrop", "doodstream", "dailymotion", "vimeo"]
# Strategy 1: Look for embedded video iframes (direct video player on page)
# These are often the most reliable source as they're the actual player
for iframe in soup.find_all("iframe"):
src = iframe.get("src", "") or iframe.get("data-src", "") or iframe.get("data-lazy-src", "")
if not src:
continue
# Normalize URL (some start with //)
if src.startswith("//"):
src = "https:" + src
# Check if iframe points to a known video embed host
for host in embed_hosts:
if host in src.lower():
logger.info("Found embedded video iframe: %s", src)
return src
# Strategy 2: Look for "Watch" button links that point to external video hosts
# These are typically styled buttons with class "su-button"
for link in soup.find_all("a", href=True):
href = link["href"]
text = link.get_text(strip=True).lower()
# Look for watch/play buttons linking to known video hosts
if text in ["watch", "play", "stream"] or "watch" in text:
for host in external_link_hosts:
if host in href:
logger.info("Found video source URL via watch button: %s", href)
return href
# Strategy 3: Fallback - look for any link to known video hosting sites
for link in soup.find_all("a", href=True):
href = link["href"]
for host in external_link_hosts:
if host in href:
logger.info("Found video source URL (fallback link): %s", href)
return href
logger.warning("No video source URL found on page: %s", page_url)
return None
except requests.RequestException as e:
logger.error("Failed to fetch page %s: %s", page_url, e)
return None
def get_suggested_filename(game: GameResult, extension: str = "mp4") -> str:
"""
Generate a suggested filename for a game video.
Args:
game: GameResult object
extension: File extension (default "mp4")
Returns:
Safe filename string like "Ohio_State_vs_Oregon_2024.mp4"
"""
return f"{game.get_filename_base()}.{extension}"
def get_download_command(game: GameResult, output_dir: str = ".", output_filename: Optional[str] = None) -> str:
"""
Generate a yt-dlp command string for downloading a game video.
This is the "offline mode" - returns a command users can copy and run locally.
Since nfl-video.com is an aggregator that links to external video hosts,
this function first extracts the external host URL to generate a working command.
Args:
game: GameResult object containing the game URL
output_dir: Directory to save the video (default current directory)
output_filename: Optional custom filename; if None, generates from game metadata
Returns:
A complete yt-dlp command string ready to run in a terminal
Example:
>>> cmd = get_download_command(game, output_dir="~/Downloads")
>>> print(cmd)
yt-dlp "https://collegegamestoday.com/..." -o "~/Downloads/Ohio_State_vs_Oregon_2024.mp4"
"""
if output_filename is None:
output_filename = get_suggested_filename(game)
# Build the output path
output_path = Path(output_dir) / output_filename
# Extract the actual video source URL since nfl-video.com is just an aggregator
video_url = _extract_video_source_url(game.url)
if video_url is None:
# Fall back to the game URL if extraction fails
logger.warning("Could not extract video source URL, using original game URL")
video_url = game.url
# Quote the URL and path for shell safety
quoted_url = shlex.quote(video_url)
quoted_output = shlex.quote(str(output_path))
# Build the command
# Using yt-dlp with common options for best compatibility
# -N flag enables concurrent fragment downloads for faster speeds on fragmented streams (HLS/DASH)
command = f"yt-dlp -N {CONCURRENT_FRAGMENTS} {quoted_url} -o {quoted_output}"
return command
def _run_ytdlp_get_url(url: str, timeout_seconds: int = 60) -> Optional[str]:
"""
Run yt-dlp --get-url on a URL and return the direct video URL.
Args:
url: URL to extract video from
timeout_seconds: Maximum time to wait
Returns:
Direct video URL string, or None if extraction failed
"""
try:
result = subprocess.run(
["yt-dlp", "--get-url", "--no-warnings", "-f", "best", url],
capture_output=True,
text=True,
timeout=timeout_seconds,
check=False,
)
if result.returncode != 0:
return None
video_url = result.stdout.strip()
if not video_url:
return None
# Sometimes yt-dlp returns multiple URLs (for video/audio streams)
# Take the first one which is typically the video
if "\n" in video_url:
video_url = video_url.split("\n")[0]
return video_url
except (subprocess.TimeoutExpired, FileNotFoundError):
return None
def extract_direct_video_url(game: GameResult, timeout_seconds: int = 60) -> Optional[str]:
"""
Extract the direct video URL from a game page using yt-dlp.
This is the preferred method for in-app downloads - extracts the actual video
URL from the hosting service (ok.ru, mixdrop, etc.) so the browser can
download directly without going through your server.
Since nfl-video.com is an aggregator that links to external video hosts,
this function first extracts the external host URL, then uses yt-dlp on that.
IMPORTANT: The extracted URL expires after some time (varies by host). Call this
function on-demand when the user clicks "Download", not when displaying results.
Args:
game: GameResult object containing the game page URL
timeout_seconds: Maximum time to wait for URL extraction
Returns:
Direct video URL string, or None if extraction failed
Example:
>>> url = extract_direct_video_url(game)
>>> print(url)
'https://vd608.okcdn.ru/expires/1769790485013/...'
"""
try:
logger.info("Extracting direct video URL for: %s", game.url)
# First, try yt-dlp directly on the game URL (in case the site structure changes)
direct_url = _run_ytdlp_get_url(game.url, timeout_seconds)
if direct_url:
logger.info("Extracted direct URL: %s...", direct_url[:100] if len(direct_url) > 100 else direct_url)
return direct_url
# If direct extraction failed, nfl-video.com likely links to an external host
# Extract the video source URL from the page
logger.info("Direct extraction failed, looking for external video source...")
video_source_url = _extract_video_source_url(game.url)
if not video_source_url:
logger.error("Could not find video source URL on page")
return None
logger.info("Found external video source: %s", video_source_url)
# Now try yt-dlp on the external video source
direct_url = _run_ytdlp_get_url(video_source_url, timeout_seconds)
if direct_url:
logger.info("Extracted direct URL: %s...", direct_url[:100] if len(direct_url) > 100 else direct_url)
return direct_url
logger.error("yt-dlp failed to extract URL from video source")
return None
except FileNotFoundError:
logger.error("yt-dlp not found. Please install it: pip install yt-dlp")
return None
except Exception as e: # pylint: disable=broad-exception-caught
logger.error("Failed to extract video URL: %s", e)
return None
def is_ytdlp_available() -> bool:
"""
Check if yt-dlp is available on the system.
Returns:
True if yt-dlp is installed and accessible
"""
try:
result = subprocess.run(["yt-dlp", "--version"], capture_output=True, text=True, timeout=5, check=False)
return result.returncode == 0
except (FileNotFoundError, subprocess.TimeoutExpired):
return False
def stream_video_proxy(game: GameResult, chunk_size: int = 8192) -> Generator[bytes, None, None]:
"""
Stream video content in chunks (fallback method).
This is Option B from the architecture - streams video through your server.
Use this only if extract_direct_video_url() doesn't work reliably (e.g., URLs expire too fast).
Memory usage is constant regardless of video size (~chunk_size bytes).
NOTE: This is a stub implementation. For production use, you would:
1. First extract the direct URL using yt-dlp
2. Stream from that URL using requests with stream=True
Args:
game: GameResult object
chunk_size: Size of chunks to yield (default 8KB)
Yields:
Bytes chunks of the video file
Example:
>>> for chunk in stream_video_proxy(game):
... response.write(chunk) # In a web framework
"""
# First, get the direct video URL
direct_url = extract_direct_video_url(game)
if direct_url is None:
logger.error("Cannot stream - failed to extract direct URL")
return
try:
logger.info("Starting streaming proxy for: %s", game.title)
# Stream the video content
with requests.get(direct_url, stream=True, timeout=30) as response:
response.raise_for_status()
# Yield chunks as they arrive
for chunk in response.iter_content(chunk_size=chunk_size):
if chunk: # Filter out keep-alive chunks
yield chunk
logger.info("Streaming complete for: %s", game.title)
except Exception as e: # pylint: disable=broad-exception-caught
logger.error("Streaming failed: %s", e)
return
def get_video_info(game: GameResult, timeout_seconds: int = 30) -> Optional[dict]:
"""
Get video metadata without downloading.
Useful for showing file size, duration, quality options to users before download.
Args:
game: GameResult object
timeout_seconds: Maximum time to wait
Returns:
Dictionary with video info (title, duration, filesize, etc.) or None if failed
"""
try:
import json # pylint: disable=import-outside-toplevel
logger.info("Getting video info for: %s", game.url)
result = subprocess.run(
["yt-dlp", "--dump-json", "--no-download", "--no-warnings", game.url],
capture_output=True,
text=True,
timeout=timeout_seconds,
check=False, # We handle return codes manually
)
if result.returncode != 0:
logger.error("yt-dlp info extraction failed: %s", result.stderr)
return None
info = json.loads(result.stdout)
return info
except subprocess.TimeoutExpired:
logger.error("yt-dlp info timed out after %d seconds", timeout_seconds)
return None
except Exception as e: # pylint: disable=broad-exception-caught
logger.error("Failed to get video info: %s", e)
return None