#!/usr/bin/env python3 """ Test script to inspect and test video extraction from nfl-video.com pages. Tests both the page inspection (to see what's there) and the actual extraction function. """ import logging import sys import requests from bs4 import BeautifulSoup # Add src directory to path for imports sys.path.insert(0, ".") from src.source_finding.downloader import _extract_video_source_url # noqa: E402 # Set up logging to see the extraction function's output logging.basicConfig(level=logging.INFO, format="%(levelname)s - %(name)s - %(message)s") HEADERS = {"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"} # Test pages: one with embedded video (iframe), one with external link TEST_URLS = [ # Page with embedded video iframes (ok.ru, filemoon) ("Indiana vs Ohio State (embedded)", "https://nfl-video.com/indiana-vs-ohio-state-football-december-6-2025-big-ten-championship-full-game-replay"), # Page with external link to collegegamestoday.com ("Texas vs Ohio State (external link)", "https://nfl-video.com/texas-longhorns-vs-ohio-state-buckeyes-football-august-30-2025-ncaa-full-game-replay"), ] def inspect_page(url: str): """Fetch page and look for video elements (for debugging).""" print(f"\n{'='*80}") print(f"Page structure inspection: {url}") print(f"{'='*80}\n") response = requests.get(url, headers=HEADERS, timeout=30) response.raise_for_status() soup = BeautifulSoup(response.text, "html.parser") # Look for iframes print("=== IFRAMES ===") iframes = soup.find_all("iframe") for i, iframe in enumerate(iframes): src = iframe.get("src", "") or iframe.get("data-src", "") or "" print(f" [{i}] src={src}") # Look for links with video hosts print("\n=== LINKS TO VIDEO HOSTS ===") video_hosts = ["collegegamestoday", "ok.ru", "filemoon", "streamtape", "mixdrop", "doodstream"] links = soup.find_all("a", href=True) for link in links: href = link["href"] for host in video_hosts: if host in href.lower(): text = link.get_text(strip=True)[:50] print(f" {host}: {href[:100]} (text: {text})") break def test_extraction(name: str, url: str): """Test the video extraction function on a URL.""" print(f"\n{'='*80}") print(f"Testing extraction: {name}") print(f"URL: {url}") print(f"{'='*80}\n") result = _extract_video_source_url(url) if result: print(f"✓ SUCCESS - Extracted URL: {result}") return True else: print("✗ FAILED - No video source URL found") return False if __name__ == "__main__": print("\n" + "=" * 80) print("VIDEO EXTRACTION TEST SUITE") print("=" * 80) # First, inspect page structures for name, url in TEST_URLS: inspect_page(url) # Then test extraction print("\n\n" + "=" * 80) print("EXTRACTION FUNCTION TESTS") print("=" * 80) results = [] for name, url in TEST_URLS: success = test_extraction(name, url) results.append((name, success)) # Summary print("\n\n" + "=" * 80) print("SUMMARY") print("=" * 80) all_passed = True for name, success in results: status = "✓ PASS" if success else "✗ FAIL" print(f" {status}: {name}") if not success: all_passed = False print("\n" + ("All tests passed!" if all_passed else "Some tests failed!"))