cfb40 / scripts /test_video_extraction.py
andytaylor-smg's picture
first bite of the app
bb3e8ea
#!/usr/bin/env python3
"""
Test script to inspect and test video extraction from nfl-video.com pages.
Tests both the page inspection (to see what's there) and the actual extraction function.
"""
import logging
import sys
import requests
from bs4 import BeautifulSoup
# Add src directory to path for imports
sys.path.insert(0, ".")
from src.source_finding.downloader import _extract_video_source_url # noqa: E402
# Set up logging to see the extraction function's output
logging.basicConfig(level=logging.INFO, format="%(levelname)s - %(name)s - %(message)s")
HEADERS = {"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"}
# Test pages: one with embedded video (iframe), one with external link
TEST_URLS = [
# Page with embedded video iframes (ok.ru, filemoon)
("Indiana vs Ohio State (embedded)", "https://nfl-video.com/indiana-vs-ohio-state-football-december-6-2025-big-ten-championship-full-game-replay"),
# Page with external link to collegegamestoday.com
("Texas vs Ohio State (external link)", "https://nfl-video.com/texas-longhorns-vs-ohio-state-buckeyes-football-august-30-2025-ncaa-full-game-replay"),
]
def inspect_page(url: str):
"""Fetch page and look for video elements (for debugging)."""
print(f"\n{'='*80}")
print(f"Page structure inspection: {url}")
print(f"{'='*80}\n")
response = requests.get(url, headers=HEADERS, timeout=30)
response.raise_for_status()
soup = BeautifulSoup(response.text, "html.parser")
# Look for iframes
print("=== IFRAMES ===")
iframes = soup.find_all("iframe")
for i, iframe in enumerate(iframes):
src = iframe.get("src", "") or iframe.get("data-src", "") or ""
print(f" [{i}] src={src}")
# Look for links with video hosts
print("\n=== LINKS TO VIDEO HOSTS ===")
video_hosts = ["collegegamestoday", "ok.ru", "filemoon", "streamtape", "mixdrop", "doodstream"]
links = soup.find_all("a", href=True)
for link in links:
href = link["href"]
for host in video_hosts:
if host in href.lower():
text = link.get_text(strip=True)[:50]
print(f" {host}: {href[:100]} (text: {text})")
break
def test_extraction(name: str, url: str):
"""Test the video extraction function on a URL."""
print(f"\n{'='*80}")
print(f"Testing extraction: {name}")
print(f"URL: {url}")
print(f"{'='*80}\n")
result = _extract_video_source_url(url)
if result:
print(f"βœ“ SUCCESS - Extracted URL: {result}")
return True
else:
print("βœ— FAILED - No video source URL found")
return False
if __name__ == "__main__":
print("\n" + "=" * 80)
print("VIDEO EXTRACTION TEST SUITE")
print("=" * 80)
# First, inspect page structures
for name, url in TEST_URLS:
inspect_page(url)
# Then test extraction
print("\n\n" + "=" * 80)
print("EXTRACTION FUNCTION TESTS")
print("=" * 80)
results = []
for name, url in TEST_URLS:
success = test_extraction(name, url)
results.append((name, success))
# Summary
print("\n\n" + "=" * 80)
print("SUMMARY")
print("=" * 80)
all_passed = True
for name, success in results:
status = "βœ“ PASS" if success else "βœ— FAIL"
print(f" {status}: {name}")
if not success:
all_passed = False
print("\n" + ("All tests passed!" if all_passed else "Some tests failed!"))