Spaces:

andytaylor-smg
/

cfb40

Sleeping

File size: 3,561 Bytes

bb3e8ea

#!/usr/bin/env python3
"""
Test script to inspect and test video extraction from nfl-video.com pages.

Tests both the page inspection (to see what's there) and the actual extraction function.
"""

import logging
import sys

import requests
from bs4 import BeautifulSoup

# Add src directory to path for imports
sys.path.insert(0, ".")

from src.source_finding.downloader import _extract_video_source_url  # noqa: E402

# Set up logging to see the extraction function's output
logging.basicConfig(level=logging.INFO, format="%(levelname)s - %(name)s - %(message)s")

HEADERS = {"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"}

# Test pages: one with embedded video (iframe), one with external link
TEST_URLS = [
    # Page with embedded video iframes (ok.ru, filemoon)
    ("Indiana vs Ohio State (embedded)", "https://nfl-video.com/indiana-vs-ohio-state-football-december-6-2025-big-ten-championship-full-game-replay"),
    # Page with external link to collegegamestoday.com
    ("Texas vs Ohio State (external link)", "https://nfl-video.com/texas-longhorns-vs-ohio-state-buckeyes-football-august-30-2025-ncaa-full-game-replay"),
]


def inspect_page(url: str):
    """Fetch page and look for video elements (for debugging)."""
    print(f"\n{'='*80}")
    print(f"Page structure inspection: {url}")
    print(f"{'='*80}\n")

    response = requests.get(url, headers=HEADERS, timeout=30)
    response.raise_for_status()

    soup = BeautifulSoup(response.text, "html.parser")

    # Look for iframes
    print("=== IFRAMES ===")
    iframes = soup.find_all("iframe")
    for i, iframe in enumerate(iframes):
        src = iframe.get("src", "") or iframe.get("data-src", "") or ""
        print(f"  [{i}] src={src}")

    # Look for links with video hosts
    print("\n=== LINKS TO VIDEO HOSTS ===")
    video_hosts = ["collegegamestoday", "ok.ru", "filemoon", "streamtape", "mixdrop", "doodstream"]
    links = soup.find_all("a", href=True)
    for link in links:
        href = link["href"]
        for host in video_hosts:
            if host in href.lower():
                text = link.get_text(strip=True)[:50]
                print(f"  {host}: {href[:100]} (text: {text})")
                break


def test_extraction(name: str, url: str):
    """Test the video extraction function on a URL."""
    print(f"\n{'='*80}")
    print(f"Testing extraction: {name}")
    print(f"URL: {url}")
    print(f"{'='*80}\n")

    result = _extract_video_source_url(url)

    if result:
        print(f"✓ SUCCESS - Extracted URL: {result}")
        return True
    else:
        print("✗ FAILED - No video source URL found")
        return False


if __name__ == "__main__":
    print("\n" + "=" * 80)
    print("VIDEO EXTRACTION TEST SUITE")
    print("=" * 80)

    # First, inspect page structures
    for name, url in TEST_URLS:
        inspect_page(url)

    # Then test extraction
    print("\n\n" + "=" * 80)
    print("EXTRACTION FUNCTION TESTS")
    print("=" * 80)

    results = []
    for name, url in TEST_URLS:
        success = test_extraction(name, url)
        results.append((name, success))

    # Summary
    print("\n\n" + "=" * 80)
    print("SUMMARY")
    print("=" * 80)
    all_passed = True
    for name, success in results:
        status = "✓ PASS" if success else "✗ FAIL"
        print(f"  {status}: {name}")
        if not success:
            all_passed = False

    print("\n" + ("All tests passed!" if all_passed else "Some tests failed!"))