Spaces:
Sleeping
Sleeping
File size: 3,561 Bytes
bb3e8ea | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 | #!/usr/bin/env python3
"""
Test script to inspect and test video extraction from nfl-video.com pages.
Tests both the page inspection (to see what's there) and the actual extraction function.
"""
import logging
import sys
import requests
from bs4 import BeautifulSoup
# Add src directory to path for imports
sys.path.insert(0, ".")
from src.source_finding.downloader import _extract_video_source_url # noqa: E402
# Set up logging to see the extraction function's output
logging.basicConfig(level=logging.INFO, format="%(levelname)s - %(name)s - %(message)s")
HEADERS = {"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"}
# Test pages: one with embedded video (iframe), one with external link
TEST_URLS = [
# Page with embedded video iframes (ok.ru, filemoon)
("Indiana vs Ohio State (embedded)", "https://nfl-video.com/indiana-vs-ohio-state-football-december-6-2025-big-ten-championship-full-game-replay"),
# Page with external link to collegegamestoday.com
("Texas vs Ohio State (external link)", "https://nfl-video.com/texas-longhorns-vs-ohio-state-buckeyes-football-august-30-2025-ncaa-full-game-replay"),
]
def inspect_page(url: str):
"""Fetch page and look for video elements (for debugging)."""
print(f"\n{'='*80}")
print(f"Page structure inspection: {url}")
print(f"{'='*80}\n")
response = requests.get(url, headers=HEADERS, timeout=30)
response.raise_for_status()
soup = BeautifulSoup(response.text, "html.parser")
# Look for iframes
print("=== IFRAMES ===")
iframes = soup.find_all("iframe")
for i, iframe in enumerate(iframes):
src = iframe.get("src", "") or iframe.get("data-src", "") or ""
print(f" [{i}] src={src}")
# Look for links with video hosts
print("\n=== LINKS TO VIDEO HOSTS ===")
video_hosts = ["collegegamestoday", "ok.ru", "filemoon", "streamtape", "mixdrop", "doodstream"]
links = soup.find_all("a", href=True)
for link in links:
href = link["href"]
for host in video_hosts:
if host in href.lower():
text = link.get_text(strip=True)[:50]
print(f" {host}: {href[:100]} (text: {text})")
break
def test_extraction(name: str, url: str):
"""Test the video extraction function on a URL."""
print(f"\n{'='*80}")
print(f"Testing extraction: {name}")
print(f"URL: {url}")
print(f"{'='*80}\n")
result = _extract_video_source_url(url)
if result:
print(f"✓ SUCCESS - Extracted URL: {result}")
return True
else:
print("✗ FAILED - No video source URL found")
return False
if __name__ == "__main__":
print("\n" + "=" * 80)
print("VIDEO EXTRACTION TEST SUITE")
print("=" * 80)
# First, inspect page structures
for name, url in TEST_URLS:
inspect_page(url)
# Then test extraction
print("\n\n" + "=" * 80)
print("EXTRACTION FUNCTION TESTS")
print("=" * 80)
results = []
for name, url in TEST_URLS:
success = test_extraction(name, url)
results.append((name, success))
# Summary
print("\n\n" + "=" * 80)
print("SUMMARY")
print("=" * 80)
all_passed = True
for name, success in results:
status = "✓ PASS" if success else "✗ FAIL"
print(f" {status}: {name}")
if not success:
all_passed = False
print("\n" + ("All tests passed!" if all_passed else "Some tests failed!"))
|