Spaces:
Sleeping
Sleeping
| #!/usr/bin/env python3 | |
| """ | |
| Test script to inspect and test video extraction from nfl-video.com pages. | |
| Tests both the page inspection (to see what's there) and the actual extraction function. | |
| """ | |
| import logging | |
| import sys | |
| import requests | |
| from bs4 import BeautifulSoup | |
| # Add src directory to path for imports | |
| sys.path.insert(0, ".") | |
| from src.source_finding.downloader import _extract_video_source_url # noqa: E402 | |
| # Set up logging to see the extraction function's output | |
| logging.basicConfig(level=logging.INFO, format="%(levelname)s - %(name)s - %(message)s") | |
| HEADERS = {"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"} | |
| # Test pages: one with embedded video (iframe), one with external link | |
| TEST_URLS = [ | |
| # Page with embedded video iframes (ok.ru, filemoon) | |
| ("Indiana vs Ohio State (embedded)", "https://nfl-video.com/indiana-vs-ohio-state-football-december-6-2025-big-ten-championship-full-game-replay"), | |
| # Page with external link to collegegamestoday.com | |
| ("Texas vs Ohio State (external link)", "https://nfl-video.com/texas-longhorns-vs-ohio-state-buckeyes-football-august-30-2025-ncaa-full-game-replay"), | |
| ] | |
| def inspect_page(url: str): | |
| """Fetch page and look for video elements (for debugging).""" | |
| print(f"\n{'='*80}") | |
| print(f"Page structure inspection: {url}") | |
| print(f"{'='*80}\n") | |
| response = requests.get(url, headers=HEADERS, timeout=30) | |
| response.raise_for_status() | |
| soup = BeautifulSoup(response.text, "html.parser") | |
| # Look for iframes | |
| print("=== IFRAMES ===") | |
| iframes = soup.find_all("iframe") | |
| for i, iframe in enumerate(iframes): | |
| src = iframe.get("src", "") or iframe.get("data-src", "") or "" | |
| print(f" [{i}] src={src}") | |
| # Look for links with video hosts | |
| print("\n=== LINKS TO VIDEO HOSTS ===") | |
| video_hosts = ["collegegamestoday", "ok.ru", "filemoon", "streamtape", "mixdrop", "doodstream"] | |
| links = soup.find_all("a", href=True) | |
| for link in links: | |
| href = link["href"] | |
| for host in video_hosts: | |
| if host in href.lower(): | |
| text = link.get_text(strip=True)[:50] | |
| print(f" {host}: {href[:100]} (text: {text})") | |
| break | |
| def test_extraction(name: str, url: str): | |
| """Test the video extraction function on a URL.""" | |
| print(f"\n{'='*80}") | |
| print(f"Testing extraction: {name}") | |
| print(f"URL: {url}") | |
| print(f"{'='*80}\n") | |
| result = _extract_video_source_url(url) | |
| if result: | |
| print(f"β SUCCESS - Extracted URL: {result}") | |
| return True | |
| else: | |
| print("β FAILED - No video source URL found") | |
| return False | |
| if __name__ == "__main__": | |
| print("\n" + "=" * 80) | |
| print("VIDEO EXTRACTION TEST SUITE") | |
| print("=" * 80) | |
| # First, inspect page structures | |
| for name, url in TEST_URLS: | |
| inspect_page(url) | |
| # Then test extraction | |
| print("\n\n" + "=" * 80) | |
| print("EXTRACTION FUNCTION TESTS") | |
| print("=" * 80) | |
| results = [] | |
| for name, url in TEST_URLS: | |
| success = test_extraction(name, url) | |
| results.append((name, success)) | |
| # Summary | |
| print("\n\n" + "=" * 80) | |
| print("SUMMARY") | |
| print("=" * 80) | |
| all_passed = True | |
| for name, success in results: | |
| status = "β PASS" if success else "β FAIL" | |
| print(f" {status}: {name}") | |
| if not success: | |
| all_passed = False | |
| print("\n" + ("All tests passed!" if all_passed else "Some tests failed!")) | |