Spaces:

andytaylor-smg
/

cfb40

Sleeping

App Files Files Community

cfb40 / scripts /test_video_extraction.py

andytaylor-smg

first bite of the app

bb3e8ea about 1 month ago

raw

history blame contribute delete

3.56 kB

	#!/usr/bin/env python3
	"""
	Test script to inspect and test video extraction from nfl-video.com pages.

	Tests both the page inspection (to see what's there) and the actual extraction function.
	"""

	import logging
	import sys

	import requests
	from bs4 import BeautifulSoup

	# Add src directory to path for imports
	sys.path.insert(0, ".")

	from src.source_finding.downloader import _extract_video_source_url # noqa: E402

	# Set up logging to see the extraction function's output
	logging.basicConfig(level=logging.INFO, format="%(levelname)s - %(name)s - %(message)s")

	HEADERS = {"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"}

	# Test pages: one with embedded video (iframe), one with external link
	TEST_URLS = [
	# Page with embedded video iframes (ok.ru, filemoon)
	("Indiana vs Ohio State (embedded)", "https://nfl-video.com/indiana-vs-ohio-state-football-december-6-2025-big-ten-championship-full-game-replay"),
	# Page with external link to collegegamestoday.com
	("Texas vs Ohio State (external link)", "https://nfl-video.com/texas-longhorns-vs-ohio-state-buckeyes-football-august-30-2025-ncaa-full-game-replay"),
	]


	def inspect_page(url: str):
	"""Fetch page and look for video elements (for debugging)."""
	print(f"\n{'='*80}")
	print(f"Page structure inspection: {url}")
	print(f"{'='*80}\n")

	response = requests.get(url, headers=HEADERS, timeout=30)
	response.raise_for_status()

	soup = BeautifulSoup(response.text, "html.parser")

	# Look for iframes
	print("=== IFRAMES ===")
	iframes = soup.find_all("iframe")
	for i, iframe in enumerate(iframes):
	src = iframe.get("src", "") or iframe.get("data-src", "") or ""
	print(f" [{i}] src={src}")

	# Look for links with video hosts
	print("\n=== LINKS TO VIDEO HOSTS ===")
	video_hosts = ["collegegamestoday", "ok.ru", "filemoon", "streamtape", "mixdrop", "doodstream"]
	links = soup.find_all("a", href=True)
	for link in links:
	href = link["href"]
	for host in video_hosts:
	if host in href.lower():
	text = link.get_text(strip=True)[:50]
	print(f" {host}: {href[:100]} (text: {text})")
	break


	def test_extraction(name: str, url: str):
	"""Test the video extraction function on a URL."""
	print(f"\n{'='*80}")
	print(f"Testing extraction: {name}")
	print(f"URL: {url}")
	print(f"{'='*80}\n")

	result = _extract_video_source_url(url)

	if result:
	print(f"✓ SUCCESS - Extracted URL: {result}")
	return True
	else:
	print("✗ FAILED - No video source URL found")
	return False


	if __name__ == "__main__":
	print("\n" + "=" * 80)
	print("VIDEO EXTRACTION TEST SUITE")
	print("=" * 80)

	# First, inspect page structures
	for name, url in TEST_URLS:
	inspect_page(url)

	# Then test extraction
	print("\n\n" + "=" * 80)
	print("EXTRACTION FUNCTION TESTS")
	print("=" * 80)

	results = []
	for name, url in TEST_URLS:
	success = test_extraction(name, url)
	results.append((name, success))

	# Summary
	print("\n\n" + "=" * 80)
	print("SUMMARY")
	print("=" * 80)
	all_passed = True
	for name, success in results:
	status = "✓ PASS" if success else "✗ FAIL"
	print(f" {status}: {name}")
	if not success:
	all_passed = False

	print("\n" + ("All tests passed!" if all_passed else "Some tests failed!"))