Spaces:

andytaylor-smg
/

cfb40

Sleeping

App Files Files Community

cfb40 / src /source_finding /scraper.py

andytaylor-smg

first bite of the app

bb3e8ea about 1 month ago

raw

history blame contribute delete

12.7 kB

	"""
	Web scraper for finding college football games on nfl-video.com.

	This module uses team-specific pages for efficient searching:
	1. Builds an index of team names to their dedicated pages
	2. Fetches games directly from the team's page
	3. Optionally filters by opponent (team_b)
	"""

	import logging
	import re
	import time
	from typing import Optional

	import requests
	from bs4 import BeautifulSoup

	from .models import GameResult, SearchResults

	logger = logging.getLogger(__name__)

	# Base URL for college football section
	BASE_URL = "https://nfl-video.com/cfb"

	# Main page with team links in sidebar
	MAIN_PAGE_URL = "https://nfl-video.com/cfb/ncaa_college_football_highlights_games_replay/ncaa_college_football_full_game_replays/2"

	# User agent to avoid being blocked
	HEADERS = {"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"}

	# Delay between page requests to be polite to the server
	REQUEST_DELAY_SECONDS = 0.5

	# Cache for team index (team name -> URL mapping)
	_team_index_cache: Optional[dict[str, str]] = None # pylint: disable=invalid-name


	def _fetch_page(url: str) -> Optional[BeautifulSoup]:
	"""
	Fetch a page and return its parsed HTML.

	Args:
	url: URL to fetch

	Returns:
	BeautifulSoup object or None if fetch failed
	"""
	try:
	logger.debug("Fetching URL: %s", url)
	response = requests.get(url, headers=HEADERS, timeout=30)
	response.raise_for_status()
	return BeautifulSoup(response.text, "html.parser")
	except requests.RequestException as e:
	logger.error("Failed to fetch %s: %s", url, e)
	return None


	def _build_team_index() -> dict[str, str]:
	"""
	Build an index mapping team names to their dedicated page URLs.

	Scrapes the main page sidebar to find all team links.

	Returns:
	Dictionary mapping team names (lowercase) to full URLs
	"""
	global _team_index_cache # pylint: disable=global-statement

	if _team_index_cache is not None:
	logger.debug("Using cached team index with %d teams", len(_team_index_cache))
	return _team_index_cache

	logger.info("Building team index from main page...")
	soup = _fetch_page(MAIN_PAGE_URL)

	if soup is None:
	logger.error("Failed to fetch main page for team index")
	return {}

	team_index: dict[str, str] = {}

	for link in soup.find_all("a", href=True):
	href = link["href"]
	text = link.get_text(strip=True)

	# Team pages follow pattern like /cfb/sec_football/alabama_crimson_tide_football/59
	if "/cfb/" in href and "_football/" in href and text:
	# Skip main category links
	if "ncaa_college_football" not in href and "replay" not in href.lower():
	# Ensure full URL
	if not href.startswith("http"):
	href = f"https://nfl-video.com{href}"

	# Store with lowercase key for case-insensitive lookup
	team_key = text.lower().strip()
	if team_key and team_key not in team_index:
	team_index[team_key] = href
	logger.debug("Indexed team: %s -> %s", text, href)

	logger.info("Built team index with %d teams", len(team_index))
	_team_index_cache = team_index
	return team_index


	def get_team_index() -> dict[str, str]:
	"""
	Get the team index (public API for debugging/listing teams).

	Returns:
	Dictionary mapping team names to their page URLs
	"""
	return _build_team_index()


	def clear_team_index_cache() -> None:
	"""Clear the cached team index (useful for testing)."""
	global _team_index_cache # pylint: disable=global-statement
	_team_index_cache = None
	logger.debug("Cleared team index cache")


	def _find_team_url(team_name: str) -> Optional[str]:
	"""
	Find the URL for a team's dedicated page.

	Performs fuzzy matching to handle variations like:
	- "Ohio State" matches "ohio state"
	- "OSU" might not match (would need alias support)
	- "Texas A&M" matches "texas a&m"

	Args:
	team_name: Team name to search for

	Returns:
	Team page URL or None if not found
	"""
	team_index = _build_team_index()
	team_lower = team_name.lower().strip()

	# Exact match first
	if team_lower in team_index:
	return team_index[team_lower]

	# Partial match - check if search term is contained in any team name
	for indexed_name, url in team_index.items():
	if team_lower in indexed_name or indexed_name in team_lower:
	logger.debug("Fuzzy matched '%s' to '%s'", team_name, indexed_name)
	return url

	logger.warning("Team not found in index: %s", team_name)
	return None


	def _parse_game_title(title: str) -> dict:
	"""
	Parse a game title to extract team names, year, and event.

	Example titles:
	"Ohio State vs Oregon Football 2024 Big Ten Championship Full Game Replay"
	"Alabama vs. Oklahoma Football December 19, 2025 CFP First Round Full Game Replay"

	Args:
	title: Full game title string

	Returns:
	Dictionary with keys: team_a, team_b, year, event (some may be None)
	"""
	result = {"team_a": None, "team_b": None, "year": None, "event": None}

	# Clean up the title
	title = title.strip()

	# Pattern: "Team A vs Team B Football [Date] [Event] Full Game Replay"
	# Handle both "vs" and "vs." (with period)
	vs_match = re.match(r"^(.+?)\s+vs\.?\s+(.+?)\s+Football\s+", title, re.IGNORECASE)
	if vs_match:
	result["team_a"] = vs_match.group(1).strip()
	result["team_b"] = vs_match.group(2).strip()

	# Extract year (4-digit number)
	year_match = re.search(r"\b(20\d{2})\b", title)
	if year_match:
	result["year"] = int(year_match.group(1))

	# Extract event - everything between the date and "Full Game Replay"
	# Format: "Football [Month Day, Year] [Event] Full Game Replay"
	event_match = re.search(r"Football\s+\w+\s+\d{1,2},?\s+20\d{2}\s+(.+?)\s+Full Game Replay", title, re.IGNORECASE)
	if event_match:
	event = event_match.group(1).strip()
	if event and event.lower() not in ["full", "game", "replay"]:
	result["event"] = event

	return result


	def _extract_games_from_page(soup: BeautifulSoup, filter_team: Optional[str] = None) -> list[GameResult]:
	"""
	Extract game listings from a parsed page.

	Args:
	soup: BeautifulSoup object of the page
	filter_team: Optional team name to filter for (for team-specific pages that may include unrelated games)

	Returns:
	List of GameResult objects found on the page
	"""
	games = []

	for link in soup.find_all("a", href=True):
	text = link.get_text(strip=True)

	# Skip links that don't look like game titles
	if "Full Game Replay" not in text:
	continue

	# Skip if it doesn't have "vs" or "vs." (not a game matchup)
	text_lower = text.lower()
	if " vs " not in text_lower and " vs. " not in text_lower:
	continue

	href = link["href"]

	# Make sure it's a full URL
	if not href.startswith("http"):
	href = f"https://nfl-video.com{href}"

	# Skip if we've already seen this URL (duplicates on page)
	if any(g.url == href for g in games):
	continue

	# Parse the title for metadata
	parsed = _parse_game_title(text)

	if parsed["team_a"] and parsed["team_b"]:
	# If filter_team specified, skip games that don't involve that team
	if filter_team:
	filter_lower = filter_team.lower()
	if filter_lower not in parsed["team_a"].lower() and filter_lower not in parsed["team_b"].lower():
	continue

	# Try to find thumbnail
	thumbnail_url = None
	parent = link.find_parent()
	if parent:
	img = parent.find("img")
	if img and img.get("src"):
	thumbnail_url = img["src"]
	if not thumbnail_url.startswith("http"):
	thumbnail_url = f"https://nfl-video.com{thumbnail_url}"

	game = GameResult(
	title=text,
	team_a=parsed["team_a"],
	team_b=parsed["team_b"],
	url=href,
	thumbnail_url=thumbnail_url,
	year=parsed["year"],
	event=parsed["event"],
	)
	games.append(game)
	logger.debug("Found game: %s", game)

	return games


	def _team_matches(game: GameResult, team_name: str) -> bool:
	"""
	Check if a game involves a given team.

	Args:
	game: GameResult to check
	team_name: Team name to search for

	Returns:
	True if the team appears in either team_a or team_b
	"""
	team_lower = team_name.lower()
	return team_lower in game.team_a.lower() or team_lower in game.team_b.lower()


	def search_games(
	team_a: str,
	team_b: Optional[str] = None,
	max_pages: int = 5,
	delay_seconds: float = REQUEST_DELAY_SECONDS,
	) -> SearchResults:
	"""
	Search for college football games by team name(s).

	Uses team-specific pages for efficient searching:
	1. Looks up team_a's dedicated page in the team index
	2. Fetches all games from that team's page
	3. If team_b specified, filters to only games against that opponent

	Args:
	team_a: Primary team name to search for (required)
	team_b: Optional opponent name - if provided, only games against this team are returned
	max_pages: Maximum number of pages to search on the team's page (default 5)
	delay_seconds: Delay between page requests to avoid rate limiting

	Returns:
	SearchResults object containing matching games and search metadata

	Example:
	# Find all Alabama games
	results = search_games("Alabama")

	# Find Alabama vs Georgia specifically
	results = search_games("Alabama", "Georgia")
	"""
	logger.info("Searching for games: team_a='%s', team_b='%s'", team_a, team_b)

	# Find the team's dedicated page URL
	team_url = _find_team_url(team_a)

	if team_url is None:
	logger.warning("Could not find team page for '%s', returning empty results", team_a)
	return SearchResults(
	query_team_a=team_a,
	query_team_b=team_b,
	games=[],
	pages_searched=0,
	total_games_scanned=0,
	)

	logger.info("Found team page for '%s': %s", team_a, team_url)

	matching_games: list[GameResult] = []
	total_scanned = 0

	# Fetch pages from the team's dedicated page
	for page_num in range(1, max_pages + 1):
	# Team pages use same pagination pattern as main: base URL, then base-2, base-3, etc.
	if page_num == 1:
	page_url = team_url
	else:
	page_url = f"{team_url}-{page_num}"

	logger.info("Searching page %d/%d: %s", page_num, max_pages, page_url)

	soup = _fetch_page(page_url)
	if soup is None:
	logger.warning("Failed to fetch page %d, stopping search", page_num)
	break

	# Extract games, filtering for the team (team pages sometimes include unrelated sidebar games)
	page_games = _extract_games_from_page(soup, filter_team=team_a)
	total_scanned += len(page_games)

	# Filter by opponent if specified
	for game in page_games:
	# If team_b specified, must also match team_b
	if team_b and not _team_matches(game, team_b):
	continue

	# Avoid duplicates (same URL)
	if not any(g.url == game.url for g in matching_games):
	matching_games.append(game)
	logger.info("Found matching game: %s", game)

	# Be polite - add delay between requests
	if page_num < max_pages:
	time.sleep(delay_seconds)

	results = SearchResults(
	query_team_a=team_a,
	query_team_b=team_b,
	games=matching_games,
	pages_searched=max_pages,
	total_games_scanned=total_scanned,
	)

	logger.info("Search complete: %s", results)
	return results


	def list_available_teams() -> list[str]:
	"""
	Get a list of all available team names.

	Useful for populating dropdown menus or autocomplete in the UI.

	Returns:
	Sorted list of team names
	"""
	team_index = _build_team_index()
	# Return with original casing (capitalize each word)
	return sorted([name.title() for name in team_index])