""" Web scraper for finding college football games on nfl-video.com. This module uses team-specific pages for efficient searching: 1. Builds an index of team names to their dedicated pages 2. Fetches games directly from the team's page 3. Optionally filters by opponent (team_b) """ import logging import re import time from typing import Optional import requests from bs4 import BeautifulSoup from .models import GameResult, SearchResults logger = logging.getLogger(__name__) # Base URL for college football section BASE_URL = "https://nfl-video.com/cfb" # Main page with team links in sidebar MAIN_PAGE_URL = "https://nfl-video.com/cfb/ncaa_college_football_highlights_games_replay/ncaa_college_football_full_game_replays/2" # User agent to avoid being blocked HEADERS = {"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"} # Delay between page requests to be polite to the server REQUEST_DELAY_SECONDS = 0.5 # Cache for team index (team name -> URL mapping) _team_index_cache: Optional[dict[str, str]] = None # pylint: disable=invalid-name def _fetch_page(url: str) -> Optional[BeautifulSoup]: """ Fetch a page and return its parsed HTML. Args: url: URL to fetch Returns: BeautifulSoup object or None if fetch failed """ try: logger.debug("Fetching URL: %s", url) response = requests.get(url, headers=HEADERS, timeout=30) response.raise_for_status() return BeautifulSoup(response.text, "html.parser") except requests.RequestException as e: logger.error("Failed to fetch %s: %s", url, e) return None def _build_team_index() -> dict[str, str]: """ Build an index mapping team names to their dedicated page URLs. Scrapes the main page sidebar to find all team links. Returns: Dictionary mapping team names (lowercase) to full URLs """ global _team_index_cache # pylint: disable=global-statement if _team_index_cache is not None: logger.debug("Using cached team index with %d teams", len(_team_index_cache)) return _team_index_cache logger.info("Building team index from main page...") soup = _fetch_page(MAIN_PAGE_URL) if soup is None: logger.error("Failed to fetch main page for team index") return {} team_index: dict[str, str] = {} for link in soup.find_all("a", href=True): href = link["href"] text = link.get_text(strip=True) # Team pages follow pattern like /cfb/sec_football/alabama_crimson_tide_football/59 if "/cfb/" in href and "_football/" in href and text: # Skip main category links if "ncaa_college_football" not in href and "replay" not in href.lower(): # Ensure full URL if not href.startswith("http"): href = f"https://nfl-video.com{href}" # Store with lowercase key for case-insensitive lookup team_key = text.lower().strip() if team_key and team_key not in team_index: team_index[team_key] = href logger.debug("Indexed team: %s -> %s", text, href) logger.info("Built team index with %d teams", len(team_index)) _team_index_cache = team_index return team_index def get_team_index() -> dict[str, str]: """ Get the team index (public API for debugging/listing teams). Returns: Dictionary mapping team names to their page URLs """ return _build_team_index() def clear_team_index_cache() -> None: """Clear the cached team index (useful for testing).""" global _team_index_cache # pylint: disable=global-statement _team_index_cache = None logger.debug("Cleared team index cache") def _find_team_url(team_name: str) -> Optional[str]: """ Find the URL for a team's dedicated page. Performs fuzzy matching to handle variations like: - "Ohio State" matches "ohio state" - "OSU" might not match (would need alias support) - "Texas A&M" matches "texas a&m" Args: team_name: Team name to search for Returns: Team page URL or None if not found """ team_index = _build_team_index() team_lower = team_name.lower().strip() # Exact match first if team_lower in team_index: return team_index[team_lower] # Partial match - check if search term is contained in any team name for indexed_name, url in team_index.items(): if team_lower in indexed_name or indexed_name in team_lower: logger.debug("Fuzzy matched '%s' to '%s'", team_name, indexed_name) return url logger.warning("Team not found in index: %s", team_name) return None def _parse_game_title(title: str) -> dict: """ Parse a game title to extract team names, year, and event. Example titles: "Ohio State vs Oregon Football 2024 Big Ten Championship Full Game Replay" "Alabama vs. Oklahoma Football December 19, 2025 CFP First Round Full Game Replay" Args: title: Full game title string Returns: Dictionary with keys: team_a, team_b, year, event (some may be None) """ result = {"team_a": None, "team_b": None, "year": None, "event": None} # Clean up the title title = title.strip() # Pattern: "Team A vs Team B Football [Date] [Event] Full Game Replay" # Handle both "vs" and "vs." (with period) vs_match = re.match(r"^(.+?)\s+vs\.?\s+(.+?)\s+Football\s+", title, re.IGNORECASE) if vs_match: result["team_a"] = vs_match.group(1).strip() result["team_b"] = vs_match.group(2).strip() # Extract year (4-digit number) year_match = re.search(r"\b(20\d{2})\b", title) if year_match: result["year"] = int(year_match.group(1)) # Extract event - everything between the date and "Full Game Replay" # Format: "Football [Month Day, Year] [Event] Full Game Replay" event_match = re.search(r"Football\s+\w+\s+\d{1,2},?\s+20\d{2}\s+(.+?)\s+Full Game Replay", title, re.IGNORECASE) if event_match: event = event_match.group(1).strip() if event and event.lower() not in ["full", "game", "replay"]: result["event"] = event return result def _extract_games_from_page(soup: BeautifulSoup, filter_team: Optional[str] = None) -> list[GameResult]: """ Extract game listings from a parsed page. Args: soup: BeautifulSoup object of the page filter_team: Optional team name to filter for (for team-specific pages that may include unrelated games) Returns: List of GameResult objects found on the page """ games = [] for link in soup.find_all("a", href=True): text = link.get_text(strip=True) # Skip links that don't look like game titles if "Full Game Replay" not in text: continue # Skip if it doesn't have "vs" or "vs." (not a game matchup) text_lower = text.lower() if " vs " not in text_lower and " vs. " not in text_lower: continue href = link["href"] # Make sure it's a full URL if not href.startswith("http"): href = f"https://nfl-video.com{href}" # Skip if we've already seen this URL (duplicates on page) if any(g.url == href for g in games): continue # Parse the title for metadata parsed = _parse_game_title(text) if parsed["team_a"] and parsed["team_b"]: # If filter_team specified, skip games that don't involve that team if filter_team: filter_lower = filter_team.lower() if filter_lower not in parsed["team_a"].lower() and filter_lower not in parsed["team_b"].lower(): continue # Try to find thumbnail thumbnail_url = None parent = link.find_parent() if parent: img = parent.find("img") if img and img.get("src"): thumbnail_url = img["src"] if not thumbnail_url.startswith("http"): thumbnail_url = f"https://nfl-video.com{thumbnail_url}" game = GameResult( title=text, team_a=parsed["team_a"], team_b=parsed["team_b"], url=href, thumbnail_url=thumbnail_url, year=parsed["year"], event=parsed["event"], ) games.append(game) logger.debug("Found game: %s", game) return games def _team_matches(game: GameResult, team_name: str) -> bool: """ Check if a game involves a given team. Args: game: GameResult to check team_name: Team name to search for Returns: True if the team appears in either team_a or team_b """ team_lower = team_name.lower() return team_lower in game.team_a.lower() or team_lower in game.team_b.lower() def search_games( team_a: str, team_b: Optional[str] = None, max_pages: int = 5, delay_seconds: float = REQUEST_DELAY_SECONDS, ) -> SearchResults: """ Search for college football games by team name(s). Uses team-specific pages for efficient searching: 1. Looks up team_a's dedicated page in the team index 2. Fetches all games from that team's page 3. If team_b specified, filters to only games against that opponent Args: team_a: Primary team name to search for (required) team_b: Optional opponent name - if provided, only games against this team are returned max_pages: Maximum number of pages to search on the team's page (default 5) delay_seconds: Delay between page requests to avoid rate limiting Returns: SearchResults object containing matching games and search metadata Example: # Find all Alabama games results = search_games("Alabama") # Find Alabama vs Georgia specifically results = search_games("Alabama", "Georgia") """ logger.info("Searching for games: team_a='%s', team_b='%s'", team_a, team_b) # Find the team's dedicated page URL team_url = _find_team_url(team_a) if team_url is None: logger.warning("Could not find team page for '%s', returning empty results", team_a) return SearchResults( query_team_a=team_a, query_team_b=team_b, games=[], pages_searched=0, total_games_scanned=0, ) logger.info("Found team page for '%s': %s", team_a, team_url) matching_games: list[GameResult] = [] total_scanned = 0 # Fetch pages from the team's dedicated page for page_num in range(1, max_pages + 1): # Team pages use same pagination pattern as main: base URL, then base-2, base-3, etc. if page_num == 1: page_url = team_url else: page_url = f"{team_url}-{page_num}" logger.info("Searching page %d/%d: %s", page_num, max_pages, page_url) soup = _fetch_page(page_url) if soup is None: logger.warning("Failed to fetch page %d, stopping search", page_num) break # Extract games, filtering for the team (team pages sometimes include unrelated sidebar games) page_games = _extract_games_from_page(soup, filter_team=team_a) total_scanned += len(page_games) # Filter by opponent if specified for game in page_games: # If team_b specified, must also match team_b if team_b and not _team_matches(game, team_b): continue # Avoid duplicates (same URL) if not any(g.url == game.url for g in matching_games): matching_games.append(game) logger.info("Found matching game: %s", game) # Be polite - add delay between requests if page_num < max_pages: time.sleep(delay_seconds) results = SearchResults( query_team_a=team_a, query_team_b=team_b, games=matching_games, pages_searched=max_pages, total_games_scanned=total_scanned, ) logger.info("Search complete: %s", results) return results def list_available_teams() -> list[str]: """ Get a list of all available team names. Useful for populating dropdown menus or autocomplete in the UI. Returns: Sorted list of team names """ team_index = _build_team_index() # Return with original casing (capitalize each word) return sorted([name.title() for name in team_index])