Spaces:
Sleeping
Sleeping
| """ | |
| Web scraper for finding college football games on nfl-video.com. | |
| This module uses team-specific pages for efficient searching: | |
| 1. Builds an index of team names to their dedicated pages | |
| 2. Fetches games directly from the team's page | |
| 3. Optionally filters by opponent (team_b) | |
| """ | |
| import logging | |
| import re | |
| import time | |
| from typing import Optional | |
| import requests | |
| from bs4 import BeautifulSoup | |
| from .models import GameResult, SearchResults | |
| logger = logging.getLogger(__name__) | |
| # Base URL for college football section | |
| BASE_URL = "https://nfl-video.com/cfb" | |
| # Main page with team links in sidebar | |
| MAIN_PAGE_URL = "https://nfl-video.com/cfb/ncaa_college_football_highlights_games_replay/ncaa_college_football_full_game_replays/2" | |
| # User agent to avoid being blocked | |
| HEADERS = {"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"} | |
| # Delay between page requests to be polite to the server | |
| REQUEST_DELAY_SECONDS = 0.5 | |
| # Cache for team index (team name -> URL mapping) | |
| _team_index_cache: Optional[dict[str, str]] = None # pylint: disable=invalid-name | |
| def _fetch_page(url: str) -> Optional[BeautifulSoup]: | |
| """ | |
| Fetch a page and return its parsed HTML. | |
| Args: | |
| url: URL to fetch | |
| Returns: | |
| BeautifulSoup object or None if fetch failed | |
| """ | |
| try: | |
| logger.debug("Fetching URL: %s", url) | |
| response = requests.get(url, headers=HEADERS, timeout=30) | |
| response.raise_for_status() | |
| return BeautifulSoup(response.text, "html.parser") | |
| except requests.RequestException as e: | |
| logger.error("Failed to fetch %s: %s", url, e) | |
| return None | |
| def _build_team_index() -> dict[str, str]: | |
| """ | |
| Build an index mapping team names to their dedicated page URLs. | |
| Scrapes the main page sidebar to find all team links. | |
| Returns: | |
| Dictionary mapping team names (lowercase) to full URLs | |
| """ | |
| global _team_index_cache # pylint: disable=global-statement | |
| if _team_index_cache is not None: | |
| logger.debug("Using cached team index with %d teams", len(_team_index_cache)) | |
| return _team_index_cache | |
| logger.info("Building team index from main page...") | |
| soup = _fetch_page(MAIN_PAGE_URL) | |
| if soup is None: | |
| logger.error("Failed to fetch main page for team index") | |
| return {} | |
| team_index: dict[str, str] = {} | |
| for link in soup.find_all("a", href=True): | |
| href = link["href"] | |
| text = link.get_text(strip=True) | |
| # Team pages follow pattern like /cfb/sec_football/alabama_crimson_tide_football/59 | |
| if "/cfb/" in href and "_football/" in href and text: | |
| # Skip main category links | |
| if "ncaa_college_football" not in href and "replay" not in href.lower(): | |
| # Ensure full URL | |
| if not href.startswith("http"): | |
| href = f"https://nfl-video.com{href}" | |
| # Store with lowercase key for case-insensitive lookup | |
| team_key = text.lower().strip() | |
| if team_key and team_key not in team_index: | |
| team_index[team_key] = href | |
| logger.debug("Indexed team: %s -> %s", text, href) | |
| logger.info("Built team index with %d teams", len(team_index)) | |
| _team_index_cache = team_index | |
| return team_index | |
| def get_team_index() -> dict[str, str]: | |
| """ | |
| Get the team index (public API for debugging/listing teams). | |
| Returns: | |
| Dictionary mapping team names to their page URLs | |
| """ | |
| return _build_team_index() | |
| def clear_team_index_cache() -> None: | |
| """Clear the cached team index (useful for testing).""" | |
| global _team_index_cache # pylint: disable=global-statement | |
| _team_index_cache = None | |
| logger.debug("Cleared team index cache") | |
| def _find_team_url(team_name: str) -> Optional[str]: | |
| """ | |
| Find the URL for a team's dedicated page. | |
| Performs fuzzy matching to handle variations like: | |
| - "Ohio State" matches "ohio state" | |
| - "OSU" might not match (would need alias support) | |
| - "Texas A&M" matches "texas a&m" | |
| Args: | |
| team_name: Team name to search for | |
| Returns: | |
| Team page URL or None if not found | |
| """ | |
| team_index = _build_team_index() | |
| team_lower = team_name.lower().strip() | |
| # Exact match first | |
| if team_lower in team_index: | |
| return team_index[team_lower] | |
| # Partial match - check if search term is contained in any team name | |
| for indexed_name, url in team_index.items(): | |
| if team_lower in indexed_name or indexed_name in team_lower: | |
| logger.debug("Fuzzy matched '%s' to '%s'", team_name, indexed_name) | |
| return url | |
| logger.warning("Team not found in index: %s", team_name) | |
| return None | |
| def _parse_game_title(title: str) -> dict: | |
| """ | |
| Parse a game title to extract team names, year, and event. | |
| Example titles: | |
| "Ohio State vs Oregon Football 2024 Big Ten Championship Full Game Replay" | |
| "Alabama vs. Oklahoma Football December 19, 2025 CFP First Round Full Game Replay" | |
| Args: | |
| title: Full game title string | |
| Returns: | |
| Dictionary with keys: team_a, team_b, year, event (some may be None) | |
| """ | |
| result = {"team_a": None, "team_b": None, "year": None, "event": None} | |
| # Clean up the title | |
| title = title.strip() | |
| # Pattern: "Team A vs Team B Football [Date] [Event] Full Game Replay" | |
| # Handle both "vs" and "vs." (with period) | |
| vs_match = re.match(r"^(.+?)\s+vs\.?\s+(.+?)\s+Football\s+", title, re.IGNORECASE) | |
| if vs_match: | |
| result["team_a"] = vs_match.group(1).strip() | |
| result["team_b"] = vs_match.group(2).strip() | |
| # Extract year (4-digit number) | |
| year_match = re.search(r"\b(20\d{2})\b", title) | |
| if year_match: | |
| result["year"] = int(year_match.group(1)) | |
| # Extract event - everything between the date and "Full Game Replay" | |
| # Format: "Football [Month Day, Year] [Event] Full Game Replay" | |
| event_match = re.search(r"Football\s+\w+\s+\d{1,2},?\s+20\d{2}\s+(.+?)\s+Full Game Replay", title, re.IGNORECASE) | |
| if event_match: | |
| event = event_match.group(1).strip() | |
| if event and event.lower() not in ["full", "game", "replay"]: | |
| result["event"] = event | |
| return result | |
| def _extract_games_from_page(soup: BeautifulSoup, filter_team: Optional[str] = None) -> list[GameResult]: | |
| """ | |
| Extract game listings from a parsed page. | |
| Args: | |
| soup: BeautifulSoup object of the page | |
| filter_team: Optional team name to filter for (for team-specific pages that may include unrelated games) | |
| Returns: | |
| List of GameResult objects found on the page | |
| """ | |
| games = [] | |
| for link in soup.find_all("a", href=True): | |
| text = link.get_text(strip=True) | |
| # Skip links that don't look like game titles | |
| if "Full Game Replay" not in text: | |
| continue | |
| # Skip if it doesn't have "vs" or "vs." (not a game matchup) | |
| text_lower = text.lower() | |
| if " vs " not in text_lower and " vs. " not in text_lower: | |
| continue | |
| href = link["href"] | |
| # Make sure it's a full URL | |
| if not href.startswith("http"): | |
| href = f"https://nfl-video.com{href}" | |
| # Skip if we've already seen this URL (duplicates on page) | |
| if any(g.url == href for g in games): | |
| continue | |
| # Parse the title for metadata | |
| parsed = _parse_game_title(text) | |
| if parsed["team_a"] and parsed["team_b"]: | |
| # If filter_team specified, skip games that don't involve that team | |
| if filter_team: | |
| filter_lower = filter_team.lower() | |
| if filter_lower not in parsed["team_a"].lower() and filter_lower not in parsed["team_b"].lower(): | |
| continue | |
| # Try to find thumbnail | |
| thumbnail_url = None | |
| parent = link.find_parent() | |
| if parent: | |
| img = parent.find("img") | |
| if img and img.get("src"): | |
| thumbnail_url = img["src"] | |
| if not thumbnail_url.startswith("http"): | |
| thumbnail_url = f"https://nfl-video.com{thumbnail_url}" | |
| game = GameResult( | |
| title=text, | |
| team_a=parsed["team_a"], | |
| team_b=parsed["team_b"], | |
| url=href, | |
| thumbnail_url=thumbnail_url, | |
| year=parsed["year"], | |
| event=parsed["event"], | |
| ) | |
| games.append(game) | |
| logger.debug("Found game: %s", game) | |
| return games | |
| def _team_matches(game: GameResult, team_name: str) -> bool: | |
| """ | |
| Check if a game involves a given team. | |
| Args: | |
| game: GameResult to check | |
| team_name: Team name to search for | |
| Returns: | |
| True if the team appears in either team_a or team_b | |
| """ | |
| team_lower = team_name.lower() | |
| return team_lower in game.team_a.lower() or team_lower in game.team_b.lower() | |
| def search_games( | |
| team_a: str, | |
| team_b: Optional[str] = None, | |
| max_pages: int = 5, | |
| delay_seconds: float = REQUEST_DELAY_SECONDS, | |
| ) -> SearchResults: | |
| """ | |
| Search for college football games by team name(s). | |
| Uses team-specific pages for efficient searching: | |
| 1. Looks up team_a's dedicated page in the team index | |
| 2. Fetches all games from that team's page | |
| 3. If team_b specified, filters to only games against that opponent | |
| Args: | |
| team_a: Primary team name to search for (required) | |
| team_b: Optional opponent name - if provided, only games against this team are returned | |
| max_pages: Maximum number of pages to search on the team's page (default 5) | |
| delay_seconds: Delay between page requests to avoid rate limiting | |
| Returns: | |
| SearchResults object containing matching games and search metadata | |
| Example: | |
| # Find all Alabama games | |
| results = search_games("Alabama") | |
| # Find Alabama vs Georgia specifically | |
| results = search_games("Alabama", "Georgia") | |
| """ | |
| logger.info("Searching for games: team_a='%s', team_b='%s'", team_a, team_b) | |
| # Find the team's dedicated page URL | |
| team_url = _find_team_url(team_a) | |
| if team_url is None: | |
| logger.warning("Could not find team page for '%s', returning empty results", team_a) | |
| return SearchResults( | |
| query_team_a=team_a, | |
| query_team_b=team_b, | |
| games=[], | |
| pages_searched=0, | |
| total_games_scanned=0, | |
| ) | |
| logger.info("Found team page for '%s': %s", team_a, team_url) | |
| matching_games: list[GameResult] = [] | |
| total_scanned = 0 | |
| # Fetch pages from the team's dedicated page | |
| for page_num in range(1, max_pages + 1): | |
| # Team pages use same pagination pattern as main: base URL, then base-2, base-3, etc. | |
| if page_num == 1: | |
| page_url = team_url | |
| else: | |
| page_url = f"{team_url}-{page_num}" | |
| logger.info("Searching page %d/%d: %s", page_num, max_pages, page_url) | |
| soup = _fetch_page(page_url) | |
| if soup is None: | |
| logger.warning("Failed to fetch page %d, stopping search", page_num) | |
| break | |
| # Extract games, filtering for the team (team pages sometimes include unrelated sidebar games) | |
| page_games = _extract_games_from_page(soup, filter_team=team_a) | |
| total_scanned += len(page_games) | |
| # Filter by opponent if specified | |
| for game in page_games: | |
| # If team_b specified, must also match team_b | |
| if team_b and not _team_matches(game, team_b): | |
| continue | |
| # Avoid duplicates (same URL) | |
| if not any(g.url == game.url for g in matching_games): | |
| matching_games.append(game) | |
| logger.info("Found matching game: %s", game) | |
| # Be polite - add delay between requests | |
| if page_num < max_pages: | |
| time.sleep(delay_seconds) | |
| results = SearchResults( | |
| query_team_a=team_a, | |
| query_team_b=team_b, | |
| games=matching_games, | |
| pages_searched=max_pages, | |
| total_games_scanned=total_scanned, | |
| ) | |
| logger.info("Search complete: %s", results) | |
| return results | |
| def list_available_teams() -> list[str]: | |
| """ | |
| Get a list of all available team names. | |
| Useful for populating dropdown menus or autocomplete in the UI. | |
| Returns: | |
| Sorted list of team names | |
| """ | |
| team_index = _build_team_index() | |
| # Return with original casing (capitalize each word) | |
| return sorted([name.title() for name in team_index]) | |