cfb40 / src /source_finding /scraper.py
andytaylor-smg's picture
first bite of the app
bb3e8ea
"""
Web scraper for finding college football games on nfl-video.com.
This module uses team-specific pages for efficient searching:
1. Builds an index of team names to their dedicated pages
2. Fetches games directly from the team's page
3. Optionally filters by opponent (team_b)
"""
import logging
import re
import time
from typing import Optional
import requests
from bs4 import BeautifulSoup
from .models import GameResult, SearchResults
logger = logging.getLogger(__name__)
# Base URL for college football section
BASE_URL = "https://nfl-video.com/cfb"
# Main page with team links in sidebar
MAIN_PAGE_URL = "https://nfl-video.com/cfb/ncaa_college_football_highlights_games_replay/ncaa_college_football_full_game_replays/2"
# User agent to avoid being blocked
HEADERS = {"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"}
# Delay between page requests to be polite to the server
REQUEST_DELAY_SECONDS = 0.5
# Cache for team index (team name -> URL mapping)
_team_index_cache: Optional[dict[str, str]] = None # pylint: disable=invalid-name
def _fetch_page(url: str) -> Optional[BeautifulSoup]:
"""
Fetch a page and return its parsed HTML.
Args:
url: URL to fetch
Returns:
BeautifulSoup object or None if fetch failed
"""
try:
logger.debug("Fetching URL: %s", url)
response = requests.get(url, headers=HEADERS, timeout=30)
response.raise_for_status()
return BeautifulSoup(response.text, "html.parser")
except requests.RequestException as e:
logger.error("Failed to fetch %s: %s", url, e)
return None
def _build_team_index() -> dict[str, str]:
"""
Build an index mapping team names to their dedicated page URLs.
Scrapes the main page sidebar to find all team links.
Returns:
Dictionary mapping team names (lowercase) to full URLs
"""
global _team_index_cache # pylint: disable=global-statement
if _team_index_cache is not None:
logger.debug("Using cached team index with %d teams", len(_team_index_cache))
return _team_index_cache
logger.info("Building team index from main page...")
soup = _fetch_page(MAIN_PAGE_URL)
if soup is None:
logger.error("Failed to fetch main page for team index")
return {}
team_index: dict[str, str] = {}
for link in soup.find_all("a", href=True):
href = link["href"]
text = link.get_text(strip=True)
# Team pages follow pattern like /cfb/sec_football/alabama_crimson_tide_football/59
if "/cfb/" in href and "_football/" in href and text:
# Skip main category links
if "ncaa_college_football" not in href and "replay" not in href.lower():
# Ensure full URL
if not href.startswith("http"):
href = f"https://nfl-video.com{href}"
# Store with lowercase key for case-insensitive lookup
team_key = text.lower().strip()
if team_key and team_key not in team_index:
team_index[team_key] = href
logger.debug("Indexed team: %s -> %s", text, href)
logger.info("Built team index with %d teams", len(team_index))
_team_index_cache = team_index
return team_index
def get_team_index() -> dict[str, str]:
"""
Get the team index (public API for debugging/listing teams).
Returns:
Dictionary mapping team names to their page URLs
"""
return _build_team_index()
def clear_team_index_cache() -> None:
"""Clear the cached team index (useful for testing)."""
global _team_index_cache # pylint: disable=global-statement
_team_index_cache = None
logger.debug("Cleared team index cache")
def _find_team_url(team_name: str) -> Optional[str]:
"""
Find the URL for a team's dedicated page.
Performs fuzzy matching to handle variations like:
- "Ohio State" matches "ohio state"
- "OSU" might not match (would need alias support)
- "Texas A&M" matches "texas a&m"
Args:
team_name: Team name to search for
Returns:
Team page URL or None if not found
"""
team_index = _build_team_index()
team_lower = team_name.lower().strip()
# Exact match first
if team_lower in team_index:
return team_index[team_lower]
# Partial match - check if search term is contained in any team name
for indexed_name, url in team_index.items():
if team_lower in indexed_name or indexed_name in team_lower:
logger.debug("Fuzzy matched '%s' to '%s'", team_name, indexed_name)
return url
logger.warning("Team not found in index: %s", team_name)
return None
def _parse_game_title(title: str) -> dict:
"""
Parse a game title to extract team names, year, and event.
Example titles:
"Ohio State vs Oregon Football 2024 Big Ten Championship Full Game Replay"
"Alabama vs. Oklahoma Football December 19, 2025 CFP First Round Full Game Replay"
Args:
title: Full game title string
Returns:
Dictionary with keys: team_a, team_b, year, event (some may be None)
"""
result = {"team_a": None, "team_b": None, "year": None, "event": None}
# Clean up the title
title = title.strip()
# Pattern: "Team A vs Team B Football [Date] [Event] Full Game Replay"
# Handle both "vs" and "vs." (with period)
vs_match = re.match(r"^(.+?)\s+vs\.?\s+(.+?)\s+Football\s+", title, re.IGNORECASE)
if vs_match:
result["team_a"] = vs_match.group(1).strip()
result["team_b"] = vs_match.group(2).strip()
# Extract year (4-digit number)
year_match = re.search(r"\b(20\d{2})\b", title)
if year_match:
result["year"] = int(year_match.group(1))
# Extract event - everything between the date and "Full Game Replay"
# Format: "Football [Month Day, Year] [Event] Full Game Replay"
event_match = re.search(r"Football\s+\w+\s+\d{1,2},?\s+20\d{2}\s+(.+?)\s+Full Game Replay", title, re.IGNORECASE)
if event_match:
event = event_match.group(1).strip()
if event and event.lower() not in ["full", "game", "replay"]:
result["event"] = event
return result
def _extract_games_from_page(soup: BeautifulSoup, filter_team: Optional[str] = None) -> list[GameResult]:
"""
Extract game listings from a parsed page.
Args:
soup: BeautifulSoup object of the page
filter_team: Optional team name to filter for (for team-specific pages that may include unrelated games)
Returns:
List of GameResult objects found on the page
"""
games = []
for link in soup.find_all("a", href=True):
text = link.get_text(strip=True)
# Skip links that don't look like game titles
if "Full Game Replay" not in text:
continue
# Skip if it doesn't have "vs" or "vs." (not a game matchup)
text_lower = text.lower()
if " vs " not in text_lower and " vs. " not in text_lower:
continue
href = link["href"]
# Make sure it's a full URL
if not href.startswith("http"):
href = f"https://nfl-video.com{href}"
# Skip if we've already seen this URL (duplicates on page)
if any(g.url == href for g in games):
continue
# Parse the title for metadata
parsed = _parse_game_title(text)
if parsed["team_a"] and parsed["team_b"]:
# If filter_team specified, skip games that don't involve that team
if filter_team:
filter_lower = filter_team.lower()
if filter_lower not in parsed["team_a"].lower() and filter_lower not in parsed["team_b"].lower():
continue
# Try to find thumbnail
thumbnail_url = None
parent = link.find_parent()
if parent:
img = parent.find("img")
if img and img.get("src"):
thumbnail_url = img["src"]
if not thumbnail_url.startswith("http"):
thumbnail_url = f"https://nfl-video.com{thumbnail_url}"
game = GameResult(
title=text,
team_a=parsed["team_a"],
team_b=parsed["team_b"],
url=href,
thumbnail_url=thumbnail_url,
year=parsed["year"],
event=parsed["event"],
)
games.append(game)
logger.debug("Found game: %s", game)
return games
def _team_matches(game: GameResult, team_name: str) -> bool:
"""
Check if a game involves a given team.
Args:
game: GameResult to check
team_name: Team name to search for
Returns:
True if the team appears in either team_a or team_b
"""
team_lower = team_name.lower()
return team_lower in game.team_a.lower() or team_lower in game.team_b.lower()
def search_games(
team_a: str,
team_b: Optional[str] = None,
max_pages: int = 5,
delay_seconds: float = REQUEST_DELAY_SECONDS,
) -> SearchResults:
"""
Search for college football games by team name(s).
Uses team-specific pages for efficient searching:
1. Looks up team_a's dedicated page in the team index
2. Fetches all games from that team's page
3. If team_b specified, filters to only games against that opponent
Args:
team_a: Primary team name to search for (required)
team_b: Optional opponent name - if provided, only games against this team are returned
max_pages: Maximum number of pages to search on the team's page (default 5)
delay_seconds: Delay between page requests to avoid rate limiting
Returns:
SearchResults object containing matching games and search metadata
Example:
# Find all Alabama games
results = search_games("Alabama")
# Find Alabama vs Georgia specifically
results = search_games("Alabama", "Georgia")
"""
logger.info("Searching for games: team_a='%s', team_b='%s'", team_a, team_b)
# Find the team's dedicated page URL
team_url = _find_team_url(team_a)
if team_url is None:
logger.warning("Could not find team page for '%s', returning empty results", team_a)
return SearchResults(
query_team_a=team_a,
query_team_b=team_b,
games=[],
pages_searched=0,
total_games_scanned=0,
)
logger.info("Found team page for '%s': %s", team_a, team_url)
matching_games: list[GameResult] = []
total_scanned = 0
# Fetch pages from the team's dedicated page
for page_num in range(1, max_pages + 1):
# Team pages use same pagination pattern as main: base URL, then base-2, base-3, etc.
if page_num == 1:
page_url = team_url
else:
page_url = f"{team_url}-{page_num}"
logger.info("Searching page %d/%d: %s", page_num, max_pages, page_url)
soup = _fetch_page(page_url)
if soup is None:
logger.warning("Failed to fetch page %d, stopping search", page_num)
break
# Extract games, filtering for the team (team pages sometimes include unrelated sidebar games)
page_games = _extract_games_from_page(soup, filter_team=team_a)
total_scanned += len(page_games)
# Filter by opponent if specified
for game in page_games:
# If team_b specified, must also match team_b
if team_b and not _team_matches(game, team_b):
continue
# Avoid duplicates (same URL)
if not any(g.url == game.url for g in matching_games):
matching_games.append(game)
logger.info("Found matching game: %s", game)
# Be polite - add delay between requests
if page_num < max_pages:
time.sleep(delay_seconds)
results = SearchResults(
query_team_a=team_a,
query_team_b=team_b,
games=matching_games,
pages_searched=max_pages,
total_games_scanned=total_scanned,
)
logger.info("Search complete: %s", results)
return results
def list_available_teams() -> list[str]:
"""
Get a list of all available team names.
Useful for populating dropdown menus or autocomplete in the UI.
Returns:
Sorted list of team names
"""
team_index = _build_team_index()
# Return with original casing (capitalize each word)
return sorted([name.title() for name in team_index])