import os import logging import requests import numpy as np import pandas as pd from typing import Dict, List, Tuple, Optional from datetime import datetime from sqlalchemy.exc import IntegrityError from sqlalchemy.orm import Session from src.entity.match import Match, RawMatch from src.entity.odds import Odds from src.entity.player import Player from src.repository import match_repo from src.jobs.player import schedule_player_details # Set up logging logger = logging.getLogger(__name__) def get_match(raw_match: Dict) -> Match: """ Parse a raw match """ match = Match() match.date = raw_match.get("Date") match.comment = raw_match.get("Comment") match.winner_rank = raw_match.get("WRank") match.winner_points = raw_match.get("WPts") match.loser_rank = raw_match.get("LRank") match.loser_points = raw_match.get("LPts") match.tournament_name = raw_match.get("Tournament") match.tournament_series = raw_match.get("Series") match.tournament_surface = raw_match.get("Surface") match.tournament_court = raw_match.get("Court") match.tournament_round = raw_match.get("Round") match.tournament_location = raw_match.get("Location") return match def get_all_odds(raw_match: Dict) -> List[Odds]: """ Parse the odds data from the raw match """ all_odds = [] odds_data = {k: v for k, v in raw_match.items() if k[-1] in ["W", "L"]} bookmakers = set(k[:-1] for k in odds_data.keys()) for bookmaker in bookmakers: odds = Odds() odds.bookmaker = bookmaker odds.winner = odds_data[f"{bookmaker}W"] odds.loser = odds_data[f"{bookmaker}L"] all_odds.append(odds) return all_odds def get_players(raw_match: Dict) -> Tuple[Player, Player]: """ Parse the players data from the raw match """ winner = Player(name = raw_match.get("Winner")) loser = Player(name = raw_match.get("Loser")) return winner, loser def parse_raw_match(raw_match: Dict) -> Match: """ Parse a raw match and odds """ match = get_match(raw_match) all_odds = get_all_odds(raw_match) winner, loser = get_players(raw_match) match.odds = all_odds match.winner = winner match.loser = loser return match def parse_raw_matches(raw_matches: Dict) -> List[Match]: """ Parse a list of raw matches """ matches = [] for raw_match in raw_matches: match = parse_raw_match(raw_match) matches.append(match) return matches def insert_new_match(db: Session, raw_match: Dict, on_conflict_do_nothing: bool = False) -> Match: """ Insert a new match into the database """ match = parse_raw_match(raw_match) try: match_repo.insert_match(db, match) except IntegrityError as e: if on_conflict_do_nothing: logging.debug(f"Match already exists: {match.date}") db.rollback() return match else: # Log the error and re-raise logging.error(f"Error inserting match: {e}") db.rollback() raise except Exception as e: # Log the error and re-raise logging.error(f"Error inserting match: {e}") db.rollback() raise # Schedule tasks to fetch player details if _should_fetch_details(match.winner): schedule_player_details(match.winner.name) if _should_fetch_details(match.loser): schedule_player_details(match.loser.name) return match def insert_batch_matches(db: Session, raw_matches: List[Dict], on_conflict_do_nothing: bool = False) -> Dict: matches = [] nb_errors = 0 for raw_match in raw_matches: try: match = insert_new_match( db=db, raw_match=raw_match.model_dump(exclude_unset=True) if isinstance(raw_match, RawMatch) else raw_match, on_conflict_do_nothing=on_conflict_do_nothing, ) if match.id is not None: matches.append(match) except IntegrityError as e: nb_errors += 1 logger.error(f"Error inserting match: {e}") logger.info(f"Number of matches inserted: {len(matches)}") if nb_errors > 0: logger.warning(f"Number of errors: {nb_errors}") return {'matches': matches, 'nb_errors': nb_errors} def _should_fetch_details(player: Player) -> bool: """ Check if player details should be fetched """ return player.tennis_id is None or player.caracteristics is None def fetch_raw_data(year: Optional[int] = None) -> None: """ Fetch data from tennis-data.co.uk for a given year and circuit (ATP or WTA) and save it to a file Args: year (int, optional): Year to retrieve. If None, fetch current year data. """ current_year = datetime.now().year if not year: year = current_year filename = f"{year}.xlsx" file_path = f"./data/atp/{filename}" # Check if the file already exists if os.path.exists(file_path) and year != current_year: logging.info(f"File {file_path} already exists. Skipping download.") return logging.info(f"Fetching data from tennis-data.co.uk for year {year}") url = f"http://www.tennis-data.co.uk/{year}/{filename}" response = requests.get(url, stream=True) # Check response status code response.raise_for_status() with open(file_path, "wb") as file: for chunk in response.iter_content(chunk_size=8192): file.write(chunk) file.flush() logging.info(f"Data fetched from {url} 👍 and saved to {file_path}") def get_cleaned_data(year: Optional[int]) -> pd.DataFrame: if not year: year = datetime.now().year df = pd.read_excel(f'./data/atp/{year}.xlsx') # Remove rows where LRank or WRank is NaN df = df.dropna(subset=['LRank', 'WRank']) df['Lsets'] = df['Lsets'].fillna(0) df['Wsets'] = df['Wsets'].fillna(0) # Strip whitespace from 'winner' and 'loser' columns df['Winner'] = df['Winner'].str.strip() df['Loser'] = df['Loser'].str.strip() # Replace NaN values with None df = df.replace({np.nan: None}) df = df.where(pd.notnull(df), None) return df