| | from typing import Optional |
| | from src.repository.common import get_connection |
| | import pandas as pd |
| |
|
| | def load_model_data(limit: Optional[int] = None) -> pd.DataFrame: |
| | """ |
| | Load model data from Postgres |
| | """ |
| | with get_connection() as conn: |
| | query = "SELECT * FROM ml.model_data_m_view" |
| |
|
| | if limit is not None: |
| | query += f" LIMIT {limit}" |
| | |
| | data = pd.read_sql_query(query, conn) |
| | |
| | |
| | data = _clean_data(data) |
| |
|
| | return data |
| |
|
| | def _clean_data(data: pd.DataFrame) -> pd.DataFrame: |
| | """ |
| | Clean the data by removing unnecessary columns and rows with null values |
| | and filtering based on specific criteria. |
| | """ |
| | |
| | columns_to_drop = [ |
| | 'match_id', 'date', |
| | 'tournament_name', 'tournament_location', |
| | 'winner_name', 'loser_name', |
| | 'min_winner', 'min_loser', |
| | 'avg_winner', 'avg_loser', |
| | 'max_winner', 'max_loser' |
| | ] |
| | data.drop(columns=columns_to_drop, inplace=True, errors='ignore') |
| |
|
| | |
| | data = data[data['victory_type'] == 'Completed'] |
| | data.drop(columns='victory_type', inplace=True, errors='ignore') |
| | |
| | |
| | data = data[data['tournament_series'].isin(['Grand Slam', 'Masters 1000', 'Masters', 'Masters Cup', 'ATP500', 'ATP250'])] |
| |
|
| | |
| | data.dropna(inplace=True) |
| |
|
| | |
| | data.reset_index(drop=True, inplace=True) |
| |
|
| | return data |