tennis-ml / src /repository /model_data.py
SlimG's picture
add a cleaning step in phase with the new data model
e39c0ca
from typing import Optional
from src.repository.common import get_connection
import pandas as pd
def load_model_data(limit: Optional[int] = None) -> pd.DataFrame:
"""
Load model data from Postgres
"""
with get_connection() as conn:
query = "SELECT * FROM ml.model_data_m_view"
if limit is not None:
query += f" LIMIT {limit}"
data = pd.read_sql_query(query, conn)
# Clean the data
data = _clean_data(data)
return data
def _clean_data(data: pd.DataFrame) -> pd.DataFrame:
"""
Clean the data by removing unnecessary columns and rows with null values
and filtering based on specific criteria.
"""
# Drop unnecessary columns
columns_to_drop = [
'match_id', 'date',
'tournament_name', 'tournament_location',
'winner_name', 'loser_name',
'min_winner', 'min_loser',
'avg_winner', 'avg_loser',
'max_winner', 'max_loser'
]
data.drop(columns=columns_to_drop, inplace=True, errors='ignore')
# Remove rows where 'victory_type' is not 'Completed'
data = data[data['victory_type'] == 'Completed']
data.drop(columns='victory_type', inplace=True, errors='ignore')
# Remove rows where 'tournament_series' is not in the specified list
data = data[data['tournament_series'].isin(['Grand Slam', 'Masters 1000', 'Masters', 'Masters Cup', 'ATP500', 'ATP250'])]
# Remove rows with null values
data.dropna(inplace=True)
# Reset the index
data.reset_index(drop=True, inplace=True)
return data