import polars as pl from grade_utils import ( season_to_kwera_constant, pitch_season_to_avg_velo_mean, pitch_season_to_avg_velo_std, pitch_season_to_lg_avg_velo, pitch_season_to_lg_swstr, pitch_season_to_lg_ball, pitch_season_to_lg_gb, pitch_season_to_lg_iffb, pitch_season_to_ypera_mean, pitch_season_to_ypera_std, map_columns ) pera_ball_kind = { 'Fastball (4-seam)': 'Fastball (4-seam)', 'Slider': 'Slider', 'Curve': 'Curve', 'Splitter': 'Splitter', 'Sinker': 'Sinker', 'Cutter': 'Cutter', 'Changeup': 'Changeup', 'Vertical Slider': 'Slider', 'Palmball': 'Changeup', 'Screwball': 'Curve', 'Slurve': 'Curve', 'Eephus': 'Curve', 'Knuckleball': 'Curve', 'Sweeper': 'Slider' } pera_ball_kind_col = pl.col('general_ballKind').replace_strict(pera_ball_kind).alias('YpERA Pitch') pera_ball_kind_col_alt = pl.col('ballKind').replace_strict(pera_ball_kind).alias('YpERA Pitch') # for when pitch data renames general_ballKind to ballKind (see `stats.py`) # p_velo_z = ((pl.col('Avg Velo') - pl.mean('Avg Velo').over('YpERA Pitch')) / pl.std('Avg Velo').over('YpERA Pitch')).alias('Velo Z') p_velo_z = ((pl.col('Avg Velo') - map_columns(pitch_season_to_avg_velo_mean)) / map_columns(pitch_season_to_avg_velo_std)).alias('Velo Z') p_swstr = (pl.col('SwStr%') * 0.7 + p_velo_z * 0.004 + 0.03).clip(0, 1).alias('xSwStr%') p_k = (p_swstr * 1.7 + 0.02).clip(0, 1).alias('xK%') p_bb = (pl.col('Ball%') * 0.8 - 0.2).clip(0, 1).alias('xBB%') p_kwera = (pl.col('season').replace(season_to_kwera_constant) - 10 * (p_k - p_bb)).alias('kwERA') p_gb_pu_adj = (pl.col('GB%') * -3 + pl.col('IFFB%') * -5 + 2).alias('GB% IFFB% Adj') pera = (p_kwera + p_gb_pu_adj).alias('YpERA') def average(col): return (pl.col('Count') * pl.col(col)).sum() / pl.sum('Count') # lg_velo = average('Avg Velo').over('YpERA Pitch').alias('lg_velo') # lg_swstr_ = average('SwStr%').over('YpERA Pitch').alias('lg_swstr') # lg_ball = average('Ball%').over('YpERA Pitch').alias('lg_ball') # lg_gb = average('GB%').over('YpERA Pitch').alias('lg_gb') # lg_iffb = average('IFFB%').over('YpERA Pitch').alias('lg_iffb') lg_velo = map_columns(pitch_season_to_lg_avg_velo).alias('lg_velo') lg_swstr_ = map_columns(pitch_season_to_lg_swstr).alias('lg_swstr') lg_ball = map_columns(pitch_season_to_lg_ball).alias('lg_ball') lg_gb = map_columns(pitch_season_to_lg_gb).alias('lg_gb') lg_iffb = map_columns(pitch_season_to_lg_iffb).alias('lg_iffb') # lg_velo_z = ((pl.col('lg_velo') - pl.mean('Avg Velo')) / pl.std('Avg Velo')).over('YpERA Pitch').alias('lg_velo_z') lg_velo_z = ((pl.col('lg_velo') - map_columns(pitch_season_to_avg_velo_mean)) / map_columns(pitch_season_to_avg_velo_std)).alias('lg_velo_z') lg_swstr = (lg_swstr_ * 0.7 + lg_velo_z * 0.004 + 0.03).clip(0, 1).alias('lg_xSwStr%') lg_k = (lg_swstr * 1.7 + 0.02).clip(0, 1).alias('lg_xK%') lg_bb = (lg_ball * 0.8 - 0.2).clip(0, 1).alias('lg_xBB%') lg_kwera = (pl.col('season').replace(season_to_kwera_constant) - 10 * (lg_k - lg_bb)).alias('lg_kwERA') lg_gb_pu_adj = (lg_gb * -4 + lg_iffb * -5 + 2).alias('lg_gb_pu_adj') lg_pera = (lg_kwera + lg_gb_pu_adj).alias('lg_YpERA') shrunk_pera = ( pl.when(pl.col('Count') < 100) .then(pl.col('YpERA') * pl.col('Count') / 100 + pl.col('lg_YpERA') * (1 - pl.col('Count')/100)) .otherwise('YpERA') .alias('shrunk_YpERA') ) mean_pera = map_columns(pitch_season_to_ypera_mean).alias('mean_YpERA') std_pera = map_columns(pitch_season_to_ypera_std).alias('std_YpERA') # mean_pera = ( # pl.col('YpERA') # .filter(pl.col('Count') >= 100) # .mean() # .alias('mean_YpERA') # ) # std_pera = ( # pl.col('YpERA') # .filter(pl.col('Count') >= 100) # .std() # .alias('std_YpERA') # ) pera_grade = ( ( (pl.col('shrunk_YpERA') - mean_pera) / std_pera ) .mul(-10).add(50).round().cast(pl.Int32()) ).alias('YpERA Grade') pera_minus = (pl.col('YpERA') / pl.col('lg_YpERA') * 100).cast(pl.Int32()).alias('YpERA-') pera_grade_pctl = ((pera_grade.clip(20, 80) - 20) / 60).alias('YpERA Grade_pctl') pera_minus_pctl = ((200 - pera_minus)/ 200).clip(0, 1).alias('YpERA-_pctl') def compute_pera(pitch_stats): pitch_stats = ( pitch_stats .fill_nan(0) .with_columns(pera_ball_kind_col if 'general_ballKind' in pitch_stats.columns else pera_ball_kind_col_alt) .with_columns( p_velo_z, p_swstr, p_k, p_bb, p_kwera, p_gb_pu_adj, pera ) .with_columns( lg_velo, lg_swstr_, lg_ball, lg_gb, lg_iffb ) .with_columns( lg_velo_z, lg_swstr, lg_k, lg_bb, lg_kwera, lg_gb_pu_adj, lg_pera ) .with_columns(shrunk_pera) .with_columns( mean_pera,#.over('YpERA Pitch'), std_pera#.over('YpERA Pitch'), ) .with_columns(pera_grade, pera_grade_pctl) .with_columns(pera_minus, pera_minus_pctl) # why is this a separate line? ) return pitch_stats if __name__ == '__main__': from datetime import date from data import data_df from stats import filter_data_by_date_and_game_kind, compute_pitch_stats _data = filter_data_by_date_and_game_kind( data=data_df, start_date=date(2021, 1, 1), end_date=date(2025, 12, 31), game_kind='Regular Season' ) pitch_stats = ( compute_pitch_stats( data=_data, player_type='pitcher', pitch_class_type='specific', min_pitches=1, group_by_season=True ) ) # try: pera_data = compute_pera(pitch_stats) # print(pera_data) # except Exception as e: # print(e) cols = ['season', 'pitcher_name', 'general_ballKind', 'ballKind', 'YpERA Pitch', 'YpERA Grade', 'YpERA-', 'YpERA'] print( pera_data .filter( pl.col('pitcher_name') == 'Miyagi Hiroya', # pl.col('general_ballKind') == 'Fastball (4-seam)', # pl.col('season').is_between(2022, 2024) pl.col('season') == 2025 ) .sort('season') [cols] ) breakpoint()