npb_data_app / grade.py
patrickramos's picture
Add psuedo-percentiles to pERA grades for coloring
e19444f
import polars as pl
from grade_utils import (
season_to_kwera_constant,
pitch_season_to_avg_velo_mean,
pitch_season_to_avg_velo_std,
pitch_season_to_lg_avg_velo,
pitch_season_to_lg_swstr,
pitch_season_to_lg_ball,
pitch_season_to_lg_gb,
pitch_season_to_lg_iffb,
pitch_season_to_ypera_mean,
pitch_season_to_ypera_std,
map_columns
)
pera_ball_kind = {
'Fastball (4-seam)': 'Fastball (4-seam)',
'Slider': 'Slider',
'Curve': 'Curve',
'Splitter': 'Splitter',
'Sinker': 'Sinker',
'Cutter': 'Cutter',
'Changeup': 'Changeup',
'Vertical Slider': 'Slider',
'Palmball': 'Changeup',
'Screwball': 'Curve',
'Slurve': 'Curve',
'Eephus': 'Curve',
'Knuckleball': 'Curve',
'Sweeper': 'Slider'
}
pera_ball_kind_col = pl.col('general_ballKind').replace_strict(pera_ball_kind).alias('YpERA Pitch')
pera_ball_kind_col_alt = pl.col('ballKind').replace_strict(pera_ball_kind).alias('YpERA Pitch') # for when pitch data renames general_ballKind to ballKind (see `stats.py`)
# p_velo_z = ((pl.col('Avg Velo') - pl.mean('Avg Velo').over('YpERA Pitch')) / pl.std('Avg Velo').over('YpERA Pitch')).alias('Velo Z')
p_velo_z = ((pl.col('Avg Velo') - map_columns(pitch_season_to_avg_velo_mean)) / map_columns(pitch_season_to_avg_velo_std)).alias('Velo Z')
p_swstr = (pl.col('SwStr%') * 0.7 + p_velo_z * 0.004 + 0.03).clip(0, 1).alias('xSwStr%')
p_k = (p_swstr * 1.7 + 0.02).clip(0, 1).alias('xK%')
p_bb = (pl.col('Ball%') * 0.8 - 0.2).clip(0, 1).alias('xBB%')
p_kwera = (pl.col('season').replace(season_to_kwera_constant) - 10 * (p_k - p_bb)).alias('kwERA')
p_gb_pu_adj = (pl.col('GB%') * -3 + pl.col('IFFB%') * -5 + 2).alias('GB% IFFB% Adj')
pera = (p_kwera + p_gb_pu_adj).alias('YpERA')
def average(col):
return (pl.col('Count') * pl.col(col)).sum() / pl.sum('Count')
# lg_velo = average('Avg Velo').over('YpERA Pitch').alias('lg_velo')
# lg_swstr_ = average('SwStr%').over('YpERA Pitch').alias('lg_swstr')
# lg_ball = average('Ball%').over('YpERA Pitch').alias('lg_ball')
# lg_gb = average('GB%').over('YpERA Pitch').alias('lg_gb')
# lg_iffb = average('IFFB%').over('YpERA Pitch').alias('lg_iffb')
lg_velo = map_columns(pitch_season_to_lg_avg_velo).alias('lg_velo')
lg_swstr_ = map_columns(pitch_season_to_lg_swstr).alias('lg_swstr')
lg_ball = map_columns(pitch_season_to_lg_ball).alias('lg_ball')
lg_gb = map_columns(pitch_season_to_lg_gb).alias('lg_gb')
lg_iffb = map_columns(pitch_season_to_lg_iffb).alias('lg_iffb')
# lg_velo_z = ((pl.col('lg_velo') - pl.mean('Avg Velo')) / pl.std('Avg Velo')).over('YpERA Pitch').alias('lg_velo_z')
lg_velo_z = ((pl.col('lg_velo') - map_columns(pitch_season_to_avg_velo_mean)) / map_columns(pitch_season_to_avg_velo_std)).alias('lg_velo_z')
lg_swstr = (lg_swstr_ * 0.7 + lg_velo_z * 0.004 + 0.03).clip(0, 1).alias('lg_xSwStr%')
lg_k = (lg_swstr * 1.7 + 0.02).clip(0, 1).alias('lg_xK%')
lg_bb = (lg_ball * 0.8 - 0.2).clip(0, 1).alias('lg_xBB%')
lg_kwera = (pl.col('season').replace(season_to_kwera_constant) - 10 * (lg_k - lg_bb)).alias('lg_kwERA')
lg_gb_pu_adj = (lg_gb * -4 + lg_iffb * -5 + 2).alias('lg_gb_pu_adj')
lg_pera = (lg_kwera + lg_gb_pu_adj).alias('lg_YpERA')
shrunk_pera = (
pl.when(pl.col('Count') < 100)
.then(pl.col('YpERA') * pl.col('Count') / 100 + pl.col('lg_YpERA') * (1 - pl.col('Count')/100))
.otherwise('YpERA')
.alias('shrunk_YpERA')
)
mean_pera = map_columns(pitch_season_to_ypera_mean).alias('mean_YpERA')
std_pera = map_columns(pitch_season_to_ypera_std).alias('std_YpERA')
# mean_pera = (
# pl.col('YpERA')
# .filter(pl.col('Count') >= 100)
# .mean()
# .alias('mean_YpERA')
# )
# std_pera = (
# pl.col('YpERA')
# .filter(pl.col('Count') >= 100)
# .std()
# .alias('std_YpERA')
# )
pera_grade = (
(
(pl.col('shrunk_YpERA') - mean_pera) /
std_pera
)
.mul(-10).add(50).round().cast(pl.Int32())
).alias('YpERA Grade')
pera_minus = (pl.col('YpERA') / pl.col('lg_YpERA') * 100).cast(pl.Int32()).alias('YpERA-')
pera_grade_pctl = ((pera_grade.clip(20, 80) - 20) / 60).alias('YpERA Grade_pctl')
pera_minus_pctl = ((200 - pera_minus)/ 200).clip(0, 1).alias('YpERA-_pctl')
def compute_pera(pitch_stats):
pitch_stats = (
pitch_stats
.fill_nan(0)
.with_columns(pera_ball_kind_col if 'general_ballKind' in pitch_stats.columns else pera_ball_kind_col_alt)
.with_columns(
p_velo_z,
p_swstr,
p_k,
p_bb,
p_kwera,
p_gb_pu_adj,
pera
)
.with_columns(
lg_velo,
lg_swstr_,
lg_ball,
lg_gb,
lg_iffb
)
.with_columns(
lg_velo_z,
lg_swstr,
lg_k,
lg_bb,
lg_kwera,
lg_gb_pu_adj,
lg_pera
)
.with_columns(shrunk_pera)
.with_columns(
mean_pera,#.over('YpERA Pitch'),
std_pera#.over('YpERA Pitch'),
)
.with_columns(pera_grade, pera_grade_pctl)
.with_columns(pera_minus, pera_minus_pctl) # why is this a separate line?
)
return pitch_stats
if __name__ == '__main__':
from datetime import date
from data import data_df
from stats import filter_data_by_date_and_game_kind, compute_pitch_stats
_data = filter_data_by_date_and_game_kind(
data=data_df,
start_date=date(2021, 1, 1),
end_date=date(2025, 12, 31),
game_kind='Regular Season'
)
pitch_stats = (
compute_pitch_stats(
data=_data,
player_type='pitcher',
pitch_class_type='specific',
min_pitches=1,
group_by_season=True
)
)
# try:
pera_data = compute_pera(pitch_stats)
# print(pera_data)
# except Exception as e:
# print(e)
cols = ['season', 'pitcher_name', 'general_ballKind', 'ballKind', 'YpERA Pitch', 'YpERA Grade', 'YpERA-', 'YpERA']
print(
pera_data
.filter(
pl.col('pitcher_name') == 'Miyagi Hiroya',
# pl.col('general_ballKind') == 'Fastball (4-seam)',
# pl.col('season').is_between(2022, 2024)
pl.col('season') == 2025
)
.sort('season')
[cols]
)
breakpoint()