nesticot commited on
Commit
9d8401c
·
verified ·
1 Parent(s): 001dc45

Update functions/df_update.py

Browse files
Files changed (1) hide show
  1. functions/df_update.py +503 -471
functions/df_update.py CHANGED
@@ -1,472 +1,504 @@
1
- import polars as pl
2
- import numpy as np
3
- import joblib
4
-
5
- loaded_model = joblib.load('joblib_model/barrel_model.joblib')
6
- in_zone_model = joblib.load('joblib_model/in_zone_model_knn_20240410.joblib')
7
- attack_zone_model = joblib.load('joblib_model/model_attack_zone.joblib')
8
- xwoba_model = joblib.load('joblib_model/xwoba_model.joblib')
9
- px_model = joblib.load('joblib_model/linear_reg_model_x.joblib')
10
- pz_model = joblib.load('joblib_model/linear_reg_model_z.joblib')
11
-
12
-
13
- class df_update:
14
- def __init__(self):
15
- pass
16
-
17
- def update(self, df_clone: pl.DataFrame):
18
-
19
- df = df_clone.clone()
20
- # Assuming px_model is defined and df is your DataFrame
21
- hit_codes = ['single',
22
- 'double','home_run', 'triple']
23
-
24
- ab_codes = ['single', 'strikeout', 'field_out',
25
- 'grounded_into_double_play', 'fielders_choice', 'force_out',
26
- 'double', 'field_error', 'home_run', 'triple',
27
- 'double_play',
28
- 'fielders_choice_out', 'strikeout_double_play',
29
- 'other_out','triple_play']
30
-
31
-
32
- obp_true_codes = ['single', 'walk',
33
- 'double','home_run', 'triple',
34
- 'hit_by_pitch', 'intent_walk']
35
-
36
- obp_codes = ['single', 'strikeout', 'walk', 'field_out',
37
- 'grounded_into_double_play', 'fielders_choice', 'force_out',
38
- 'double', 'sac_fly', 'field_error', 'home_run', 'triple',
39
- 'hit_by_pitch', 'double_play', 'intent_walk',
40
- 'fielders_choice_out', 'strikeout_double_play',
41
- 'sac_fly_double_play',
42
- 'other_out','triple_play']
43
-
44
-
45
- contact_codes = ['In play, no out',
46
- 'Foul', 'In play, out(s)',
47
- 'In play, run(s)',
48
- 'Foul Bunt']
49
-
50
- bip_codes = ['In play, no out', 'In play, run(s)','In play, out(s)']
51
-
52
-
53
- conditions_barrel = [
54
- df['launch_speed'].is_null(),
55
- (df['launch_speed'] * 1.5 - df['launch_angle'] >= 117) &
56
- (df['launch_speed'] + df['launch_angle'] >= 124) &
57
- (df['launch_speed'] >= 98) &
58
- (df['launch_angle'] >= 4) & (df['launch_angle'] <= 50)
59
- ]
60
- choices_barrel = [False, True]
61
-
62
- conditions_tb = [
63
- (df['event_type'] == 'single'),
64
- (df['event_type'] == 'double'),
65
- (df['event_type'] == 'triple'),
66
- (df['event_type'] == 'home_run')
67
- ]
68
- choices_tb = [1, 2, 3, 4]
69
-
70
-
71
- conditions_woba = [
72
- df['event_type'].is_in(['strikeout', 'field_out', 'sac_fly', 'force_out', 'grounded_into_double_play', 'fielders_choice', 'field_error', 'sac_bunt', 'double_play', 'fielders_choice_out', 'strikeout_double_play', 'sac_fly_double_play', 'other_out']),
73
- df['event_type'] == 'walk',
74
- df['event_type'] == 'hit_by_pitch',
75
- df['event_type'] == 'single',
76
- df['event_type'] == 'double',
77
- df['event_type'] == 'triple',
78
- df['event_type'] == 'home_run'
79
- ]
80
- choices_woba = [0, 0.689, 0.720, 0.881, 1.254, 1.589, 2.048]
81
-
82
- woba_codes = ['strikeout', 'field_out', 'single', 'walk', 'hit_by_pitch', 'double', 'sac_fly', 'force_out', 'home_run', 'grounded_into_double_play', 'fielders_choice', 'field_error', 'triple', 'sac_bunt', 'double_play', 'fielders_choice_out', 'strikeout_double_play', 'sac_fly_double_play', 'other_out']
83
-
84
- pitch_cat = {'FA': 'Fastball',
85
- 'FF': 'Fastball',
86
- 'FT': 'Fastball',
87
- 'FC': 'Fastball',
88
- 'FS': 'Off-Speed',
89
- 'FO': 'Off-Speed',
90
- 'SI': 'Fastball',
91
- 'ST': 'Breaking',
92
- 'SL': 'Breaking',
93
- 'CU': 'Breaking',
94
- 'KC': 'Breaking',
95
- 'SC': 'Off-Speed',
96
- 'GY': 'Off-Speed',
97
- 'SV': 'Breaking',
98
- 'CS': 'Breaking',
99
- 'CH': 'Off-Speed',
100
- 'KN': 'Off-Speed',
101
- 'EP': 'Breaking',
102
- 'UN': None,
103
- 'IN': None,
104
- 'PO': None,
105
- 'AB': None,
106
- 'AS': None,
107
- 'NP': None}
108
-
109
-
110
- df = df.with_columns([
111
- pl.when(df['type_ab'].is_not_null()).then(1).otherwise(0).alias('pa'),
112
- pl.when(df['is_pitch']).then(1).otherwise(0).alias('pitches'),
113
- pl.when(df['sz_top'] == 0).then(None).otherwise(df['sz_top']).alias('sz_top'),
114
- pl.when(df['sz_bot'] == 0).then(None).otherwise(df['sz_bot']).alias('sz_bot'),
115
- pl.when(df['zone'] > 0).then(df['zone'] < 10).otherwise(None).alias('in_zone'),
116
- pl.Series(px_model.predict(df[['x']].fill_null(0).to_numpy())[:, 0]).alias('px_predict'),
117
- pl.Series(pz_model.predict(df[['y']].fill_null(0).to_numpy())[:, 0] + 3.2).alias('pz_predict'),
118
- pl.Series(in_zone_model.predict(df[['px','pz','sz_top','sz_bot']].fill_null(0).to_numpy())[:]).alias('in_zone_predict'),
119
- pl.Series(attack_zone_model.predict(df[['px','pz','sz_top','sz_bot']].fill_null(0).to_numpy())[:]).alias('attack_zone_predict'),
120
- pl.when(df['event_type'].is_in(hit_codes)).then(True).otherwise(False).alias('hits'),
121
- pl.when(df['event_type'].is_in(ab_codes)).then(True).otherwise(False).alias('ab'),
122
- pl.when(df['event_type'].is_in(obp_true_codes)).then(True).otherwise(False).alias('on_base'),
123
- pl.when(df['event_type'].is_in(obp_codes)).then(True).otherwise(False).alias('obp'),
124
- pl.when(df['play_description'].is_in(bip_codes)).then(True).otherwise(False).alias('bip'),
125
- pl.when(conditions_barrel[0]).then(choices_barrel[0]).when(conditions_barrel[1]).then(choices_barrel[1]).otherwise(None).alias('barrel'),
126
- pl.when(df['launch_angle'].is_null()).then(False).when((df['launch_angle'] >= 8) & (df['launch_angle'] <= 32)).then(True).otherwise(None).alias('sweet_spot'),
127
- pl.when(df['launch_speed'].is_null()).then(False).when(df['launch_speed'] >= 94.5).then(True).otherwise(None).alias('hard_hit'),
128
- pl.when(conditions_tb[0]).then(choices_tb[0]).when(conditions_tb[1]).then(choices_tb[1]).when(conditions_tb[2]).then(choices_tb[2]).when(conditions_tb[3]).then(choices_tb[3]).otherwise(None).alias('tb'),
129
- pl.when(conditions_woba[0]).then(choices_woba[0]).when(conditions_woba[1]).then(choices_woba[1]).when(conditions_woba[2]).then(choices_woba[2]).when(conditions_woba[3]).then(choices_woba[3]).when(conditions_woba[4]).then(choices_woba[4]).when(conditions_woba[5]).then(choices_woba[5]).when(conditions_woba[6]).then(choices_woba[6]).otherwise(None).alias('woba'),
130
- pl.when((df['play_code'] == 'S') | (df['play_code'] == 'W') | (df['play_code'] == 'T')).then(1).otherwise(0).alias('whiffs'),
131
- pl.when((df['play_code'] == 'S') | (df['play_code'] == 'W') | (df['play_code'] == 'T') | (df['play_code'] == 'C')).then(1).otherwise(0).alias('csw'),
132
- pl.when(pl.col('is_swing').cast(pl.Boolean)).then(1).otherwise(0).alias('swings'),
133
- pl.col('event_type').is_in(['strikeout','strikeout_double_play']).alias('k'),
134
- pl.col('event_type').is_in(['walk', 'intent_walk']).alias('bb'),
135
- pl.lit(None).alias('attack_zone'),
136
- pl.lit(None).alias('woba_pred'),
137
- pl.lit(None).alias('woba_pred_contact')
138
-
139
- ])
140
-
141
- df = df.with_columns([
142
- pl.when(df['event_type'].is_in(woba_codes)).then(1).otherwise(None).alias('woba_codes'),
143
- pl.when(df['event_type'].is_in(woba_codes)).then(1).otherwise(None).alias('xwoba_codes'),
144
- pl.when((pl.col('tb') >= 0)).then(df['woba']).otherwise(None).alias('woba_contact'),
145
- pl.when(pl.col('px').is_null()).then(pl.col('px_predict')).otherwise(pl.col('px')).alias('px'),
146
- pl.when(pl.col('pz').is_null()).then(pl.col('pz_predict')).otherwise(pl.col('pz')).alias('pz'),
147
- pl.when(pl.col('in_zone').is_null()).then(pl.col('in_zone_predict')).otherwise(pl.col('in_zone')).alias('in_zone'),
148
- pl.when(df['launch_speed'].is_null()).then(None).otherwise(df['barrel']).alias('barrel'),
149
- pl.lit('average').alias('average'),
150
- pl.when(pl.col('in_zone') == False).then(True).otherwise(False).alias('out_zone'),
151
- pl.when((pl.col('in_zone') == True) & (pl.col('swings') == 1)).then(True).otherwise(False).alias('zone_swing'),
152
- pl.when((pl.col('in_zone') == True) & (pl.col('swings') == 1) & (pl.col('whiffs') == 0)).then(True).otherwise(False).alias('zone_contact'),
153
- pl.when((pl.col('in_zone') == False) & (pl.col('swings') == 1)).then(True).otherwise(False).alias('ozone_swing'),
154
- pl.when((pl.col('in_zone') == False) & (pl.col('swings') == 1) & (pl.col('whiffs') == 0)).then(True).otherwise(False).alias('ozone_contact'),
155
- pl.when(pl.col('event_type').str.contains('strikeout')).then(True).otherwise(False).alias('k'),
156
- pl.when(pl.col('event_type').is_in(['walk', 'intent_walk'])).then(True).otherwise(False).alias('bb'),
157
- pl.when(pl.col('attack_zone').is_null()).then(pl.col('attack_zone_predict')).otherwise(pl.col('attack_zone')).alias('attack_zone'),
158
-
159
-
160
- ])
161
-
162
- df = df.with_columns([
163
- (df['k'].cast(pl.Float32) - df['bb'].cast(pl.Float32)).alias('k_minus_bb'),
164
- (df['bb'].cast(pl.Float32) - df['k'].cast(pl.Float32)).alias('bb_minus_k'),
165
- (df['launch_speed'] > 0).alias('bip_div'),
166
- (df['attack_zone'] == 0).alias('heart'),
167
- (df['attack_zone'] == 1).alias('shadow'),
168
- (df['attack_zone'] == 2).alias('chase'),
169
- (df['attack_zone'] == 3).alias('waste'),
170
- ((df['attack_zone'] == 0) & (df['swings'] == 1)).alias('heart_swing'),
171
- ((df['attack_zone'] == 1) & (df['swings'] == 1)).alias('shadow_swing'),
172
- ((df['attack_zone'] == 2) & (df['swings'] == 1)).alias('chase_swing'),
173
- ((df['attack_zone'] == 3) & (df['swings'] == 1)).alias('waste_swing'),
174
- ((df['attack_zone'] == 0) & (df['whiffs'] == 1)).alias('heart_whiff'),
175
- ((df['attack_zone'] == 1) & (df['whiffs'] == 1)).alias('shadow_whiff'),
176
- ((df['attack_zone'] == 2) & (df['whiffs'] == 1)).alias('chase_whiff'),
177
- ((df['attack_zone'] == 3) & (df['whiffs'] == 1)).alias('waste_whiff')
178
- ])
179
-
180
-
181
- [0, 0.689, 0.720, 0.881, 1.254, 1.589, 2.048]
182
-
183
- df = df.with_columns([
184
- pl.Series(
185
- [sum(x) for x in xwoba_model.predict_proba(df[['launch_angle', 'launch_speed']].fill_null(0).to_numpy()[:]) * ([0, 0.881, 1.254, 1.589, 2.048])]
186
- ).alias('woba_pred_predict')
187
- ])
188
-
189
- df = df.with_columns([
190
- pl.when(pl.col('event_type').is_in(['walk'])).then(0.689)
191
- .when(pl.col('event_type').is_in(['hit_by_pitch'])).then(0.720)
192
- .when(pl.col('event_type').is_in(['strikeout', 'strikeout_double_play'])).then(0)
193
- .otherwise(pl.col('woba_pred_predict')).alias('woba_pred_predict')
194
- ])
195
-
196
- df = df.with_columns([
197
- pl.when(pl.col('woba_codes').is_null()).then(None).otherwise(pl.col('woba_pred_predict')).alias('woba_pred'),
198
- pl.when(pl.col('bip')!=1).then(None).otherwise(pl.col('woba_pred_predict')).alias('woba_pred_contact'),
199
- ])
200
-
201
- df = df.with_columns([
202
- pl.when(pl.col('trajectory').is_in(['bunt_popup'])).then(pl.lit('popup'))
203
- .when(pl.col('trajectory').is_in(['bunt_grounder'])).then(pl.lit('ground_ball'))
204
- .when(pl.col('trajectory').is_in(['bunt_line_drive'])).then(pl.lit('line_drive'))
205
- .when(pl.col('trajectory').is_in([''])).then(pl.lit(None))
206
- .otherwise(pl.col('trajectory')).alias('trajectory')
207
- ])
208
-
209
-
210
- # Create one-hot encoded columns for the trajectory column
211
- dummy_df = df.select(pl.col('trajectory')).to_dummies()
212
-
213
- # Rename the one-hot encoded columns
214
- dummy_df = dummy_df.rename({
215
- 'trajectory_fly_ball': 'trajectory_fly_ball',
216
- 'trajectory_ground_ball': 'trajectory_ground_ball',
217
- 'trajectory_line_drive': 'trajectory_line_drive',
218
- 'trajectory_popup': 'trajectory_popup'
219
- })
220
-
221
- # Ensure the columns are present in the DataFrame
222
- for col in ['trajectory_fly_ball', 'trajectory_ground_ball', 'trajectory_line_drive', 'trajectory_popup']:
223
- if col not in dummy_df.columns:
224
- dummy_df = dummy_df.with_columns(pl.lit(0).alias(col))
225
-
226
- # Join the one-hot encoded columns back to the original DataFrame
227
- df = df.hstack(dummy_df)
228
-
229
- # Check if 'trajectory_null' column exists and drop it
230
- if 'trajectory_null' in df.columns:
231
- df = df.drop('trajectory_null')
232
-
233
- return df
234
-
235
- # Assuming df is your Polars DataFrame
236
- def update_summary(self, df: pl.DataFrame, pitcher: bool = True) -> pl.DataFrame:
237
- """
238
- Update summary statistics for pitchers or batters.
239
-
240
- Parameters:
241
- df (pl.DataFrame): The input Polars DataFrame containing player statistics.
242
- pitcher (bool): A flag indicating whether to calculate statistics for pitchers (True) or batters (False).
243
-
244
- Returns:
245
- pl.DataFrame: A Polars DataFrame with aggregated and calculated summary statistics.
246
- """
247
-
248
- # Determine the position based on the pitcher flag
249
- if pitcher:
250
- position = 'pitcher'
251
- else:
252
- position = 'batter'
253
-
254
- # Group by position_id and position_name, then aggregate various statistics
255
- df_summ = df.group_by([f'{position}_id', f'{position}_name']).agg([
256
- pl.col('pa').sum().alias('pa'),
257
- pl.col('ab').sum().alias('ab'),
258
- pl.col('obp').sum().alias('obp_pa'),
259
- pl.col('hits').sum().alias('hits'),
260
- pl.col('on_base').sum().alias('on_base'),
261
- pl.col('k').sum().alias('k'),
262
- pl.col('bb').sum().alias('bb'),
263
- pl.col('bb_minus_k').sum().alias('bb_minus_k'),
264
- pl.col('csw').sum().alias('csw'),
265
- pl.col('bip').sum().alias('bip'),
266
- pl.col('bip_div').sum().alias('bip_div'),
267
- pl.col('tb').sum().alias('tb'),
268
- pl.col('woba').sum().alias('woba'),
269
- pl.col('woba_contact').sum().alias('woba_contact'),
270
- pl.col('woba_pred').sum().alias('xwoba'),
271
- pl.col('woba_pred_contact').sum().alias('xwoba_contact'),
272
- pl.col('woba_codes').sum().alias('woba_codes'),
273
- pl.col('xwoba_codes').sum().alias('xwoba_codes'),
274
- pl.col('hard_hit').sum().alias('hard_hit'),
275
- pl.col('barrel').sum().alias('barrel'),
276
- pl.col('sweet_spot').sum().alias('sweet_spot'),
277
- pl.col('launch_speed').max().alias('max_launch_speed'),
278
- pl.col('launch_speed').quantile(0.90).alias('launch_speed_90'),
279
- pl.col('launch_speed').mean().alias('launch_speed'),
280
- pl.col('launch_angle').mean().alias('launch_angle'),
281
- pl.col('is_pitch').sum().alias('pitches'),
282
- pl.col('swings').sum().alias('swings'),
283
- pl.col('in_zone').sum().alias('in_zone'),
284
- pl.col('out_zone').sum().alias('out_zone'),
285
- pl.col('whiffs').sum().alias('whiffs'),
286
- pl.col('zone_swing').sum().alias('zone_swing'),
287
- pl.col('zone_contact').sum().alias('zone_contact'),
288
- pl.col('ozone_swing').sum().alias('ozone_swing'),
289
- pl.col('ozone_contact').sum().alias('ozone_contact'),
290
- pl.col('trajectory_ground_ball').sum().alias('ground_ball'),
291
- pl.col('trajectory_line_drive').sum().alias('line_drive'),
292
- pl.col('trajectory_fly_ball').sum().alias('fly_ball'),
293
- pl.col('trajectory_popup').sum().alias('pop_up'),
294
- pl.col('attack_zone').count().alias('attack_zone'),
295
- pl.col('heart').sum().alias('heart'),
296
- pl.col('shadow').sum().alias('shadow'),
297
- pl.col('chase').sum().alias('chase'),
298
- pl.col('waste').sum().alias('waste'),
299
- pl.col('heart_swing').sum().alias('heart_swing'),
300
- pl.col('shadow_swing').sum().alias('shadow_swing'),
301
- pl.col('chase_swing').sum().alias('chase_swing'),
302
- pl.col('waste_swing').sum().alias('waste_swing'),
303
- pl.col('heart_whiff').sum().alias('heart_whiff'),
304
- pl.col('shadow_whiff').sum().alias('shadow_whiff'),
305
- pl.col('chase_whiff').sum().alias('chase_whiff'),
306
- pl.col('waste_whiff').sum().alias('waste_whiff')
307
- ])
308
-
309
- # Add calculated columns to the summary DataFrame
310
- df_summ = df_summ.with_columns([
311
- (pl.col('hits') / pl.col('ab')).alias('avg'),
312
- (pl.col('on_base') / pl.col('obp_pa')).alias('obp'),
313
- (pl.col('tb') / pl.col('ab')).alias('slg'),
314
- (pl.col('on_base') / pl.col('obp_pa') + pl.col('tb') / pl.col('ab')).alias('ops'),
315
- (pl.col('k') / pl.col('pa')).alias('k_percent'),
316
- (pl.col('bb') / pl.col('pa')).alias('bb_percent'),
317
- (pl.col('bb_minus_k') / pl.col('pa')).alias('bb_minus_k_percent'),
318
- (pl.col('bb') / pl.col('k')).alias('bb_over_k_percent'),
319
- (pl.col('csw') / pl.col('pitches')).alias('csw_percent'),
320
- (pl.col('sweet_spot') / pl.col('bip_div')).alias('sweet_spot_percent'),
321
- (pl.col('woba') / pl.col('woba_codes')).alias('woba_percent'),
322
- (pl.col('woba_contact') / pl.col('bip')).alias('woba_percent_contact'),
323
- (pl.col('hard_hit') / pl.col('bip_div')).alias('hard_hit_percent'),
324
- (pl.col('barrel') / pl.col('bip_div')).alias('barrel_percent'),
325
- (pl.col('zone_contact') / pl.col('zone_swing')).alias('zone_contact_percent'),
326
- (pl.col('zone_swing') / pl.col('in_zone')).alias('zone_swing_percent'),
327
- (pl.col('in_zone') / pl.col('pitches')).alias('zone_percent'),
328
- (pl.col('ozone_swing') / (pl.col('pitches') - pl.col('in_zone'))).alias('chase_percent'),
329
- (pl.col('ozone_contact') / pl.col('ozone_swing')).alias('chase_contact'),
330
- (pl.col('swings') / pl.col('pitches')).alias('swing_percent'),
331
- (pl.col('whiffs') / pl.col('swings')).alias('whiff_rate'),
332
- (pl.col('whiffs') / pl.col('pitches')).alias('swstr_rate'),
333
- (pl.col('ground_ball') / pl.col('bip')).alias('ground_ball_percent'),
334
- (pl.col('line_drive') / pl.col('bip')).alias('line_drive_percent'),
335
- (pl.col('fly_ball') / pl.col('bip')).alias('fly_ball_percent'),
336
- (pl.col('pop_up') / pl.col('bip')).alias('pop_up_percent'),
337
- (pl.col('heart') / pl.col('attack_zone')).alias('heart_zone_percent'),
338
- (pl.col('shadow') / pl.col('attack_zone')).alias('shadow_zone_percent'),
339
- (pl.col('chase') / pl.col('attack_zone')).alias('chase_zone_percent'),
340
- (pl.col('waste') / pl.col('attack_zone')).alias('waste_zone_percent'),
341
- (pl.col('heart_swing') / pl.col('heart')).alias('heart_zone_swing_percent'),
342
- (pl.col('shadow_swing') / pl.col('shadow')).alias('shadow_zone_swing_percent'),
343
- (pl.col('chase_swing') / pl.col('chase')).alias('chase_zone_swing_percent'),
344
- (pl.col('waste_swing') / pl.col('waste')).alias('waste_zone_swing_percent'),
345
- (pl.col('heart_whiff') / pl.col('heart_swing')).alias('heart_zone_whiff_percent'),
346
- (pl.col('shadow_whiff') / pl.col('shadow_swing')).alias('shadow_zone_whiff_percent'),
347
- (pl.col('chase_whiff') / pl.col('chase_swing')).alias('chase_zone_whiff_percent'),
348
- (pl.col('waste_whiff') / pl.col('waste_swing')).alias('waste_zone_whiff_percent'),
349
- (pl.col('xwoba') / pl.col('xwoba_codes')).alias('xwoba_percent'),
350
- (pl.col('xwoba_contact') / pl.col('bip')).alias('xwoba_percent_contact')
351
- ])
352
-
353
- return df_summ
354
-
355
-
356
-
357
-
358
-
359
-
360
- # Assuming df is your Polars DataFrame
361
- def update_summary_select(self, df: pl.DataFrame, selection: list) -> pl.DataFrame:
362
- """
363
- Update summary statistics for pitchers or batters.
364
-
365
- Parameters:
366
- df (pl.DataFrame): The input Polars DataFrame containing player statistics.
367
- pitcher (bool): A flag indicating whether to calculate statistics for pitchers (True) or batters (False).
368
-
369
- Returns:
370
- pl.DataFrame: A Polars DataFrame with aggregated and calculated summary statistics.
371
- """
372
-
373
- # Group by position_id and position_name, then aggregate various statistics
374
- df_summ = df.group_by(selection).agg([
375
- pl.col('pa').sum().alias('pa'),
376
- pl.col('ab').sum().alias('ab'),
377
- pl.col('obp').sum().alias('obp_pa'),
378
- pl.col('hits').sum().alias('hits'),
379
- pl.col('on_base').sum().alias('on_base'),
380
- pl.col('k').sum().alias('k'),
381
- pl.col('bb').sum().alias('bb'),
382
- pl.col('bb_minus_k').sum().alias('bb_minus_k'),
383
- pl.col('csw').sum().alias('csw'),
384
- pl.col('bip').sum().alias('bip'),
385
- pl.col('bip_div').sum().alias('bip_div'),
386
- pl.col('tb').sum().alias('tb'),
387
- pl.col('woba').sum().alias('woba'),
388
- pl.col('woba_contact').sum().alias('woba_contact'),
389
- pl.col('woba_pred').sum().alias('xwoba'),
390
- pl.col('woba_pred_contact').sum().alias('xwoba_contact'),
391
- pl.col('woba_codes').sum().alias('woba_codes'),
392
- pl.col('xwoba_codes').sum().alias('xwoba_codes'),
393
- pl.col('hard_hit').sum().alias('hard_hit'),
394
- pl.col('barrel').sum().alias('barrel'),
395
- pl.col('sweet_spot').sum().alias('sweet_spot'),
396
- pl.col('launch_speed').max().alias('max_launch_speed'),
397
- pl.col('launch_speed').quantile(0.90).alias('launch_speed_90'),
398
- pl.col('launch_speed').mean().alias('launch_speed'),
399
- pl.col('launch_angle').mean().alias('launch_angle'),
400
- pl.col('is_pitch').sum().alias('pitches'),
401
- pl.col('swings').sum().alias('swings'),
402
- pl.col('in_zone').sum().alias('in_zone'),
403
- pl.col('out_zone').sum().alias('out_zone'),
404
- pl.col('whiffs').sum().alias('whiffs'),
405
- pl.col('zone_swing').sum().alias('zone_swing'),
406
- pl.col('zone_contact').sum().alias('zone_contact'),
407
- pl.col('ozone_swing').sum().alias('ozone_swing'),
408
- pl.col('ozone_contact').sum().alias('ozone_contact'),
409
- pl.col('trajectory_ground_ball').sum().alias('ground_ball'),
410
- pl.col('trajectory_line_drive').sum().alias('line_drive'),
411
- pl.col('trajectory_fly_ball').sum().alias('fly_ball'),
412
- pl.col('trajectory_popup').sum().alias('pop_up'),
413
- pl.col('attack_zone').count().alias('attack_zone'),
414
- pl.col('heart').sum().alias('heart'),
415
- pl.col('shadow').sum().alias('shadow'),
416
- pl.col('chase').sum().alias('chase'),
417
- pl.col('waste').sum().alias('waste'),
418
- pl.col('heart_swing').sum().alias('heart_swing'),
419
- pl.col('shadow_swing').sum().alias('shadow_swing'),
420
- pl.col('chase_swing').sum().alias('chase_swing'),
421
- pl.col('waste_swing').sum().alias('waste_swing'),
422
- pl.col('heart_whiff').sum().alias('heart_whiff'),
423
- pl.col('shadow_whiff').sum().alias('shadow_whiff'),
424
- pl.col('chase_whiff').sum().alias('chase_whiff'),
425
- pl.col('waste_whiff').sum().alias('waste_whiff')
426
- ])
427
-
428
- # Add calculated columns to the summary DataFrame
429
- df_summ = df_summ.with_columns([
430
- (pl.col('hits') / pl.col('ab')).alias('avg'),
431
- (pl.col('on_base') / pl.col('obp_pa')).alias('obp'),
432
- (pl.col('tb') / pl.col('ab')).alias('slg'),
433
- (pl.col('on_base') / pl.col('obp_pa') + pl.col('tb') / pl.col('ab')).alias('ops'),
434
- (pl.col('k') / pl.col('pa')).alias('k_percent'),
435
- (pl.col('bb') / pl.col('pa')).alias('bb_percent'),
436
- (pl.col('bb_minus_k') / pl.col('pa')).alias('bb_minus_k_percent'),
437
- (pl.col('bb') / pl.col('k')).alias('bb_over_k_percent'),
438
- (pl.col('csw') / pl.col('pitches')).alias('csw_percent'),
439
- (pl.col('sweet_spot') / pl.col('bip_div')).alias('sweet_spot_percent'),
440
- (pl.col('woba') / pl.col('woba_codes')).alias('woba_percent'),
441
- (pl.col('woba_contact') / pl.col('bip')).alias('woba_percent_contact'),
442
- (pl.col('hard_hit') / pl.col('bip_div')).alias('hard_hit_percent'),
443
- (pl.col('barrel') / pl.col('bip_div')).alias('barrel_percent'),
444
- (pl.col('zone_contact') / pl.col('zone_swing')).alias('zone_contact_percent'),
445
- (pl.col('zone_swing') / pl.col('in_zone')).alias('zone_swing_percent'),
446
- (pl.col('in_zone') / pl.col('pitches')).alias('zone_percent'),
447
- (pl.col('ozone_swing') / (pl.col('pitches') - pl.col('in_zone'))).alias('chase_percent'),
448
- (pl.col('ozone_contact') / pl.col('ozone_swing')).alias('chase_contact'),
449
- (pl.col('swings') / pl.col('pitches')).alias('swing_percent'),
450
- (pl.col('whiffs') / pl.col('swings')).alias('whiff_rate'),
451
- (pl.col('whiffs') / pl.col('pitches')).alias('swstr_rate'),
452
- (pl.col('ground_ball') / pl.col('bip')).alias('ground_ball_percent'),
453
- (pl.col('line_drive') / pl.col('bip')).alias('line_drive_percent'),
454
- (pl.col('fly_ball') / pl.col('bip')).alias('fly_ball_percent'),
455
- (pl.col('pop_up') / pl.col('bip')).alias('pop_up_percent'),
456
- (pl.col('heart') / pl.col('attack_zone')).alias('heart_zone_percent'),
457
- (pl.col('shadow') / pl.col('attack_zone')).alias('shadow_zone_percent'),
458
- (pl.col('chase') / pl.col('attack_zone')).alias('chase_zone_percent'),
459
- (pl.col('waste') / pl.col('attack_zone')).alias('waste_zone_percent'),
460
- (pl.col('heart_swing') / pl.col('heart')).alias('heart_zone_swing_percent'),
461
- (pl.col('shadow_swing') / pl.col('shadow')).alias('shadow_zone_swing_percent'),
462
- (pl.col('chase_swing') / pl.col('chase')).alias('chase_zone_swing_percent'),
463
- (pl.col('waste_swing') / pl.col('waste')).alias('waste_zone_swing_percent'),
464
- (pl.col('heart_whiff') / pl.col('heart_swing')).alias('heart_zone_whiff_percent'),
465
- (pl.col('shadow_whiff') / pl.col('shadow_swing')).alias('shadow_zone_whiff_percent'),
466
- (pl.col('chase_whiff') / pl.col('chase_swing')).alias('chase_zone_whiff_percent'),
467
- (pl.col('waste_whiff') / pl.col('waste_swing')).alias('waste_zone_whiff_percent'),
468
- (pl.col('xwoba') / pl.col('xwoba_codes')).alias('xwoba_percent'),
469
- (pl.col('xwoba_contact') / pl.col('bip')).alias('xwoba_percent_contact')
470
- ])
471
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
472
  return df_summ
 
1
+ import polars as pl
2
+ import numpy as np
3
+ import joblib
4
+
5
+ loaded_model = joblib.load('joblib_model/barrel_model.joblib')
6
+ in_zone_model = joblib.load('joblib_model/in_zone_model_knn_20240410.joblib')
7
+ attack_zone_model = joblib.load('joblib_model/model_attack_zone.joblib')
8
+ xwoba_model = joblib.load('joblib_model/xwoba_model.joblib')
9
+ px_model = joblib.load('joblib_model/linear_reg_model_x.joblib')
10
+ pz_model = joblib.load('joblib_model/linear_reg_model_z.joblib')
11
+
12
+
13
+ class df_update:
14
+ def __init__(self):
15
+ pass
16
+
17
+ def update(self, df_clone: pl.DataFrame):
18
+
19
+ df = df_clone.clone()
20
+ # Assuming px_model is defined and df is your DataFrame
21
+ hit_codes = ['single',
22
+ 'double','home_run', 'triple']
23
+
24
+ ab_codes = ['single', 'strikeout', 'field_out',
25
+ 'grounded_into_double_play', 'fielders_choice', 'force_out',
26
+ 'double', 'field_error', 'home_run', 'triple',
27
+ 'double_play',
28
+ 'fielders_choice_out', 'strikeout_double_play',
29
+ 'other_out','triple_play']
30
+
31
+
32
+ obp_true_codes = ['single', 'walk',
33
+ 'double','home_run', 'triple',
34
+ 'hit_by_pitch', 'intent_walk']
35
+
36
+ obp_codes = ['single', 'strikeout', 'walk', 'field_out',
37
+ 'grounded_into_double_play', 'fielders_choice', 'force_out',
38
+ 'double', 'sac_fly', 'field_error', 'home_run', 'triple',
39
+ 'hit_by_pitch', 'double_play', 'intent_walk',
40
+ 'fielders_choice_out', 'strikeout_double_play',
41
+ 'sac_fly_double_play',
42
+ 'other_out','triple_play']
43
+
44
+
45
+ contact_codes = ['In play, no out',
46
+ 'Foul', 'In play, out(s)',
47
+ 'In play, run(s)',
48
+ 'Foul Bunt']
49
+
50
+ bip_codes = ['In play, no out', 'In play, run(s)','In play, out(s)']
51
+
52
+
53
+ conditions_barrel = [
54
+ df['launch_speed'].is_null(),
55
+ (df['launch_speed'] * 1.5 - df['launch_angle'] >= 117) &
56
+ (df['launch_speed'] + df['launch_angle'] >= 124) &
57
+ (df['launch_speed'] >= 98) &
58
+ (df['launch_angle'] >= 4) & (df['launch_angle'] <= 50)
59
+ ]
60
+ choices_barrel = [False, True]
61
+
62
+ conditions_tb = [
63
+ (df['event_type'] == 'single'),
64
+ (df['event_type'] == 'double'),
65
+ (df['event_type'] == 'triple'),
66
+ (df['event_type'] == 'home_run')
67
+ ]
68
+ choices_tb = [1, 2, 3, 4]
69
+
70
+
71
+ conditions_woba = [
72
+ df['event_type'].is_in(['strikeout', 'field_out', 'sac_fly', 'force_out', 'grounded_into_double_play', 'fielders_choice', 'field_error', 'sac_bunt', 'double_play', 'fielders_choice_out', 'strikeout_double_play', 'sac_fly_double_play', 'other_out']),
73
+ df['event_type'] == 'walk',
74
+ df['event_type'] == 'hit_by_pitch',
75
+ df['event_type'] == 'single',
76
+ df['event_type'] == 'double',
77
+ df['event_type'] == 'triple',
78
+ df['event_type'] == 'home_run'
79
+ ]
80
+ choices_woba = [0, 0.689, 0.720, 0.881, 1.254, 1.589, 2.048]
81
+
82
+ woba_codes = ['strikeout', 'field_out', 'single', 'walk', 'hit_by_pitch', 'double', 'sac_fly', 'force_out', 'home_run', 'grounded_into_double_play', 'fielders_choice', 'field_error', 'triple', 'sac_bunt', 'double_play', 'fielders_choice_out', 'strikeout_double_play', 'sac_fly_double_play', 'other_out']
83
+
84
+ pitch_cat = {'FA': 'Fastball',
85
+ 'FF': 'Fastball',
86
+ 'FT': 'Fastball',
87
+ 'FC': 'Fastball',
88
+ 'FS': 'Off-Speed',
89
+ 'FO': 'Off-Speed',
90
+ 'SI': 'Fastball',
91
+ 'ST': 'Breaking',
92
+ 'SL': 'Breaking',
93
+ 'CU': 'Breaking',
94
+ 'KC': 'Breaking',
95
+ 'SC': 'Off-Speed',
96
+ 'GY': 'Off-Speed',
97
+ 'SV': 'Breaking',
98
+ 'CS': 'Breaking',
99
+ 'CH': 'Off-Speed',
100
+ 'KN': 'Off-Speed',
101
+ 'EP': 'Breaking',
102
+ 'UN': None,
103
+ 'IN': None,
104
+ 'PO': None,
105
+ 'AB': None,
106
+ 'AS': None,
107
+ 'NP': None}
108
+
109
+
110
+ df = df.with_columns([
111
+ pl.when(df['type_ab'].is_not_null()).then(1).otherwise(0).alias('pa'),
112
+ pl.when(df['is_pitch']).then(1).otherwise(0).alias('pitches'),
113
+ pl.when(df['sz_top'] == 0).then(None).otherwise(df['sz_top']).alias('sz_top'),
114
+ pl.when(df['sz_bot'] == 0).then(None).otherwise(df['sz_bot']).alias('sz_bot'),
115
+ pl.when(df['zone'] > 0).then(df['zone'] < 10).otherwise(None).alias('in_zone'),
116
+ pl.Series(px_model.predict(df[['x']].fill_null(0).to_numpy())[:, 0]).alias('px_predict'),
117
+ pl.Series(pz_model.predict(df[['y']].fill_null(0).to_numpy())[:, 0] + 3.2).alias('pz_predict'),
118
+ pl.Series(in_zone_model.predict(df[['px','pz','sz_top','sz_bot']].fill_null(0).to_numpy())[:]).alias('in_zone_predict'),
119
+ pl.Series(attack_zone_model.predict(df[['px','pz','sz_top','sz_bot']].fill_null(0).to_numpy())[:]).alias('attack_zone_predict'),
120
+ pl.when(df['event_type'].is_in(hit_codes)).then(True).otherwise(False).alias('hits'),
121
+ pl.when(df['event_type'].is_in(ab_codes)).then(True).otherwise(False).alias('ab'),
122
+ pl.when(df['event_type'].is_in(obp_true_codes)).then(True).otherwise(False).alias('on_base'),
123
+ pl.when(df['event_type'].is_in(obp_codes)).then(True).otherwise(False).alias('obp'),
124
+ pl.when(df['play_description'].is_in(bip_codes)).then(True).otherwise(False).alias('bip'),
125
+ pl.when(conditions_barrel[0]).then(choices_barrel[0]).when(conditions_barrel[1]).then(choices_barrel[1]).otherwise(None).alias('barrel'),
126
+ pl.when(df['launch_angle'].is_null()).then(False).when((df['launch_angle'] >= 8) & (df['launch_angle'] <= 32)).then(True).otherwise(None).alias('sweet_spot'),
127
+ pl.when(df['launch_speed'].is_null()).then(False).when(df['launch_speed'] >= 94.5).then(True).otherwise(None).alias('hard_hit'),
128
+ pl.when(conditions_tb[0]).then(choices_tb[0]).when(conditions_tb[1]).then(choices_tb[1]).when(conditions_tb[2]).then(choices_tb[2]).when(conditions_tb[3]).then(choices_tb[3]).otherwise(None).alias('tb'),
129
+ pl.when(conditions_woba[0]).then(choices_woba[0]).when(conditions_woba[1]).then(choices_woba[1]).when(conditions_woba[2]).then(choices_woba[2]).when(conditions_woba[3]).then(choices_woba[3]).when(conditions_woba[4]).then(choices_woba[4]).when(conditions_woba[5]).then(choices_woba[5]).when(conditions_woba[6]).then(choices_woba[6]).otherwise(None).alias('woba'),
130
+ pl.when((df['play_code'] == 'S') | (df['play_code'] == 'W') | (df['play_code'] == 'T')).then(1).otherwise(0).alias('whiffs'),
131
+ pl.when((df['play_code'] == 'S') | (df['play_code'] == 'W') | (df['play_code'] == 'T') | (df['play_code'] == 'C')).then(1).otherwise(0).alias('csw'),
132
+ pl.when(pl.col('is_swing').cast(pl.Boolean)).then(1).otherwise(0).alias('swings'),
133
+ pl.col('event_type').is_in(['strikeout','strikeout_double_play']).alias('k'),
134
+ pl.col('event_type').is_in(['walk', 'intent_walk']).alias('bb'),
135
+ pl.lit(None).alias('attack_zone'),
136
+ pl.lit(None).alias('woba_pred'),
137
+ pl.lit(None).alias('woba_pred_contact')
138
+
139
+ ])
140
+
141
+ df = df.with_columns([
142
+ pl.when(df['event_type'].is_in(woba_codes)).then(1).otherwise(None).alias('woba_codes'),
143
+ pl.when(df['event_type'].is_in(woba_codes)).then(1).otherwise(None).alias('xwoba_codes'),
144
+ pl.when((pl.col('tb') >= 0)).then(df['woba']).otherwise(None).alias('woba_contact'),
145
+ pl.when(pl.col('px').is_null()).then(pl.col('px_predict')).otherwise(pl.col('px')).alias('px'),
146
+ pl.when(pl.col('pz').is_null()).then(pl.col('pz_predict')).otherwise(pl.col('pz')).alias('pz'),
147
+ pl.when(pl.col('in_zone').is_null()).then(pl.col('in_zone_predict')).otherwise(pl.col('in_zone')).alias('in_zone'),
148
+ pl.when(df['launch_speed'].is_null()).then(None).otherwise(df['barrel']).alias('barrel'),
149
+ pl.lit('average').alias('average'),
150
+ pl.when(pl.col('in_zone') == False).then(True).otherwise(False).alias('out_zone'),
151
+ pl.when((pl.col('in_zone') == True) & (pl.col('swings') == 1)).then(True).otherwise(False).alias('zone_swing'),
152
+ pl.when((pl.col('in_zone') == True) & (pl.col('swings') == 1) & (pl.col('whiffs') == 0)).then(True).otherwise(False).alias('zone_contact'),
153
+ pl.when((pl.col('in_zone') == False) & (pl.col('swings') == 1)).then(True).otherwise(False).alias('ozone_swing'),
154
+ pl.when((pl.col('in_zone') == False) & (pl.col('swings') == 1) & (pl.col('whiffs') == 0)).then(True).otherwise(False).alias('ozone_contact'),
155
+ pl.when(pl.col('event_type').str.contains('strikeout')).then(True).otherwise(False).alias('k'),
156
+ pl.when(pl.col('event_type').is_in(['walk', 'intent_walk'])).then(True).otherwise(False).alias('bb'),
157
+ pl.when(pl.col('attack_zone').is_null()).then(pl.col('attack_zone_predict')).otherwise(pl.col('attack_zone')).alias('attack_zone'),
158
+
159
+
160
+ ])
161
+
162
+ df = df.with_columns([
163
+ (df['k'].cast(pl.Float32) - df['bb'].cast(pl.Float32)).alias('k_minus_bb'),
164
+ (df['bb'].cast(pl.Float32) - df['k'].cast(pl.Float32)).alias('bb_minus_k'),
165
+ (df['launch_speed'] > 0).alias('bip_div'),
166
+ (df['attack_zone'] == 0).alias('heart'),
167
+ (df['attack_zone'] == 1).alias('shadow'),
168
+ (df['attack_zone'] == 2).alias('chase'),
169
+ (df['attack_zone'] == 3).alias('waste'),
170
+ ((df['attack_zone'] == 0) & (df['swings'] == 1)).alias('heart_swing'),
171
+ ((df['attack_zone'] == 1) & (df['swings'] == 1)).alias('shadow_swing'),
172
+ ((df['attack_zone'] == 2) & (df['swings'] == 1)).alias('chase_swing'),
173
+ ((df['attack_zone'] == 3) & (df['swings'] == 1)).alias('waste_swing'),
174
+ ((df['attack_zone'] == 0) & (df['whiffs'] == 1)).alias('heart_whiff'),
175
+ ((df['attack_zone'] == 1) & (df['whiffs'] == 1)).alias('shadow_whiff'),
176
+ ((df['attack_zone'] == 2) & (df['whiffs'] == 1)).alias('chase_whiff'),
177
+ ((df['attack_zone'] == 3) & (df['whiffs'] == 1)).alias('waste_whiff')
178
+ ])
179
+
180
+
181
+ [0, 0.689, 0.720, 0.881, 1.254, 1.589, 2.048]
182
+
183
+ df = df.with_columns([
184
+ pl.Series(
185
+ [sum(x) for x in xwoba_model.predict_proba(df[['launch_angle', 'launch_speed']].fill_null(0).to_numpy()[:]) * ([0, 0.881, 1.254, 1.589, 2.048])]
186
+ ).alias('woba_pred_predict')
187
+ ])
188
+
189
+ df = df.with_columns([
190
+ pl.when(pl.col('event_type').is_in(['walk'])).then(0.689)
191
+ .when(pl.col('event_type').is_in(['hit_by_pitch'])).then(0.720)
192
+ .when(pl.col('event_type').is_in(['strikeout', 'strikeout_double_play'])).then(0)
193
+ .otherwise(pl.col('woba_pred_predict')).alias('woba_pred_predict')
194
+ ])
195
+
196
+ df = df.with_columns([
197
+ pl.when(pl.col('woba_codes').is_null()).then(None).otherwise(pl.col('woba_pred_predict')).alias('woba_pred'),
198
+ pl.when(pl.col('bip')!=1).then(None).otherwise(pl.col('woba_pred_predict')).alias('woba_pred_contact'),
199
+ ])
200
+
201
+ df = df.with_columns([
202
+ pl.when(pl.col('trajectory').is_in(['bunt_popup'])).then(pl.lit('popup'))
203
+ .when(pl.col('trajectory').is_in(['bunt_grounder'])).then(pl.lit('ground_ball'))
204
+ .when(pl.col('trajectory').is_in(['bunt_line_drive'])).then(pl.lit('line_drive'))
205
+ .when(pl.col('trajectory').is_in([''])).then(pl.lit(None))
206
+ .otherwise(pl.col('trajectory')).alias('trajectory')
207
+ ])
208
+
209
+
210
+ # Create one-hot encoded columns for the trajectory column
211
+ dummy_df = df.select(pl.col('trajectory')).to_dummies()
212
+
213
+ # Rename the one-hot encoded columns
214
+ dummy_df = dummy_df.rename({
215
+ 'trajectory_fly_ball': 'trajectory_fly_ball',
216
+ 'trajectory_ground_ball': 'trajectory_ground_ball',
217
+ 'trajectory_line_drive': 'trajectory_line_drive',
218
+ 'trajectory_popup': 'trajectory_popup'
219
+ })
220
+
221
+ # Ensure the columns are present in the DataFrame
222
+ for col in ['trajectory_fly_ball', 'trajectory_ground_ball', 'trajectory_line_drive', 'trajectory_popup']:
223
+ if col not in dummy_df.columns:
224
+ dummy_df = dummy_df.with_columns(pl.lit(0).alias(col))
225
+
226
+ # Join the one-hot encoded columns back to the original DataFrame
227
+ df = df.hstack(dummy_df)
228
+
229
+ # Check if 'trajectory_null' column exists and drop it
230
+ if 'trajectory_null' in df.columns:
231
+ df = df.drop('trajectory_null')
232
+
233
+
234
+ pitch_cat = {'FA': None,
235
+ 'FF': 'Fastball',
236
+ 'FT': 'Fastball',
237
+ 'FC': 'Fastball',
238
+ 'FS': 'Off-Speed',
239
+ 'FO': 'Off-Speed',
240
+ 'SI': 'Fastball',
241
+ 'ST': 'Breaking',
242
+ 'SL': 'Breaking',
243
+ 'CU': 'Breaking',
244
+ 'KC': 'Breaking',
245
+ 'SC': 'Off-Speed',
246
+ 'GY': 'Off-Speed',
247
+ 'SV': 'Breaking',
248
+ 'CS': 'Breaking',
249
+ 'CH': 'Off-Speed',
250
+ 'KN': 'Off-Speed',
251
+ 'EP': 'Breaking',
252
+ 'UN': None,
253
+ 'IN': None,
254
+ 'PO': None,
255
+ 'AB': None,
256
+ 'AS': None,
257
+ 'NP': None}
258
+ df = df.with_columns(
259
+ df["pitch_type"].map_elements(lambda x: pitch_cat.get(x, x)).alias("pitch_group")
260
+ )
261
+
262
+ return df
263
+
264
+ # Assuming df is your Polars DataFrame
265
+ def update_summary(self, df: pl.DataFrame, pitcher: bool = True) -> pl.DataFrame:
266
+ """
267
+ Update summary statistics for pitchers or batters.
268
+
269
+ Parameters:
270
+ df (pl.DataFrame): The input Polars DataFrame containing player statistics.
271
+ pitcher (bool): A flag indicating whether to calculate statistics for pitchers (True) or batters (False).
272
+
273
+ Returns:
274
+ pl.DataFrame: A Polars DataFrame with aggregated and calculated summary statistics.
275
+ """
276
+
277
+ # Determine the position based on the pitcher flag
278
+ if pitcher:
279
+ position = 'pitcher'
280
+ else:
281
+ position = 'batter'
282
+
283
+ # Group by position_id and position_name, then aggregate various statistics
284
+ df_summ = df.group_by([f'{position}_id', f'{position}_name']).agg([
285
+ pl.col('pa').sum().alias('pa'),
286
+ pl.col('ab').sum().alias('ab'),
287
+ pl.col('obp').sum().alias('obp_pa'),
288
+ pl.col('hits').sum().alias('hits'),
289
+ pl.col('on_base').sum().alias('on_base'),
290
+ pl.col('k').sum().alias('k'),
291
+ pl.col('bb').sum().alias('bb'),
292
+ pl.col('bb_minus_k').sum().alias('bb_minus_k'),
293
+ pl.col('csw').sum().alias('csw'),
294
+ pl.col('bip').sum().alias('bip'),
295
+ pl.col('bip_div').sum().alias('bip_div'),
296
+ pl.col('tb').sum().alias('tb'),
297
+ pl.col('woba').sum().alias('woba'),
298
+ pl.col('woba_contact').sum().alias('woba_contact'),
299
+ pl.col('woba_pred').sum().alias('xwoba'),
300
+ pl.col('woba_pred_contact').sum().alias('xwoba_contact'),
301
+ pl.col('woba_codes').sum().alias('woba_codes'),
302
+ pl.col('xwoba_codes').sum().alias('xwoba_codes'),
303
+ pl.col('hard_hit').sum().alias('hard_hit'),
304
+ pl.col('barrel').sum().alias('barrel'),
305
+ pl.col('sweet_spot').sum().alias('sweet_spot'),
306
+ pl.col('launch_speed').max().alias('max_launch_speed'),
307
+ pl.col('launch_speed').quantile(0.90).alias('launch_speed_90'),
308
+ pl.col('launch_speed').mean().alias('launch_speed'),
309
+ pl.col('launch_angle').mean().alias('launch_angle'),
310
+ pl.col('is_pitch').sum().alias('pitches'),
311
+ pl.col('swings').sum().alias('swings'),
312
+ pl.col('in_zone').sum().alias('in_zone'),
313
+ pl.col('out_zone').sum().alias('out_zone'),
314
+ pl.col('whiffs').sum().alias('whiffs'),
315
+ pl.col('zone_swing').sum().alias('zone_swing'),
316
+ pl.col('zone_contact').sum().alias('zone_contact'),
317
+ pl.col('ozone_swing').sum().alias('ozone_swing'),
318
+ pl.col('ozone_contact').sum().alias('ozone_contact'),
319
+ pl.col('trajectory_ground_ball').sum().alias('ground_ball'),
320
+ pl.col('trajectory_line_drive').sum().alias('line_drive'),
321
+ pl.col('trajectory_fly_ball').sum().alias('fly_ball'),
322
+ pl.col('trajectory_popup').sum().alias('pop_up'),
323
+ pl.col('attack_zone').count().alias('attack_zone'),
324
+ pl.col('heart').sum().alias('heart'),
325
+ pl.col('shadow').sum().alias('shadow'),
326
+ pl.col('chase').sum().alias('chase'),
327
+ pl.col('waste').sum().alias('waste'),
328
+ pl.col('heart_swing').sum().alias('heart_swing'),
329
+ pl.col('shadow_swing').sum().alias('shadow_swing'),
330
+ pl.col('chase_swing').sum().alias('chase_swing'),
331
+ pl.col('waste_swing').sum().alias('waste_swing'),
332
+ pl.col('heart_whiff').sum().alias('heart_whiff'),
333
+ pl.col('shadow_whiff').sum().alias('shadow_whiff'),
334
+ pl.col('chase_whiff').sum().alias('chase_whiff'),
335
+ pl.col('waste_whiff').sum().alias('waste_whiff')
336
+ ])
337
+
338
+ # Add calculated columns to the summary DataFrame
339
+ df_summ = df_summ.with_columns([
340
+ (pl.col('hits') / pl.col('ab')).alias('avg'),
341
+ (pl.col('on_base') / pl.col('obp_pa')).alias('obp'),
342
+ (pl.col('tb') / pl.col('ab')).alias('slg'),
343
+ (pl.col('on_base') / pl.col('obp_pa') + pl.col('tb') / pl.col('ab')).alias('ops'),
344
+ (pl.col('k') / pl.col('pa')).alias('k_percent'),
345
+ (pl.col('bb') / pl.col('pa')).alias('bb_percent'),
346
+ (pl.col('bb_minus_k') / pl.col('pa')).alias('bb_minus_k_percent'),
347
+ (pl.col('bb') / pl.col('k')).alias('bb_over_k_percent'),
348
+ (pl.col('csw') / pl.col('pitches')).alias('csw_percent'),
349
+ (pl.col('sweet_spot') / pl.col('bip_div')).alias('sweet_spot_percent'),
350
+ (pl.col('woba') / pl.col('woba_codes')).alias('woba_percent'),
351
+ (pl.col('woba_contact') / pl.col('bip')).alias('woba_percent_contact'),
352
+ (pl.col('hard_hit') / pl.col('bip_div')).alias('hard_hit_percent'),
353
+ (pl.col('barrel') / pl.col('bip_div')).alias('barrel_percent'),
354
+ (pl.col('zone_contact') / pl.col('zone_swing')).alias('zone_contact_percent'),
355
+ (pl.col('zone_swing') / pl.col('in_zone')).alias('zone_swing_percent'),
356
+ (pl.col('in_zone') / pl.col('pitches')).alias('zone_percent'),
357
+ (pl.col('ozone_swing') / (pl.col('pitches') - pl.col('in_zone'))).alias('chase_percent'),
358
+ (pl.col('ozone_contact') / pl.col('ozone_swing')).alias('chase_contact'),
359
+ (pl.col('swings') / pl.col('pitches')).alias('swing_percent'),
360
+ (pl.col('whiffs') / pl.col('swings')).alias('whiff_rate'),
361
+ (pl.col('whiffs') / pl.col('pitches')).alias('swstr_rate'),
362
+ (pl.col('ground_ball') / pl.col('bip')).alias('ground_ball_percent'),
363
+ (pl.col('line_drive') / pl.col('bip')).alias('line_drive_percent'),
364
+ (pl.col('fly_ball') / pl.col('bip')).alias('fly_ball_percent'),
365
+ (pl.col('pop_up') / pl.col('bip')).alias('pop_up_percent'),
366
+ (pl.col('heart') / pl.col('attack_zone')).alias('heart_zone_percent'),
367
+ (pl.col('shadow') / pl.col('attack_zone')).alias('shadow_zone_percent'),
368
+ (pl.col('chase') / pl.col('attack_zone')).alias('chase_zone_percent'),
369
+ (pl.col('waste') / pl.col('attack_zone')).alias('waste_zone_percent'),
370
+ (pl.col('heart_swing') / pl.col('heart')).alias('heart_zone_swing_percent'),
371
+ (pl.col('shadow_swing') / pl.col('shadow')).alias('shadow_zone_swing_percent'),
372
+ (pl.col('chase_swing') / pl.col('chase')).alias('chase_zone_swing_percent'),
373
+ (pl.col('waste_swing') / pl.col('waste')).alias('waste_zone_swing_percent'),
374
+ (pl.col('heart_whiff') / pl.col('heart_swing')).alias('heart_zone_whiff_percent'),
375
+ (pl.col('shadow_whiff') / pl.col('shadow_swing')).alias('shadow_zone_whiff_percent'),
376
+ (pl.col('chase_whiff') / pl.col('chase_swing')).alias('chase_zone_whiff_percent'),
377
+ (pl.col('waste_whiff') / pl.col('waste_swing')).alias('waste_zone_whiff_percent'),
378
+ (pl.col('xwoba') / pl.col('xwoba_codes')).alias('xwoba_percent'),
379
+ (pl.col('xwoba_contact') / pl.col('bip')).alias('xwoba_percent_contact')
380
+ ])
381
+
382
+ return df_summ
383
+
384
+
385
+
386
+
387
+
388
+
389
+ # Assuming df is your Polars DataFrame
390
+ def update_summary_select(self, df: pl.DataFrame, selection: list) -> pl.DataFrame:
391
+ """
392
+ Update summary statistics for pitchers or batters.
393
+
394
+ Parameters:
395
+ df (pl.DataFrame): The input Polars DataFrame containing player statistics.
396
+ pitcher (bool): A flag indicating whether to calculate statistics for pitchers (True) or batters (False).
397
+
398
+ Returns:
399
+ pl.DataFrame: A Polars DataFrame with aggregated and calculated summary statistics.
400
+ """
401
+
402
+ # Group by position_id and position_name, then aggregate various statistics
403
+ df_summ = df.group_by(selection).agg([
404
+ pl.col('pa').sum().alias('pa'),
405
+ pl.col('ab').sum().alias('ab'),
406
+ pl.col('obp').sum().alias('obp_pa'),
407
+ pl.col('hits').sum().alias('hits'),
408
+ pl.col('on_base').sum().alias('on_base'),
409
+ pl.col('k').sum().alias('k'),
410
+ pl.col('bb').sum().alias('bb'),
411
+ pl.col('bb_minus_k').sum().alias('bb_minus_k'),
412
+ pl.col('csw').sum().alias('csw'),
413
+ pl.col('bip').sum().alias('bip'),
414
+ pl.col('bip_div').sum().alias('bip_div'),
415
+ pl.col('tb').sum().alias('tb'),
416
+ pl.col('woba').sum().alias('woba'),
417
+ pl.col('woba_contact').sum().alias('woba_contact'),
418
+ pl.col('woba_pred').sum().alias('xwoba'),
419
+ pl.col('woba_pred_contact').sum().alias('xwoba_contact'),
420
+ pl.col('woba_codes').sum().alias('woba_codes'),
421
+ pl.col('xwoba_codes').sum().alias('xwoba_codes'),
422
+ pl.col('hard_hit').sum().alias('hard_hit'),
423
+ pl.col('barrel').sum().alias('barrel'),
424
+ pl.col('sweet_spot').sum().alias('sweet_spot'),
425
+ pl.col('launch_speed').max().alias('max_launch_speed'),
426
+ pl.col('launch_speed').quantile(0.90).alias('launch_speed_90'),
427
+ pl.col('launch_speed').mean().alias('launch_speed'),
428
+ pl.col('launch_angle').mean().alias('launch_angle'),
429
+ pl.col('is_pitch').sum().alias('pitches'),
430
+ pl.col('swings').sum().alias('swings'),
431
+ pl.col('in_zone').sum().alias('in_zone'),
432
+ pl.col('out_zone').sum().alias('out_zone'),
433
+ pl.col('whiffs').sum().alias('whiffs'),
434
+ pl.col('zone_swing').sum().alias('zone_swing'),
435
+ pl.col('zone_contact').sum().alias('zone_contact'),
436
+ pl.col('ozone_swing').sum().alias('ozone_swing'),
437
+ pl.col('ozone_contact').sum().alias('ozone_contact'),
438
+ pl.col('trajectory_ground_ball').sum().alias('ground_ball'),
439
+ pl.col('trajectory_line_drive').sum().alias('line_drive'),
440
+ pl.col('trajectory_fly_ball').sum().alias('fly_ball'),
441
+ pl.col('trajectory_popup').sum().alias('pop_up'),
442
+ pl.col('attack_zone').count().alias('attack_zone'),
443
+ pl.col('heart').sum().alias('heart'),
444
+ pl.col('shadow').sum().alias('shadow'),
445
+ pl.col('chase').sum().alias('chase'),
446
+ pl.col('waste').sum().alias('waste'),
447
+ pl.col('heart_swing').sum().alias('heart_swing'),
448
+ pl.col('shadow_swing').sum().alias('shadow_swing'),
449
+ pl.col('chase_swing').sum().alias('chase_swing'),
450
+ pl.col('waste_swing').sum().alias('waste_swing'),
451
+ pl.col('heart_whiff').sum().alias('heart_whiff'),
452
+ pl.col('shadow_whiff').sum().alias('shadow_whiff'),
453
+ pl.col('chase_whiff').sum().alias('chase_whiff'),
454
+ pl.col('waste_whiff').sum().alias('waste_whiff'),
455
+ pl.col('tj_stuff_plus').sum().alias('tj_stuff_plus')
456
+ ])
457
+
458
+ # Add calculated columns to the summary DataFrame
459
+ df_summ = df_summ.with_columns([
460
+ (pl.col('hits') / pl.col('ab')).alias('avg'),
461
+ (pl.col('on_base') / pl.col('obp_pa')).alias('obp'),
462
+ (pl.col('tb') / pl.col('ab')).alias('slg'),
463
+ (pl.col('on_base') / pl.col('obp_pa') + pl.col('tb') / pl.col('ab')).alias('ops'),
464
+ (pl.col('k') / pl.col('pa')).alias('k_percent'),
465
+ (pl.col('bb') / pl.col('pa')).alias('bb_percent'),
466
+ (pl.col('bb_minus_k') / pl.col('pa')).alias('bb_minus_k_percent'),
467
+ (pl.col('bb') / pl.col('k')).alias('bb_over_k_percent'),
468
+ (pl.col('csw') / pl.col('pitches')).alias('csw_percent'),
469
+ (pl.col('sweet_spot') / pl.col('bip_div')).alias('sweet_spot_percent'),
470
+ (pl.col('woba') / pl.col('woba_codes')).alias('woba_percent'),
471
+ (pl.col('woba_contact') / pl.col('bip')).alias('woba_percent_contact'),
472
+ (pl.col('hard_hit') / pl.col('bip_div')).alias('hard_hit_percent'),
473
+ (pl.col('barrel') / pl.col('bip_div')).alias('barrel_percent'),
474
+ (pl.col('zone_contact') / pl.col('zone_swing')).alias('zone_contact_percent'),
475
+ (pl.col('zone_swing') / pl.col('in_zone')).alias('zone_swing_percent'),
476
+ (pl.col('in_zone') / pl.col('pitches')).alias('zone_percent'),
477
+ (pl.col('ozone_swing') / (pl.col('pitches') - pl.col('in_zone'))).alias('chase_percent'),
478
+ (pl.col('ozone_contact') / pl.col('ozone_swing')).alias('chase_contact'),
479
+ (pl.col('swings') / pl.col('pitches')).alias('swing_percent'),
480
+ (pl.col('whiffs') / pl.col('swings')).alias('whiff_rate'),
481
+ (pl.col('whiffs') / pl.col('pitches')).alias('swstr_rate'),
482
+ (pl.col('ground_ball') / pl.col('bip')).alias('ground_ball_percent'),
483
+ (pl.col('line_drive') / pl.col('bip')).alias('line_drive_percent'),
484
+ (pl.col('fly_ball') / pl.col('bip')).alias('fly_ball_percent'),
485
+ (pl.col('pop_up') / pl.col('bip')).alias('pop_up_percent'),
486
+ (pl.col('heart') / pl.col('attack_zone')).alias('heart_zone_percent'),
487
+ (pl.col('shadow') / pl.col('attack_zone')).alias('shadow_zone_percent'),
488
+ (pl.col('chase') / pl.col('attack_zone')).alias('chase_zone_percent'),
489
+ (pl.col('waste') / pl.col('attack_zone')).alias('waste_zone_percent'),
490
+ (pl.col('heart_swing') / pl.col('heart')).alias('heart_zone_swing_percent'),
491
+ (pl.col('shadow_swing') / pl.col('shadow')).alias('shadow_zone_swing_percent'),
492
+ (pl.col('chase_swing') / pl.col('chase')).alias('chase_zone_swing_percent'),
493
+ (pl.col('waste_swing') / pl.col('waste')).alias('waste_zone_swing_percent'),
494
+ (pl.col('heart_whiff') / pl.col('heart_swing')).alias('heart_zone_whiff_percent'),
495
+ (pl.col('shadow_whiff') / pl.col('shadow_swing')).alias('shadow_zone_whiff_percent'),
496
+ (pl.col('chase_whiff') / pl.col('chase_swing')).alias('chase_zone_whiff_percent'),
497
+ (pl.col('waste_whiff') / pl.col('waste_swing')).alias('waste_zone_whiff_percent'),
498
+ (pl.col('xwoba') / pl.col('xwoba_codes')).alias('xwoba_percent'),
499
+ (pl.col('xwoba_contact') / pl.col('bip')).alias('xwoba_percent_contact'),
500
+ (pl.col('tj_stuff_plus') / pl.col('pitches')).alias('tj_stuff_plus_avg'),
501
+
502
+ ])
503
+
504
  return df_summ