nesticot commited on
Commit
1e488de
·
verified ·
1 Parent(s): 1dd8af2

Upload 27 files

Browse files
app.py CHANGED
@@ -1,242 +1,488 @@
1
- from shiny import App, ui, render, reactive
2
- import polars as pl
3
- import numpy as np
4
- import pandas as pd
5
- import api_scraper
6
- scrape = api_scraper.MLB_Scrape()
7
- from functions import df_update
8
- from functions import pitch_summary_functions
9
- update = df_update.df_update()
10
- from stuff_model import feature_engineering as fe
11
- from stuff_model import stuff_apply
12
- import requests
13
- import joblib
14
- from matplotlib.gridspec import GridSpec
15
- import math
16
-
17
- colour_palette = ['#FFB000','#648FFF','#785EF0',
18
- '#DC267F','#FE6100','#3D1EB2','#894D80','#16AA02','#B5592B','#A3C1ED']
19
-
20
- # df = pl.read_csv("data.csv")
21
- df = pl.read_parquet("data.parquet")
22
- print('df')
23
-
24
- year_input = 2024
25
- sport_id = 1
26
-
27
- df_schedule = scrape.get_schedule(year_input=[year_input],sport_id=[sport_id])
28
- df = df.join(df_schedule, on='game_id', how='left')
29
-
30
- df = df.with_columns(
31
- pl.when((pl.col('batter_team_id') == pl.col('away_id')))
32
- .then(pl.lit('Away'))
33
- .when((pl.col('batter_team_id') == pl.col('home_id')))
34
- .then(pl.lit('Home'))
35
- .otherwise(None)
36
- .alias('home_away_batter')
37
- )
38
-
39
- df = df.with_columns(
40
- pl.when((pl.col('pitcher_team_id') == pl.col('away_id')))
41
- .then(pl.lit('Away'))
42
- .when((pl.col('pitcher_team_id') == pl.col('home_id')))
43
- .then(pl.lit('Home'))
44
- .otherwise(None)
45
- .alias('home_away_pitcher')
46
- )
47
-
48
- print('schedule')
49
-
50
- df_stuff = stuff_apply.stuff_apply(fe.feature_engineering(df))
51
- print('stuff')
52
- df_up = update.update(df)
53
- print('update')
54
- df_total = df_up.join(df_stuff[['play_id','tj_stuff_plus']], on='play_id', how='left')
55
- print('total')
56
-
57
- stat_titles = {
58
-
59
-
60
-
61
- 'pa': 'PA',
62
- 'bip': 'BBE',
63
- 'hits': 'H',
64
- 'k': 'K',
65
- 'bb': 'BB',
66
- 'max_launch_speed': 'Max EV',
67
- 'launch_speed_90': '90th% EV',
68
- 'launch_speed': 'EV',
69
- 'pitches': 'Pitches',
70
- 'tj_stuff_plus_avg': 'tjStuff+',
71
- 'avg': 'AVG',
72
- 'obp': 'OBP',
73
- 'slg': 'SLG',
74
- 'ops': 'OPS',
75
- 'k_percent': 'K%',
76
- 'bb_percent': 'BB%',
77
- 'k_minus_bb_percent': 'K-BB%',
78
- 'sweet_spot_percent': 'SwSpot%',
79
- 'woba_percent': 'wOBA',
80
- 'xwoba_percent': 'xwOBA',
81
- 'woba_percent_contact': 'wOBACON',
82
- 'xwoba_percent_contact': 'xwOBACON',
83
- 'hard_hit_percent': 'HardHit%',
84
- 'barrel_percent': 'Barrel%',
85
- 'zone_contact_percent': 'Z-Contact%',
86
- 'zone_swing_percent': 'Z-Swing%',
87
- 'zone_percent': 'Zone%',
88
- 'chase_percent': 'O-Swing%',
89
- 'chase_contact': 'O-Contact%',
90
- 'swing_percent': 'Swing%',
91
- 'whiff_rate': 'Whiff%',
92
- 'swstr_rate': 'SwStr%',
93
- 'ground_ball_percent': 'GB%',
94
- 'line_drive_percent': 'LD%',
95
- 'fly_ball_percent': 'FB%',
96
- 'pop_up_percent': 'PU%',
97
- 'heart_zone_swing_percent': 'Heart Swing%',
98
- 'shadow_zone_swing_percent': 'Shadow Swing%',
99
- 'chase_zone_swing_percent': 'Chase Swing%',
100
- 'waste_zone_swing_percent': 'Waste Swing%',
101
- 'heart_zone_whiff_percent': 'Heart Whiff%',
102
- 'shadow_zone_whiff_percent': 'Shadow Whiff%',
103
- 'chase_zone_whiff_percent': 'Chase Whiff%',
104
- 'waste_zone_whiff_percent': 'Waste Whiff%'
105
- }
106
-
107
- agg_titles = { 'batter_id':'Batter ID',
108
- 'batter_name':'Batter Name',
109
- 'batter_team':'Batter Team',
110
- 'batter_hand':'Batter Hand',
111
- 'pitcher_id':'Pitcher ID',
112
- 'pitcher_name':'Pitcher Name',
113
- 'pitcher_team':'Pitcher Team',
114
- 'pitcher_hand':'Pitcher Hand',
115
- 'pitch_type':'Pitch Type',
116
- 'pitch_group':'Pitch Group',
117
- 'home_away_batter':'Home/Away Batter',
118
- 'home_away_pitcher':'Home/Away Pitcher'}
119
-
120
- stat_selection = [key for key in stat_titles.keys()]
121
-
122
- # Pad each value with non-breaking spaces to a length of 20
123
- stat_titles_padded = {key: value + '\u00A0' * (30 - len(value)) for key, value in (agg_titles | stat_titles).items()}
124
-
125
- rounding_dict = {
126
- 'pa': 0,
127
- 'bip': 0,
128
- 'hits': 0,
129
- 'k': 0,
130
- 'bb': 0,
131
- 'max_launch_speed': 1,
132
- 'launch_speed_90': 1,
133
- 'launch_speed': 1,
134
- 'pitches': 0,
135
- 'tj_stuff_plus_avg': 1,
136
- 'avg': 3,
137
- 'obp': 3,
138
- 'slg': 3,
139
- 'ops': 3,
140
- 'k_percent': 3,
141
- 'bb_percent': 3,
142
- 'sweet_spot_percent': 3,
143
- 'woba_percent': 3,
144
- 'xwoba_percent': 3,
145
- 'woba_percent_contact': 3,
146
- 'xwoba_percent_contact': 3,
147
- 'hard_hit_percent': 3,
148
- 'barrel_percent': 3,
149
- 'zone_contact_percent': 3,
150
- 'zone_swing_percent': 3,
151
- 'zone_percent': 3,
152
- 'chase_percent': 3,
153
- 'chase_contact': 3,
154
- 'swing_percent': 3,
155
- 'whiff_rate': 3,
156
- 'swstr_rate': 3,
157
- 'ground_ball_percent': 3,
158
- 'line_drive_percent': 3,
159
- 'fly_ball_percent': 3,
160
- 'pop_up_percent': 3,
161
- 'heart_zone_swing_percent': 3,
162
- 'shadow_zone_swing_percent': 3,
163
- 'chase_zone_swing_percent': 3,
164
- 'waste_zone_swing_percent': 3,
165
- 'heart_zone_whiff_percent': 3,
166
- 'shadow_zone_whiff_percent': 3,
167
- 'chase_zone_whiff_percent': 3,
168
- 'waste_zone_whiff_percent': 3
169
- }
170
-
171
-
172
-
173
- app_ui = ui.page_sidebar(
174
- ui.sidebar(
175
-
176
-
177
- ui.input_selectize(
178
- "list_input",
179
- "Select Aggregation:",
180
- choices=agg_titles,
181
- multiple=True,
182
- selected=['batter_id', 'batter_name']
183
- ),
184
- ui.input_selectize(
185
- "list_stats",
186
- "Select Stats:",
187
- choices=stat_titles,
188
- multiple=True,
189
- selected=['pa']
190
- ),
191
- ui.input_date_range("date_id", "Select Date Range",
192
- start=df_total['game_date'].min(),
193
- end=df_total['game_date'].max(),
194
- min=df_total['game_date'].min(),
195
- max=df_total['game_date'].max()),
196
-
197
- ui.input_action_button("generate_table", "Generate Table", class_="btn-primary"),
198
- width="400px"
199
-
200
- ),
201
- ui.card(
202
- ui.card_header("Leaderboard"),
203
- ui.output_data_frame("list_view")
204
- )
205
- )
206
-
207
- def server(input, output, session):
208
- @output
209
- @render.data_frame
210
- @reactive.event(input.generate_table, ignore_none=False)
211
- def list_view():
212
-
213
- start_date = str(input.date_id()[0])
214
- end_date = str(input.date_id()[1])
215
- selection_list = list(input.list_input())
216
- stat_list = list(input.list_stats())
217
- print(selection_list)
218
- df_agg = update.update_summary_select(df=df_total.filter((pl.col('game_date')>=start_date)&(pl.col('game_date')<=end_date)),
219
- selection=selection_list)
220
- df_agg = df_agg.select(selection_list+stat_list)
221
- for col in df_agg.columns[len(selection_list):]:
222
- if col in rounding_dict:
223
- df_agg = df_agg.with_columns(pl.col(col).round(rounding_dict[col]))
224
-
225
-
226
- # Only rename columns that exist in stat_titles
227
- print(df_agg.columns)
228
- rename_dict = {col: stat_titles_padded.get(col, col) for col in df_agg.columns}
229
- # print(rename_dict)
230
- result_df = df_agg.rename(rename_dict)
231
-
232
- return render.DataGrid(
233
- result_df,
234
- width='2000px',
235
- height='750px',
236
- filters=True,
237
-
238
-
239
-
240
- )
241
-
242
- app = App(app_ui, server)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from shiny import App, ui, render, reactive
2
+ import polars as pl
3
+ import numpy as np
4
+ import pandas as pd
5
+ import api_scraper
6
+ scrape = api_scraper.MLB_Scrape()
7
+ from functions import df_update
8
+ from functions import pitch_summary_functions
9
+ update = df_update.df_update()
10
+ from stuff_model import feature_engineering as fe
11
+ from stuff_model import stuff_apply
12
+ import requests
13
+ import joblib
14
+ from matplotlib.gridspec import GridSpec
15
+ import math
16
+ from pytabulator import TableOptions, Tabulator, output_tabulator, render_tabulator, theme
17
+ theme.tabulator_site()
18
+
19
+ colour_palette = ['#FFB000','#648FFF','#785EF0',
20
+ '#DC267F','#FE6100','#3D1EB2','#894D80','#16AA02','#B5592B','#A3C1ED']
21
+
22
+ # df = pl.read_csv("data.csv")
23
+ # df = pl.read_parquet("data_small.parquet")[:]
24
+ # df = pl.read_parquet("data.parquet")[:]
25
+ # print('df')
26
+ season = 2024
27
+
28
+ df_mlb = pl.read_parquet("data/data_mlb_2024.parquet")[:]
29
+ df_aaa = pl.read_parquet("data/data_aaa_2024.parquet")[:]
30
+ df_a = pl.read_parquet("data/data_a_2024.parquet")[:]
31
+
32
+
33
+
34
+ def df_final(df:pl.dataframe,year_input:int,sport_id:int):
35
+
36
+ df_schedule = scrape.get_schedule(year_input=[year_input],sport_id=[sport_id])
37
+ df = df.join(df_schedule, on='game_id', how='left')
38
+
39
+ df = df.with_columns(
40
+ pl.when((pl.col('batter_team_id') == pl.col('away_id')))
41
+ .then(pl.lit('Away'))
42
+ .when((pl.col('batter_team_id') == pl.col('home_id')))
43
+ .then(pl.lit('Home'))
44
+ .otherwise(None)
45
+ .alias('home_away')
46
+ )
47
+
48
+ df = df.with_columns(
49
+ pl.when((pl.col('pitcher_team_id') == pl.col('away_id')))
50
+ .then(pl.lit('Away'))
51
+ .when((pl.col('pitcher_team_id') == pl.col('home_id')))
52
+ .then(pl.lit('Home'))
53
+ .otherwise(None)
54
+ .alias('home_away_pitcher')
55
+ )
56
+
57
+
58
+ print('schedule')
59
+
60
+ df_stuff = stuff_apply.stuff_apply(fe.feature_engineering(df))
61
+ print('stuff')
62
+ df_up = update.update(df)
63
+ print('update')
64
+ df_total = df_up.join(df_stuff[['play_id','tj_stuff_plus']], on='play_id', how='left')
65
+ print('total')
66
+ return df_total
67
+
68
+
69
+ df_mlb_total = df_final(df=df_mlb,year_input=season,sport_id=1)
70
+ df_aaa_total = df_final(df=df_aaa,year_input=season,sport_id=11)
71
+ df_a_total = df_final(df=df_a.drop_nulls(subset=['start_speed']),year_input=season,sport_id=14)
72
+
73
+ rounding_dict = {
74
+ 'pa': 0,
75
+ 'bip': 0,
76
+ 'hits': 0,
77
+ 'k': 0,
78
+ 'bb': 0,
79
+ 'max_launch_speed': 1,
80
+ 'launch_speed_90': 1,
81
+ 'launch_speed': 1,
82
+ 'pitches': 0,
83
+ 'tj_stuff_plus_avg': 0,
84
+ 'avg': 3,
85
+ 'obp': 3,
86
+ 'slg': 3,
87
+ 'ops': 3,
88
+ 'k_percent': 3,
89
+ 'bb_percent': 3,
90
+ 'k_minus_bb_percent': 3,
91
+ 'sweet_spot_percent': 3,
92
+ 'woba_percent': 3,
93
+ 'xwoba_percent': 3,
94
+ 'woba_percent_contact': 3,
95
+ 'xwoba_percent_contact': 3,
96
+ 'hard_hit_percent': 3,
97
+ 'barrel_percent': 3,
98
+ 'zone_contact_percent': 3,
99
+ 'zone_swing_percent': 3,
100
+ 'zone_percent': 3,
101
+ 'chase_percent': 3,
102
+ 'chase_contact': 3,
103
+ 'swing_percent': 3,
104
+ 'whiff_rate': 3,
105
+ 'swstr_rate': 3,
106
+ 'ground_ball_percent': 3,
107
+ 'line_drive_percent': 3,
108
+ 'fly_ball_percent': 3,
109
+ 'pop_up_percent': 3,
110
+ 'heart_zone_swing_percent': 3,
111
+ 'shadow_zone_swing_percent': 3,
112
+ 'chase_zone_swing_percent': 3,
113
+ 'waste_zone_swing_percent': 3,
114
+ 'heart_zone_whiff_percent': 3,
115
+ 'shadow_zone_whiff_percent': 3,
116
+ 'chase_zone_whiff_percent': 3,
117
+ 'waste_zone_whiff_percent': 3,
118
+ 'start_speed_avg': 1,
119
+ 'vb_avg': 1,
120
+ 'ivb_avg': 1,
121
+ 'hb_avg': 1,
122
+ 'z0_avg': 1,
123
+ 'x0_avg': 1,
124
+ 'vaa_avg': 1,
125
+ 'haa_avg': 1,
126
+ 'spin_rate_avg': 0,
127
+ 'extension_avg': 1
128
+ }
129
+
130
+ columns = [
131
+ { "title": "PA", "field": "pa", "width": 150},
132
+ { "title": "BBE", "field": "bip", "width": 150 },
133
+ { "title": "H", "field": "hits", "width": 150 },
134
+ { "title": "K", "field": "k", "width": 150 },
135
+ { "title": "BB", "field": "bb", "width": 150 },
136
+ { "title": "Max EV", "field": "max_launch_speed", "width": 150 },
137
+ { "title": "90th% EV", "field": "launch_speed_90", "width": 150 },
138
+ { "title": "EV", "field": "launch_speed", "width": 150 },
139
+ { "title": "Pitches", "field": "pitches", "width": 150 },
140
+ { "title": "AVG", "field": "avg", "width": 150 },
141
+ { "title": "OBP", "field": "obp", "width": 150 },
142
+ { "title": "SLG", "field": "slg", "width": 150 },
143
+ { "title": "OPS", "field": "ops", "width": 150 },
144
+ { "title": "K%", "field": "k_percent", "width": 150,"formatter": "money", "formatterParams":{"decimal":".","thousand":".","symbol":"%","symbolAfter":"%","negativeSign":True,"precision":1}},
145
+ { "title": "BB%", "field": "bb_percent", "width": 150,"formatter": "money", "formatterParams":{"decimal":".","thousand":".","symbol":"%","symbolAfter":"%","negativeSign":True,"precision":1}},
146
+ { "title": "K-BB%", "field": "k_minus_bb_percent", "width": 150,"formatter": "money", "formatterParams":{"decimal":".","thousand":".","symbol":"%","symbolAfter":"%","negativeSign":True,"precision":1}},
147
+ { "title": "SwSpot%", "field": "sweet_spot_percent", "width": 150,"formatter": "money", "formatterParams":{"decimal":".","thousand":".","symbol":"%","symbolAfter":"%","negativeSign":True,"precision":1}},
148
+ { "title": "wOBA", "field": "woba_percent", "width": 150 },
149
+ { "title": "xwOBA", "field": "xwoba_percent", "width": 150 },
150
+ { "title": "wOBACON", "field": "woba_percent_contact", "width": 150 },
151
+ { "title": "xwOBACON", "field": "xwoba_percent_contact", "width": 150 },
152
+ { "title": "HardHit%", "field": "hard_hit_percent", "width": 150 ,"formatter": "money", "formatterParams":{"decimal":".","thousand":".","symbol":"%","symbolAfter":"%","negativeSign":True,"precision":1}},
153
+ { "title": "Barrel%", "field": "barrel_percent", "width": 150 ,"formatter": "money", "formatterParams":{"decimal":".","thousand":".","symbol":"%","symbolAfter":"%","negativeSign":True,"precision":1}},
154
+ { "title": "Z-Contact%", "field": "zone_contact_percent", "width": 150 ,"formatter": "money", "formatterParams":{"decimal":".","thousand":".","symbol":"%","symbolAfter":"%","negativeSign":True,"precision":1}},
155
+ { "title": "Z-Swing%", "field": "zone_swing_percent", "width": 150 ,"formatter": "money", "formatterParams":{"decimal":".","thousand":".","symbol":"%","symbolAfter":"%","negativeSign":True,"precision":1}},
156
+ { "title": "Zone%", "field": "zone_percent", "width": 150 ,"formatter": "money", "formatterParams":{"decimal":".","thousand":".","symbol":"%","symbolAfter":"%","negativeSign":True,"precision":1}},
157
+ { "title": "O-Swing%", "field": "chase_percent", "width": 150 ,"formatter": "money", "formatterParams":{"decimal":".","thousand":".","symbol":"%","symbolAfter":"%","negativeSign":True,"precision":1}},
158
+ { "title": "O-Contact%", "field": "chase_contact", "width": 150 ,"formatter": "money", "formatterParams":{"decimal":".","thousand":".","symbol":"%","symbolAfter":"%","negativeSign":True,"precision":1}},
159
+ { "title": "Swing%", "field": "swing_percent", "width": 150 ,"formatter": "money", "formatterParams":{"decimal":".","thousand":".","symbol":"%","symbolAfter":"%","negativeSign":True,"precision":1}},
160
+ { "title": "Whiff%", "field": "whiff_rate", "width": 150 ,"formatter": "money", "formatterParams":{"decimal":".","thousand":".","symbol":"%","symbolAfter":"%","negativeSign":True,"precision":1}},
161
+ { "title": "SwStr%", "field": "swstr_rate", "width": 150 ,"formatter": "money", "formatterParams":{"decimal":".","thousand":".","symbol":"%","symbolAfter":"%","negativeSign":True,"precision":1}},
162
+ { "title": "GB%", "field": "ground_ball_percent", "width": 150 ,"formatter": "money", "formatterParams":{"decimal":".","thousand":".","symbol":"%","symbolAfter":"%","negativeSign":True,"precision":1}},
163
+ { "title": "LD%", "field": "line_drive_percent", "width": 150 ,"formatter": "money", "formatterParams":{"decimal":".","thousand":".","symbol":"%","symbolAfter":"%","negativeSign":True,"precision":1}},
164
+ { "title": "FB%", "field": "fly_ball_percent", "width": 150 ,"formatter": "money", "formatterParams":{"decimal":".","thousand":".","symbol":"%","symbolAfter":"%","negativeSign":True,"precision":1}},
165
+ { "title": "PU%", "field": "pop_up_percent", "width": 150 ,"formatter": "money", "formatterParams":{"decimal":".","thousand":".","symbol":"%","symbolAfter":"%","negativeSign":True,"precision":1}},
166
+ { "title": "Heart Swing%", "field": "heart_zone_swing_percent", "width": 150 ,"formatter": "money", "formatterParams":{"decimal":".","thousand":".","symbol":"%","symbolAfter":"%","negativeSign":True,"precision":1}},
167
+ { "title": "Shadow Swing%", "field": "shadow_zone_swing_percent", "width": 150 ,"formatter": "money", "formatterParams":{"decimal":".","thousand":".","symbol":"%","symbolAfter":"%","negativeSign":True,"precision":1}},
168
+ { "title": "Chase Swing%", "field": "chase_zone_swing_percent", "width": 150 ,"formatter": "money", "formatterParams":{"decimal":".","thousand":".","symbol":"%","symbolAfter":"%","negativeSign":True,"precision":1}},
169
+ { "title": "Waste Swing%", "field": "waste_zone_swing_percent", "width": 150 ,"formatter": "money", "formatterParams":{"decimal":".","thousand":".","symbol":"%","symbolAfter":"%","negativeSign":True,"precision":1}},
170
+ { "title": "Heart Whiff%", "field": "heart_zone_whiff_percent", "width": 150 ,"formatter": "money", "formatterParams":{"decimal":".","thousand":".","symbol":"%","symbolAfter":"%","negativeSign":True,"precision":1}},
171
+ { "title": "Shadow Whiff%", "field": "shadow_zone_whiff_percent", "width": 150 ,"formatter": "money", "formatterParams":{"decimal":".","thousand":".","symbol":"%","symbolAfter":"%","negativeSign":True,"precision":1}},
172
+ { "title": "Chase Whiff%", "field": "chase_zone_whiff_percent", "width": 150 ,"formatter": "money", "formatterParams":{"decimal":".","thousand":".","symbol":"%","symbolAfter":"%","negativeSign":True,"precision":1}},
173
+ { "title": "Waste Whiff%", "field": "waste_zone_whiff_percent", "width": 150 ,"formatter": "money", "formatterParams":{"decimal":".","thousand":".","symbol":"%","symbolAfter":"%","negativeSign":True,"precision":1}},
174
+ { "title": "tjStuff+", "field": "tj_stuff_plus_avg", "width": 150 },
175
+ { "title": "Velocity", "field": "start_speed_avg", "width": 150 },
176
+ { "title": "Extension", "field": "extension_avg", "width": 150 },
177
+ { "title": "VB", "field": "vb_avg", "width": 150 },
178
+ { "title": "iVB", "field": "ivb_avg", "width": 150 },
179
+ { "title": "HB", "field": "hb_avg", "width": 150 },
180
+ { "title": "vRel", "field": "z0_avg", "width": 150 },
181
+ { "title": "hRel", "field": "x0_avg", "width": 150 },
182
+ { "title": "VAA", "field": "vaa_avg", "width": 150 },
183
+ { "title": "HAA", "field": "haa_avg", "width": 150 },
184
+ { "title": "Spin Rate", "field": "spin_rate_avg", "width": 150 },
185
+ { "title": "Extension", "field": "extension_avg", "width": 150 },
186
+
187
+ ]
188
+
189
+ stat_titles = dict(zip([col["field"] for col in columns],[col["title"] for col in columns]))
190
+
191
+ stat_selection = [key for key in stat_titles.keys()]
192
+
193
+ agg_titles = {'batter_id':'Batter ID',
194
+ 'batter_name':'Batter Name',
195
+ 'batter_team':'Batter Team',
196
+ 'batter_hand':'Batter Hand',
197
+ 'pitcher_id':'Pitcher ID',
198
+ 'pitcher_name':'Pitcher Name',
199
+ 'pitcher_team':'Pitcher Team',
200
+ 'pitcher_hand':'Pitcher Hand',
201
+ 'pitch_type':'Pitch Type',
202
+ 'pitch_group':'Pitch Group',
203
+ 'home_away_batter':'Home/Away Batter',
204
+ 'home_away_pitcher':'Home/Away Pitcher',
205
+ 'is_swing':'Is Swing?',
206
+ 'is_bip':'Is BIP?',
207
+ 'in_zone_final':'In Zone?',
208
+ 'attack_zone_final':'Attack Zone'}
209
+
210
+
211
+ columns_group = [
212
+ { "title": "Batter ID", "field": "batter_id", "width": 150, "headerFilter":"input","frozen":True,},
213
+ { "title": "Batter Name", "field": "batter_name", "width": 200,"frozen":True, "headerFilter":"input" },
214
+ { "title": "Batter Team", "field": "batter_team", "width": 150,"frozen":True, "headerFilter":"input" },
215
+ { "title": "Batter Hand", "field": "batter_hand", "width": 150,"frozen":True, "headerFilter":"input" },
216
+ { "title": "Pitcher ID", "field": "pitcher_id", "width": 150,"frozen":True, "headerFilter":"input" },
217
+ { "title": "Pitcher Name", "field": "pitcher_name", "width": 200,"frozen":True, "headerFilter":"input" },
218
+ { "title": "Pitcher Team", "field": "pitcher_team", "width": 150,"frozen":True, "headerFilter":"input" },
219
+ { "title": "Pitcher Hand", "field": "pitcher_hand", "width": 150,"frozen":True, "headerFilter":"input" },
220
+ { "title": "Pitch Type", "field": "pitch_type", "width": 150,"frozen":True, "headerFilter":"input" },
221
+ { "title": "Pitch Group", "field": "pitch_group", "width": 150,"frozen":True, "headerFilter":"input" },
222
+ { "title": "Home/Away Batter", "field": "home_away_batter", "width": 150,"frozen":True, "headerFilter":"input" },
223
+ { "title": "Home/Away Pitcher", "field": "home_away_pitcher", "width": 150,"frozen":True, "headerFilter":"input" },
224
+ { "title": "Is Swing?", "field": "is_swing", "width": 150,"frozen":True, "headerFilter":"input" },
225
+ { "title": "Is BIP?", "field": "is_bip", "width": 150,"frozen":True, "headerFilter":"input" },
226
+ { "title": "In Zone?", "field": "in_zone_final", "width": 150,"frozen":True, "headerFilter":"input" },
227
+ { "title": "Attack Zone", "field": "attack_zone_final", "width": 150,"frozen":True, "headerFilter":"input" }
228
+ ]
229
+
230
+
231
+ app_ui = ui.page_sidebar(
232
+ ui.sidebar(
233
+ ui.input_selectize(
234
+ "level_input",
235
+ "Select Level:",
236
+ choices=['MLB','AAA','A'],
237
+ multiple=False,
238
+ selected=['MLB']
239
+ ),
240
+ ui.input_selectize(
241
+ "list_input",
242
+ "Select Aggregation:",
243
+ choices=agg_titles,
244
+ multiple=True,
245
+ selected=['batter_id', 'batter_name']
246
+ ),
247
+ ui.input_selectize(
248
+ "list_stats",
249
+ "Select Stats:",
250
+ choices=stat_titles,
251
+ multiple=True,
252
+ selected=['pa']
253
+ ),
254
+ ui.input_date_range(
255
+ "date_id",
256
+ "Select Date Range",
257
+ start=f'{season}-01-01',
258
+ end=f'{season}-12-01',
259
+ min=f'{season}-01-01',
260
+ max=f'{season}-12-01',
261
+ ),
262
+ ui.hr(),
263
+ ui.h4("Filters"),
264
+ ui.div(
265
+ {"id": "filter-container"},
266
+ ui.div(
267
+ {"class": "filter-row", "id": "filter_row_1"}, # Add id for deletion
268
+ ui.row(
269
+ ui.column(5, # Adjusted column widths to make room for delete button
270
+ ui.input_select(
271
+ "filter_column_1",
272
+ "Metric",
273
+ choices={}
274
+ )
275
+ ),
276
+ ui.column(3,
277
+ ui.input_select(
278
+ "filter_operator_1",
279
+ "Operator",
280
+ choices=[">=", "<="]
281
+ ),
282
+ ),
283
+ ui.column(3,
284
+ ui.input_numeric(
285
+ "filter_value_1",
286
+ "Value",
287
+ value=0
288
+ )
289
+ ),
290
+ ui.column(1,
291
+ ui.markdown("&nbsp;"),
292
+
293
+
294
+ ui.input_action_button(
295
+ f"delete_filter_1",
296
+ "",
297
+ class_="btn-danger btn-sm",
298
+ style="padding: 3px 6px;",
299
+ icon='✖'
300
+
301
+ )
302
+ )
303
+ )
304
+ )
305
+ ),
306
+ ui.input_action_button(
307
+ "add_filter",
308
+ "Add Filter",
309
+ class_="btn-secondary"
310
+ ),
311
+ ui.br(),
312
+ ui.br(),
313
+ ui.input_action_button(
314
+ "generate_table",
315
+ "Generate Table",
316
+ class_="btn-primary"
317
+ ),
318
+ width="400px"
319
+ ),
320
+ ui.navset_tab(
321
+ ui.nav_panel("Leaderboard",
322
+ ui.card(
323
+ #ui.card_header("Leaderboard"),
324
+ output_tabulator("tabulator")
325
+ )
326
+ ),
327
+
328
+ )
329
+ )
330
+
331
+ def server(input, output, session):
332
+ # Store the number of active filters
333
+ filter_count = reactive.value(1)
334
+ # Store active filter IDs
335
+ active_filters = reactive.value([1])
336
+
337
+ @reactive.effect
338
+ @reactive.event(input.list_stats)
339
+ def _():
340
+ stat_choices = {k: k for k in input.list_stats()}
341
+ filtered_stat_choices = {key: stat_titles[key] for key in stat_choices}
342
+ ui.update_select("filter_column_1", choices=filtered_stat_choices)
343
+
344
+ @reactive.effect
345
+ @reactive.event(input.add_filter)
346
+ def _():
347
+ current_count = filter_count.get()
348
+ new_count = current_count + 1
349
+
350
+ stat_choices = {k: k for k in input.list_stats()}
351
+ filtered_stat_choices = {key: stat_titles[key] for key in stat_choices}
352
+
353
+ ui.insert_ui(
354
+ selector="#filter-container",
355
+ where="beforeEnd",
356
+ ui=ui.div(
357
+ {"class": "filter-row", "id": f"filter_row_{new_count}"},
358
+ ui.row(
359
+ ui.column(5,
360
+ ui.input_select(
361
+ f"filter_column_{new_count}",
362
+ "Metric",
363
+ choices=filtered_stat_choices
364
+ ),
365
+ ),
366
+ ui.column(3,
367
+ ui.input_select(
368
+ f"filter_operator_{new_count}",
369
+ "Operator",
370
+ choices=[">=", "<="]
371
+ ),
372
+ ),
373
+ ui.column(3,
374
+ ui.input_numeric(
375
+ f"filter_value_{new_count}",
376
+ "Value",
377
+ value=0
378
+ )
379
+ ),
380
+ ui.column(1,
381
+ ui.markdown("&nbsp;"),
382
+
383
+
384
+ ui.input_action_button(
385
+ f"delete_filter_{new_count}",
386
+ "",
387
+ class_="btn-danger btn-sm",
388
+ style="padding: 3px 6px;",
389
+ icon='✖'
390
+
391
+ )
392
+ )
393
+ )
394
+ )
395
+ )
396
+ filter_count.set(new_count)
397
+ current_filters = active_filters.get()
398
+ current_filters.append(new_count)
399
+ active_filters.set(current_filters)
400
+
401
+ @reactive.effect
402
+ def _():
403
+ # Monitor all possible delete buttons
404
+ for i in range(1, filter_count.get() + 1):
405
+ try:
406
+ if getattr(input, f"delete_filter_{i}")() > 0:
407
+ # Remove the filter row
408
+ ui.remove_ui(f"#filter_row_{i}")
409
+ # Update active filters
410
+ current_filters = active_filters.get()
411
+ if i in current_filters:
412
+ current_filters.remove(i)
413
+ active_filters.set(current_filters)
414
+ except:
415
+ continue
416
+
417
+ @output
418
+ @render_tabulator
419
+ @reactive.event(input.generate_table, ignore_none=False)
420
+ def tabulator():
421
+ columns_c = columns.copy()
422
+ selection_list = list(input.list_input())
423
+ start_date = str(input.date_id()[0])
424
+ end_date = str(input.date_id()[1])
425
+
426
+
427
+ if input.level_input() == "MLB":
428
+ df_agg = update.update_summary_select(df=df_mlb_total.filter((pl.col('game_date')>=start_date)&(pl.col('game_date')<=end_date)),
429
+ selection=selection_list)
430
+
431
+ elif input.level_input() == "AAA":
432
+ df_agg = update.update_summary_select(df=df_aaa_total.filter((pl.col('game_date')>=start_date)&(pl.col('game_date')<=end_date)),
433
+ selection=selection_list)
434
+
435
+ elif input.level_input() == "A":
436
+ df_agg = update.update_summary_select(df=df_a_total.filter((pl.col('game_date')>=start_date)&(pl.col('game_date')<=end_date)),
437
+ selection=selection_list)
438
+
439
+
440
+ df_agg = df_agg.select(selection_list + list(input.list_stats()))#.sort('pa', descending=True)
441
+
442
+ # Apply filters - only for active filters
443
+ for i in active_filters.get():
444
+ try:
445
+ col_name = getattr(input, f"filter_column_{i}")()
446
+ if col_name: # Only apply filter if column is selected
447
+ operator = getattr(input, f"filter_operator_{i}")()
448
+ if col_name in [col["field"] for col in columns_c if col.get("formatter") == "money"]:
449
+ value = getattr(input, f"filter_value_{i}")()/100
450
+ else:
451
+ value = getattr(input, f"filter_value_{i}")()
452
+
453
+ if operator == ">=":
454
+ df_agg = df_agg.filter(pl.col(col_name) >= value)
455
+ elif operator == "<=":
456
+ df_agg = df_agg.filter(pl.col(col_name) <= value)
457
+ except:
458
+ continue
459
+
460
+ for col in df_agg.columns[len(selection_list):]:
461
+ if col in rounding_dict:
462
+ df_agg = df_agg.with_columns(pl.col(col).round(rounding_dict[col]))
463
+
464
+ for column in columns_c:
465
+ if column.get("formatter") == "money" and column.get("field") in df_agg.columns:
466
+ df_agg = df_agg.with_columns(pl.col(column.get("field"))*100)
467
+
468
+ col_group = []
469
+ for column in columns_group:
470
+ if column.get("field") in df_agg.columns:
471
+ col_group.append(column)
472
+
473
+ col_group_stats = []
474
+ for column in columns_c:
475
+ if column.get("field") in df_agg.columns:
476
+ col_group_stats.append(column)
477
+
478
+ columns_c = col_group + col_group_stats
479
+
480
+ return Tabulator(
481
+ df_agg.to_pandas(),
482
+ table_options=TableOptions(
483
+ height=800,
484
+ columns=columns_c,
485
+ )
486
+ )
487
+
488
+ app = App(app_ui, server)
data/data_a_2024.parquet ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:11ddeb5b35ced8d2a3c627af2cded75c78fbb333fa4331569b78a7fb29ddce1f
3
+ size 44964738
data/data_aaa_2024.parquet ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cf48b06263a3b0a25b701e98c589f7e11bdf0ec3665562082ea209a61b55468e
3
+ size 117704536
data/data_mlb_2024.parquet ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e0f2c52277a98ecbba343650996f32bbb8a847631d676bce2f07dbd3931f09a8
3
+ size 128295721
functions/__pycache__/df_update.cpython-39.pyc CHANGED
Binary files a/functions/__pycache__/df_update.cpython-39.pyc and b/functions/__pycache__/df_update.cpython-39.pyc differ
 
functions/df_update.py CHANGED
@@ -1,506 +1,579 @@
1
- import polars as pl
2
- import numpy as np
3
- import joblib
4
-
5
- loaded_model = joblib.load('joblib_model/barrel_model.joblib')
6
- in_zone_model = joblib.load('joblib_model/in_zone_model_knn_20240410.joblib')
7
- attack_zone_model = joblib.load('joblib_model/model_attack_zone.joblib')
8
- xwoba_model = joblib.load('joblib_model/xwoba_model.joblib')
9
- px_model = joblib.load('joblib_model/linear_reg_model_x.joblib')
10
- pz_model = joblib.load('joblib_model/linear_reg_model_z.joblib')
11
-
12
-
13
- class df_update:
14
- def __init__(self):
15
- pass
16
-
17
- def update(self, df_clone: pl.DataFrame):
18
-
19
- df = df_clone.clone()
20
- # Assuming px_model is defined and df is your DataFrame
21
- hit_codes = ['single',
22
- 'double','home_run', 'triple']
23
-
24
- ab_codes = ['single', 'strikeout', 'field_out',
25
- 'grounded_into_double_play', 'fielders_choice', 'force_out',
26
- 'double', 'field_error', 'home_run', 'triple',
27
- 'double_play',
28
- 'fielders_choice_out', 'strikeout_double_play',
29
- 'other_out','triple_play']
30
-
31
-
32
- obp_true_codes = ['single', 'walk',
33
- 'double','home_run', 'triple',
34
- 'hit_by_pitch', 'intent_walk']
35
-
36
- obp_codes = ['single', 'strikeout', 'walk', 'field_out',
37
- 'grounded_into_double_play', 'fielders_choice', 'force_out',
38
- 'double', 'sac_fly', 'field_error', 'home_run', 'triple',
39
- 'hit_by_pitch', 'double_play', 'intent_walk',
40
- 'fielders_choice_out', 'strikeout_double_play',
41
- 'sac_fly_double_play',
42
- 'other_out','triple_play']
43
-
44
-
45
- contact_codes = ['In play, no out',
46
- 'Foul', 'In play, out(s)',
47
- 'In play, run(s)',
48
- 'Foul Bunt']
49
-
50
- bip_codes = ['In play, no out', 'In play, run(s)','In play, out(s)']
51
-
52
-
53
- conditions_barrel = [
54
- df['launch_speed'].is_null(),
55
- (df['launch_speed'] * 1.5 - df['launch_angle'] >= 117) &
56
- (df['launch_speed'] + df['launch_angle'] >= 124) &
57
- (df['launch_speed'] >= 98) &
58
- (df['launch_angle'] >= 4) & (df['launch_angle'] <= 50)
59
- ]
60
- choices_barrel = [False, True]
61
-
62
- conditions_tb = [
63
- (df['event_type'] == 'single'),
64
- (df['event_type'] == 'double'),
65
- (df['event_type'] == 'triple'),
66
- (df['event_type'] == 'home_run')
67
- ]
68
- choices_tb = [1, 2, 3, 4]
69
-
70
-
71
- conditions_woba = [
72
- df['event_type'].is_in(['strikeout', 'field_out', 'sac_fly', 'force_out', 'grounded_into_double_play', 'fielders_choice', 'field_error', 'sac_bunt', 'double_play', 'fielders_choice_out', 'strikeout_double_play', 'sac_fly_double_play', 'other_out']),
73
- df['event_type'] == 'walk',
74
- df['event_type'] == 'hit_by_pitch',
75
- df['event_type'] == 'single',
76
- df['event_type'] == 'double',
77
- df['event_type'] == 'triple',
78
- df['event_type'] == 'home_run'
79
- ]
80
- choices_woba = [0, 0.689, 0.720, 0.881, 1.254, 1.589, 2.048]
81
-
82
- woba_codes = ['strikeout', 'field_out', 'single', 'walk', 'hit_by_pitch', 'double', 'sac_fly', 'force_out', 'home_run', 'grounded_into_double_play', 'fielders_choice', 'field_error', 'triple', 'sac_bunt', 'double_play', 'fielders_choice_out', 'strikeout_double_play', 'sac_fly_double_play', 'other_out']
83
-
84
- pitch_cat = {'FA': 'Fastball',
85
- 'FF': 'Fastball',
86
- 'FT': 'Fastball',
87
- 'FC': 'Fastball',
88
- 'FS': 'Off-Speed',
89
- 'FO': 'Off-Speed',
90
- 'SI': 'Fastball',
91
- 'ST': 'Breaking',
92
- 'SL': 'Breaking',
93
- 'CU': 'Breaking',
94
- 'KC': 'Breaking',
95
- 'SC': 'Off-Speed',
96
- 'GY': 'Off-Speed',
97
- 'SV': 'Breaking',
98
- 'CS': 'Breaking',
99
- 'CH': 'Off-Speed',
100
- 'KN': 'Off-Speed',
101
- 'EP': 'Breaking',
102
- 'UN': None,
103
- 'IN': None,
104
- 'PO': None,
105
- 'AB': None,
106
- 'AS': None,
107
- 'NP': None}
108
-
109
-
110
- df = df.with_columns([
111
- pl.when(df['type_ab'].is_not_null()).then(1).otherwise(0).alias('pa'),
112
- pl.when(df['is_pitch']).then(1).otherwise(0).alias('pitches'),
113
- pl.when(df['sz_top'] == 0).then(None).otherwise(df['sz_top']).alias('sz_top'),
114
- pl.when(df['sz_bot'] == 0).then(None).otherwise(df['sz_bot']).alias('sz_bot'),
115
- pl.when(df['zone'] > 0).then(df['zone'] < 10).otherwise(None).alias('in_zone'),
116
- pl.Series(px_model.predict(df[['x']].fill_null(0).to_numpy())[:, 0]).alias('px_predict'),
117
- pl.Series(pz_model.predict(df[['y']].fill_null(0).to_numpy())[:, 0] + 3.2).alias('pz_predict'),
118
- pl.Series(in_zone_model.predict(df[['px','pz','sz_top','sz_bot']].fill_null(0).to_numpy())[:]).alias('in_zone_predict'),
119
- pl.Series(attack_zone_model.predict(df[['px','pz','sz_top','sz_bot']].fill_null(0).to_numpy())[:]).alias('attack_zone_predict'),
120
- pl.when(df['event_type'].is_in(hit_codes)).then(True).otherwise(False).alias('hits'),
121
- pl.when(df['event_type'].is_in(ab_codes)).then(True).otherwise(False).alias('ab'),
122
- pl.when(df['event_type'].is_in(obp_true_codes)).then(True).otherwise(False).alias('on_base'),
123
- pl.when(df['event_type'].is_in(obp_codes)).then(True).otherwise(False).alias('obp'),
124
- pl.when(df['play_description'].is_in(bip_codes)).then(True).otherwise(False).alias('bip'),
125
- pl.when(conditions_barrel[0]).then(choices_barrel[0]).when(conditions_barrel[1]).then(choices_barrel[1]).otherwise(None).alias('barrel'),
126
- pl.when(df['launch_angle'].is_null()).then(False).when((df['launch_angle'] >= 8) & (df['launch_angle'] <= 32)).then(True).otherwise(None).alias('sweet_spot'),
127
- pl.when(df['launch_speed'].is_null()).then(False).when(df['launch_speed'] >= 94.5).then(True).otherwise(None).alias('hard_hit'),
128
- pl.when(conditions_tb[0]).then(choices_tb[0]).when(conditions_tb[1]).then(choices_tb[1]).when(conditions_tb[2]).then(choices_tb[2]).when(conditions_tb[3]).then(choices_tb[3]).otherwise(None).alias('tb'),
129
- pl.when(conditions_woba[0]).then(choices_woba[0]).when(conditions_woba[1]).then(choices_woba[1]).when(conditions_woba[2]).then(choices_woba[2]).when(conditions_woba[3]).then(choices_woba[3]).when(conditions_woba[4]).then(choices_woba[4]).when(conditions_woba[5]).then(choices_woba[5]).when(conditions_woba[6]).then(choices_woba[6]).otherwise(None).alias('woba'),
130
- pl.when((df['play_code'] == 'S') | (df['play_code'] == 'W') | (df['play_code'] == 'T')).then(1).otherwise(0).alias('whiffs'),
131
- pl.when((df['play_code'] == 'S') | (df['play_code'] == 'W') | (df['play_code'] == 'T') | (df['play_code'] == 'C')).then(1).otherwise(0).alias('csw'),
132
- pl.when(pl.col('is_swing').cast(pl.Boolean)).then(1).otherwise(0).alias('swings'),
133
- pl.col('event_type').is_in(['strikeout','strikeout_double_play']).alias('k'),
134
- pl.col('event_type').is_in(['walk', 'intent_walk']).alias('bb'),
135
- pl.lit(None).alias('attack_zone'),
136
- pl.lit(None).alias('woba_pred'),
137
- pl.lit(None).alias('woba_pred_contact')
138
-
139
- ])
140
-
141
- df = df.with_columns([
142
- pl.when(df['event_type'].is_in(woba_codes)).then(1).otherwise(None).alias('woba_codes'),
143
- pl.when(df['event_type'].is_in(woba_codes)).then(1).otherwise(None).alias('xwoba_codes'),
144
- pl.when((pl.col('tb') >= 0)).then(df['woba']).otherwise(None).alias('woba_contact'),
145
- pl.when(pl.col('px').is_null()).then(pl.col('px_predict')).otherwise(pl.col('px')).alias('px'),
146
- pl.when(pl.col('pz').is_null()).then(pl.col('pz_predict')).otherwise(pl.col('pz')).alias('pz'),
147
- pl.when(pl.col('in_zone').is_null()).then(pl.col('in_zone_predict')).otherwise(pl.col('in_zone')).alias('in_zone'),
148
- pl.when(df['launch_speed'].is_null()).then(None).otherwise(df['barrel']).alias('barrel'),
149
- pl.lit('average').alias('average'),
150
- pl.when(pl.col('in_zone') == False).then(True).otherwise(False).alias('out_zone'),
151
- pl.when((pl.col('in_zone') == True) & (pl.col('swings') == 1)).then(True).otherwise(False).alias('zone_swing'),
152
- pl.when((pl.col('in_zone') == True) & (pl.col('swings') == 1) & (pl.col('whiffs') == 0)).then(True).otherwise(False).alias('zone_contact'),
153
- pl.when((pl.col('in_zone') == False) & (pl.col('swings') == 1)).then(True).otherwise(False).alias('ozone_swing'),
154
- pl.when((pl.col('in_zone') == False) & (pl.col('swings') == 1) & (pl.col('whiffs') == 0)).then(True).otherwise(False).alias('ozone_contact'),
155
- pl.when(pl.col('event_type').str.contains('strikeout')).then(True).otherwise(False).alias('k'),
156
- pl.when(pl.col('event_type').is_in(['walk', 'intent_walk'])).then(True).otherwise(False).alias('bb'),
157
- pl.when(pl.col('attack_zone').is_null()).then(pl.col('attack_zone_predict')).otherwise(pl.col('attack_zone')).alias('attack_zone'),
158
-
159
-
160
- ])
161
-
162
- df = df.with_columns([
163
- (df['k'].cast(pl.Float32) - df['bb'].cast(pl.Float32)).alias('k_minus_bb'),
164
- (df['bb'].cast(pl.Float32) - df['k'].cast(pl.Float32)).alias('bb_minus_k'),
165
- (df['launch_speed'] > 0).alias('bip_div'),
166
- (df['attack_zone'] == 0).alias('heart'),
167
- (df['attack_zone'] == 1).alias('shadow'),
168
- (df['attack_zone'] == 2).alias('chase'),
169
- (df['attack_zone'] == 3).alias('waste'),
170
- ((df['attack_zone'] == 0) & (df['swings'] == 1)).alias('heart_swing'),
171
- ((df['attack_zone'] == 1) & (df['swings'] == 1)).alias('shadow_swing'),
172
- ((df['attack_zone'] == 2) & (df['swings'] == 1)).alias('chase_swing'),
173
- ((df['attack_zone'] == 3) & (df['swings'] == 1)).alias('waste_swing'),
174
- ((df['attack_zone'] == 0) & (df['whiffs'] == 1)).alias('heart_whiff'),
175
- ((df['attack_zone'] == 1) & (df['whiffs'] == 1)).alias('shadow_whiff'),
176
- ((df['attack_zone'] == 2) & (df['whiffs'] == 1)).alias('chase_whiff'),
177
- ((df['attack_zone'] == 3) & (df['whiffs'] == 1)).alias('waste_whiff')
178
- ])
179
-
180
-
181
- [0, 0.689, 0.720, 0.881, 1.254, 1.589, 2.048]
182
-
183
- df = df.with_columns([
184
- pl.Series(
185
- [sum(x) for x in xwoba_model.predict_proba(df[['launch_angle', 'launch_speed']].fill_null(0).to_numpy()[:]) * ([0, 0.881, 1.254, 1.589, 2.048])]
186
- ).alias('woba_pred_predict')
187
- ])
188
-
189
- df = df.with_columns([
190
- pl.when(pl.col('event_type').is_in(['walk'])).then(0.689)
191
- .when(pl.col('event_type').is_in(['hit_by_pitch'])).then(0.720)
192
- .when(pl.col('event_type').is_in(['strikeout', 'strikeout_double_play'])).then(0)
193
- .otherwise(pl.col('woba_pred_predict')).alias('woba_pred_predict')
194
- ])
195
-
196
- df = df.with_columns([
197
- pl.when(pl.col('woba_codes').is_null()).then(None).otherwise(pl.col('woba_pred_predict')).alias('woba_pred'),
198
- pl.when(pl.col('bip')!=1).then(None).otherwise(pl.col('woba_pred_predict')).alias('woba_pred_contact'),
199
- ])
200
-
201
- df = df.with_columns([
202
- pl.when(pl.col('trajectory').is_in(['bunt_popup'])).then(pl.lit('popup'))
203
- .when(pl.col('trajectory').is_in(['bunt_grounder'])).then(pl.lit('ground_ball'))
204
- .when(pl.col('trajectory').is_in(['bunt_line_drive'])).then(pl.lit('line_drive'))
205
- .when(pl.col('trajectory').is_in([''])).then(pl.lit(None))
206
- .otherwise(pl.col('trajectory')).alias('trajectory')
207
- ])
208
-
209
-
210
- # Create one-hot encoded columns for the trajectory column
211
- dummy_df = df.select(pl.col('trajectory')).to_dummies()
212
-
213
- # Rename the one-hot encoded columns
214
- dummy_df = dummy_df.rename({
215
- 'trajectory_fly_ball': 'trajectory_fly_ball',
216
- 'trajectory_ground_ball': 'trajectory_ground_ball',
217
- 'trajectory_line_drive': 'trajectory_line_drive',
218
- 'trajectory_popup': 'trajectory_popup'
219
- })
220
-
221
- # Ensure the columns are present in the DataFrame
222
- for col in ['trajectory_fly_ball', 'trajectory_ground_ball', 'trajectory_line_drive', 'trajectory_popup']:
223
- if col not in dummy_df.columns:
224
- dummy_df = dummy_df.with_columns(pl.lit(0).alias(col))
225
-
226
- # Join the one-hot encoded columns back to the original DataFrame
227
- df = df.hstack(dummy_df)
228
-
229
- # Check if 'trajectory_null' column exists and drop it
230
- if 'trajectory_null' in df.columns:
231
- df = df.drop('trajectory_null')
232
-
233
-
234
- pitch_cat = {'FA': None,
235
- 'FF': 'Fastball',
236
- 'FT': 'Fastball',
237
- 'FC': 'Fastball',
238
- 'FS': 'Off-Speed',
239
- 'FO': 'Off-Speed',
240
- 'SI': 'Fastball',
241
- 'ST': 'Breaking',
242
- 'SL': 'Breaking',
243
- 'CU': 'Breaking',
244
- 'KC': 'Breaking',
245
- 'SC': 'Off-Speed',
246
- 'GY': 'Off-Speed',
247
- 'SV': 'Breaking',
248
- 'CS': 'Breaking',
249
- 'CH': 'Off-Speed',
250
- 'KN': 'Off-Speed',
251
- 'EP': 'Breaking',
252
- 'UN': None,
253
- 'IN': None,
254
- 'PO': None,
255
- 'AB': None,
256
- 'AS': None,
257
- 'NP': None}
258
- df = df.with_columns(
259
- df["pitch_type"].map_elements(lambda x: pitch_cat.get(x, x)).alias("pitch_group")
260
- )
261
-
262
- return df
263
-
264
- # Assuming df is your Polars DataFrame
265
- def update_summary(self, df: pl.DataFrame, pitcher: bool = True) -> pl.DataFrame:
266
- """
267
- Update summary statistics for pitchers or batters.
268
-
269
- Parameters:
270
- df (pl.DataFrame): The input Polars DataFrame containing player statistics.
271
- pitcher (bool): A flag indicating whether to calculate statistics for pitchers (True) or batters (False).
272
-
273
- Returns:
274
- pl.DataFrame: A Polars DataFrame with aggregated and calculated summary statistics.
275
- """
276
-
277
- # Determine the position based on the pitcher flag
278
- if pitcher:
279
- position = 'pitcher'
280
- else:
281
- position = 'batter'
282
-
283
- # Group by position_id and position_name, then aggregate various statistics
284
- df_summ = df.group_by([f'{position}_id', f'{position}_name']).agg([
285
- pl.col('pa').sum().alias('pa'),
286
- pl.col('ab').sum().alias('ab'),
287
- pl.col('obp').sum().alias('obp_pa'),
288
- pl.col('hits').sum().alias('hits'),
289
- pl.col('on_base').sum().alias('on_base'),
290
- pl.col('k').sum().alias('k'),
291
- pl.col('bb').sum().alias('bb'),
292
- pl.col('bb_minus_k').sum().alias('bb_minus_k'),
293
- pl.col('csw').sum().alias('csw'),
294
- pl.col('bip').sum().alias('bip'),
295
- pl.col('bip_div').sum().alias('bip_div'),
296
- pl.col('tb').sum().alias('tb'),
297
- pl.col('woba').sum().alias('woba'),
298
- pl.col('woba_contact').sum().alias('woba_contact'),
299
- pl.col('woba_pred').sum().alias('xwoba'),
300
- pl.col('woba_pred_contact').sum().alias('xwoba_contact'),
301
- pl.col('woba_codes').sum().alias('woba_codes'),
302
- pl.col('xwoba_codes').sum().alias('xwoba_codes'),
303
- pl.col('hard_hit').sum().alias('hard_hit'),
304
- pl.col('barrel').sum().alias('barrel'),
305
- pl.col('sweet_spot').sum().alias('sweet_spot'),
306
- pl.col('launch_speed').max().alias('max_launch_speed'),
307
- pl.col('launch_speed').quantile(0.90).alias('launch_speed_90'),
308
- pl.col('launch_speed').mean().alias('launch_speed'),
309
- pl.col('launch_angle').mean().alias('launch_angle'),
310
- pl.col('is_pitch').sum().alias('pitches'),
311
- pl.col('swings').sum().alias('swings'),
312
- pl.col('in_zone').sum().alias('in_zone'),
313
- pl.col('out_zone').sum().alias('out_zone'),
314
- pl.col('whiffs').sum().alias('whiffs'),
315
- pl.col('zone_swing').sum().alias('zone_swing'),
316
- pl.col('zone_contact').sum().alias('zone_contact'),
317
- pl.col('ozone_swing').sum().alias('ozone_swing'),
318
- pl.col('ozone_contact').sum().alias('ozone_contact'),
319
- pl.col('trajectory_ground_ball').sum().alias('ground_ball'),
320
- pl.col('trajectory_line_drive').sum().alias('line_drive'),
321
- pl.col('trajectory_fly_ball').sum().alias('fly_ball'),
322
- pl.col('trajectory_popup').sum().alias('pop_up'),
323
- pl.col('attack_zone').count().alias('attack_zone'),
324
- pl.col('heart').sum().alias('heart'),
325
- pl.col('shadow').sum().alias('shadow'),
326
- pl.col('chase').sum().alias('chase'),
327
- pl.col('waste').sum().alias('waste'),
328
- pl.col('heart_swing').sum().alias('heart_swing'),
329
- pl.col('shadow_swing').sum().alias('shadow_swing'),
330
- pl.col('chase_swing').sum().alias('chase_swing'),
331
- pl.col('waste_swing').sum().alias('waste_swing'),
332
- pl.col('heart_whiff').sum().alias('heart_whiff'),
333
- pl.col('shadow_whiff').sum().alias('shadow_whiff'),
334
- pl.col('chase_whiff').sum().alias('chase_whiff'),
335
- pl.col('waste_whiff').sum().alias('waste_whiff')
336
- ])
337
-
338
- # Add calculated columns to the summary DataFrame
339
- df_summ = df_summ.with_columns([
340
- (pl.col('hits') / pl.col('ab')).alias('avg'),
341
- (pl.col('on_base') / pl.col('obp_pa')).alias('obp'),
342
- (pl.col('tb') / pl.col('ab')).alias('slg'),
343
- (pl.col('on_base') / pl.col('obp_pa') + pl.col('tb') / pl.col('ab')).alias('ops'),
344
- (pl.col('k') / pl.col('pa')).alias('k_percent'),
345
- (pl.col('bb') / pl.col('pa')).alias('bb_percent'),
346
- (pl.col('bb_minus_k') / pl.col('pa')).alias('bb_minus_k_percent'),
347
- (pl.col('bb') / pl.col('k')).alias('bb_over_k_percent'),
348
- (pl.col('csw') / pl.col('pitches')).alias('csw_percent'),
349
- (pl.col('sweet_spot') / pl.col('bip_div')).alias('sweet_spot_percent'),
350
- (pl.col('woba') / pl.col('woba_codes')).alias('woba_percent'),
351
- (pl.col('woba_contact') / pl.col('bip')).alias('woba_percent_contact'),
352
- (pl.col('hard_hit') / pl.col('bip_div')).alias('hard_hit_percent'),
353
- (pl.col('barrel') / pl.col('bip_div')).alias('barrel_percent'),
354
- (pl.col('zone_contact') / pl.col('zone_swing')).alias('zone_contact_percent'),
355
- (pl.col('zone_swing') / pl.col('in_zone')).alias('zone_swing_percent'),
356
- (pl.col('in_zone') / pl.col('pitches')).alias('zone_percent'),
357
- (pl.col('ozone_swing') / (pl.col('pitches') - pl.col('in_zone'))).alias('chase_percent'),
358
- (pl.col('ozone_contact') / pl.col('ozone_swing')).alias('chase_contact'),
359
- (pl.col('swings') / pl.col('pitches')).alias('swing_percent'),
360
- (pl.col('whiffs') / pl.col('swings')).alias('whiff_rate'),
361
- (pl.col('whiffs') / pl.col('pitches')).alias('swstr_rate'),
362
- (pl.col('ground_ball') / pl.col('bip')).alias('ground_ball_percent'),
363
- (pl.col('line_drive') / pl.col('bip')).alias('line_drive_percent'),
364
- (pl.col('fly_ball') / pl.col('bip')).alias('fly_ball_percent'),
365
- (pl.col('pop_up') / pl.col('bip')).alias('pop_up_percent'),
366
- (pl.col('heart') / pl.col('attack_zone')).alias('heart_zone_percent'),
367
- (pl.col('shadow') / pl.col('attack_zone')).alias('shadow_zone_percent'),
368
- (pl.col('chase') / pl.col('attack_zone')).alias('chase_zone_percent'),
369
- (pl.col('waste') / pl.col('attack_zone')).alias('waste_zone_percent'),
370
- (pl.col('heart_swing') / pl.col('heart')).alias('heart_zone_swing_percent'),
371
- (pl.col('shadow_swing') / pl.col('shadow')).alias('shadow_zone_swing_percent'),
372
- (pl.col('chase_swing') / pl.col('chase')).alias('chase_zone_swing_percent'),
373
- (pl.col('waste_swing') / pl.col('waste')).alias('waste_zone_swing_percent'),
374
- (pl.col('heart_whiff') / pl.col('heart_swing')).alias('heart_zone_whiff_percent'),
375
- (pl.col('shadow_whiff') / pl.col('shadow_swing')).alias('shadow_zone_whiff_percent'),
376
- (pl.col('chase_whiff') / pl.col('chase_swing')).alias('chase_zone_whiff_percent'),
377
- (pl.col('waste_whiff') / pl.col('waste_swing')).alias('waste_zone_whiff_percent'),
378
- (pl.col('xwoba') / pl.col('xwoba_codes')).alias('xwoba_percent'),
379
- (pl.col('xwoba_contact') / pl.col('bip')).alias('xwoba_percent_contact')
380
- ])
381
-
382
- return df_summ
383
-
384
-
385
-
386
-
387
-
388
-
389
- # Assuming df is your Polars DataFrame
390
- def update_summary_select(self, df: pl.DataFrame, selection: list) -> pl.DataFrame:
391
- """
392
- Update summary statistics for pitchers or batters.
393
-
394
- Parameters:
395
- df (pl.DataFrame): The input Polars DataFrame containing player statistics.
396
- pitcher (bool): A flag indicating whether to calculate statistics for pitchers (True) or batters (False).
397
-
398
- Returns:
399
- pl.DataFrame: A Polars DataFrame with aggregated and calculated summary statistics.
400
- """
401
-
402
- # Group by position_id and position_name, then aggregate various statistics
403
- df_summ = df.group_by(selection).agg([
404
- pl.col('pa').sum().alias('pa'),
405
- pl.col('ab').sum().alias('ab'),
406
- pl.col('obp').sum().alias('obp_pa'),
407
- pl.col('hits').sum().alias('hits'),
408
- pl.col('on_base').sum().alias('on_base'),
409
- pl.col('k').sum().alias('k'),
410
- pl.col('bb').sum().alias('bb'),
411
- pl.col('bb_minus_k').sum().alias('bb_minus_k'),
412
- pl.col('k_minus_bb').sum().alias('k_minus_bb'),
413
- pl.col('csw').sum().alias('csw'),
414
- pl.col('bip').sum().alias('bip'),
415
- pl.col('bip_div').sum().alias('bip_div'),
416
- pl.col('tb').sum().alias('tb'),
417
- pl.col('woba').sum().alias('woba'),
418
- pl.col('woba_contact').sum().alias('woba_contact'),
419
- pl.col('woba_pred').sum().alias('xwoba'),
420
- pl.col('woba_pred_contact').sum().alias('xwoba_contact'),
421
- pl.col('woba_codes').sum().alias('woba_codes'),
422
- pl.col('xwoba_codes').sum().alias('xwoba_codes'),
423
- pl.col('hard_hit').sum().alias('hard_hit'),
424
- pl.col('barrel').sum().alias('barrel'),
425
- pl.col('sweet_spot').sum().alias('sweet_spot'),
426
- pl.col('launch_speed').max().alias('max_launch_speed'),
427
- pl.col('launch_speed').quantile(0.90).alias('launch_speed_90'),
428
- pl.col('launch_speed').mean().alias('launch_speed'),
429
- pl.col('launch_angle').mean().alias('launch_angle'),
430
- pl.col('is_pitch').sum().alias('pitches'),
431
- pl.col('swings').sum().alias('swings'),
432
- pl.col('in_zone').sum().alias('in_zone'),
433
- pl.col('out_zone').sum().alias('out_zone'),
434
- pl.col('whiffs').sum().alias('whiffs'),
435
- pl.col('zone_swing').sum().alias('zone_swing'),
436
- pl.col('zone_contact').sum().alias('zone_contact'),
437
- pl.col('ozone_swing').sum().alias('ozone_swing'),
438
- pl.col('ozone_contact').sum().alias('ozone_contact'),
439
- pl.col('trajectory_ground_ball').sum().alias('ground_ball'),
440
- pl.col('trajectory_line_drive').sum().alias('line_drive'),
441
- pl.col('trajectory_fly_ball').sum().alias('fly_ball'),
442
- pl.col('trajectory_popup').sum().alias('pop_up'),
443
- pl.col('attack_zone').count().alias('attack_zone'),
444
- pl.col('heart').sum().alias('heart'),
445
- pl.col('shadow').sum().alias('shadow'),
446
- pl.col('chase').sum().alias('chase'),
447
- pl.col('waste').sum().alias('waste'),
448
- pl.col('heart_swing').sum().alias('heart_swing'),
449
- pl.col('shadow_swing').sum().alias('shadow_swing'),
450
- pl.col('chase_swing').sum().alias('chase_swing'),
451
- pl.col('waste_swing').sum().alias('waste_swing'),
452
- pl.col('heart_whiff').sum().alias('heart_whiff'),
453
- pl.col('shadow_whiff').sum().alias('shadow_whiff'),
454
- pl.col('chase_whiff').sum().alias('chase_whiff'),
455
- pl.col('waste_whiff').sum().alias('waste_whiff'),
456
- pl.col('tj_stuff_plus').sum().alias('tj_stuff_plus')
457
- ])
458
-
459
- # Add calculated columns to the summary DataFrame
460
- df_summ = df_summ.with_columns([
461
- (pl.col('hits') / pl.col('ab')).alias('avg'),
462
- (pl.col('on_base') / pl.col('obp_pa')).alias('obp'),
463
- (pl.col('tb') / pl.col('ab')).alias('slg'),
464
- (pl.col('on_base') / pl.col('obp_pa') + pl.col('tb') / pl.col('ab')).alias('ops'),
465
- (pl.col('k') / pl.col('pa')).alias('k_percent'),
466
- (pl.col('bb') / pl.col('pa')).alias('bb_percent'),
467
- (pl.col('bb_minus_k') / pl.col('pa')).alias('bb_minus_k_percent'),
468
- (pl.col('k_minus_bb') / pl.col('pa')).alias('k_minus_bb_percent'),
469
- (pl.col('bb') / pl.col('k')).alias('bb_over_k_percent'),
470
- (pl.col('csw') / pl.col('pitches')).alias('csw_percent'),
471
- (pl.col('sweet_spot') / pl.col('bip_div')).alias('sweet_spot_percent'),
472
- (pl.col('woba') / pl.col('woba_codes')).alias('woba_percent'),
473
- (pl.col('woba_contact') / pl.col('bip')).alias('woba_percent_contact'),
474
- (pl.col('hard_hit') / pl.col('bip_div')).alias('hard_hit_percent'),
475
- (pl.col('barrel') / pl.col('bip_div')).alias('barrel_percent'),
476
- (pl.col('zone_contact') / pl.col('zone_swing')).alias('zone_contact_percent'),
477
- (pl.col('zone_swing') / pl.col('in_zone')).alias('zone_swing_percent'),
478
- (pl.col('in_zone') / pl.col('pitches')).alias('zone_percent'),
479
- (pl.col('ozone_swing') / (pl.col('pitches') - pl.col('in_zone'))).alias('chase_percent'),
480
- (pl.col('ozone_contact') / pl.col('ozone_swing')).alias('chase_contact'),
481
- (pl.col('swings') / pl.col('pitches')).alias('swing_percent'),
482
- (pl.col('whiffs') / pl.col('swings')).alias('whiff_rate'),
483
- (pl.col('whiffs') / pl.col('pitches')).alias('swstr_rate'),
484
- (pl.col('ground_ball') / pl.col('bip')).alias('ground_ball_percent'),
485
- (pl.col('line_drive') / pl.col('bip')).alias('line_drive_percent'),
486
- (pl.col('fly_ball') / pl.col('bip')).alias('fly_ball_percent'),
487
- (pl.col('pop_up') / pl.col('bip')).alias('pop_up_percent'),
488
- (pl.col('heart') / pl.col('attack_zone')).alias('heart_zone_percent'),
489
- (pl.col('shadow') / pl.col('attack_zone')).alias('shadow_zone_percent'),
490
- (pl.col('chase') / pl.col('attack_zone')).alias('chase_zone_percent'),
491
- (pl.col('waste') / pl.col('attack_zone')).alias('waste_zone_percent'),
492
- (pl.col('heart_swing') / pl.col('heart')).alias('heart_zone_swing_percent'),
493
- (pl.col('shadow_swing') / pl.col('shadow')).alias('shadow_zone_swing_percent'),
494
- (pl.col('chase_swing') / pl.col('chase')).alias('chase_zone_swing_percent'),
495
- (pl.col('waste_swing') / pl.col('waste')).alias('waste_zone_swing_percent'),
496
- (pl.col('heart_whiff') / pl.col('heart_swing')).alias('heart_zone_whiff_percent'),
497
- (pl.col('shadow_whiff') / pl.col('shadow_swing')).alias('shadow_zone_whiff_percent'),
498
- (pl.col('chase_whiff') / pl.col('chase_swing')).alias('chase_zone_whiff_percent'),
499
- (pl.col('waste_whiff') / pl.col('waste_swing')).alias('waste_zone_whiff_percent'),
500
- (pl.col('xwoba') / pl.col('xwoba_codes')).alias('xwoba_percent'),
501
- (pl.col('xwoba_contact') / pl.col('bip')).alias('xwoba_percent_contact'),
502
- (pl.col('tj_stuff_plus') / pl.col('pitches')).alias('tj_stuff_plus_avg'),
503
-
504
- ])
505
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
506
  return df_summ
 
1
+ import polars as pl
2
+ import numpy as np
3
+ import joblib
4
+
5
+ loaded_model = joblib.load('joblib_model/barrel_model.joblib')
6
+ in_zone_model = joblib.load('joblib_model/in_zone_model_knn_20240410.joblib')
7
+ attack_zone_model = joblib.load('joblib_model/model_attack_zone.joblib')
8
+ xwoba_model = joblib.load('joblib_model/xwoba_model.joblib')
9
+ px_model = joblib.load('joblib_model/linear_reg_model_x.joblib')
10
+ pz_model = joblib.load('joblib_model/linear_reg_model_z.joblib')
11
+
12
+
13
+ class df_update:
14
+ def __init__(self):
15
+ pass
16
+
17
+ def update(self, df_clone: pl.DataFrame):
18
+
19
+ df = df_clone.clone()
20
+ # Assuming px_model is defined and df is your DataFrame
21
+ hit_codes = ['single',
22
+ 'double','home_run', 'triple']
23
+
24
+ ab_codes = ['single', 'strikeout', 'field_out',
25
+ 'grounded_into_double_play', 'fielders_choice', 'force_out',
26
+ 'double', 'field_error', 'home_run', 'triple',
27
+ 'double_play',
28
+ 'fielders_choice_out', 'strikeout_double_play',
29
+ 'other_out','triple_play']
30
+
31
+
32
+ obp_true_codes = ['single', 'walk',
33
+ 'double','home_run', 'triple',
34
+ 'hit_by_pitch', 'intent_walk']
35
+
36
+ obp_codes = ['single', 'strikeout', 'walk', 'field_out',
37
+ 'grounded_into_double_play', 'fielders_choice', 'force_out',
38
+ 'double', 'sac_fly', 'field_error', 'home_run', 'triple',
39
+ 'hit_by_pitch', 'double_play', 'intent_walk',
40
+ 'fielders_choice_out', 'strikeout_double_play',
41
+ 'sac_fly_double_play',
42
+ 'other_out','triple_play']
43
+
44
+
45
+ contact_codes = ['In play, no out',
46
+ 'Foul', 'In play, out(s)',
47
+ 'In play, run(s)',
48
+ 'Foul Bunt']
49
+
50
+ bip_codes = ['In play, no out', 'In play, run(s)','In play, out(s)']
51
+
52
+
53
+ conditions_barrel = [
54
+ df['launch_speed'].is_null(),
55
+ (df['launch_speed'] * 1.5 - df['launch_angle'] >= 117) &
56
+ (df['launch_speed'] + df['launch_angle'] >= 124) &
57
+ (df['launch_speed'] >= 98) &
58
+ (df['launch_angle'] >= 4) & (df['launch_angle'] <= 50)
59
+ ]
60
+ choices_barrel = [False, True]
61
+
62
+ conditions_tb = [
63
+ (df['event_type'] == 'single'),
64
+ (df['event_type'] == 'double'),
65
+ (df['event_type'] == 'triple'),
66
+ (df['event_type'] == 'home_run')
67
+ ]
68
+ choices_tb = [1, 2, 3, 4]
69
+
70
+
71
+ conditions_woba = [
72
+ df['event_type'].is_in(['strikeout', 'field_out', 'sac_fly', 'force_out', 'grounded_into_double_play', 'fielders_choice', 'field_error', 'sac_bunt', 'double_play', 'fielders_choice_out', 'strikeout_double_play', 'sac_fly_double_play', 'other_out']),
73
+ df['event_type'] == 'walk',
74
+ df['event_type'] == 'hit_by_pitch',
75
+ df['event_type'] == 'single',
76
+ df['event_type'] == 'double',
77
+ df['event_type'] == 'triple',
78
+ df['event_type'] == 'home_run'
79
+ ]
80
+ choices_woba = [0, 0.689, 0.720, 0.881, 1.254, 1.589, 2.048]
81
+
82
+ woba_codes = ['strikeout', 'field_out', 'single', 'walk', 'hit_by_pitch', 'double', 'sac_fly', 'force_out', 'home_run', 'grounded_into_double_play', 'fielders_choice', 'field_error', 'triple', 'sac_bunt', 'double_play', 'fielders_choice_out', 'strikeout_double_play', 'sac_fly_double_play', 'other_out']
83
+
84
+ pitch_cat = {'FA': 'Fastball',
85
+ 'FF': 'Fastball',
86
+ 'FT': 'Fastball',
87
+ 'FC': 'Fastball',
88
+ 'FS': 'Off-Speed',
89
+ 'FO': 'Off-Speed',
90
+ 'SI': 'Fastball',
91
+ 'ST': 'Breaking',
92
+ 'SL': 'Breaking',
93
+ 'CU': 'Breaking',
94
+ 'KC': 'Breaking',
95
+ 'SC': 'Off-Speed',
96
+ 'GY': 'Off-Speed',
97
+ 'SV': 'Breaking',
98
+ 'CS': 'Breaking',
99
+ 'CH': 'Off-Speed',
100
+ 'KN': 'Off-Speed',
101
+ 'EP': 'Breaking',
102
+ 'UN': None,
103
+ 'IN': None,
104
+ 'PO': None,
105
+ 'AB': None,
106
+ 'AS': None,
107
+ 'NP': None}
108
+
109
+
110
+ df = df.with_columns([
111
+ pl.when(df['type_ab'].is_not_null()).then(1).otherwise(0).alias('pa'),
112
+ pl.when(df['is_pitch']).then(1).otherwise(0).alias('pitches'),
113
+ pl.when(df['sz_top'] == 0).then(None).otherwise(df['sz_top']).alias('sz_top'),
114
+ pl.when(df['sz_bot'] == 0).then(None).otherwise(df['sz_bot']).alias('sz_bot'),
115
+ pl.when(df['zone'] > 0).then(df['zone'] < 10).otherwise(None).alias('in_zone'),
116
+ pl.Series(px_model.predict(df[['x']].fill_null(0).to_numpy())[:, 0]).alias('px_predict'),
117
+ pl.Series(pz_model.predict(df[['y']].fill_null(0).to_numpy())[:, 0] + 3.2).alias('pz_predict'),
118
+ pl.Series(in_zone_model.predict(df[['px','pz','sz_top','sz_bot']].fill_null(0).to_numpy())[:]).alias('in_zone_predict'),
119
+ pl.Series(attack_zone_model.predict(df[['px','pz','sz_top','sz_bot']].fill_null(0).to_numpy())[:]).alias('attack_zone_predict'),
120
+ pl.when(df['event_type'].is_in(hit_codes)).then(True).otherwise(False).alias('hits'),
121
+ pl.when(df['event_type'].is_in(ab_codes)).then(True).otherwise(False).alias('ab'),
122
+ pl.when(df['event_type'].is_in(obp_true_codes)).then(True).otherwise(False).alias('on_base'),
123
+ pl.when(df['event_type'].is_in(obp_codes)).then(True).otherwise(False).alias('obp'),
124
+ pl.when(df['play_description'].is_in(bip_codes)).then(True).otherwise(False).alias('bip'),
125
+ pl.when(conditions_barrel[0]).then(choices_barrel[0]).when(conditions_barrel[1]).then(choices_barrel[1]).otherwise(None).alias('barrel'),
126
+ pl.when(df['launch_angle'].is_null()).then(False).when((df['launch_angle'] >= 8) & (df['launch_angle'] <= 32)).then(True).otherwise(None).alias('sweet_spot'),
127
+ pl.when(df['launch_speed'].is_null()).then(False).when(df['launch_speed'] >= 94.5).then(True).otherwise(None).alias('hard_hit'),
128
+ pl.when(conditions_tb[0]).then(choices_tb[0]).when(conditions_tb[1]).then(choices_tb[1]).when(conditions_tb[2]).then(choices_tb[2]).when(conditions_tb[3]).then(choices_tb[3]).otherwise(None).alias('tb'),
129
+ pl.when(conditions_woba[0]).then(choices_woba[0]).when(conditions_woba[1]).then(choices_woba[1]).when(conditions_woba[2]).then(choices_woba[2]).when(conditions_woba[3]).then(choices_woba[3]).when(conditions_woba[4]).then(choices_woba[4]).when(conditions_woba[5]).then(choices_woba[5]).when(conditions_woba[6]).then(choices_woba[6]).otherwise(None).alias('woba'),
130
+ pl.when((df['play_code'] == 'S') | (df['play_code'] == 'W') | (df['play_code'] == 'T')).then(1).otherwise(0).alias('whiffs'),
131
+ pl.when((df['play_code'] == 'S') | (df['play_code'] == 'W') | (df['play_code'] == 'T') | (df['play_code'] == 'C')).then(1).otherwise(0).alias('csw'),
132
+ pl.when(pl.col('is_swing').cast(pl.Boolean)).then(1).otherwise(0).alias('swings'),
133
+ pl.col('event_type').is_in(['strikeout','strikeout_double_play']).alias('k'),
134
+ pl.col('event_type').is_in(['walk', 'intent_walk']).alias('bb'),
135
+ pl.lit(None).alias('attack_zone'),
136
+ pl.lit(None).alias('woba_pred'),
137
+ pl.lit(None).alias('woba_pred_contact')
138
+
139
+ ])
140
+
141
+
142
+ df = df.with_columns([
143
+ pl.when(df['event_type'].is_in(woba_codes)).then(1).otherwise(None).alias('woba_codes'),
144
+ pl.when(df['event_type'].is_in(woba_codes)).then(1).otherwise(None).alias('xwoba_codes'),
145
+ pl.when((pl.col('tb') >= 0)).then(df['woba']).otherwise(None).alias('woba_contact'),
146
+ pl.when(pl.col('px').is_null()).then(pl.col('px_predict')).otherwise(pl.col('px')).alias('px'),
147
+ pl.when(pl.col('pz').is_null()).then(pl.col('pz_predict')).otherwise(pl.col('pz')).alias('pz'),
148
+ pl.when(pl.col('in_zone').is_null()).then(pl.col('in_zone_predict')).otherwise(pl.col('in_zone')).alias('in_zone_final'),
149
+
150
+ ])
151
+
152
+ df = df.with_columns([
153
+ pl.when(df['launch_speed'].is_null()).then(None).otherwise(df['barrel']).alias('barrel'),
154
+ pl.lit('average').alias('average'),
155
+ pl.when(pl.col('in_zone_final') == False).then(True).otherwise(False).alias('out_zone'),
156
+ pl.when((pl.col('in_zone_final') == True) & (pl.col('swings') == 1)).then(True).otherwise(False).alias('zone_swing'),
157
+ pl.when((pl.col('in_zone_final') == True) & (pl.col('swings') == 1) & (pl.col('whiffs') == 0)).then(True).otherwise(False).alias('zone_contact'),
158
+ pl.when((pl.col('in_zone_final') == False) & (pl.col('swings') == 1)).then(True).otherwise(False).alias('ozone_swing'),
159
+ pl.when((pl.col('in_zone_final') == False) & (pl.col('swings') == 1) & (pl.col('whiffs') == 0)).then(True).otherwise(False).alias('ozone_contact'),
160
+ pl.when(pl.col('event_type').str.contains('strikeout')).then(True).otherwise(False).alias('k'),
161
+ pl.when(pl.col('event_type').is_in(['walk', 'intent_walk'])).then(True).otherwise(False).alias('bb'),
162
+ pl.when(pl.col('attack_zone').is_null()).then(pl.col('attack_zone_predict')).otherwise(pl.col('attack_zone')).alias('attack_zone_final'),
163
+
164
+
165
+ ])
166
+
167
+ df = df.with_columns([
168
+ (df['k'].cast(pl.Float32) - df['bb'].cast(pl.Float32)).alias('k_minus_bb'),
169
+ (df['bb'].cast(pl.Float32) - df['k'].cast(pl.Float32)).alias('bb_minus_k'),
170
+ (df['launch_speed'] > 0).alias('bip_div'),
171
+ (df['attack_zone_final'] == 0).alias('heart'),
172
+ (df['attack_zone_final'] == 1).alias('shadow'),
173
+ (df['attack_zone_final'] == 2).alias('chase'),
174
+ (df['attack_zone_final'] == 3).alias('waste'),
175
+ ((df['attack_zone_final'] == 0) & (df['swings'] == 1)).alias('heart_swing'),
176
+ ((df['attack_zone_final'] == 1) & (df['swings'] == 1)).alias('shadow_swing'),
177
+ ((df['attack_zone_final'] == 2) & (df['swings'] == 1)).alias('chase_swing'),
178
+ ((df['attack_zone_final'] == 3) & (df['swings'] == 1)).alias('waste_swing'),
179
+ ((df['attack_zone_final'] == 0) & (df['whiffs'] == 1)).alias('heart_whiff'),
180
+ ((df['attack_zone_final'] == 1) & (df['whiffs'] == 1)).alias('shadow_whiff'),
181
+ ((df['attack_zone_final'] == 2) & (df['whiffs'] == 1)).alias('chase_whiff'),
182
+ ((df['attack_zone_final'] == 3) & (df['whiffs'] == 1)).alias('waste_whiff')
183
+ ])
184
+
185
+
186
+ [0, 0.689, 0.720, 0.881, 1.254, 1.589, 2.048]
187
+
188
+ df = df.with_columns([
189
+ pl.Series(
190
+ [sum(x) for x in xwoba_model.predict_proba(df[['launch_angle', 'launch_speed']].fill_null(0).to_numpy()[:]) * ([0, 0.881, 1.254, 1.589, 2.048])]
191
+ ).alias('woba_pred_predict')
192
+ ])
193
+
194
+ df = df.with_columns([
195
+ pl.when(pl.col('event_type').is_in(['walk'])).then(0.689)
196
+ .when(pl.col('event_type').is_in(['hit_by_pitch'])).then(0.720)
197
+ .when(pl.col('event_type').is_in(['strikeout', 'strikeout_double_play'])).then(0)
198
+ .otherwise(pl.col('woba_pred_predict')).alias('woba_pred_predict')
199
+ ])
200
+
201
+ df = df.with_columns([
202
+ pl.when(pl.col('woba_codes').is_null()).then(None).otherwise(pl.col('woba_pred_predict')).alias('woba_pred'),
203
+ pl.when(pl.col('bip')!=1).then(None).otherwise(pl.col('woba_pred_predict')).alias('woba_pred_contact'),
204
+ ])
205
+
206
+ df = df.with_columns([
207
+ pl.when(pl.col('trajectory').is_in(['bunt_popup'])).then(pl.lit('popup'))
208
+ .when(pl.col('trajectory').is_in(['bunt_grounder'])).then(pl.lit('ground_ball'))
209
+ .when(pl.col('trajectory').is_in(['bunt_line_drive'])).then(pl.lit('line_drive'))
210
+ .when(pl.col('trajectory').is_in([''])).then(pl.lit(None))
211
+ .otherwise(pl.col('trajectory')).alias('trajectory')
212
+ ])
213
+
214
+
215
+ # Create one-hot encoded columns for the trajectory column
216
+ dummy_df = df.select(pl.col('trajectory')).to_dummies()
217
+
218
+ # Rename the one-hot encoded columns
219
+ dummy_df = dummy_df.rename({
220
+ 'trajectory_fly_ball': 'trajectory_fly_ball',
221
+ 'trajectory_ground_ball': 'trajectory_ground_ball',
222
+ 'trajectory_line_drive': 'trajectory_line_drive',
223
+ 'trajectory_popup': 'trajectory_popup'
224
+ })
225
+
226
+ # Ensure the columns are present in the DataFrame
227
+ for col in ['trajectory_fly_ball', 'trajectory_ground_ball', 'trajectory_line_drive', 'trajectory_popup']:
228
+ if col not in dummy_df.columns:
229
+ dummy_df = dummy_df.with_columns(pl.lit(0).alias(col))
230
+
231
+ # Join the one-hot encoded columns back to the original DataFrame
232
+ df = df.hstack(dummy_df)
233
+
234
+ # Check if 'trajectory_null' column exists and drop it
235
+ if 'trajectory_null' in df.columns:
236
+ df = df.drop('trajectory_null')
237
+
238
+
239
+ pitch_cat = {'FA': None,
240
+ 'FF': 'Fastball',
241
+ 'FT': 'Fastball',
242
+ 'FC': 'Fastball',
243
+ 'FS': 'Off-Speed',
244
+ 'FO': 'Off-Speed',
245
+ 'SI': 'Fastball',
246
+ 'ST': 'Breaking',
247
+ 'SL': 'Breaking',
248
+ 'CU': 'Breaking',
249
+ 'KC': 'Breaking',
250
+ 'SC': 'Off-Speed',
251
+ 'GY': 'Off-Speed',
252
+ 'SV': 'Breaking',
253
+ 'CS': 'Breaking',
254
+ 'CH': 'Off-Speed',
255
+ 'KN': 'Off-Speed',
256
+ 'EP': 'Breaking',
257
+ 'UN': None,
258
+ 'IN': None,
259
+ 'PO': None,
260
+ 'AB': None,
261
+ 'AS': None,
262
+ 'NP': None}
263
+ df = df.with_columns(
264
+ df["pitch_type"].map_elements(lambda x: pitch_cat.get(x, x)).alias("pitch_group")
265
+ )
266
+
267
+ df = df.with_columns([
268
+
269
+ (-(pl.col('vy0')**2 - (2 * pl.col('ay') * (pl.col('y0') - 17/12)))**0.5).alias('vy_f'),
270
+ ])
271
+
272
+ df = df.with_columns([
273
+ ((pl.col('vy_f') - pl.col('vy0')) / pl.col('ay')).alias('t'),
274
+ ])
275
+
276
+ df = df.with_columns([
277
+ (pl.col('vz0') + (pl.col('az') * pl.col('t'))).alias('vz_f'),
278
+ (pl.col('vx0') + (pl.col('ax') * pl.col('t'))).alias('vx_f')
279
+ ])
280
+
281
+ df = df.with_columns([
282
+ (-np.arctan(pl.col('vz_f') / pl.col('vy_f')) * (180 / np.pi)).alias('vaa'),
283
+ (-np.arctan(pl.col('vx_f') / pl.col('vy_f')) * (180 / np.pi)).alias('haa')
284
+ ])
285
+
286
+ # Mirror horizontal break for left-handed pitchers
287
+ df = df.with_columns(
288
+ pl.when(pl.col('pitcher_hand') == 'L')
289
+ .then(-pl.col('ax'))
290
+ .otherwise(pl.col('ax'))
291
+ .alias('ax')
292
+ )
293
+
294
+ # Mirror horizontal break for left-handed pitchers
295
+ df = df.with_columns(
296
+ pl.when(pl.col('pitcher_hand') == 'L')
297
+ .then(-pl.col('hb'))
298
+ .otherwise(pl.col('hb'))
299
+ .alias('hb')
300
+ )
301
+
302
+ # Mirror horizontal release point for left-handed pitchers
303
+ df = df.with_columns(
304
+ pl.when(pl.col('pitcher_hand') == 'L')
305
+ .then(pl.col('x0'))
306
+ .otherwise(-pl.col('x0'))
307
+ .alias('x0')
308
+ )
309
+
310
+ df = df.with_columns([
311
+ pl.when(df['swings'].is_null()).then(None).otherwise(df['swings']).alias('is_swing'),
312
+ pl.when(df['bip'].is_null()).then(None).otherwise(df['bip']).alias('is_bip')])
313
+
314
+
315
+ return df
316
+
317
+ # Assuming df is your Polars DataFrame
318
+ def update_summary(self, df: pl.DataFrame, pitcher: bool = True) -> pl.DataFrame:
319
+ """
320
+ Update summary statistics for pitchers or batters.
321
+
322
+ Parameters:
323
+ df (pl.DataFrame): The input Polars DataFrame containing player statistics.
324
+ pitcher (bool): A flag indicating whether to calculate statistics for pitchers (True) or batters (False).
325
+
326
+ Returns:
327
+ pl.DataFrame: A Polars DataFrame with aggregated and calculated summary statistics.
328
+ """
329
+
330
+ # Determine the position based on the pitcher flag
331
+ if pitcher:
332
+ position = 'pitcher'
333
+ else:
334
+ position = 'batter'
335
+
336
+ # Group by position_id and position_name, then aggregate various statistics
337
+ df_summ = df.group_by([f'{position}_id', f'{position}_name']).agg([
338
+ pl.col('pa').sum().alias('pa'),
339
+ pl.col('ab').sum().alias('ab'),
340
+ pl.col('obp').sum().alias('obp_pa'),
341
+ pl.col('hits').sum().alias('hits'),
342
+ pl.col('on_base').sum().alias('on_base'),
343
+ pl.col('k').sum().alias('k'),
344
+ pl.col('bb').sum().alias('bb'),
345
+ pl.col('bb_minus_k').sum().alias('bb_minus_k'),
346
+ pl.col('csw').sum().alias('csw'),
347
+ pl.col('bip').sum().alias('bip'),
348
+ pl.col('bip_div').sum().alias('bip_div'),
349
+ pl.col('tb').sum().alias('tb'),
350
+ pl.col('woba').sum().alias('woba'),
351
+ pl.col('woba_contact').sum().alias('woba_contact'),
352
+ pl.col('woba_pred').sum().alias('xwoba'),
353
+ pl.col('woba_pred_contact').sum().alias('xwoba_contact'),
354
+ pl.col('woba_codes').sum().alias('woba_codes'),
355
+ pl.col('xwoba_codes').sum().alias('xwoba_codes'),
356
+ pl.col('hard_hit').sum().alias('hard_hit'),
357
+ pl.col('barrel').sum().alias('barrel'),
358
+ pl.col('sweet_spot').sum().alias('sweet_spot'),
359
+ pl.col('launch_speed').max().alias('max_launch_speed'),
360
+ pl.col('launch_speed').quantile(0.90).alias('launch_speed_90'),
361
+ pl.col('launch_speed').mean().alias('launch_speed'),
362
+ pl.col('launch_angle').mean().alias('launch_angle'),
363
+ pl.col('is_pitch').sum().alias('pitches'),
364
+ pl.col('swings').sum().alias('swings'),
365
+ pl.col('in_zone').sum().alias('in_zone'),
366
+ pl.col('out_zone').sum().alias('out_zone'),
367
+ pl.col('whiffs').sum().alias('whiffs'),
368
+ pl.col('zone_swing').sum().alias('zone_swing'),
369
+ pl.col('zone_contact').sum().alias('zone_contact'),
370
+ pl.col('ozone_swing').sum().alias('ozone_swing'),
371
+ pl.col('ozone_contact').sum().alias('ozone_contact'),
372
+ pl.col('trajectory_ground_ball').sum().alias('ground_ball'),
373
+ pl.col('trajectory_line_drive').sum().alias('line_drive'),
374
+ pl.col('trajectory_fly_ball').sum().alias('fly_ball'),
375
+ pl.col('trajectory_popup').sum().alias('pop_up'),
376
+ pl.col('attack_zone').count().alias('attack_zone'),
377
+ pl.col('heart').sum().alias('heart'),
378
+ pl.col('shadow').sum().alias('shadow'),
379
+ pl.col('chase').sum().alias('chase'),
380
+ pl.col('waste').sum().alias('waste'),
381
+ pl.col('heart_swing').sum().alias('heart_swing'),
382
+ pl.col('shadow_swing').sum().alias('shadow_swing'),
383
+ pl.col('chase_swing').sum().alias('chase_swing'),
384
+ pl.col('waste_swing').sum().alias('waste_swing'),
385
+ pl.col('heart_whiff').sum().alias('heart_whiff'),
386
+ pl.col('shadow_whiff').sum().alias('shadow_whiff'),
387
+ pl.col('chase_whiff').sum().alias('chase_whiff'),
388
+ pl.col('waste_whiff').sum().alias('waste_whiff')
389
+ ])
390
+
391
+ # Add calculated columns to the summary DataFrame
392
+ df_summ = df_summ.with_columns([
393
+ (pl.col('hits') / pl.col('ab')).alias('avg'),
394
+ (pl.col('on_base') / pl.col('obp_pa')).alias('obp'),
395
+ (pl.col('tb') / pl.col('ab')).alias('slg'),
396
+ (pl.col('on_base') / pl.col('obp_pa') + pl.col('tb') / pl.col('ab')).alias('ops'),
397
+ (pl.col('k') / pl.col('pa')).alias('k_percent'),
398
+ (pl.col('bb') / pl.col('pa')).alias('bb_percent'),
399
+ (pl.col('bb_minus_k') / pl.col('pa')).alias('bb_minus_k_percent'),
400
+ (pl.col('bb') / pl.col('k')).alias('bb_over_k_percent'),
401
+ (pl.col('csw') / pl.col('pitches')).alias('csw_percent'),
402
+ (pl.col('sweet_spot') / pl.col('bip_div')).alias('sweet_spot_percent'),
403
+ (pl.col('woba') / pl.col('woba_codes')).alias('woba_percent'),
404
+ (pl.col('woba_contact') / pl.col('bip')).alias('woba_percent_contact'),
405
+ (pl.col('hard_hit') / pl.col('bip_div')).alias('hard_hit_percent'),
406
+ (pl.col('barrel') / pl.col('bip_div')).alias('barrel_percent'),
407
+ (pl.col('zone_contact') / pl.col('zone_swing')).alias('zone_contact_percent'),
408
+ (pl.col('zone_swing') / pl.col('in_zone')).alias('zone_swing_percent'),
409
+ (pl.col('in_zone') / pl.col('pitches')).alias('zone_percent'),
410
+ (pl.col('ozone_swing') / (pl.col('pitches') - pl.col('in_zone'))).alias('chase_percent'),
411
+ (pl.col('ozone_contact') / pl.col('ozone_swing')).alias('chase_contact'),
412
+ (pl.col('swings') / pl.col('pitches')).alias('swing_percent'),
413
+ (pl.col('whiffs') / pl.col('swings')).alias('whiff_rate'),
414
+ (pl.col('whiffs') / pl.col('pitches')).alias('swstr_rate'),
415
+ (pl.col('ground_ball') / pl.col('bip')).alias('ground_ball_percent'),
416
+ (pl.col('line_drive') / pl.col('bip')).alias('line_drive_percent'),
417
+ (pl.col('fly_ball') / pl.col('bip')).alias('fly_ball_percent'),
418
+ (pl.col('pop_up') / pl.col('bip')).alias('pop_up_percent'),
419
+ (pl.col('heart') / pl.col('attack_zone')).alias('heart_zone_percent'),
420
+ (pl.col('shadow') / pl.col('attack_zone')).alias('shadow_zone_percent'),
421
+ (pl.col('chase') / pl.col('attack_zone')).alias('chase_zone_percent'),
422
+ (pl.col('waste') / pl.col('attack_zone')).alias('waste_zone_percent'),
423
+ (pl.col('heart_swing') / pl.col('heart')).alias('heart_zone_swing_percent'),
424
+ (pl.col('shadow_swing') / pl.col('shadow')).alias('shadow_zone_swing_percent'),
425
+ (pl.col('chase_swing') / pl.col('chase')).alias('chase_zone_swing_percent'),
426
+ (pl.col('waste_swing') / pl.col('waste')).alias('waste_zone_swing_percent'),
427
+ (pl.col('heart_whiff') / pl.col('heart_swing')).alias('heart_zone_whiff_percent'),
428
+ (pl.col('shadow_whiff') / pl.col('shadow_swing')).alias('shadow_zone_whiff_percent'),
429
+ (pl.col('chase_whiff') / pl.col('chase_swing')).alias('chase_zone_whiff_percent'),
430
+ (pl.col('waste_whiff') / pl.col('waste_swing')).alias('waste_zone_whiff_percent'),
431
+ (pl.col('xwoba') / pl.col('xwoba_codes')).alias('xwoba_percent'),
432
+ (pl.col('xwoba_contact') / pl.col('bip')).alias('xwoba_percent_contact')
433
+ ])
434
+
435
+ return df_summ
436
+
437
+
438
+
439
+
440
+
441
+
442
+ # Assuming df is your Polars DataFrame
443
+ def update_summary_select(self, df: pl.DataFrame, selection: list) -> pl.DataFrame:
444
+ """
445
+ Update summary statistics for pitchers or batters.
446
+
447
+ Parameters:
448
+ df (pl.DataFrame): The input Polars DataFrame containing player statistics.
449
+ pitcher (bool): A flag indicating whether to calculate statistics for pitchers (True) or batters (False).
450
+
451
+ Returns:
452
+ pl.DataFrame: A Polars DataFrame with aggregated and calculated summary statistics.
453
+ """
454
+
455
+ # Group by position_id and position_name, then aggregate various statistics
456
+ df_summ = df.group_by(selection).agg([
457
+ pl.col('pa').sum().alias('pa'),
458
+ pl.col('ab').sum().alias('ab'),
459
+ pl.col('obp').sum().alias('obp_pa'),
460
+ pl.col('hits').sum().alias('hits'),
461
+ pl.col('on_base').sum().alias('on_base'),
462
+ pl.col('k').sum().alias('k'),
463
+ pl.col('bb').sum().alias('bb'),
464
+ pl.col('bb_minus_k').sum().alias('bb_minus_k'),
465
+ pl.col('k_minus_bb').sum().alias('k_minus_bb'),
466
+ pl.col('csw').sum().alias('csw'),
467
+ pl.col('bip').sum().alias('bip'),
468
+ pl.col('bip_div').sum().alias('bip_div'),
469
+ pl.col('tb').sum().alias('tb'),
470
+ pl.col('woba').sum().alias('woba'),
471
+ pl.col('woba_contact').sum().alias('woba_contact'),
472
+ pl.col('woba_pred').sum().alias('xwoba'),
473
+ pl.col('woba_pred_contact').sum().alias('xwoba_contact'),
474
+ pl.col('woba_codes').sum().alias('woba_codes'),
475
+ pl.col('xwoba_codes').sum().alias('xwoba_codes'),
476
+ pl.col('hard_hit').sum().alias('hard_hit'),
477
+ pl.col('barrel').sum().alias('barrel'),
478
+ pl.col('sweet_spot').sum().alias('sweet_spot'),
479
+ pl.col('launch_speed').max().alias('max_launch_speed'),
480
+ pl.col('launch_speed').quantile(0.90).alias('launch_speed_90'),
481
+ pl.col('launch_speed').mean().alias('launch_speed'),
482
+ pl.col('launch_angle').mean().alias('launch_angle'),
483
+ pl.col('is_pitch').sum().alias('pitches'),
484
+ pl.col('swings').sum().alias('swings'),
485
+ pl.col('in_zone').sum().alias('in_zone'),
486
+ pl.col('out_zone').sum().alias('out_zone'),
487
+ pl.col('whiffs').sum().alias('whiffs'),
488
+ pl.col('zone_swing').sum().alias('zone_swing'),
489
+ pl.col('zone_contact').sum().alias('zone_contact'),
490
+ pl.col('ozone_swing').sum().alias('ozone_swing'),
491
+ pl.col('ozone_contact').sum().alias('ozone_contact'),
492
+ pl.col('trajectory_ground_ball').sum().alias('ground_ball'),
493
+ pl.col('trajectory_line_drive').sum().alias('line_drive'),
494
+ pl.col('trajectory_fly_ball').sum().alias('fly_ball'),
495
+ pl.col('trajectory_popup').sum().alias('pop_up'),
496
+ pl.col('attack_zone').count().alias('attack_zone'),
497
+ pl.col('heart').sum().alias('heart'),
498
+ pl.col('shadow').sum().alias('shadow'),
499
+ pl.col('chase').sum().alias('chase'),
500
+ pl.col('waste').sum().alias('waste'),
501
+ pl.col('heart_swing').sum().alias('heart_swing'),
502
+ pl.col('shadow_swing').sum().alias('shadow_swing'),
503
+ pl.col('chase_swing').sum().alias('chase_swing'),
504
+ pl.col('waste_swing').sum().alias('waste_swing'),
505
+ pl.col('heart_whiff').sum().alias('heart_whiff'),
506
+ pl.col('shadow_whiff').sum().alias('shadow_whiff'),
507
+ pl.col('chase_whiff').sum().alias('chase_whiff'),
508
+ pl.col('waste_whiff').sum().alias('waste_whiff'),
509
+ pl.col('tj_stuff_plus').sum().alias('tj_stuff_plus'),
510
+ pl.col('start_speed').sum(),
511
+ pl.col('vb').sum(),
512
+ pl.col('ivb').sum(),
513
+ pl.col('hb').sum(),
514
+ pl.col('x0').sum(),
515
+ pl.col('z0').sum(),
516
+ pl.col('vaa').sum(),
517
+ pl.col('haa').sum(),
518
+ pl.col('spin_rate').sum(),
519
+ pl.col('extension').sum(),
520
+ ])
521
+
522
+ # Add calculated columns to the summary DataFrame
523
+ df_summ = df_summ.with_columns([
524
+ (pl.col('hits') / pl.col('ab')).alias('avg'),
525
+ (pl.col('on_base') / pl.col('obp_pa')).alias('obp'),
526
+ (pl.col('tb') / pl.col('ab')).alias('slg'),
527
+ (pl.col('on_base') / pl.col('obp_pa') + pl.col('tb') / pl.col('ab')).alias('ops'),
528
+ (pl.col('k') / pl.col('pa')).alias('k_percent'),
529
+ (pl.col('bb') / pl.col('pa')).alias('bb_percent'),
530
+ (pl.col('bb_minus_k') / pl.col('pa')).alias('bb_minus_k_percent'),
531
+ (pl.col('k_minus_bb') / pl.col('pa')).alias('k_minus_bb_percent'),
532
+ (pl.col('bb') / pl.col('k')).alias('bb_over_k_percent'),
533
+ (pl.col('csw') / pl.col('pitches')).alias('csw_percent'),
534
+ (pl.col('sweet_spot') / pl.col('bip_div')).alias('sweet_spot_percent'),
535
+ (pl.col('woba') / pl.col('woba_codes')).alias('woba_percent'),
536
+ (pl.col('woba_contact') / pl.col('bip')).alias('woba_percent_contact'),
537
+ (pl.col('hard_hit') / pl.col('bip_div')).alias('hard_hit_percent'),
538
+ (pl.col('barrel') / pl.col('bip_div')).alias('barrel_percent'),
539
+ (pl.col('zone_contact') / pl.col('zone_swing')).alias('zone_contact_percent'),
540
+ (pl.col('zone_swing') / pl.col('in_zone')).alias('zone_swing_percent'),
541
+ (pl.col('in_zone') / pl.col('pitches')).alias('zone_percent'),
542
+ (pl.col('ozone_swing') / (pl.col('pitches') - pl.col('in_zone'))).alias('chase_percent'),
543
+ (pl.col('ozone_contact') / pl.col('ozone_swing')).alias('chase_contact'),
544
+ (pl.col('swings') / pl.col('pitches')).alias('swing_percent'),
545
+ (pl.col('whiffs') / pl.col('swings')).alias('whiff_rate'),
546
+ (pl.col('whiffs') / pl.col('pitches')).alias('swstr_rate'),
547
+ (pl.col('ground_ball') / pl.col('bip')).alias('ground_ball_percent'),
548
+ (pl.col('line_drive') / pl.col('bip')).alias('line_drive_percent'),
549
+ (pl.col('fly_ball') / pl.col('bip')).alias('fly_ball_percent'),
550
+ (pl.col('pop_up') / pl.col('bip')).alias('pop_up_percent'),
551
+ (pl.col('heart') / pl.col('attack_zone')).alias('heart_zone_percent'),
552
+ (pl.col('shadow') / pl.col('attack_zone')).alias('shadow_zone_percent'),
553
+ (pl.col('chase') / pl.col('attack_zone')).alias('chase_zone_percent'),
554
+ (pl.col('waste') / pl.col('attack_zone')).alias('waste_zone_percent'),
555
+ (pl.col('heart_swing') / pl.col('heart')).alias('heart_zone_swing_percent'),
556
+ (pl.col('shadow_swing') / pl.col('shadow')).alias('shadow_zone_swing_percent'),
557
+ (pl.col('chase_swing') / pl.col('chase')).alias('chase_zone_swing_percent'),
558
+ (pl.col('waste_swing') / pl.col('waste')).alias('waste_zone_swing_percent'),
559
+ (pl.col('heart_whiff') / pl.col('heart_swing')).alias('heart_zone_whiff_percent'),
560
+ (pl.col('shadow_whiff') / pl.col('shadow_swing')).alias('shadow_zone_whiff_percent'),
561
+ (pl.col('chase_whiff') / pl.col('chase_swing')).alias('chase_zone_whiff_percent'),
562
+ (pl.col('waste_whiff') / pl.col('waste_swing')).alias('waste_zone_whiff_percent'),
563
+ (pl.col('xwoba') / pl.col('xwoba_codes')).alias('xwoba_percent'),
564
+ (pl.col('xwoba_contact') / pl.col('bip')).alias('xwoba_percent_contact'),
565
+ (pl.col('tj_stuff_plus') / pl.col('pitches')).alias('tj_stuff_plus_avg'),
566
+ (pl.col('start_speed')/ pl.col('pitches')).alias('start_speed_avg'),
567
+ (pl.col('vb')/ pl.col('pitches')).alias('vb_avg'),
568
+ (pl.col('ivb')/ pl.col('pitches')).alias('ivb_avg'),
569
+ (pl.col('hb')/ pl.col('pitches')).alias('hb_avg'),
570
+ (pl.col('x0')/ pl.col('pitches')).alias('x0_avg'),
571
+ (pl.col('z0')/ pl.col('pitches')).alias('z0_avg'),
572
+ (pl.col('vaa')/ pl.col('pitches')).alias('vaa_avg'),
573
+ (pl.col('haa')/ pl.col('pitches')).alias('haa_avg'),
574
+ (pl.col('spin_rate')/ pl.col('pitches')).alias('spin_rate_avg'),
575
+ (pl.col('extension')/ pl.col('pitches')).alias('extension_avg'),
576
+
577
+ ])
578
+
579
  return df_summ
stuff_model/__pycache__/feature_engineering.cpython-39.pyc CHANGED
Binary files a/stuff_model/__pycache__/feature_engineering.cpython-39.pyc and b/stuff_model/__pycache__/feature_engineering.cpython-39.pyc differ