nesticot commited on
Commit
c01f41d
·
verified ·
1 Parent(s): d12a6ec

Upload 13 files

Browse files
stuff_model/calculate_arm_angles.py CHANGED
@@ -1,112 +1,112 @@
1
- import polars as pl
2
- import numpy as np
3
- import requests
4
- from io import StringIO
5
-
6
- def calculate_arm_angles(df: pl.DataFrame, pitcher_id: int) -> pl.DataFrame:
7
- def fetch_arm_angle_data(url: str):
8
- headers = {
9
- 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
10
- }
11
- try:
12
- response = requests.get(url, headers=headers, timeout=5)
13
- if not response.ok or "<html" in response.text.lower():
14
- return None
15
- return pl.read_csv(StringIO(response.text), truncate_ragged_lines=True)
16
- except requests.RequestException:
17
- return None
18
-
19
- date_start = df['game_date'][0]
20
- date_end = df['game_date'][-1]
21
- season = int(date_start[:4])
22
- daily_check = date_start == date_end
23
-
24
- # Try fetching current-season arm angle data
25
- url = (
26
- f"https://baseballsavant.mlb.com/leaderboard/pitcher-arm-angles"
27
- f"?batSide=&dateStart={date_start}&dateEnd={date_end}&gameType=R&groupBy=&min=1"
28
- f"&minGroupPitches=1&perspective=back&pitchHand=&pitchType=&season={season}"
29
- f"&size=small&sort=ascending&team=&csv=true"
30
- )
31
- df_arm_angle = fetch_arm_angle_data(url)
32
- print("ARM ANGLE",df_arm_angle)
33
-
34
- old_data = False
35
- if df_arm_angle is None or pitcher_id not in df_arm_angle["pitcher"]:
36
- old_data = True
37
-
38
- # Fallback to saved CSVs if 2025 data isn't fetched or pitcher not found
39
- try:
40
- df_arm_angle_2025 = pl.read_csv("stuff_model/pitcher_arm_angles_2025.csv", truncate_ragged_lines=True)
41
- except Exception as e:
42
- raise RuntimeError("Failed to load fallback 2025 arm angle CSV.") from e
43
-
44
- try:
45
- df_arm_angle_2024 = pl.read_csv("stuff_model/pitcher_arm_angles_2024.csv", truncate_ragged_lines=True)
46
- df_arm_angle_2024 = df_arm_angle_2024.cast(df_arm_angle_2025.schema)
47
- except Exception as e:
48
- raise RuntimeError("Failed to load or cast 2024 arm angle CSV.") from e
49
-
50
- df_arm_angle = pl.concat([df_arm_angle_2025, df_arm_angle_2024]).unique(subset=["pitcher"], keep="first")
51
-
52
- # Filter your tracking data
53
- df_filter = df.filter(pl.col("pitcher_id") == pitcher_id).drop_nulls(subset=["release_pos_x", "release_pos_z"])
54
-
55
- if pitcher_id not in df_arm_angle["pitcher"]:
56
- data = requests.get(f'https://statsapi.mlb.com/api/v1/people?personIds={pitcher_id}').json()
57
- height_in = data['people'][0]['height']
58
- height = int(height_in.split("'")[0]) * 12 + int(height_in.split("'")[1].split('"')[0])
59
-
60
- df_filter = (
61
- df_filter.with_columns(
62
- (pl.col("release_pos_x") * 12).alias("release_pos_x"),
63
- (pl.col("release_pos_z") * 12).alias("release_pos_z"),
64
- (pl.lit(height * 0.70)).alias("shoulder_pos"),
65
- )
66
- .with_columns(
67
- (pl.col("release_pos_z") - pl.col("shoulder_pos")).alias("Opp"),
68
- pl.col("release_pos_x").abs().alias("Adj"),
69
- )
70
- .with_columns(
71
- pl.struct(["Opp", "Adj"]).map_elements(lambda x: np.arctan2(x["Opp"], x["Adj"])).alias("arm_angle_rad")
72
- )
73
- .with_columns(
74
- pl.col("arm_angle_rad").degrees().alias("arm_angle")
75
- )
76
- )
77
-
78
- else:
79
- row = df_arm_angle.filter(pl.col("pitcher") == pitcher_id).select([
80
- "relative_shoulder_x", "shoulder_z", "relative_release_ball_x", "release_ball_z", "ball_angle"
81
- ]).row(0)
82
- shoulder_x, shoulder_z, rel_x, rel_z, ball_angle = row
83
- hyp = np.sqrt((rel_x - shoulder_x)**2 + (rel_z - shoulder_z)**2)
84
-
85
- df_filter = (
86
- df_filter.with_columns(
87
- (pl.col("release_pos_z") - shoulder_z).alias("Opp"),
88
- pl.lit(hyp).alias("Hyp"),
89
- )
90
- .with_columns(
91
- pl.struct(["Opp", "Hyp"]).map_elements(lambda x: np.arcsin(x["Opp"] / x["Hyp"])).alias("arm_angle_rad")
92
- )
93
- .with_columns(
94
- pl.col("arm_angle_rad").degrees().alias("arm_angle")
95
- )
96
- )
97
-
98
- # Adjust based on data source freshness
99
- if old_data:
100
- df_filter = df_filter.with_columns(((pl.col("arm_angle") * 0.5) + (ball_angle * 0.5)).alias("arm_angle"))
101
- elif daily_check:
102
- df_filter = df_filter.with_columns(((pl.col("arm_angle") * 0.25) + (ball_angle * 0.75)).alias("arm_angle"))
103
- else:
104
- df_filter = df_filter.with_columns(((pl.col("arm_angle") * 0.0) + (ball_angle * 1)).alias("arm_angle"))
105
-
106
- # Fill missing arm_angle values with mean
107
- valid_mean = df_filter["arm_angle"].fill_nan(None).drop_nulls().mean()
108
- df_filter = df_filter.with_columns(
109
- df_filter["arm_angle"].fill_nan(None).fill_null(valid_mean)
110
- )
111
-
112
- return df_filter
 
1
+ import polars as pl
2
+ import numpy as np
3
+ import requests
4
+ from io import StringIO
5
+
6
+ def calculate_arm_angles(df: pl.DataFrame, pitcher_id: int) -> pl.DataFrame:
7
+ def fetch_arm_angle_data(url: str):
8
+ headers = {
9
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
10
+ }
11
+ try:
12
+ response = requests.get(url, headers=headers, timeout=5)
13
+ if not response.ok or "<html" in response.text.lower():
14
+ return None
15
+ return pl.read_csv(StringIO(response.text), truncate_ragged_lines=True)
16
+ except requests.RequestException:
17
+ return None
18
+
19
+ date_start = df['game_date'][0]
20
+ date_end = df['game_date'][-1]
21
+ season = int(date_start[:4])
22
+ daily_check = date_start == date_end
23
+
24
+ # Try fetching current-season arm angle data
25
+ url = (
26
+ f"https://baseballsavant.mlb.com/leaderboard/pitcher-arm-angles"
27
+ f"?batSide=&dateStart={date_start}&dateEnd={date_end}&gameType=R&groupBy=&min=1"
28
+ f"&minGroupPitches=1&perspective=back&pitchHand=&pitchType=&season={season}"
29
+ f"&size=small&sort=ascending&team=&csv=true"
30
+ )
31
+ df_arm_angle = fetch_arm_angle_data(url)
32
+ print("ARM ANGLE",df_arm_angle)
33
+
34
+ old_data = False
35
+ if df_arm_angle is None or pitcher_id not in df_arm_angle["pitcher"]:
36
+ old_data = True
37
+
38
+ # Fallback to saved CSVs if 2025 data isn't fetched or pitcher not found
39
+ try:
40
+ df_arm_angle_2025 = pl.read_csv("stuff_model/pitcher_arm_angles_2025.csv", truncate_ragged_lines=True)
41
+ except Exception as e:
42
+ raise RuntimeError("Failed to load fallback 2025 arm angle CSV.") from e
43
+
44
+ try:
45
+ df_arm_angle_2024 = pl.read_csv("stuff_model/pitcher_arm_angles_2024.csv", truncate_ragged_lines=True)
46
+ df_arm_angle_2024 = df_arm_angle_2024.cast(df_arm_angle_2025.schema)
47
+ except Exception as e:
48
+ raise RuntimeError("Failed to load or cast 2024 arm angle CSV.") from e
49
+
50
+ df_arm_angle = pl.concat([df_arm_angle_2025, df_arm_angle_2024]).unique(subset=["pitcher"], keep="first")
51
+
52
+ # Filter your tracking data
53
+ df_filter = df.filter(pl.col("pitcher_id") == pitcher_id).drop_nulls(subset=["release_pos_x", "release_pos_z"])
54
+
55
+ if pitcher_id not in df_arm_angle["pitcher"]:
56
+ data = requests.get(f'https://statsapi.mlb.com/api/v1/people?personIds={pitcher_id}').json()
57
+ height_in = data['people'][0]['height']
58
+ height = int(height_in.split("'")[0]) * 12 + int(height_in.split("'")[1].split('"')[0])
59
+
60
+ df_filter = (
61
+ df_filter.with_columns(
62
+ (pl.col("release_pos_x") * 12).alias("release_pos_x"),
63
+ (pl.col("release_pos_z") * 12).alias("release_pos_z"),
64
+ (pl.lit(height * 0.70)).alias("shoulder_pos"),
65
+ )
66
+ .with_columns(
67
+ (pl.col("release_pos_z") - pl.col("shoulder_pos")).alias("Opp"),
68
+ pl.col("release_pos_x").abs().alias("Adj"),
69
+ )
70
+ .with_columns(
71
+ pl.struct(["Opp", "Adj"]).map_elements(lambda x: np.arctan2(x["Opp"], x["Adj"])).alias("arm_angle_rad")
72
+ )
73
+ .with_columns(
74
+ pl.col("arm_angle_rad").degrees().alias("arm_angle")
75
+ )
76
+ )
77
+
78
+ else:
79
+ row = df_arm_angle.filter(pl.col("pitcher") == pitcher_id).select([
80
+ "relative_shoulder_x", "shoulder_z", "relative_release_ball_x", "release_ball_z", "ball_angle"
81
+ ]).row(0)
82
+ shoulder_x, shoulder_z, rel_x, rel_z, ball_angle = row
83
+ hyp = np.sqrt((rel_x - shoulder_x)**2 + (rel_z - shoulder_z)**2)
84
+
85
+ df_filter = (
86
+ df_filter.with_columns(
87
+ (pl.col("release_pos_z") - shoulder_z).alias("Opp"),
88
+ pl.lit(hyp).alias("Hyp"),
89
+ )
90
+ .with_columns(
91
+ pl.struct(["Opp", "Hyp"]).map_elements(lambda x: np.arcsin(x["Opp"] / x["Hyp"])).alias("arm_angle_rad")
92
+ )
93
+ .with_columns(
94
+ pl.col("arm_angle_rad").degrees().alias("arm_angle")
95
+ )
96
+ )
97
+
98
+ # Adjust based on data source freshness
99
+ if old_data:
100
+ df_filter = df_filter.with_columns(((pl.col("arm_angle") * 0.5) + (ball_angle * 0.5)).alias("arm_angle"))
101
+ elif daily_check:
102
+ df_filter = df_filter.with_columns(((pl.col("arm_angle") * 0.25) + (ball_angle * 0.75)).alias("arm_angle"))
103
+ else:
104
+ df_filter = df_filter.with_columns(((pl.col("arm_angle") * 0.0) + (ball_angle * 1)).alias("arm_angle"))
105
+
106
+ # Fill missing arm_angle values with mean
107
+ valid_mean = df_filter["arm_angle"].fill_nan(None).drop_nulls().mean()
108
+ df_filter = df_filter.with_columns(
109
+ df_filter["arm_angle"].fill_nan(None).fill_null(valid_mean)
110
+ )
111
+
112
+ return df_filter
stuff_model/df_update.py CHANGED
@@ -1,650 +1,650 @@
1
- import polars as pl
2
- import numpy as np
3
- import joblib
4
-
5
- loaded_model = joblib.load('joblib_model/barrel_model.joblib')
6
- in_zone_model = joblib.load('joblib_model/in_zone_model_knn_20240410.joblib')
7
- attack_zone_model = joblib.load('joblib_model/model_attack_zone.joblib')
8
- xwoba_model = joblib.load('joblib_model/xwoba_model.joblib')
9
- px_model = joblib.load('joblib_model/linear_reg_model_x.joblib')
10
- pz_model = joblib.load('joblib_model/linear_reg_model_z.joblib')
11
-
12
-
13
- class df_update:
14
- def __init__(self):
15
- pass
16
-
17
- def update(self, df_clone: pl.DataFrame):
18
-
19
- df = df_clone.clone()
20
- # Assuming px_model is defined and df is your DataFrame
21
- hit_codes = ['single',
22
- 'double','home_run', 'triple']
23
-
24
- ab_codes = ['single', 'strikeout', 'field_out',
25
- 'grounded_into_double_play', 'fielders_choice', 'force_out',
26
- 'double', 'field_error', 'home_run', 'triple',
27
- 'double_play',
28
- 'fielders_choice_out', 'strikeout_double_play',
29
- 'other_out','triple_play']
30
-
31
-
32
- obp_true_codes = ['single', 'walk',
33
- 'double','home_run', 'triple',
34
- 'hit_by_pitch', 'intent_walk']
35
-
36
- obp_codes = ['single', 'strikeout', 'walk', 'field_out',
37
- 'grounded_into_double_play', 'fielders_choice', 'force_out',
38
- 'double', 'sac_fly', 'field_error', 'home_run', 'triple',
39
- 'hit_by_pitch', 'double_play', 'intent_walk',
40
- 'fielders_choice_out', 'strikeout_double_play',
41
- 'sac_fly_double_play',
42
- 'other_out','triple_play']
43
-
44
-
45
- contact_codes = ['In play, no out',
46
- 'Foul', 'In play, out(s)',
47
- 'In play, run(s)',
48
- 'Foul Bunt']
49
-
50
- bip_codes = ['In play, no out', 'In play, run(s)','In play, out(s)']
51
-
52
-
53
- conditions_barrel = [
54
- df['launch_speed'].is_null(),
55
- (df['launch_speed'] * 1.5 - df['launch_angle'] >= 117) &
56
- (df['launch_speed'] + df['launch_angle'] >= 124) &
57
- (df['launch_speed'] >= 98) &
58
- (df['launch_angle'] >= 4) & (df['launch_angle'] <= 50)
59
- ]
60
- choices_barrel = [False, True]
61
-
62
- conditions_tb = [
63
- (df['event_type'] == 'single'),
64
- (df['event_type'] == 'double'),
65
- (df['event_type'] == 'triple'),
66
- (df['event_type'] == 'home_run')
67
- ]
68
- choices_tb = [1, 2, 3, 4]
69
-
70
-
71
- conditions_woba = [
72
- df['event_type'].is_in(['strikeout', 'field_out', 'sac_fly', 'force_out', 'grounded_into_double_play', 'fielders_choice', 'field_error', 'sac_bunt', 'double_play', 'fielders_choice_out', 'strikeout_double_play', 'sac_fly_double_play', 'other_out']),
73
- df['event_type'] == 'walk',
74
- df['event_type'] == 'hit_by_pitch',
75
- df['event_type'] == 'single',
76
- df['event_type'] == 'double',
77
- df['event_type'] == 'triple',
78
- df['event_type'] == 'home_run'
79
- ]
80
- choices_woba = [0, 0.689, 0.720, 0.881, 1.254, 1.589, 2.048]
81
-
82
- woba_codes = ['strikeout', 'field_out', 'single', 'walk', 'hit_by_pitch', 'double', 'sac_fly', 'force_out', 'home_run', 'grounded_into_double_play', 'fielders_choice', 'field_error', 'triple', 'sac_bunt', 'double_play', 'fielders_choice_out', 'strikeout_double_play', 'sac_fly_double_play', 'other_out']
83
-
84
- pitch_cat = {'FA': 'Fastball',
85
- 'FF': 'Fastball',
86
- 'FT': 'Fastball',
87
- 'FC': 'Fastball',
88
- 'FS': 'Off-Speed',
89
- 'FO': 'Off-Speed',
90
- 'SI': 'Fastball',
91
- 'ST': 'Breaking',
92
- 'SL': 'Breaking',
93
- 'CU': 'Breaking',
94
- 'KC': 'Breaking',
95
- 'SC': 'Off-Speed',
96
- 'GY': 'Off-Speed',
97
- 'SV': 'Breaking',
98
- 'CS': 'Breaking',
99
- 'CH': 'Off-Speed',
100
- 'KN': 'Off-Speed',
101
- 'EP': 'Breaking',
102
- 'UN': None,
103
- 'IN': None,
104
- 'PO': None,
105
- 'AB': None,
106
- 'AS': None,
107
- 'NP': None}
108
-
109
-
110
- df = df.with_columns([
111
- pl.when(df['type_ab'].is_not_null()).then(1).otherwise(0).alias('pa'),
112
- pl.when(df['is_pitch']).then(1).otherwise(0).alias('pitches'),
113
- pl.when(df['sz_top'] == 0).then(None).otherwise(df['sz_top']).alias('sz_top'),
114
- pl.when(df['sz_bot'] == 0).then(None).otherwise(df['sz_bot']).alias('sz_bot'),
115
- pl.when(df['zone'] > 0).then(df['zone'] < 10).otherwise(None).alias('in_zone'),
116
- pl.Series(px_model.predict(df[['x']].fill_null(0).to_numpy())[:, 0]).alias('px_predict'),
117
- pl.Series(pz_model.predict(df[['y']].fill_null(0).to_numpy())[:, 0]).alias('pz_predict'),
118
-
119
- pl.when(df['event_type'].is_in(hit_codes)).then(True).otherwise(False).alias('hits'),
120
- pl.when(df['event_type'].is_in(ab_codes)).then(True).otherwise(False).alias('ab'),
121
- pl.when(df['event_type'].is_in(obp_true_codes)).then(True).otherwise(False).alias('on_base'),
122
- pl.when(df['event_type'].is_in(obp_codes)).then(True).otherwise(False).alias('obp'),
123
- pl.when(df['play_description'].is_in(bip_codes)).then(True).otherwise(False).alias('bip'),
124
- pl.when(conditions_barrel[0]).then(choices_barrel[0]).when(conditions_barrel[1]).then(choices_barrel[1]).otherwise(None).alias('barrel'),
125
- pl.when(df['launch_angle'].is_null()).then(False).when((df['launch_angle'] >= 8) & (df['launch_angle'] <= 32)).then(True).otherwise(None).alias('sweet_spot'),
126
- pl.when(df['launch_speed'].is_null()).then(False).when(df['launch_speed'] >= 94.5).then(True).otherwise(None).alias('hard_hit'),
127
- pl.when(conditions_tb[0]).then(choices_tb[0]).when(conditions_tb[1]).then(choices_tb[1]).when(conditions_tb[2]).then(choices_tb[2]).when(conditions_tb[3]).then(choices_tb[3]).otherwise(None).alias('tb'),
128
- pl.when(conditions_woba[0]).then(choices_woba[0]).when(conditions_woba[1]).then(choices_woba[1]).when(conditions_woba[2]).then(choices_woba[2]).when(conditions_woba[3]).then(choices_woba[3]).when(conditions_woba[4]).then(choices_woba[4]).when(conditions_woba[5]).then(choices_woba[5]).when(conditions_woba[6]).then(choices_woba[6]).otherwise(None).alias('woba'),
129
- pl.when((df['play_code'] == 'S') | (df['play_code'] == 'W') | (df['play_code'] == 'T')).then(1).otherwise(0).alias('whiffs'),
130
- pl.when((df['play_code'] == 'S') | (df['play_code'] == 'W') | (df['play_code'] == 'T') | (df['play_code'] == 'C')).then(1).otherwise(0).alias('csw'),
131
- pl.when(pl.col('is_swing').cast(pl.Boolean)).then(1).otherwise(0).alias('swings'),
132
- pl.col('event_type').is_in(['strikeout','strikeout_double_play']).alias('k'),
133
- pl.col('event_type').is_in(['walk', 'intent_walk']).alias('bb'),
134
- pl.lit(None).alias('attack_zone'),
135
- pl.lit(None).alias('woba_pred'),
136
- pl.lit(None).alias('woba_pred_contact')
137
-
138
- ])
139
-
140
-
141
-
142
-
143
- df = df.with_columns([
144
- pl.when(df['event_type'].is_in(woba_codes)).then(1).otherwise(None).alias('woba_codes'),
145
- pl.when(df['event_type'].is_in(woba_codes)).then(1).otherwise(None).alias('xwoba_codes'),
146
- pl.when((pl.col('tb') >= 0)).then(df['woba']).otherwise(None).alias('woba_contact'),
147
-
148
-
149
- ])
150
-
151
- df = df.with_columns([
152
- pl.Series(in_zone_model.predict(df[['px','pz','sz_top','sz_bot']].fill_null(0).to_numpy())[:]).alias('in_zone_predict'),
153
- pl.Series(attack_zone_model.predict(df[['px','pz','sz_top','sz_bot']].fill_null(0).to_numpy())[:]).alias('attack_zone_predict'),
154
-
155
- ])
156
-
157
- df = df.with_columns([
158
- pl.when(pl.col('px').is_null()).then(pl.col('px_predict')).otherwise(pl.col('px')).alias('px'),
159
- pl.when(pl.col('pz').is_null()).then(pl.col('pz_predict')).otherwise(pl.col('pz')).alias('pz'),
160
- pl.when(pl.col('in_zone').is_null()).then(pl.col('in_zone_predict')).otherwise(pl.col('in_zone')).alias('in_zone_final'),
161
- ])
162
-
163
- df = df.with_columns([
164
- pl.when(df['launch_speed'].is_null()).then(None).otherwise(df['barrel']).alias('barrel'),
165
- pl.lit('average').alias('average'),
166
- pl.when(pl.col('in_zone_final') == False).then(True).otherwise(False).alias('out_zone'),
167
- pl.when((pl.col('in_zone_final') == True) & (pl.col('swings') == 1)).then(True).otherwise(False).alias('zone_swing'),
168
- pl.when((pl.col('in_zone_final') == True) & (pl.col('swings') == 1) & (pl.col('whiffs') == 0)).then(True).otherwise(False).alias('zone_contact'),
169
- pl.when((pl.col('in_zone_final') == False) & (pl.col('swings') == 1)).then(True).otherwise(False).alias('ozone_swing'),
170
- pl.when((pl.col('in_zone_final') == False) & (pl.col('swings') == 1) & (pl.col('whiffs') == 0)).then(True).otherwise(False).alias('ozone_contact'),
171
- pl.when(pl.col('event_type').str.contains('strikeout')).then(True).otherwise(False).alias('k'),
172
- pl.when(pl.col('event_type').is_in(['walk', 'intent_walk'])).then(True).otherwise(False).alias('bb'),
173
- pl.when(pl.col('attack_zone').is_null()).then(pl.col('attack_zone_predict')).otherwise(pl.col('attack_zone')).alias('attack_zone_final'),
174
-
175
-
176
- ])
177
-
178
- df = df.with_columns([
179
- (df['k'].cast(pl.Float32) - df['bb'].cast(pl.Float32)).alias('k_minus_bb'),
180
- (df['bb'].cast(pl.Float32) - df['k'].cast(pl.Float32)).alias('bb_minus_k'),
181
- (df['launch_speed'] > 0).alias('bip_div'),
182
- (df['attack_zone_final'] == 0).alias('heart'),
183
- (df['attack_zone_final'] == 1).alias('shadow'),
184
- (df['attack_zone_final'] == 2).alias('chase'),
185
- (df['attack_zone_final'] == 3).alias('waste'),
186
- ((df['attack_zone_final'] == 0) & (df['swings'] == 1)).alias('heart_swing'),
187
- ((df['attack_zone_final'] == 1) & (df['swings'] == 1)).alias('shadow_swing'),
188
- ((df['attack_zone_final'] == 2) & (df['swings'] == 1)).alias('chase_swing'),
189
- ((df['attack_zone_final'] == 3) & (df['swings'] == 1)).alias('waste_swing'),
190
- ((df['attack_zone_final'] == 0) & (df['whiffs'] == 1)).alias('heart_whiff'),
191
- ((df['attack_zone_final'] == 1) & (df['whiffs'] == 1)).alias('shadow_whiff'),
192
- ((df['attack_zone_final'] == 2) & (df['whiffs'] == 1)).alias('chase_whiff'),
193
- ((df['attack_zone_final'] == 3) & (df['whiffs'] == 1)).alias('waste_whiff')
194
- ])
195
-
196
-
197
- [0, 0.689, 0.720, 0.881, 1.254, 1.589, 2.048]
198
-
199
- df = df.with_columns([
200
- pl.Series(
201
- [sum(x) for x in xwoba_model.predict_proba(df[['launch_angle', 'launch_speed']].fill_null(0).to_numpy()[:]) * ([0, 0.881, 1.254, 1.589, 2.048])]
202
- ).alias('woba_pred_predict')
203
- ])
204
-
205
- df = df.with_columns([
206
- pl.when(pl.col('event_type').is_in(['walk'])).then(0.689)
207
- .when(pl.col('event_type').is_in(['hit_by_pitch'])).then(0.720)
208
- .when(pl.col('event_type').is_in(['strikeout', 'strikeout_double_play'])).then(0)
209
- .otherwise(pl.col('woba_pred_predict')).alias('woba_pred_predict')
210
- ])
211
-
212
- df = df.with_columns([
213
- pl.when(pl.col('woba_codes').is_null()).then(None).otherwise(pl.col('woba_pred_predict')).alias('woba_pred'),
214
- pl.when(pl.col('bip')!=1).then(None).otherwise(pl.col('woba_pred_predict')).alias('woba_pred_contact'),
215
- ])
216
-
217
- df = df.with_columns([
218
- pl.when(pl.col('trajectory').is_in(['bunt_popup'])).then(pl.lit('popup'))
219
- .when(pl.col('trajectory').is_in(['bunt_grounder'])).then(pl.lit('ground_ball'))
220
- .when(pl.col('trajectory').is_in(['bunt_line_drive'])).then(pl.lit('line_drive'))
221
- .when(pl.col('trajectory').is_in([''])).then(pl.lit(None))
222
- .otherwise(pl.col('trajectory')).alias('trajectory')
223
- ])
224
-
225
-
226
- # Create one-hot encoded columns for the trajectory column
227
- dummy_df = df.select(pl.col('trajectory')).to_dummies()
228
-
229
- # Rename the one-hot encoded columns
230
- dummy_df = dummy_df.rename({
231
- 'trajectory_fly_ball': 'trajectory_fly_ball',
232
- 'trajectory_ground_ball': 'trajectory_ground_ball',
233
- 'trajectory_line_drive': 'trajectory_line_drive',
234
- 'trajectory_popup': 'trajectory_popup'
235
- })
236
-
237
- # Ensure the columns are present in the DataFrame
238
- for col in ['trajectory_fly_ball', 'trajectory_ground_ball', 'trajectory_line_drive', 'trajectory_popup']:
239
- if col not in dummy_df.columns:
240
- dummy_df = dummy_df.with_columns(pl.lit(0).alias(col))
241
-
242
- # Join the one-hot encoded columns back to the original DataFrame
243
- df = df.hstack(dummy_df)
244
-
245
- # Check if 'trajectory_null' column exists and drop it
246
- if 'trajectory_null' in df.columns:
247
- df = df.drop('trajectory_null')
248
-
249
-
250
- pitch_cat = {'FA': None,
251
- 'FF': 'Fastball',
252
- 'FT': 'Fastball',
253
- 'FC': 'Fastball',
254
- 'FS': 'Off-Speed',
255
- 'FO': 'Off-Speed',
256
- 'SI': 'Fastball',
257
- 'ST': 'Breaking',
258
- 'SL': 'Breaking',
259
- 'CU': 'Breaking',
260
- 'KC': 'Breaking',
261
- 'SC': 'Off-Speed',
262
- 'GY': 'Off-Speed',
263
- 'SV': 'Breaking',
264
- 'CS': 'Breaking',
265
- 'CH': 'Off-Speed',
266
- 'KN': 'Off-Speed',
267
- 'EP': 'Breaking',
268
- 'UN': None,
269
- 'IN': None,
270
- 'PO': None,
271
- 'AB': None,
272
- 'AS': None,
273
- 'NP': None}
274
- df = df.with_columns(
275
- df["pitch_type"]
276
- .replace(pitch_cat)
277
- .fill_null("Unknown")
278
- .alias("pitch_group")
279
- )
280
-
281
- df = df.with_columns([
282
-
283
- (-(pl.col('vy0')**2 - (2 * pl.col('ay') * (pl.col('y0') - 17/12)))**0.5).alias('vy_f'),
284
- ])
285
-
286
- df = df.with_columns([
287
- ((pl.col('vy_f') - pl.col('vy0')) / pl.col('ay')).alias('t'),
288
- ])
289
-
290
- df = df.with_columns([
291
- (pl.col('vz0') + (pl.col('az') * pl.col('t'))).alias('vz_f'),
292
- (pl.col('vx0') + (pl.col('ax') * pl.col('t'))).alias('vx_f')
293
- ])
294
-
295
- df = df.with_columns([
296
- (-np.arctan(pl.col('vz_f') / pl.col('vy_f')) * (180 / np.pi)).alias('vaa'),
297
- (-np.arctan(pl.col('vx_f') / pl.col('vy_f')) * (180 / np.pi)).alias('haa')
298
- ])
299
-
300
- # Mirror horizontal break for left-handed pitchers
301
- df = df.with_columns(
302
- pl.when(pl.col('pitcher_hand') == 'L')
303
- .then(-pl.col('ax'))
304
- .otherwise(pl.col('ax'))
305
- .alias('ax')
306
- )
307
-
308
- # Mirror horizontal break for left-handed pitchers
309
- df = df.with_columns(
310
- pl.when(pl.col('pitcher_hand') == 'L')
311
- .then(-pl.col('hb'))
312
- .otherwise(pl.col('hb'))
313
- .alias('hb')
314
- )
315
-
316
- # Mirror horizontal release point for left-handed pitchers
317
- df = df.with_columns(
318
- pl.when(pl.col('pitcher_hand') == 'L')
319
- .then(pl.col('x0'))
320
- .otherwise(-pl.col('x0'))
321
- .alias('x0')
322
- )
323
-
324
- df = df.with_columns([
325
- pl.when(df['swings'].is_null()).then(None).otherwise(df['swings']).alias('is_swing'),
326
- pl.when(df['bip'].is_null()).then(None).otherwise(df['bip']).alias('is_bip')])
327
-
328
-
329
-
330
-
331
- df = df.with_columns([
332
- (np.arctan((pl.col("hit_x")*-1 + 125.42) / (198.27 - pl.col("hit_y"))) * 180 / np.pi * 0.75).alias("spray_angle")
333
- ])
334
-
335
- df = df.with_columns([
336
- pl.when(pl.col("batter_hand") == "L")
337
- .then(-pl.col("spray_angle"))
338
- .otherwise(pl.col("spray_angle"))
339
- .alias("adj_spray_angle")
340
- ]).drop("spray_angle")
341
-
342
-
343
- df = df.with_columns([
344
- pl.when(pl.col("adj_spray_angle").is_not_null() & (pl.col("adj_spray_angle") < -15))
345
- .then(pl.lit("oppo"))
346
- .when(pl.col("adj_spray_angle").is_not_null() & (pl.col("adj_spray_angle") > 15))
347
- .then(pl.lit("pull"))
348
- .when(pl.col("adj_spray_angle").is_not_null())
349
- .then(pl.lit("straight"))
350
- .otherwise(None) # Keep null if adj_spray_angle is null
351
- .alias("hit_direction")
352
- ])
353
-
354
- df = df.with_columns([
355
- pl.when(pl.col("hit_direction") == "oppo").then(1).otherwise(None).alias("oppo"),
356
- pl.when(pl.col("hit_direction") == "pull").then(1).otherwise(None).alias("pull"),
357
- pl.when(pl.col("hit_direction") == "straight").then(1).otherwise(None).alias("straight")
358
- ])
359
-
360
- df = df.with_columns([
361
- pl.when(pl.col("event_type") == "single").then(1).otherwise(0).alias("single"),
362
- pl.when(pl.col("event_type") == "double").then(1).otherwise(0).alias("double"),
363
- pl.when(pl.col("event_type") == "triple").then(1).otherwise(0).alias("triple"),
364
- pl.when(pl.col("event_type") == "home_run").then(1).otherwise(0).alias("home_run")
365
- ])
366
-
367
-
368
-
369
-
370
-
371
- return df
372
-
373
- # Assuming df is your Polars DataFrame
374
- def update_summary(self, df: pl.DataFrame, pitcher: bool = True) -> pl.DataFrame:
375
- """
376
- Update summary statistics for pitchers or batters.
377
-
378
- Parameters:
379
- df (pl.DataFrame): The input Polars DataFrame containing player statistics.
380
- pitcher (bool): A flag indicating whether to calculate statistics for pitchers (True) or batters (False).
381
-
382
- Returns:
383
- pl.DataFrame: A Polars DataFrame with aggregated and calculated summary statistics.
384
- """
385
-
386
- # Determine the position based on the pitcher flag
387
- if pitcher:
388
- position = 'pitcher'
389
- else:
390
- position = 'batter'
391
-
392
- # Group by position_id and position_name, then aggregate various statistics
393
- df_summ = df.group_by([f'{position}_id', f'{position}_name']).agg([
394
- pl.col('pa').sum().alias('pa'),
395
- pl.col('ab').sum().alias('ab'),
396
- pl.col('obp').sum().alias('obp_pa'),
397
- pl.col('hits').sum().alias('hits'),
398
- pl.col('on_base').sum().alias('on_base'),
399
- pl.col('k').sum().alias('k'),
400
- pl.col('bb').sum().alias('bb'),
401
- pl.col('k_minus_bb').sum().alias('k_minus_bb'),
402
- pl.col('bb_minus_k').sum().alias('bb_minus_k'),
403
- pl.col('csw').sum().alias('csw'),
404
- pl.col('bip').sum().alias('bip'),
405
- pl.col('bip_div').sum().alias('bip_div'),
406
- pl.col('tb').sum().alias('tb'),
407
- pl.col('woba').sum().alias('woba'),
408
- pl.col('woba_contact').sum().alias('woba_contact'),
409
- pl.col('woba_pred').sum().alias('xwoba'),
410
- pl.col('woba_pred_contact').sum().alias('xwoba_contact'),
411
- pl.col('woba_codes').sum().alias('woba_codes'),
412
- pl.col('xwoba_codes').sum().alias('xwoba_codes'),
413
- pl.col('hard_hit').sum().alias('hard_hit'),
414
- pl.col('barrel').sum().alias('barrel'),
415
- pl.col('sweet_spot').sum().alias('sweet_spot'),
416
- pl.col('launch_speed').max().alias('max_launch_speed'),
417
- pl.col('launch_speed').quantile(0.90).alias('launch_speed_90'),
418
- pl.col('launch_speed').mean().alias('launch_speed'),
419
- pl.col('launch_angle').mean().alias('launch_angle'),
420
- pl.col('is_pitch').sum().alias('pitches'),
421
- pl.col('swings').sum().alias('swings'),
422
- pl.col('in_zone').sum().alias('in_zone'),
423
- pl.col('out_zone').sum().alias('out_zone'),
424
- pl.col('whiffs').sum().alias('whiffs'),
425
- pl.col('zone_swing').sum().alias('zone_swing'),
426
- pl.col('zone_contact').sum().alias('zone_contact'),
427
- pl.col('ozone_swing').sum().alias('ozone_swing'),
428
- pl.col('ozone_contact').sum().alias('ozone_contact'),
429
- pl.col('trajectory_ground_ball').sum().alias('ground_ball'),
430
- pl.col('trajectory_line_drive').sum().alias('line_drive'),
431
- pl.col('trajectory_fly_ball').sum().alias('fly_ball'),
432
- pl.col('trajectory_popup').sum().alias('pop_up'),
433
- pl.col('attack_zone').count().alias('attack_zone'),
434
- pl.col('heart').sum().alias('heart'),
435
- pl.col('shadow').sum().alias('shadow'),
436
- pl.col('chase').sum().alias('chase'),
437
- pl.col('waste').sum().alias('waste'),
438
- pl.col('heart_swing').sum().alias('heart_swing'),
439
- pl.col('shadow_swing').sum().alias('shadow_swing'),
440
- pl.col('chase_swing').sum().alias('chase_swing'),
441
- pl.col('waste_swing').sum().alias('waste_swing'),
442
- pl.col('heart_whiff').sum().alias('heart_whiff'),
443
- pl.col('shadow_whiff').sum().alias('shadow_whiff'),
444
- pl.col('chase_whiff').sum().alias('chase_whiff'),
445
- pl.col('waste_whiff').sum().alias('waste_whiff'),
446
- pl.col('pull').sum().alias('pull'),
447
- pl.col('straight').sum().alias('straight'),
448
- pl.col('oppo').sum().alias('oppo'),
449
- ((pl.col('trajectory_fly_ball') == 1) | (pl.col('trajectory_line_drive') == 1)).sum().alias('fly_line_bip'),
450
- (pl.col('pull') & ((pl.col('trajectory_fly_ball') == 1) | (pl.col('trajectory_line_drive') == 1))).sum().alias('pull_fly_ball'),
451
- pl.col('single').sum().alias('single'),
452
- pl.col('double').sum().alias('double'),
453
- pl.col('triple').sum().alias('triple'),
454
- pl.col('home_run').sum().alias('home_run'),
455
- (pl.col('extension').mean()).alias('extension'),
456
- (pl.col('start_speed').filter(pl.col('pitch_type').is_in(['FF','SI'])).mean().alias('avg_start_speed_ff')),
457
-
458
-
459
-
460
- ])
461
-
462
- # Add calculated columns to the summary DataFrame
463
- df_summ = df_summ.with_columns([
464
- (pl.col('hits') / pl.col('ab')).alias('avg'),
465
- (pl.col('on_base') / pl.col('obp_pa')).alias('obp'),
466
- (pl.col('tb') / pl.col('ab')).alias('slg'),
467
- (pl.col('on_base') / pl.col('obp_pa') + pl.col('tb') / pl.col('ab')).alias('ops'),
468
- (pl.col('k') / pl.col('pa')).alias('k_percent'),
469
- (pl.col('bb') / pl.col('pa')).alias('bb_percent'),
470
- (pl.col('k_minus_bb') / pl.col('pa')).alias('k_minus_bb_percent'),
471
- (pl.col('bb_minus_k') / pl.col('pa')).alias('bb_minus_k_percent'),
472
- (pl.col('bb') / pl.col('k')).alias('bb_over_k_percent'),
473
- (pl.col('csw') / pl.col('pitches')).alias('csw_percent'),
474
- (pl.col('sweet_spot') / pl.col('bip_div')).alias('sweet_spot_percent'),
475
- (pl.col('woba') / pl.col('woba_codes')).alias('woba_percent'),
476
- (pl.col('woba_contact') / pl.col('bip')).alias('woba_percent_contact'),
477
- (pl.col('hard_hit') / pl.col('bip_div')).alias('hard_hit_percent'),
478
- (pl.col('barrel') / pl.col('bip_div')).alias('barrel_percent'),
479
- (pl.col('zone_contact') / pl.col('zone_swing')).alias('zone_contact_percent'),
480
- (pl.col('zone_swing') / pl.col('in_zone')).alias('zone_swing_percent'),
481
- (pl.col('in_zone') / pl.col('pitches')).alias('zone_percent'),
482
- (pl.col('ozone_swing') / (pl.col('out_zone'))).alias('chase_percent'),
483
- (pl.col('ozone_contact') / pl.col('ozone_swing')).alias('chase_contact'),
484
- (pl.col('swings') / pl.col('pitches')).alias('swing_percent'),
485
- (pl.col('whiffs') / pl.col('swings')).alias('whiff_rate'),
486
- (pl.col('whiffs') / pl.col('pitches')).alias('swstr_rate'),
487
- (pl.col('ground_ball') / pl.col('bip')).alias('ground_ball_percent'),
488
- (pl.col('line_drive') / pl.col('bip')).alias('line_drive_percent'),
489
- (pl.col('fly_ball') / pl.col('bip')).alias('fly_ball_percent'),
490
- (pl.col('pop_up') / pl.col('bip')).alias('pop_up_percent'),
491
- (pl.col('heart') / pl.col('attack_zone')).alias('heart_zone_percent'),
492
- (pl.col('shadow') / pl.col('attack_zone')).alias('shadow_zone_percent'),
493
- (pl.col('chase') / pl.col('attack_zone')).alias('chase_zone_percent'),
494
- (pl.col('waste') / pl.col('attack_zone')).alias('waste_zone_percent'),
495
- (pl.col('heart_swing') / pl.col('heart')).alias('heart_zone_swing_percent'),
496
- (pl.col('shadow_swing') / pl.col('shadow')).alias('shadow_zone_swing_percent'),
497
- (pl.col('chase_swing') / pl.col('chase')).alias('chase_zone_swing_percent'),
498
- (pl.col('waste_swing') / pl.col('waste')).alias('waste_zone_swing_percent'),
499
- (pl.col('heart_whiff') / pl.col('heart_swing')).alias('heart_zone_whiff_percent'),
500
- (pl.col('shadow_whiff') / pl.col('shadow_swing')).alias('shadow_zone_whiff_percent'),
501
- (pl.col('chase_whiff') / pl.col('chase_swing')).alias('chase_zone_whiff_percent'),
502
- (pl.col('waste_whiff') / pl.col('waste_swing')).alias('waste_zone_whiff_percent'),
503
- (pl.col('xwoba') / pl.col('xwoba_codes')).alias('xwoba_percent'),
504
- (pl.col('xwoba_contact') / pl.col('bip')).alias('xwoba_percent_contact'),
505
- (pl.col('pull') / pl.col('bip')).alias('pull_percent'),
506
- (pl.col('straight') / pl.col('bip')).alias('straight_percent'),
507
- (pl.col('oppo') / pl.col('bip')).alias('oppo_percent'),
508
- (pl.col('pull_fly_ball') / pl.col('fly_line_bip')).alias('pulled_fly_ball_percent'),
509
-
510
- ])
511
-
512
- return df_summ
513
-
514
-
515
-
516
-
517
-
518
-
519
- # Assuming df is your Polars DataFrame
520
- def update_summary_select(self, df: pl.DataFrame, selection: list) -> pl.DataFrame:
521
- """
522
- Update summary statistics for pitchers or batters.
523
-
524
- Parameters:
525
- df (pl.DataFrame): The input Polars DataFrame containing player statistics.
526
- pitcher (bool): A flag indicating whether to calculate statistics for pitchers (True) or batters (False).
527
-
528
- Returns:
529
- pl.DataFrame: A Polars DataFrame with aggregated and calculated summary statistics.
530
- """
531
-
532
- # Group by position_id and position_name, then aggregate various statistics
533
- df_summ = df.group_by(selection).agg([
534
- pl.col('pa').sum().alias('pa'),
535
- pl.col('ab').sum().alias('ab'),
536
- pl.col('obp').sum().alias('obp_pa'),
537
- pl.col('hits').sum().alias('hits'),
538
- pl.col('on_base').sum().alias('on_base'),
539
- pl.col('k').sum().alias('k'),
540
- pl.col('bb').sum().alias('bb'),
541
- pl.col('k_minus_bb').sum().alias('k_minus_bb'),
542
- pl.col('bb_minus_k').sum().alias('bb_minus_k'),
543
- pl.col('csw').sum().alias('csw'),
544
- pl.col('bip').sum().alias('bip'),
545
- pl.col('bip_div').sum().alias('bip_div'),
546
- pl.col('tb').sum().alias('tb'),
547
- pl.col('woba').sum().alias('woba'),
548
- pl.col('woba_contact').sum().alias('woba_contact'),
549
- pl.col('woba_pred').sum().alias('xwoba'),
550
- pl.col('woba_pred_contact').sum().alias('xwoba_contact'),
551
- pl.col('woba_codes').sum().alias('woba_codes'),
552
- pl.col('xwoba_codes').sum().alias('xwoba_codes'),
553
- pl.col('hard_hit').sum().alias('hard_hit'),
554
- pl.col('barrel').sum().alias('barrel'),
555
- pl.col('sweet_spot').sum().alias('sweet_spot'),
556
- pl.col('launch_speed').max().alias('max_launch_speed'),
557
- pl.col('launch_speed').quantile(0.90).alias('launch_speed_90'),
558
- pl.col('launch_speed').mean().alias('launch_speed'),
559
- pl.col('launch_angle').mean().alias('launch_angle'),
560
- pl.col('is_pitch').sum().alias('pitches'),
561
- pl.col('swings').sum().alias('swings'),
562
- pl.col('in_zone').sum().alias('in_zone'),
563
- pl.col('out_zone').sum().alias('out_zone'),
564
- pl.col('whiffs').sum().alias('whiffs'),
565
- pl.col('zone_swing').sum().alias('zone_swing'),
566
- pl.col('zone_contact').sum().alias('zone_contact'),
567
- pl.col('ozone_swing').sum().alias('ozone_swing'),
568
- pl.col('ozone_contact').sum().alias('ozone_contact'),
569
- pl.col('trajectory_ground_ball').sum().alias('ground_ball'),
570
- pl.col('trajectory_line_drive').sum().alias('line_drive'),
571
- pl.col('trajectory_fly_ball').sum().alias('fly_ball'),
572
- pl.col('trajectory_popup').sum().alias('pop_up'),
573
- pl.col('attack_zone').count().alias('attack_zone'),
574
- pl.col('heart').sum().alias('heart'),
575
- pl.col('shadow').sum().alias('shadow'),
576
- pl.col('chase').sum().alias('chase'),
577
- pl.col('waste').sum().alias('waste'),
578
- pl.col('heart_swing').sum().alias('heart_swing'),
579
- pl.col('shadow_swing').sum().alias('shadow_swing'),
580
- pl.col('chase_swing').sum().alias('chase_swing'),
581
- pl.col('waste_swing').sum().alias('waste_swing'),
582
- pl.col('heart_whiff').sum().alias('heart_whiff'),
583
- pl.col('shadow_whiff').sum().alias('shadow_whiff'),
584
- pl.col('chase_whiff').sum().alias('chase_whiff'),
585
- pl.col('waste_whiff').sum().alias('waste_whiff'),
586
- pl.col('pull').sum().alias('pull'),
587
- pl.col('straight').sum().alias('straight'),
588
- pl.col('oppo').sum().alias('oppo'),
589
- ((pl.col('trajectory_fly_ball') == 1) | (pl.col('trajectory_line_drive') == 1)).sum().alias('fly_line_bip'),
590
- (pl.col('pull') & ((pl.col('trajectory_fly_ball') == 1) | (pl.col('trajectory_line_drive') == 1))).sum().alias('pull_fly_ball'),
591
- pl.col('single').sum().alias('single'),
592
- pl.col('double').sum().alias('double'),
593
- pl.col('triple').sum().alias('triple'),
594
- pl.col('home_run').sum().alias('home_run'),
595
- (pl.col('extension').mean()).alias('extension'),
596
- (pl.col('start_speed').filter(pl.col('pitch_type').is_in(['FF','SI','FC'])).mean().alias('avg_start_speed_ff')),
597
-
598
-
599
- ])
600
-
601
- # Add calculated columns to the summary DataFrame
602
- df_summ = df_summ.with_columns([
603
- (pl.col('hits') / pl.col('ab')).alias('avg'),
604
- (pl.col('on_base') / pl.col('obp_pa')).alias('obp'),
605
- (pl.col('tb') / pl.col('ab')).alias('slg'),
606
- (pl.col('on_base') / pl.col('obp_pa') + pl.col('tb') / pl.col('ab')).alias('ops'),
607
- (pl.col('k') / pl.col('pa')).alias('k_percent'),
608
- (pl.col('bb') / pl.col('pa')).alias('bb_percent'),
609
- (pl.col('k_minus_bb') / pl.col('pa')).alias('k_minus_bb_percent'),
610
- (pl.col('bb_minus_k') / pl.col('pa')).alias('bb_minus_k_percent'),
611
- (pl.col('bb') / pl.col('k')).alias('bb_over_k_percent'),
612
- (pl.col('csw') / pl.col('pitches')).alias('csw_percent'),
613
- (pl.col('sweet_spot') / pl.col('bip_div')).alias('sweet_spot_percent'),
614
- (pl.col('woba') / pl.col('woba_codes')).alias('woba_percent'),
615
- (pl.col('woba_contact') / pl.col('bip')).alias('woba_percent_contact'),
616
- (pl.col('hard_hit') / pl.col('bip_div')).alias('hard_hit_percent'),
617
- (pl.col('barrel') / pl.col('bip_div')).alias('barrel_percent'),
618
- (pl.col('zone_contact') / pl.col('zone_swing')).alias('zone_contact_percent'),
619
- (pl.col('zone_swing') / pl.col('in_zone')).alias('zone_swing_percent'),
620
- (pl.col('in_zone') / pl.col('pitches')).alias('zone_percent'),
621
- (pl.col('ozone_swing') / (pl.col('pitches') - pl.col('in_zone'))).alias('chase_percent'),
622
- (pl.col('ozone_contact') / pl.col('ozone_swing')).alias('chase_contact'),
623
- (pl.col('swings') / pl.col('pitches')).alias('swing_percent'),
624
- (pl.col('whiffs') / pl.col('swings')).alias('whiff_rate'),
625
- (pl.col('whiffs') / pl.col('pitches')).alias('swstr_rate'),
626
- (pl.col('ground_ball') / pl.col('bip')).alias('ground_ball_percent'),
627
- (pl.col('line_drive') / pl.col('bip')).alias('line_drive_percent'),
628
- (pl.col('fly_ball') / pl.col('bip')).alias('fly_ball_percent'),
629
- (pl.col('pop_up') / pl.col('bip')).alias('pop_up_percent'),
630
- (pl.col('heart') / pl.col('attack_zone')).alias('heart_zone_percent'),
631
- (pl.col('shadow') / pl.col('attack_zone')).alias('shadow_zone_percent'),
632
- (pl.col('chase') / pl.col('attack_zone')).alias('chase_zone_percent'),
633
- (pl.col('waste') / pl.col('attack_zone')).alias('waste_zone_percent'),
634
- (pl.col('heart_swing') / pl.col('heart')).alias('heart_zone_swing_percent'),
635
- (pl.col('shadow_swing') / pl.col('shadow')).alias('shadow_zone_swing_percent'),
636
- (pl.col('chase_swing') / pl.col('chase')).alias('chase_zone_swing_percent'),
637
- (pl.col('waste_swing') / pl.col('waste')).alias('waste_zone_swing_percent'),
638
- (pl.col('heart_whiff') / pl.col('heart_swing')).alias('heart_zone_whiff_percent'),
639
- (pl.col('shadow_whiff') / pl.col('shadow_swing')).alias('shadow_zone_whiff_percent'),
640
- (pl.col('chase_whiff') / pl.col('chase_swing')).alias('chase_zone_whiff_percent'),
641
- (pl.col('waste_whiff') / pl.col('waste_swing')).alias('waste_zone_whiff_percent'),
642
- (pl.col('xwoba') / pl.col('xwoba_codes')).alias('xwoba_percent'),
643
- (pl.col('xwoba_contact') / pl.col('bip')).alias('xwoba_percent_contact'),
644
- (pl.col('pull') / pl.col('bip')).alias('pull_percent'),
645
- (pl.col('straight') / pl.col('bip')).alias('straight_percent'),
646
- (pl.col('oppo') / pl.col('bip')).alias('oppo_percent'),
647
- (pl.col('pull_fly_ball') / pl.col('fly_line_bip')).alias('pulled_fly_ball_percent'),
648
- ])
649
-
650
  return df_summ
 
1
+ import polars as pl
2
+ import numpy as np
3
+ import joblib
4
+
5
+ loaded_model = joblib.load('joblib_model/barrel_model.joblib')
6
+ in_zone_model = joblib.load('joblib_model/in_zone_model_knn_20240410.joblib')
7
+ attack_zone_model = joblib.load('joblib_model/model_attack_zone.joblib')
8
+ xwoba_model = joblib.load('joblib_model/xwoba_model.joblib')
9
+ px_model = joblib.load('joblib_model/linear_reg_model_x.joblib')
10
+ pz_model = joblib.load('joblib_model/linear_reg_model_z.joblib')
11
+
12
+
13
+ class df_update:
14
+ def __init__(self):
15
+ pass
16
+
17
+ def update(self, df_clone: pl.DataFrame):
18
+
19
+ df = df_clone.clone()
20
+ # Assuming px_model is defined and df is your DataFrame
21
+ hit_codes = ['single',
22
+ 'double','home_run', 'triple']
23
+
24
+ ab_codes = ['single', 'strikeout', 'field_out',
25
+ 'grounded_into_double_play', 'fielders_choice', 'force_out',
26
+ 'double', 'field_error', 'home_run', 'triple',
27
+ 'double_play',
28
+ 'fielders_choice_out', 'strikeout_double_play',
29
+ 'other_out','triple_play']
30
+
31
+
32
+ obp_true_codes = ['single', 'walk',
33
+ 'double','home_run', 'triple',
34
+ 'hit_by_pitch', 'intent_walk']
35
+
36
+ obp_codes = ['single', 'strikeout', 'walk', 'field_out',
37
+ 'grounded_into_double_play', 'fielders_choice', 'force_out',
38
+ 'double', 'sac_fly', 'field_error', 'home_run', 'triple',
39
+ 'hit_by_pitch', 'double_play', 'intent_walk',
40
+ 'fielders_choice_out', 'strikeout_double_play',
41
+ 'sac_fly_double_play',
42
+ 'other_out','triple_play']
43
+
44
+
45
+ contact_codes = ['In play, no out',
46
+ 'Foul', 'In play, out(s)',
47
+ 'In play, run(s)',
48
+ 'Foul Bunt']
49
+
50
+ bip_codes = ['In play, no out', 'In play, run(s)','In play, out(s)']
51
+
52
+
53
+ conditions_barrel = [
54
+ df['launch_speed'].is_null(),
55
+ (df['launch_speed'] * 1.5 - df['launch_angle'] >= 117) &
56
+ (df['launch_speed'] + df['launch_angle'] >= 124) &
57
+ (df['launch_speed'] >= 98) &
58
+ (df['launch_angle'] >= 4) & (df['launch_angle'] <= 50)
59
+ ]
60
+ choices_barrel = [False, True]
61
+
62
+ conditions_tb = [
63
+ (df['event_type'] == 'single'),
64
+ (df['event_type'] == 'double'),
65
+ (df['event_type'] == 'triple'),
66
+ (df['event_type'] == 'home_run')
67
+ ]
68
+ choices_tb = [1, 2, 3, 4]
69
+
70
+
71
+ conditions_woba = [
72
+ df['event_type'].is_in(['strikeout', 'field_out', 'sac_fly', 'force_out', 'grounded_into_double_play', 'fielders_choice', 'field_error', 'sac_bunt', 'double_play', 'fielders_choice_out', 'strikeout_double_play', 'sac_fly_double_play', 'other_out']),
73
+ df['event_type'] == 'walk',
74
+ df['event_type'] == 'hit_by_pitch',
75
+ df['event_type'] == 'single',
76
+ df['event_type'] == 'double',
77
+ df['event_type'] == 'triple',
78
+ df['event_type'] == 'home_run'
79
+ ]
80
+ choices_woba = [0, 0.689, 0.720, 0.881, 1.254, 1.589, 2.048]
81
+
82
+ woba_codes = ['strikeout', 'field_out', 'single', 'walk', 'hit_by_pitch', 'double', 'sac_fly', 'force_out', 'home_run', 'grounded_into_double_play', 'fielders_choice', 'field_error', 'triple', 'sac_bunt', 'double_play', 'fielders_choice_out', 'strikeout_double_play', 'sac_fly_double_play', 'other_out']
83
+
84
+ pitch_cat = {'FA': 'Fastball',
85
+ 'FF': 'Fastball',
86
+ 'FT': 'Fastball',
87
+ 'FC': 'Fastball',
88
+ 'FS': 'Off-Speed',
89
+ 'FO': 'Off-Speed',
90
+ 'SI': 'Fastball',
91
+ 'ST': 'Breaking',
92
+ 'SL': 'Breaking',
93
+ 'CU': 'Breaking',
94
+ 'KC': 'Breaking',
95
+ 'SC': 'Off-Speed',
96
+ 'GY': 'Off-Speed',
97
+ 'SV': 'Breaking',
98
+ 'CS': 'Breaking',
99
+ 'CH': 'Off-Speed',
100
+ 'KN': 'Off-Speed',
101
+ 'EP': 'Breaking',
102
+ 'UN': None,
103
+ 'IN': None,
104
+ 'PO': None,
105
+ 'AB': None,
106
+ 'AS': None,
107
+ 'NP': None}
108
+
109
+
110
+ df = df.with_columns([
111
+ pl.when(df['type_ab'].is_not_null()).then(1).otherwise(0).alias('pa'),
112
+ pl.when(df['is_pitch']).then(1).otherwise(0).alias('pitches'),
113
+ pl.when(df['sz_top'] == 0).then(None).otherwise(df['sz_top']).alias('sz_top'),
114
+ pl.when(df['sz_bot'] == 0).then(None).otherwise(df['sz_bot']).alias('sz_bot'),
115
+ pl.when(df['zone'] > 0).then(df['zone'] < 10).otherwise(None).alias('in_zone'),
116
+ pl.Series(px_model.predict(df[['x']].fill_null(0).to_numpy())[:, 0]).alias('px_predict'),
117
+ pl.Series(pz_model.predict(df[['y']].fill_null(0).to_numpy())[:, 0]).alias('pz_predict'),
118
+
119
+ pl.when(df['event_type'].is_in(hit_codes)).then(True).otherwise(False).alias('hits'),
120
+ pl.when(df['event_type'].is_in(ab_codes)).then(True).otherwise(False).alias('ab'),
121
+ pl.when(df['event_type'].is_in(obp_true_codes)).then(True).otherwise(False).alias('on_base'),
122
+ pl.when(df['event_type'].is_in(obp_codes)).then(True).otherwise(False).alias('obp'),
123
+ pl.when(df['play_description'].is_in(bip_codes)).then(True).otherwise(False).alias('bip'),
124
+ pl.when(conditions_barrel[0]).then(choices_barrel[0]).when(conditions_barrel[1]).then(choices_barrel[1]).otherwise(None).alias('barrel'),
125
+ pl.when(df['launch_angle'].is_null()).then(False).when((df['launch_angle'] >= 8) & (df['launch_angle'] <= 32)).then(True).otherwise(None).alias('sweet_spot'),
126
+ pl.when(df['launch_speed'].is_null()).then(False).when(df['launch_speed'] >= 94.5).then(True).otherwise(None).alias('hard_hit'),
127
+ pl.when(conditions_tb[0]).then(choices_tb[0]).when(conditions_tb[1]).then(choices_tb[1]).when(conditions_tb[2]).then(choices_tb[2]).when(conditions_tb[3]).then(choices_tb[3]).otherwise(None).alias('tb'),
128
+ pl.when(conditions_woba[0]).then(choices_woba[0]).when(conditions_woba[1]).then(choices_woba[1]).when(conditions_woba[2]).then(choices_woba[2]).when(conditions_woba[3]).then(choices_woba[3]).when(conditions_woba[4]).then(choices_woba[4]).when(conditions_woba[5]).then(choices_woba[5]).when(conditions_woba[6]).then(choices_woba[6]).otherwise(None).alias('woba'),
129
+ pl.when((df['play_code'] == 'S') | (df['play_code'] == 'W') | (df['play_code'] == 'T')).then(1).otherwise(0).alias('whiffs'),
130
+ pl.when((df['play_code'] == 'S') | (df['play_code'] == 'W') | (df['play_code'] == 'T') | (df['play_code'] == 'C')).then(1).otherwise(0).alias('csw'),
131
+ pl.when(pl.col('is_swing').cast(pl.Boolean)).then(1).otherwise(0).alias('swings'),
132
+ pl.col('event_type').is_in(['strikeout','strikeout_double_play']).alias('k'),
133
+ pl.col('event_type').is_in(['walk', 'intent_walk']).alias('bb'),
134
+ pl.lit(None).alias('attack_zone'),
135
+ pl.lit(None).alias('woba_pred'),
136
+ pl.lit(None).alias('woba_pred_contact')
137
+
138
+ ])
139
+
140
+
141
+
142
+
143
+ df = df.with_columns([
144
+ pl.when(df['event_type'].is_in(woba_codes)).then(1).otherwise(None).alias('woba_codes'),
145
+ pl.when(df['event_type'].is_in(woba_codes)).then(1).otherwise(None).alias('xwoba_codes'),
146
+ pl.when((pl.col('tb') >= 0)).then(df['woba']).otherwise(None).alias('woba_contact'),
147
+
148
+
149
+ ])
150
+
151
+ df = df.with_columns([
152
+ pl.Series(in_zone_model.predict(df[['px','pz','sz_top','sz_bot']].fill_null(0).to_numpy())[:]).alias('in_zone_predict'),
153
+ pl.Series(attack_zone_model.predict(df[['px','pz','sz_top','sz_bot']].fill_null(0).to_numpy())[:]).alias('attack_zone_predict'),
154
+
155
+ ])
156
+
157
+ df = df.with_columns([
158
+ pl.when(pl.col('px').is_null()).then(pl.col('px_predict')).otherwise(pl.col('px')).alias('px'),
159
+ pl.when(pl.col('pz').is_null()).then(pl.col('pz_predict')).otherwise(pl.col('pz')).alias('pz'),
160
+ pl.when(pl.col('in_zone').is_null()).then(pl.col('in_zone_predict')).otherwise(pl.col('in_zone')).alias('in_zone_final'),
161
+ ])
162
+
163
+ df = df.with_columns([
164
+ pl.when(df['launch_speed'].is_null()).then(None).otherwise(df['barrel']).alias('barrel'),
165
+ pl.lit('average').alias('average'),
166
+ pl.when(pl.col('in_zone_final') == False).then(True).otherwise(False).alias('out_zone'),
167
+ pl.when((pl.col('in_zone_final') == True) & (pl.col('swings') == 1)).then(True).otherwise(False).alias('zone_swing'),
168
+ pl.when((pl.col('in_zone_final') == True) & (pl.col('swings') == 1) & (pl.col('whiffs') == 0)).then(True).otherwise(False).alias('zone_contact'),
169
+ pl.when((pl.col('in_zone_final') == False) & (pl.col('swings') == 1)).then(True).otherwise(False).alias('ozone_swing'),
170
+ pl.when((pl.col('in_zone_final') == False) & (pl.col('swings') == 1) & (pl.col('whiffs') == 0)).then(True).otherwise(False).alias('ozone_contact'),
171
+ pl.when(pl.col('event_type').str.contains('strikeout')).then(True).otherwise(False).alias('k'),
172
+ pl.when(pl.col('event_type').is_in(['walk', 'intent_walk'])).then(True).otherwise(False).alias('bb'),
173
+ pl.when(pl.col('attack_zone').is_null()).then(pl.col('attack_zone_predict')).otherwise(pl.col('attack_zone')).alias('attack_zone_final'),
174
+
175
+
176
+ ])
177
+
178
+ df = df.with_columns([
179
+ (df['k'].cast(pl.Float32) - df['bb'].cast(pl.Float32)).alias('k_minus_bb'),
180
+ (df['bb'].cast(pl.Float32) - df['k'].cast(pl.Float32)).alias('bb_minus_k'),
181
+ (df['launch_speed'] > 0).alias('bip_div'),
182
+ (df['attack_zone_final'] == 0).alias('heart'),
183
+ (df['attack_zone_final'] == 1).alias('shadow'),
184
+ (df['attack_zone_final'] == 2).alias('chase'),
185
+ (df['attack_zone_final'] == 3).alias('waste'),
186
+ ((df['attack_zone_final'] == 0) & (df['swings'] == 1)).alias('heart_swing'),
187
+ ((df['attack_zone_final'] == 1) & (df['swings'] == 1)).alias('shadow_swing'),
188
+ ((df['attack_zone_final'] == 2) & (df['swings'] == 1)).alias('chase_swing'),
189
+ ((df['attack_zone_final'] == 3) & (df['swings'] == 1)).alias('waste_swing'),
190
+ ((df['attack_zone_final'] == 0) & (df['whiffs'] == 1)).alias('heart_whiff'),
191
+ ((df['attack_zone_final'] == 1) & (df['whiffs'] == 1)).alias('shadow_whiff'),
192
+ ((df['attack_zone_final'] == 2) & (df['whiffs'] == 1)).alias('chase_whiff'),
193
+ ((df['attack_zone_final'] == 3) & (df['whiffs'] == 1)).alias('waste_whiff')
194
+ ])
195
+
196
+
197
+ [0, 0.689, 0.720, 0.881, 1.254, 1.589, 2.048]
198
+
199
+ df = df.with_columns([
200
+ pl.Series(
201
+ [sum(x) for x in xwoba_model.predict_proba(df[['launch_angle', 'launch_speed']].fill_null(0).to_numpy()[:]) * ([0, 0.881, 1.254, 1.589, 2.048])]
202
+ ).alias('woba_pred_predict')
203
+ ])
204
+
205
+ df = df.with_columns([
206
+ pl.when(pl.col('event_type').is_in(['walk'])).then(0.689)
207
+ .when(pl.col('event_type').is_in(['hit_by_pitch'])).then(0.720)
208
+ .when(pl.col('event_type').is_in(['strikeout', 'strikeout_double_play'])).then(0)
209
+ .otherwise(pl.col('woba_pred_predict')).alias('woba_pred_predict')
210
+ ])
211
+
212
+ df = df.with_columns([
213
+ pl.when(pl.col('woba_codes').is_null()).then(None).otherwise(pl.col('woba_pred_predict')).alias('woba_pred'),
214
+ pl.when(pl.col('bip')!=1).then(None).otherwise(pl.col('woba_pred_predict')).alias('woba_pred_contact'),
215
+ ])
216
+
217
+ df = df.with_columns([
218
+ pl.when(pl.col('trajectory').is_in(['bunt_popup'])).then(pl.lit('popup'))
219
+ .when(pl.col('trajectory').is_in(['bunt_grounder'])).then(pl.lit('ground_ball'))
220
+ .when(pl.col('trajectory').is_in(['bunt_line_drive'])).then(pl.lit('line_drive'))
221
+ .when(pl.col('trajectory').is_in([''])).then(pl.lit(None))
222
+ .otherwise(pl.col('trajectory')).alias('trajectory')
223
+ ])
224
+
225
+
226
+ # Create one-hot encoded columns for the trajectory column
227
+ dummy_df = df.select(pl.col('trajectory')).to_dummies()
228
+
229
+ # Rename the one-hot encoded columns
230
+ dummy_df = dummy_df.rename({
231
+ 'trajectory_fly_ball': 'trajectory_fly_ball',
232
+ 'trajectory_ground_ball': 'trajectory_ground_ball',
233
+ 'trajectory_line_drive': 'trajectory_line_drive',
234
+ 'trajectory_popup': 'trajectory_popup'
235
+ })
236
+
237
+ # Ensure the columns are present in the DataFrame
238
+ for col in ['trajectory_fly_ball', 'trajectory_ground_ball', 'trajectory_line_drive', 'trajectory_popup']:
239
+ if col not in dummy_df.columns:
240
+ dummy_df = dummy_df.with_columns(pl.lit(0).alias(col))
241
+
242
+ # Join the one-hot encoded columns back to the original DataFrame
243
+ df = df.hstack(dummy_df)
244
+
245
+ # Check if 'trajectory_null' column exists and drop it
246
+ if 'trajectory_null' in df.columns:
247
+ df = df.drop('trajectory_null')
248
+
249
+
250
+ pitch_cat = {'FA': None,
251
+ 'FF': 'Fastball',
252
+ 'FT': 'Fastball',
253
+ 'FC': 'Fastball',
254
+ 'FS': 'Off-Speed',
255
+ 'FO': 'Off-Speed',
256
+ 'SI': 'Fastball',
257
+ 'ST': 'Breaking',
258
+ 'SL': 'Breaking',
259
+ 'CU': 'Breaking',
260
+ 'KC': 'Breaking',
261
+ 'SC': 'Off-Speed',
262
+ 'GY': 'Off-Speed',
263
+ 'SV': 'Breaking',
264
+ 'CS': 'Breaking',
265
+ 'CH': 'Off-Speed',
266
+ 'KN': 'Off-Speed',
267
+ 'EP': 'Breaking',
268
+ 'UN': None,
269
+ 'IN': None,
270
+ 'PO': None,
271
+ 'AB': None,
272
+ 'AS': None,
273
+ 'NP': None}
274
+ df = df.with_columns(
275
+ df["pitch_type"]
276
+ .replace(pitch_cat)
277
+ .fill_null("Unknown")
278
+ .alias("pitch_group")
279
+ )
280
+
281
+ df = df.with_columns([
282
+
283
+ (-(pl.col('vy0')**2 - (2 * pl.col('ay') * (pl.col('y0') - 17/12)))**0.5).alias('vy_f'),
284
+ ])
285
+
286
+ df = df.with_columns([
287
+ ((pl.col('vy_f') - pl.col('vy0')) / pl.col('ay')).alias('t'),
288
+ ])
289
+
290
+ df = df.with_columns([
291
+ (pl.col('vz0') + (pl.col('az') * pl.col('t'))).alias('vz_f'),
292
+ (pl.col('vx0') + (pl.col('ax') * pl.col('t'))).alias('vx_f')
293
+ ])
294
+
295
+ df = df.with_columns([
296
+ (-np.arctan(pl.col('vz_f') / pl.col('vy_f')) * (180 / np.pi)).alias('vaa'),
297
+ (-np.arctan(pl.col('vx_f') / pl.col('vy_f')) * (180 / np.pi)).alias('haa')
298
+ ])
299
+
300
+ # Mirror horizontal break for left-handed pitchers
301
+ df = df.with_columns(
302
+ pl.when(pl.col('pitcher_hand') == 'L')
303
+ .then(-pl.col('ax'))
304
+ .otherwise(pl.col('ax'))
305
+ .alias('ax')
306
+ )
307
+
308
+ # Mirror horizontal break for left-handed pitchers
309
+ df = df.with_columns(
310
+ pl.when(pl.col('pitcher_hand') == 'L')
311
+ .then(-pl.col('hb'))
312
+ .otherwise(pl.col('hb'))
313
+ .alias('hb')
314
+ )
315
+
316
+ # Mirror horizontal release point for left-handed pitchers
317
+ df = df.with_columns(
318
+ pl.when(pl.col('pitcher_hand') == 'L')
319
+ .then(pl.col('x0'))
320
+ .otherwise(-pl.col('x0'))
321
+ .alias('x0')
322
+ )
323
+
324
+ df = df.with_columns([
325
+ pl.when(df['swings'].is_null()).then(None).otherwise(df['swings']).alias('is_swing'),
326
+ pl.when(df['bip'].is_null()).then(None).otherwise(df['bip']).alias('is_bip')])
327
+
328
+
329
+
330
+
331
+ df = df.with_columns([
332
+ (np.arctan((pl.col("hit_x")*-1 + 125.42) / (198.27 - pl.col("hit_y"))) * 180 / np.pi * 0.75).alias("spray_angle")
333
+ ])
334
+
335
+ df = df.with_columns([
336
+ pl.when(pl.col("batter_hand") == "L")
337
+ .then(-pl.col("spray_angle"))
338
+ .otherwise(pl.col("spray_angle"))
339
+ .alias("adj_spray_angle")
340
+ ]).drop("spray_angle")
341
+
342
+
343
+ df = df.with_columns([
344
+ pl.when(pl.col("adj_spray_angle").is_not_null() & (pl.col("adj_spray_angle") < -15))
345
+ .then(pl.lit("oppo"))
346
+ .when(pl.col("adj_spray_angle").is_not_null() & (pl.col("adj_spray_angle") > 15))
347
+ .then(pl.lit("pull"))
348
+ .when(pl.col("adj_spray_angle").is_not_null())
349
+ .then(pl.lit("straight"))
350
+ .otherwise(None) # Keep null if adj_spray_angle is null
351
+ .alias("hit_direction")
352
+ ])
353
+
354
+ df = df.with_columns([
355
+ pl.when(pl.col("hit_direction") == "oppo").then(1).otherwise(None).alias("oppo"),
356
+ pl.when(pl.col("hit_direction") == "pull").then(1).otherwise(None).alias("pull"),
357
+ pl.when(pl.col("hit_direction") == "straight").then(1).otherwise(None).alias("straight")
358
+ ])
359
+
360
+ df = df.with_columns([
361
+ pl.when(pl.col("event_type") == "single").then(1).otherwise(0).alias("single"),
362
+ pl.when(pl.col("event_type") == "double").then(1).otherwise(0).alias("double"),
363
+ pl.when(pl.col("event_type") == "triple").then(1).otherwise(0).alias("triple"),
364
+ pl.when(pl.col("event_type") == "home_run").then(1).otherwise(0).alias("home_run")
365
+ ])
366
+
367
+
368
+
369
+
370
+
371
+ return df
372
+
373
+ # Assuming df is your Polars DataFrame
374
+ def update_summary(self, df: pl.DataFrame, pitcher: bool = True) -> pl.DataFrame:
375
+ """
376
+ Update summary statistics for pitchers or batters.
377
+
378
+ Parameters:
379
+ df (pl.DataFrame): The input Polars DataFrame containing player statistics.
380
+ pitcher (bool): A flag indicating whether to calculate statistics for pitchers (True) or batters (False).
381
+
382
+ Returns:
383
+ pl.DataFrame: A Polars DataFrame with aggregated and calculated summary statistics.
384
+ """
385
+
386
+ # Determine the position based on the pitcher flag
387
+ if pitcher:
388
+ position = 'pitcher'
389
+ else:
390
+ position = 'batter'
391
+
392
+ # Group by position_id and position_name, then aggregate various statistics
393
+ df_summ = df.group_by([f'{position}_id', f'{position}_name']).agg([
394
+ pl.col('pa').sum().alias('pa'),
395
+ pl.col('ab').sum().alias('ab'),
396
+ pl.col('obp').sum().alias('obp_pa'),
397
+ pl.col('hits').sum().alias('hits'),
398
+ pl.col('on_base').sum().alias('on_base'),
399
+ pl.col('k').sum().alias('k'),
400
+ pl.col('bb').sum().alias('bb'),
401
+ pl.col('k_minus_bb').sum().alias('k_minus_bb'),
402
+ pl.col('bb_minus_k').sum().alias('bb_minus_k'),
403
+ pl.col('csw').sum().alias('csw'),
404
+ pl.col('bip').sum().alias('bip'),
405
+ pl.col('bip_div').sum().alias('bip_div'),
406
+ pl.col('tb').sum().alias('tb'),
407
+ pl.col('woba').sum().alias('woba'),
408
+ pl.col('woba_contact').sum().alias('woba_contact'),
409
+ pl.col('woba_pred').sum().alias('xwoba'),
410
+ pl.col('woba_pred_contact').sum().alias('xwoba_contact'),
411
+ pl.col('woba_codes').sum().alias('woba_codes'),
412
+ pl.col('xwoba_codes').sum().alias('xwoba_codes'),
413
+ pl.col('hard_hit').sum().alias('hard_hit'),
414
+ pl.col('barrel').sum().alias('barrel'),
415
+ pl.col('sweet_spot').sum().alias('sweet_spot'),
416
+ pl.col('launch_speed').max().alias('max_launch_speed'),
417
+ pl.col('launch_speed').quantile(0.90).alias('launch_speed_90'),
418
+ pl.col('launch_speed').mean().alias('launch_speed'),
419
+ pl.col('launch_angle').mean().alias('launch_angle'),
420
+ pl.col('is_pitch').sum().alias('pitches'),
421
+ pl.col('swings').sum().alias('swings'),
422
+ pl.col('in_zone').sum().alias('in_zone'),
423
+ pl.col('out_zone').sum().alias('out_zone'),
424
+ pl.col('whiffs').sum().alias('whiffs'),
425
+ pl.col('zone_swing').sum().alias('zone_swing'),
426
+ pl.col('zone_contact').sum().alias('zone_contact'),
427
+ pl.col('ozone_swing').sum().alias('ozone_swing'),
428
+ pl.col('ozone_contact').sum().alias('ozone_contact'),
429
+ pl.col('trajectory_ground_ball').sum().alias('ground_ball'),
430
+ pl.col('trajectory_line_drive').sum().alias('line_drive'),
431
+ pl.col('trajectory_fly_ball').sum().alias('fly_ball'),
432
+ pl.col('trajectory_popup').sum().alias('pop_up'),
433
+ pl.col('attack_zone').count().alias('attack_zone'),
434
+ pl.col('heart').sum().alias('heart'),
435
+ pl.col('shadow').sum().alias('shadow'),
436
+ pl.col('chase').sum().alias('chase'),
437
+ pl.col('waste').sum().alias('waste'),
438
+ pl.col('heart_swing').sum().alias('heart_swing'),
439
+ pl.col('shadow_swing').sum().alias('shadow_swing'),
440
+ pl.col('chase_swing').sum().alias('chase_swing'),
441
+ pl.col('waste_swing').sum().alias('waste_swing'),
442
+ pl.col('heart_whiff').sum().alias('heart_whiff'),
443
+ pl.col('shadow_whiff').sum().alias('shadow_whiff'),
444
+ pl.col('chase_whiff').sum().alias('chase_whiff'),
445
+ pl.col('waste_whiff').sum().alias('waste_whiff'),
446
+ pl.col('pull').sum().alias('pull'),
447
+ pl.col('straight').sum().alias('straight'),
448
+ pl.col('oppo').sum().alias('oppo'),
449
+ ((pl.col('trajectory_fly_ball') == 1) | (pl.col('trajectory_line_drive') == 1)).sum().alias('fly_line_bip'),
450
+ (pl.col('pull') & ((pl.col('trajectory_fly_ball') == 1) | (pl.col('trajectory_line_drive') == 1))).sum().alias('pull_fly_ball'),
451
+ pl.col('single').sum().alias('single'),
452
+ pl.col('double').sum().alias('double'),
453
+ pl.col('triple').sum().alias('triple'),
454
+ pl.col('home_run').sum().alias('home_run'),
455
+ (pl.col('extension').mean()).alias('extension'),
456
+ (pl.col('start_speed').filter(pl.col('pitch_type').is_in(['FF','SI'])).mean().alias('avg_start_speed_ff')),
457
+
458
+
459
+
460
+ ])
461
+
462
+ # Add calculated columns to the summary DataFrame
463
+ df_summ = df_summ.with_columns([
464
+ (pl.col('hits') / pl.col('ab')).alias('avg'),
465
+ (pl.col('on_base') / pl.col('obp_pa')).alias('obp'),
466
+ (pl.col('tb') / pl.col('ab')).alias('slg'),
467
+ (pl.col('on_base') / pl.col('obp_pa') + pl.col('tb') / pl.col('ab')).alias('ops'),
468
+ (pl.col('k') / pl.col('pa')).alias('k_percent'),
469
+ (pl.col('bb') / pl.col('pa')).alias('bb_percent'),
470
+ (pl.col('k_minus_bb') / pl.col('pa')).alias('k_minus_bb_percent'),
471
+ (pl.col('bb_minus_k') / pl.col('pa')).alias('bb_minus_k_percent'),
472
+ (pl.col('bb') / pl.col('k')).alias('bb_over_k_percent'),
473
+ (pl.col('csw') / pl.col('pitches')).alias('csw_percent'),
474
+ (pl.col('sweet_spot') / pl.col('bip_div')).alias('sweet_spot_percent'),
475
+ (pl.col('woba') / pl.col('woba_codes')).alias('woba_percent'),
476
+ (pl.col('woba_contact') / pl.col('bip')).alias('woba_percent_contact'),
477
+ (pl.col('hard_hit') / pl.col('bip_div')).alias('hard_hit_percent'),
478
+ (pl.col('barrel') / pl.col('bip_div')).alias('barrel_percent'),
479
+ (pl.col('zone_contact') / pl.col('zone_swing')).alias('zone_contact_percent'),
480
+ (pl.col('zone_swing') / pl.col('in_zone')).alias('zone_swing_percent'),
481
+ (pl.col('in_zone') / pl.col('pitches')).alias('zone_percent'),
482
+ (pl.col('ozone_swing') / (pl.col('out_zone'))).alias('chase_percent'),
483
+ (pl.col('ozone_contact') / pl.col('ozone_swing')).alias('chase_contact'),
484
+ (pl.col('swings') / pl.col('pitches')).alias('swing_percent'),
485
+ (pl.col('whiffs') / pl.col('swings')).alias('whiff_rate'),
486
+ (pl.col('whiffs') / pl.col('pitches')).alias('swstr_rate'),
487
+ (pl.col('ground_ball') / pl.col('bip')).alias('ground_ball_percent'),
488
+ (pl.col('line_drive') / pl.col('bip')).alias('line_drive_percent'),
489
+ (pl.col('fly_ball') / pl.col('bip')).alias('fly_ball_percent'),
490
+ (pl.col('pop_up') / pl.col('bip')).alias('pop_up_percent'),
491
+ (pl.col('heart') / pl.col('attack_zone')).alias('heart_zone_percent'),
492
+ (pl.col('shadow') / pl.col('attack_zone')).alias('shadow_zone_percent'),
493
+ (pl.col('chase') / pl.col('attack_zone')).alias('chase_zone_percent'),
494
+ (pl.col('waste') / pl.col('attack_zone')).alias('waste_zone_percent'),
495
+ (pl.col('heart_swing') / pl.col('heart')).alias('heart_zone_swing_percent'),
496
+ (pl.col('shadow_swing') / pl.col('shadow')).alias('shadow_zone_swing_percent'),
497
+ (pl.col('chase_swing') / pl.col('chase')).alias('chase_zone_swing_percent'),
498
+ (pl.col('waste_swing') / pl.col('waste')).alias('waste_zone_swing_percent'),
499
+ (pl.col('heart_whiff') / pl.col('heart_swing')).alias('heart_zone_whiff_percent'),
500
+ (pl.col('shadow_whiff') / pl.col('shadow_swing')).alias('shadow_zone_whiff_percent'),
501
+ (pl.col('chase_whiff') / pl.col('chase_swing')).alias('chase_zone_whiff_percent'),
502
+ (pl.col('waste_whiff') / pl.col('waste_swing')).alias('waste_zone_whiff_percent'),
503
+ (pl.col('xwoba') / pl.col('xwoba_codes')).alias('xwoba_percent'),
504
+ (pl.col('xwoba_contact') / pl.col('bip')).alias('xwoba_percent_contact'),
505
+ (pl.col('pull') / pl.col('bip')).alias('pull_percent'),
506
+ (pl.col('straight') / pl.col('bip')).alias('straight_percent'),
507
+ (pl.col('oppo') / pl.col('bip')).alias('oppo_percent'),
508
+ (pl.col('pull_fly_ball') / pl.col('fly_line_bip')).alias('pulled_fly_ball_percent'),
509
+
510
+ ])
511
+
512
+ return df_summ
513
+
514
+
515
+
516
+
517
+
518
+
519
+ # Assuming df is your Polars DataFrame
520
+ def update_summary_select(self, df: pl.DataFrame, selection: list) -> pl.DataFrame:
521
+ """
522
+ Update summary statistics for pitchers or batters.
523
+
524
+ Parameters:
525
+ df (pl.DataFrame): The input Polars DataFrame containing player statistics.
526
+ pitcher (bool): A flag indicating whether to calculate statistics for pitchers (True) or batters (False).
527
+
528
+ Returns:
529
+ pl.DataFrame: A Polars DataFrame with aggregated and calculated summary statistics.
530
+ """
531
+
532
+ # Group by position_id and position_name, then aggregate various statistics
533
+ df_summ = df.group_by(selection).agg([
534
+ pl.col('pa').sum().alias('pa'),
535
+ pl.col('ab').sum().alias('ab'),
536
+ pl.col('obp').sum().alias('obp_pa'),
537
+ pl.col('hits').sum().alias('hits'),
538
+ pl.col('on_base').sum().alias('on_base'),
539
+ pl.col('k').sum().alias('k'),
540
+ pl.col('bb').sum().alias('bb'),
541
+ pl.col('k_minus_bb').sum().alias('k_minus_bb'),
542
+ pl.col('bb_minus_k').sum().alias('bb_minus_k'),
543
+ pl.col('csw').sum().alias('csw'),
544
+ pl.col('bip').sum().alias('bip'),
545
+ pl.col('bip_div').sum().alias('bip_div'),
546
+ pl.col('tb').sum().alias('tb'),
547
+ pl.col('woba').sum().alias('woba'),
548
+ pl.col('woba_contact').sum().alias('woba_contact'),
549
+ pl.col('woba_pred').sum().alias('xwoba'),
550
+ pl.col('woba_pred_contact').sum().alias('xwoba_contact'),
551
+ pl.col('woba_codes').sum().alias('woba_codes'),
552
+ pl.col('xwoba_codes').sum().alias('xwoba_codes'),
553
+ pl.col('hard_hit').sum().alias('hard_hit'),
554
+ pl.col('barrel').sum().alias('barrel'),
555
+ pl.col('sweet_spot').sum().alias('sweet_spot'),
556
+ pl.col('launch_speed').max().alias('max_launch_speed'),
557
+ pl.col('launch_speed').quantile(0.90).alias('launch_speed_90'),
558
+ pl.col('launch_speed').mean().alias('launch_speed'),
559
+ pl.col('launch_angle').mean().alias('launch_angle'),
560
+ pl.col('is_pitch').sum().alias('pitches'),
561
+ pl.col('swings').sum().alias('swings'),
562
+ pl.col('in_zone').sum().alias('in_zone'),
563
+ pl.col('out_zone').sum().alias('out_zone'),
564
+ pl.col('whiffs').sum().alias('whiffs'),
565
+ pl.col('zone_swing').sum().alias('zone_swing'),
566
+ pl.col('zone_contact').sum().alias('zone_contact'),
567
+ pl.col('ozone_swing').sum().alias('ozone_swing'),
568
+ pl.col('ozone_contact').sum().alias('ozone_contact'),
569
+ pl.col('trajectory_ground_ball').sum().alias('ground_ball'),
570
+ pl.col('trajectory_line_drive').sum().alias('line_drive'),
571
+ pl.col('trajectory_fly_ball').sum().alias('fly_ball'),
572
+ pl.col('trajectory_popup').sum().alias('pop_up'),
573
+ pl.col('attack_zone').count().alias('attack_zone'),
574
+ pl.col('heart').sum().alias('heart'),
575
+ pl.col('shadow').sum().alias('shadow'),
576
+ pl.col('chase').sum().alias('chase'),
577
+ pl.col('waste').sum().alias('waste'),
578
+ pl.col('heart_swing').sum().alias('heart_swing'),
579
+ pl.col('shadow_swing').sum().alias('shadow_swing'),
580
+ pl.col('chase_swing').sum().alias('chase_swing'),
581
+ pl.col('waste_swing').sum().alias('waste_swing'),
582
+ pl.col('heart_whiff').sum().alias('heart_whiff'),
583
+ pl.col('shadow_whiff').sum().alias('shadow_whiff'),
584
+ pl.col('chase_whiff').sum().alias('chase_whiff'),
585
+ pl.col('waste_whiff').sum().alias('waste_whiff'),
586
+ pl.col('pull').sum().alias('pull'),
587
+ pl.col('straight').sum().alias('straight'),
588
+ pl.col('oppo').sum().alias('oppo'),
589
+ ((pl.col('trajectory_fly_ball') == 1) | (pl.col('trajectory_line_drive') == 1)).sum().alias('fly_line_bip'),
590
+ (pl.col('pull') & ((pl.col('trajectory_fly_ball') == 1) | (pl.col('trajectory_line_drive') == 1))).sum().alias('pull_fly_ball'),
591
+ pl.col('single').sum().alias('single'),
592
+ pl.col('double').sum().alias('double'),
593
+ pl.col('triple').sum().alias('triple'),
594
+ pl.col('home_run').sum().alias('home_run'),
595
+ (pl.col('extension').mean()).alias('extension'),
596
+ (pl.col('start_speed').filter(pl.col('pitch_type').is_in(['FF','SI','FC'])).mean().alias('avg_start_speed_ff')),
597
+
598
+
599
+ ])
600
+
601
+ # Add calculated columns to the summary DataFrame
602
+ df_summ = df_summ.with_columns([
603
+ (pl.col('hits') / pl.col('ab')).alias('avg'),
604
+ (pl.col('on_base') / pl.col('obp_pa')).alias('obp'),
605
+ (pl.col('tb') / pl.col('ab')).alias('slg'),
606
+ (pl.col('on_base') / pl.col('obp_pa') + pl.col('tb') / pl.col('ab')).alias('ops'),
607
+ (pl.col('k') / pl.col('pa')).alias('k_percent'),
608
+ (pl.col('bb') / pl.col('pa')).alias('bb_percent'),
609
+ (pl.col('k_minus_bb') / pl.col('pa')).alias('k_minus_bb_percent'),
610
+ (pl.col('bb_minus_k') / pl.col('pa')).alias('bb_minus_k_percent'),
611
+ (pl.col('bb') / pl.col('k')).alias('bb_over_k_percent'),
612
+ (pl.col('csw') / pl.col('pitches')).alias('csw_percent'),
613
+ (pl.col('sweet_spot') / pl.col('bip_div')).alias('sweet_spot_percent'),
614
+ (pl.col('woba') / pl.col('woba_codes')).alias('woba_percent'),
615
+ (pl.col('woba_contact') / pl.col('bip')).alias('woba_percent_contact'),
616
+ (pl.col('hard_hit') / pl.col('bip_div')).alias('hard_hit_percent'),
617
+ (pl.col('barrel') / pl.col('bip_div')).alias('barrel_percent'),
618
+ (pl.col('zone_contact') / pl.col('zone_swing')).alias('zone_contact_percent'),
619
+ (pl.col('zone_swing') / pl.col('in_zone')).alias('zone_swing_percent'),
620
+ (pl.col('in_zone') / pl.col('pitches')).alias('zone_percent'),
621
+ (pl.col('ozone_swing') / (pl.col('pitches') - pl.col('in_zone'))).alias('chase_percent'),
622
+ (pl.col('ozone_contact') / pl.col('ozone_swing')).alias('chase_contact'),
623
+ (pl.col('swings') / pl.col('pitches')).alias('swing_percent'),
624
+ (pl.col('whiffs') / pl.col('swings')).alias('whiff_rate'),
625
+ (pl.col('whiffs') / pl.col('pitches')).alias('swstr_rate'),
626
+ (pl.col('ground_ball') / pl.col('bip')).alias('ground_ball_percent'),
627
+ (pl.col('line_drive') / pl.col('bip')).alias('line_drive_percent'),
628
+ (pl.col('fly_ball') / pl.col('bip')).alias('fly_ball_percent'),
629
+ (pl.col('pop_up') / pl.col('bip')).alias('pop_up_percent'),
630
+ (pl.col('heart') / pl.col('attack_zone')).alias('heart_zone_percent'),
631
+ (pl.col('shadow') / pl.col('attack_zone')).alias('shadow_zone_percent'),
632
+ (pl.col('chase') / pl.col('attack_zone')).alias('chase_zone_percent'),
633
+ (pl.col('waste') / pl.col('attack_zone')).alias('waste_zone_percent'),
634
+ (pl.col('heart_swing') / pl.col('heart')).alias('heart_zone_swing_percent'),
635
+ (pl.col('shadow_swing') / pl.col('shadow')).alias('shadow_zone_swing_percent'),
636
+ (pl.col('chase_swing') / pl.col('chase')).alias('chase_zone_swing_percent'),
637
+ (pl.col('waste_swing') / pl.col('waste')).alias('waste_zone_swing_percent'),
638
+ (pl.col('heart_whiff') / pl.col('heart_swing')).alias('heart_zone_whiff_percent'),
639
+ (pl.col('shadow_whiff') / pl.col('shadow_swing')).alias('shadow_zone_whiff_percent'),
640
+ (pl.col('chase_whiff') / pl.col('chase_swing')).alias('chase_zone_whiff_percent'),
641
+ (pl.col('waste_whiff') / pl.col('waste_swing')).alias('waste_zone_whiff_percent'),
642
+ (pl.col('xwoba') / pl.col('xwoba_codes')).alias('xwoba_percent'),
643
+ (pl.col('xwoba_contact') / pl.col('bip')).alias('xwoba_percent_contact'),
644
+ (pl.col('pull') / pl.col('bip')).alias('pull_percent'),
645
+ (pl.col('straight') / pl.col('bip')).alias('straight_percent'),
646
+ (pl.col('oppo') / pl.col('bip')).alias('oppo_percent'),
647
+ (pl.col('pull_fly_ball') / pl.col('fly_line_bip')).alias('pulled_fly_ball_percent'),
648
+ ])
649
+
650
  return df_summ
stuff_model/feature_engineering.py CHANGED
@@ -82,8 +82,8 @@ def feature_engineering(df: pl.DataFrame) -> pl.DataFrame:
82
  # Group by pitcher_id and year, then aggregate to calculate average speed and usage percentage
83
  df_agg = df_filtered.group_by(['pitcher_id', 'year', 'pitch_type']).agg([
84
  pl.col('start_speed').mean().alias('avg_fastball_speed'),
85
- pl.col('az').mean().alias('avg_fastball_az'),
86
- pl.col('ax').mean().alias('avg_fastball_ax'),
87
  pl.len().alias('count')
88
  ])
89
 
@@ -104,25 +104,25 @@ def feature_engineering(df: pl.DataFrame) -> pl.DataFrame:
104
 
105
  # If no fastball, use the fastest pitch for avg_fastball_az
106
  df = df.with_columns(
107
- pl.when(pl.col('avg_fastball_az').is_null())
108
- .then(pl.col('az').max().over('pitcher_id'))
109
- .otherwise(pl.col('avg_fastball_az'))
110
- .alias('avg_fastball_az')
111
  )
112
 
113
  # If no fastball, use the fastest pitch for avg_fastball_ax
114
  df = df.with_columns(
115
- pl.when(pl.col('avg_fastball_ax').is_null())
116
- .then(pl.col('ax').max().over('pitcher_id'))
117
- .otherwise(pl.col('avg_fastball_ax'))
118
- .alias('avg_fastball_ax')
119
  )
120
 
121
  # Calculate pitch differentials
122
  df = df.with_columns(
123
  (pl.col('start_speed') - pl.col('avg_fastball_speed')).alias('speed_diff'),
124
- (pl.col('az') - pl.col('avg_fastball_az')).alias('az_diff'),
125
- (pl.col('ax') - pl.col('avg_fastball_ax')).abs().alias('ax_diff')
126
  )
127
 
128
  # Cast the year column to integer type
 
82
  # Group by pitcher_id and year, then aggregate to calculate average speed and usage percentage
83
  df_agg = df_filtered.group_by(['pitcher_id', 'year', 'pitch_type']).agg([
84
  pl.col('start_speed').mean().alias('avg_fastball_speed'),
85
+ pl.col('ivb').mean().alias('avg_fastball_ivb'),
86
+ pl.col('hb').mean().alias('avg_fastball_hb'),
87
  pl.len().alias('count')
88
  ])
89
 
 
104
 
105
  # If no fastball, use the fastest pitch for avg_fastball_az
106
  df = df.with_columns(
107
+ pl.when(pl.col('avg_fastball_ivb').is_null())
108
+ .then(pl.col('ivb').max().over('pitcher_id'))
109
+ .otherwise(pl.col('avg_fastball_ivb'))
110
+ .alias('avg_fastball_ivb')
111
  )
112
 
113
  # If no fastball, use the fastest pitch for avg_fastball_ax
114
  df = df.with_columns(
115
+ pl.when(pl.col('avg_fastball_hb').is_null())
116
+ .then(pl.col('hb').max().over('pitcher_id'))
117
+ .otherwise(pl.col('avg_fastball_hb'))
118
+ .alias('avg_fastball_hb')
119
  )
120
 
121
  # Calculate pitch differentials
122
  df = df.with_columns(
123
  (pl.col('start_speed') - pl.col('avg_fastball_speed')).alias('speed_diff'),
124
+ (pl.col('ivb') - pl.col('avg_fastball_ivb')).alias('ivb_diff'),
125
+ (pl.col('hb') - pl.col('avg_fastball_hb')).abs().alias('hb_diff')
126
  )
127
 
128
  # Cast the year column to integer type
stuff_model/pitcher_arm_angles_2024.csv CHANGED
The diff for this file is too large to render. See raw diff
 
stuff_model/pitcher_arm_angles_2025.csv CHANGED
The diff for this file is too large to render. See raw diff
 
stuff_model/stuff_apply.py CHANGED
@@ -1,57 +1,57 @@
1
- import polars as pl
2
- import joblib
3
-
4
- model = joblib.load('stuff_model/lgbm_model_2020_2024.joblib')
5
- # Read the values from the text file
6
- with open('stuff_model/target_stats.txt', 'r') as file:
7
- lines = file.readlines()
8
- target_mean = float(lines[0].strip())
9
- target_std = float(lines[1].strip())
10
-
11
- # Define the features to be used for training
12
- features = ['start_speed',
13
- 'spin_rate',
14
- 'extension',
15
- 'az',
16
- 'ax',
17
- 'x0',
18
- 'z0',
19
- 'speed_diff',
20
- 'az_diff',
21
- 'ax_diff']
22
-
23
-
24
- def stuff_apply(df:pl.DataFrame) -> pl.DataFrame:
25
- # Filter the dataframe to include only the rows for the year 2024 and drop rows with null values in the specified features and target column
26
- # df_test = df.drop_nulls(subset=features)
27
- df_test = df.clone()
28
-
29
- # Predict the target values for the 2024 data using the trained model
30
- df_test = df_test.with_columns(
31
- pl.Series(name="target", values=model.predict(df_test[features].to_numpy()))
32
- )
33
- # Standardize the target column to create a z-score
34
- df_test = df_test.with_columns(
35
- ((pl.col('target') - target_mean) / target_std).alias('target_zscore')
36
- )
37
-
38
- # Convert the z-score to tj_stuff_plus
39
- df_test = df_test.with_columns(
40
- (100 - (pl.col('target_zscore') * 10)).alias('tj_stuff_plus')
41
- )
42
-
43
- df_pitch_types = pl.read_csv('stuff_model/tj_stuff_plus_pitch.csv')
44
-
45
- # Join the pitch type statistics with the main DataFrame based on pitch_type
46
- df_pitch_all = df_test.join(df_pitch_types, left_on='pitch_type', right_on='pitch_type')
47
-
48
- # Normalize pitch_grade values to a range between -0.5 and 0.5 based on the percentiles
49
- df_pitch_all = df_pitch_all.with_columns(
50
- ((pl.col('tj_stuff_plus') - pl.col('mean')) / pl.col('std')).alias('pitch_grade')
51
- )
52
-
53
- # Scale the pitch_grade values to a range between 20 and 80
54
- df_pitch_all = df_pitch_all.with_columns(
55
- (pl.col('pitch_grade') * 10 + 50).clip(20, 80)
56
- )
57
  return df_pitch_all
 
1
+ import polars as pl
2
+ import joblib
3
+
4
+ model = joblib.load('stuff_model/stuff_model.joblib')
5
+ # Read the values from the text file
6
+ with open('stuff_model/target_stats.txt', 'r') as file:
7
+ lines = file.readlines()
8
+ target_mean = float(lines[0].strip())
9
+ target_std = float(lines[1].strip())
10
+
11
+ # Define the features to be used for training
12
+ features = ['start_speed',
13
+ 'spin_rate',
14
+ 'extension',
15
+ 'ivb',
16
+ 'hb',
17
+ 'x0',
18
+ 'z0',
19
+ 'speed_diff',
20
+ 'ivb_diff',
21
+ 'hb_diff']
22
+
23
+
24
+ def stuff_apply(df:pl.DataFrame) -> pl.DataFrame:
25
+ # Filter the dataframe to include only the rows for the year 2024 and drop rows with null values in the specified features and target column
26
+ # df_test = df.drop_nulls(subset=features)
27
+ df_test = df.clone()
28
+
29
+ # Predict the target values for the 2024 data using the trained model
30
+ df_test = df_test.with_columns(
31
+ pl.Series(name="target", values=model.predict(df_test[features].to_numpy()))
32
+ )
33
+ # Standardize the target column to create a z-score
34
+ df_test = df_test.with_columns(
35
+ ((pl.col('target') - target_mean) / target_std).alias('target_zscore')
36
+ )
37
+
38
+ # Convert the z-score to tj_stuff_plus
39
+ df_test = df_test.with_columns(
40
+ (100 - (pl.col('target_zscore') * 10)).alias('tj_stuff_plus')
41
+ )
42
+
43
+ df_pitch_types = pl.read_csv('stuff_model/tj_stuff_plus_pitch.csv')
44
+
45
+ # Join the pitch type statistics with the main DataFrame based on pitch_type
46
+ df_pitch_all = df_test.join(df_pitch_types, left_on='pitch_type', right_on='pitch_type')
47
+
48
+ # Normalize pitch_grade values to a range between -0.5 and 0.5 based on the percentiles
49
+ df_pitch_all = df_pitch_all.with_columns(
50
+ ((pl.col('tj_stuff_plus') - pl.col('mean')) / pl.col('std')).alias('pitch_grade')
51
+ )
52
+
53
+ # Scale the pitch_grade values to a range between 20 and 80
54
+ df_pitch_all = df_pitch_all.with_columns(
55
+ (pl.col('pitch_grade') * 10 + 50).clip(20, 80)
56
+ )
57
  return df_pitch_all
stuff_model/stuff_model.joblib ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a13ce41054747c57d91a025e4ee2375a806f01431e686e0f30691fc0244ebae9
3
+ size 3104666
stuff_model/target_stats.txt CHANGED
@@ -1,2 +1,2 @@
1
- 0.0036819646648982335
2
- 0.006743548730130907
 
1
+ 0.0027508874807158695
2
+ 0.0060355533615784425