Update stuff_model/stuff_apply.py
Browse files- stuff_model/stuff_apply.py +56 -56
stuff_model/stuff_apply.py
CHANGED
|
@@ -1,57 +1,57 @@
|
|
| 1 |
-
import polars as pl
|
| 2 |
-
import joblib
|
| 3 |
-
|
| 4 |
-
model = joblib.load('stuff_model/stuff_model.joblib')
|
| 5 |
-
# Read the values from the text file
|
| 6 |
-
with open('stuff_model/target_stats.txt', 'r') as file:
|
| 7 |
-
lines = file.readlines()
|
| 8 |
-
target_mean = float(lines[0].strip())
|
| 9 |
-
target_std = float(lines[1].strip())
|
| 10 |
-
|
| 11 |
-
# Define the features to be used for training
|
| 12 |
-
features = ['start_speed',
|
| 13 |
-
'spin_rate',
|
| 14 |
-
'extension',
|
| 15 |
-
'ivb',
|
| 16 |
-
'hb',
|
| 17 |
-
'x0',
|
| 18 |
-
'z0',
|
| 19 |
-
'speed_diff',
|
| 20 |
-
'ivb_diff',
|
| 21 |
-
'hb_diff']
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
def stuff_apply(df:pl.DataFrame) -> pl.DataFrame:
|
| 25 |
-
# Filter the dataframe to include only the rows for the year 2024 and drop rows with null values in the specified features and target column
|
| 26 |
-
# df_test = df.drop_nulls(subset=features)
|
| 27 |
-
df_test = df.clone()
|
| 28 |
-
|
| 29 |
-
# Predict the target values for the 2024 data using the trained model
|
| 30 |
-
df_test = df_test.with_columns(
|
| 31 |
-
pl.Series(name="target", values=model.predict(df_test[features].to_numpy()))
|
| 32 |
-
)
|
| 33 |
-
# Standardize the target column to create a z-score
|
| 34 |
-
df_test = df_test.with_columns(
|
| 35 |
-
((pl.col('target') - target_mean) / target_std).alias('target_zscore')
|
| 36 |
-
)
|
| 37 |
-
|
| 38 |
-
# Convert the z-score to tj_stuff_plus
|
| 39 |
-
df_test = df_test.with_columns(
|
| 40 |
-
(100 - (pl.col('target_zscore') * 10)).alias('tj_stuff_plus')
|
| 41 |
-
)
|
| 42 |
-
|
| 43 |
-
df_pitch_types = pl.read_csv('stuff_model/tj_stuff_plus_pitch.csv')
|
| 44 |
-
|
| 45 |
-
# Join the pitch type statistics with the main DataFrame based on pitch_type
|
| 46 |
-
df_pitch_all = df_test.join(df_pitch_types, left_on='pitch_type', right_on='pitch_type')
|
| 47 |
-
|
| 48 |
-
# Normalize pitch_grade values to a range between -0.5 and 0.5 based on the percentiles
|
| 49 |
-
df_pitch_all = df_pitch_all.with_columns(
|
| 50 |
-
((pl.col('tj_stuff_plus') - pl.col('mean')) / pl.col('std')).alias('pitch_grade')
|
| 51 |
-
)
|
| 52 |
-
|
| 53 |
-
# Scale the pitch_grade values to a range between 20 and 80
|
| 54 |
-
df_pitch_all = df_pitch_all.with_columns(
|
| 55 |
-
(pl.col('pitch_grade') * 10 + 50).clip(20, 80)
|
| 56 |
-
)
|
| 57 |
return df_pitch_all
|
|
|
|
| 1 |
+
import polars as pl
|
| 2 |
+
import joblib
|
| 3 |
+
|
| 4 |
+
model = joblib.load('stuff_model/stuff_model.joblib')
|
| 5 |
+
# Read the values from the text file
|
| 6 |
+
with open('stuff_model/target_stats.txt', 'r') as file:
|
| 7 |
+
lines = file.readlines()
|
| 8 |
+
target_mean = float(lines[0].strip())
|
| 9 |
+
target_std = float(lines[1].strip())
|
| 10 |
+
|
| 11 |
+
# Define the features to be used for training
|
| 12 |
+
features = ['start_speed',
|
| 13 |
+
'spin_rate',
|
| 14 |
+
'extension',
|
| 15 |
+
'ivb',
|
| 16 |
+
'hb',
|
| 17 |
+
'x0',
|
| 18 |
+
'z0',
|
| 19 |
+
'speed_diff',
|
| 20 |
+
'ivb_diff',
|
| 21 |
+
'hb_diff']
|
| 22 |
+
|
| 23 |
+
|
| 24 |
+
def stuff_apply(df:pl.DataFrame) -> pl.DataFrame:
|
| 25 |
+
# Filter the dataframe to include only the rows for the year 2024 and drop rows with null values in the specified features and target column
|
| 26 |
+
# df_test = df.drop_nulls(subset=features)
|
| 27 |
+
df_test = df.clone()
|
| 28 |
+
|
| 29 |
+
# Predict the target values for the 2024 data using the trained model
|
| 30 |
+
df_test = df_test.with_columns(
|
| 31 |
+
pl.Series(name="target", values=model.predict(df_test[features].to_numpy()))
|
| 32 |
+
)
|
| 33 |
+
# Standardize the target column to create a z-score
|
| 34 |
+
df_test = df_test.with_columns(
|
| 35 |
+
((pl.col('target') - target_mean) / target_std).alias('target_zscore')
|
| 36 |
+
)
|
| 37 |
+
|
| 38 |
+
# Convert the z-score to tj_stuff_plus
|
| 39 |
+
df_test = df_test.with_columns(
|
| 40 |
+
(100 - (pl.col('target_zscore') * 10)).alias('tj_stuff_plus')
|
| 41 |
+
)
|
| 42 |
+
|
| 43 |
+
df_pitch_types = pl.read_csv('stuff_model/tj_stuff_plus_pitch.csv')
|
| 44 |
+
|
| 45 |
+
# Join the pitch type statistics with the main DataFrame based on pitch_type
|
| 46 |
+
df_pitch_all = df_test.join(df_pitch_types, left_on='pitch_type', right_on='pitch_type')
|
| 47 |
+
|
| 48 |
+
# Normalize pitch_grade values to a range between -0.5 and 0.5 based on the percentiles
|
| 49 |
+
df_pitch_all = df_pitch_all.with_columns(
|
| 50 |
+
((pl.col('tj_stuff_plus') - pl.col('mean')) / (pl.col('std')/2)).alias('pitch_grade')
|
| 51 |
+
)
|
| 52 |
+
|
| 53 |
+
# Scale the pitch_grade values to a range between 20 and 80
|
| 54 |
+
df_pitch_all = df_pitch_all.with_columns(
|
| 55 |
+
(pl.col('pitch_grade') * 10 + 50).clip(20, 80)
|
| 56 |
+
)
|
| 57 |
return df_pitch_all
|