nesticot commited on
Commit
ea869a3
·
verified ·
1 Parent(s): c6fa9f6

Update stuff_model/stuff_apply.py

Browse files
Files changed (1) hide show
  1. stuff_model/stuff_apply.py +56 -56
stuff_model/stuff_apply.py CHANGED
@@ -1,57 +1,57 @@
1
- import polars as pl
2
- import joblib
3
-
4
- model = joblib.load('stuff_model/stuff_model.joblib')
5
- # Read the values from the text file
6
- with open('stuff_model/target_stats.txt', 'r') as file:
7
- lines = file.readlines()
8
- target_mean = float(lines[0].strip())
9
- target_std = float(lines[1].strip())
10
-
11
- # Define the features to be used for training
12
- features = ['start_speed',
13
- 'spin_rate',
14
- 'extension',
15
- 'ivb',
16
- 'hb',
17
- 'x0',
18
- 'z0',
19
- 'speed_diff',
20
- 'ivb_diff',
21
- 'hb_diff']
22
-
23
-
24
- def stuff_apply(df:pl.DataFrame) -> pl.DataFrame:
25
- # Filter the dataframe to include only the rows for the year 2024 and drop rows with null values in the specified features and target column
26
- # df_test = df.drop_nulls(subset=features)
27
- df_test = df.clone()
28
-
29
- # Predict the target values for the 2024 data using the trained model
30
- df_test = df_test.with_columns(
31
- pl.Series(name="target", values=model.predict(df_test[features].to_numpy()))
32
- )
33
- # Standardize the target column to create a z-score
34
- df_test = df_test.with_columns(
35
- ((pl.col('target') - target_mean) / target_std).alias('target_zscore')
36
- )
37
-
38
- # Convert the z-score to tj_stuff_plus
39
- df_test = df_test.with_columns(
40
- (100 - (pl.col('target_zscore') * 10)).alias('tj_stuff_plus')
41
- )
42
-
43
- df_pitch_types = pl.read_csv('stuff_model/tj_stuff_plus_pitch.csv')
44
-
45
- # Join the pitch type statistics with the main DataFrame based on pitch_type
46
- df_pitch_all = df_test.join(df_pitch_types, left_on='pitch_type', right_on='pitch_type')
47
-
48
- # Normalize pitch_grade values to a range between -0.5 and 0.5 based on the percentiles
49
- df_pitch_all = df_pitch_all.with_columns(
50
- ((pl.col('tj_stuff_plus') - pl.col('mean')) / pl.col('std')).alias('pitch_grade')
51
- )
52
-
53
- # Scale the pitch_grade values to a range between 20 and 80
54
- df_pitch_all = df_pitch_all.with_columns(
55
- (pl.col('pitch_grade') * 10 + 50).clip(20, 80)
56
- )
57
  return df_pitch_all
 
1
+ import polars as pl
2
+ import joblib
3
+
4
+ model = joblib.load('stuff_model/stuff_model.joblib')
5
+ # Read the values from the text file
6
+ with open('stuff_model/target_stats.txt', 'r') as file:
7
+ lines = file.readlines()
8
+ target_mean = float(lines[0].strip())
9
+ target_std = float(lines[1].strip())
10
+
11
+ # Define the features to be used for training
12
+ features = ['start_speed',
13
+ 'spin_rate',
14
+ 'extension',
15
+ 'ivb',
16
+ 'hb',
17
+ 'x0',
18
+ 'z0',
19
+ 'speed_diff',
20
+ 'ivb_diff',
21
+ 'hb_diff']
22
+
23
+
24
+ def stuff_apply(df:pl.DataFrame) -> pl.DataFrame:
25
+ # Filter the dataframe to include only the rows for the year 2024 and drop rows with null values in the specified features and target column
26
+ # df_test = df.drop_nulls(subset=features)
27
+ df_test = df.clone()
28
+
29
+ # Predict the target values for the 2024 data using the trained model
30
+ df_test = df_test.with_columns(
31
+ pl.Series(name="target", values=model.predict(df_test[features].to_numpy()))
32
+ )
33
+ # Standardize the target column to create a z-score
34
+ df_test = df_test.with_columns(
35
+ ((pl.col('target') - target_mean) / target_std).alias('target_zscore')
36
+ )
37
+
38
+ # Convert the z-score to tj_stuff_plus
39
+ df_test = df_test.with_columns(
40
+ (100 - (pl.col('target_zscore') * 10)).alias('tj_stuff_plus')
41
+ )
42
+
43
+ df_pitch_types = pl.read_csv('stuff_model/tj_stuff_plus_pitch.csv')
44
+
45
+ # Join the pitch type statistics with the main DataFrame based on pitch_type
46
+ df_pitch_all = df_test.join(df_pitch_types, left_on='pitch_type', right_on='pitch_type')
47
+
48
+ # Normalize pitch_grade values to a range between -0.5 and 0.5 based on the percentiles
49
+ df_pitch_all = df_pitch_all.with_columns(
50
+ ((pl.col('tj_stuff_plus') - pl.col('mean')) / (pl.col('std')/2)).alias('pitch_grade')
51
+ )
52
+
53
+ # Scale the pitch_grade values to a range between 20 and 80
54
+ df_pitch_all = df_pitch_all.with_columns(
55
+ (pl.col('pitch_grade') * 10 + 50).clip(20, 80)
56
+ )
57
  return df_pitch_all