Spaces:
Running
Running
Upload stuff_model/feature_engineering.py with huggingface_hub
Browse files
stuff_model/feature_engineering.py
CHANGED
|
@@ -7,6 +7,29 @@ def feature_engineering(df: pl.DataFrame) -> pl.DataFrame:
|
|
| 7 |
pl.col('game_date').str.slice(0, 4).alias('year')
|
| 8 |
)
|
| 9 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 10 |
df = df.with_columns([
|
| 11 |
|
| 12 |
(-(pl.col('vy0')**2 - (2 * pl.col('ay') * (pl.col('y0') - 17/12)))**0.5).alias('vy_f'),
|
|
@@ -69,7 +92,7 @@ def feature_engineering(df: pl.DataFrame) -> pl.DataFrame:
|
|
| 69 |
df_agg = df_agg.unique(subset=['pitcher_id', 'year'], keep='first')
|
| 70 |
|
| 71 |
# Join the aggregated data with the main DataFrame
|
| 72 |
-
df = df.join(df_agg, on=['pitcher_id', 'year'])
|
| 73 |
|
| 74 |
# If no fastball, use the fastest pitch for avg_fastball_speed
|
| 75 |
df = df.with_columns(
|
|
@@ -90,7 +113,7 @@ def feature_engineering(df: pl.DataFrame) -> pl.DataFrame:
|
|
| 90 |
# If no fastball, use the fastest pitch for avg_fastball_ax
|
| 91 |
df = df.with_columns(
|
| 92 |
pl.when(pl.col('avg_fastball_ax').is_null())
|
| 93 |
-
.then(pl.col('ax').max().over('
|
| 94 |
.otherwise(pl.col('avg_fastball_ax'))
|
| 95 |
.alias('avg_fastball_ax')
|
| 96 |
)
|
|
@@ -113,27 +136,8 @@ def feature_engineering(df: pl.DataFrame) -> pl.DataFrame:
|
|
| 113 |
pl.lit('All').alias('all')
|
| 114 |
])
|
| 115 |
|
| 116 |
-
# Calculate mound_to_release as 60.5 - extension
|
| 117 |
-
df = df.with_columns([
|
| 118 |
-
(60.5 - df["extension"]).alias("release_pos_y")
|
| 119 |
-
])
|
| 120 |
-
|
| 121 |
-
# Calculate delta time (Δt)
|
| 122 |
-
delta_t = (df["release_pos_y"] - df["y0"]) / df["vy0"]
|
| 123 |
-
# print((df["vx0"] * delta_t + 0.5 * df["ax"] * delta_t ** 2))
|
| 124 |
-
# Corrected back-calculation of release_pos_x and release_pos_z
|
| 125 |
|
| 126 |
-
|
| 127 |
-
df = df.with_columns(
|
| 128 |
-
pl.when(pl.col('pitcher_hand')== 'R')
|
| 129 |
-
.then(df["x0"] - df["vx0"] * delta_t - 0.5 * df["ax"] * delta_t ** 2)
|
| 130 |
-
.otherwise(df["x0"] + df["vx0"] * delta_t + 0.5 * df["ax"] * delta_t ** 2)
|
| 131 |
-
.alias('release_pos_x')
|
| 132 |
-
)
|
| 133 |
|
| 134 |
-
df = df.with_columns([
|
| 135 |
-
(df["z0"] + df["vz0"] * delta_t + 0.5 * df["az"] * delta_t ** 2).alias("release_pos_z")
|
| 136 |
-
])
|
| 137 |
|
| 138 |
|
| 139 |
|
|
|
|
| 7 |
pl.col('game_date').str.slice(0, 4).alias('year')
|
| 8 |
)
|
| 9 |
|
| 10 |
+
# Calculate mound_to_release as 60.5 - extension
|
| 11 |
+
df = df.with_columns([
|
| 12 |
+
(60.5 - df["extension"]).alias("release_pos_y")
|
| 13 |
+
])
|
| 14 |
+
|
| 15 |
+
# Calculate delta time (Δt)
|
| 16 |
+
delta_t = (df["release_pos_y"] - df["y0"]) / df["vy0"]
|
| 17 |
+
# print((df["vx0"] * delta_t + 0.5 * df["ax"] * delta_t ** 2))
|
| 18 |
+
# Corrected back-calculation of release_pos_x and release_pos_z
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
df = df.with_columns(
|
| 22 |
+
pl.when(pl.col('pitcher_hand')== 'R')
|
| 23 |
+
.then((df["x0"] + df["vx0"] * delta_t + 0.5 * df["ax"] * delta_t ** 2)*-1)
|
| 24 |
+
.otherwise(df["x0"] + df["vx0"] * delta_t + 0.5 * df["ax"] * delta_t ** 2)
|
| 25 |
+
.alias('release_pos_x')
|
| 26 |
+
)
|
| 27 |
+
|
| 28 |
+
df = df.with_columns([
|
| 29 |
+
(df["z0"] + df["vz0"] * delta_t + 0.5 * df["az"] * delta_t ** 2).alias("release_pos_z")
|
| 30 |
+
])
|
| 31 |
+
|
| 32 |
+
|
| 33 |
df = df.with_columns([
|
| 34 |
|
| 35 |
(-(pl.col('vy0')**2 - (2 * pl.col('ay') * (pl.col('y0') - 17/12)))**0.5).alias('vy_f'),
|
|
|
|
| 92 |
df_agg = df_agg.unique(subset=['pitcher_id', 'year'], keep='first')
|
| 93 |
|
| 94 |
# Join the aggregated data with the main DataFrame
|
| 95 |
+
df = df.join(df_agg, on=['pitcher_id', 'year'],how='left')
|
| 96 |
|
| 97 |
# If no fastball, use the fastest pitch for avg_fastball_speed
|
| 98 |
df = df.with_columns(
|
|
|
|
| 113 |
# If no fastball, use the fastest pitch for avg_fastball_ax
|
| 114 |
df = df.with_columns(
|
| 115 |
pl.when(pl.col('avg_fastball_ax').is_null())
|
| 116 |
+
.then(pl.col('ax').max().over('pitcher_id'))
|
| 117 |
.otherwise(pl.col('avg_fastball_ax'))
|
| 118 |
.alias('avg_fastball_ax')
|
| 119 |
)
|
|
|
|
| 136 |
pl.lit('All').alias('all')
|
| 137 |
])
|
| 138 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 139 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 140 |
|
|
|
|
|
|
|
|
|
|
| 141 |
|
| 142 |
|
| 143 |
|