ymlin105 commited on
Commit
aa92081
·
1 Parent(s): 062e752

feat: standardize feature engineering and push new production model

Browse files
models/rossmann_production_model.pkl CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:19f7950c78166df1b87f4b5e5db12b0782e6cd96a2f51502ddef7af41a225e7d
3
  size 53873380
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:59d3bda897c6d9678abfce7bd5b631183319efbdf275113d83463ad3d52f706e
3
  size 53873380
scripts/train_production_model.py CHANGED
@@ -33,17 +33,16 @@ def run_production_training():
33
  df_feat = pipeline.run_feature_engineering(df_raw)
34
 
35
  # 3. Define Final Feature Set
36
- # Include Store ID and Store Metadata
37
- from sklearn.preprocessing import LabelEncoder
38
- le = LabelEncoder()
39
- for col in ['StoreType', 'Assortment']:
40
- if col in df_feat.columns:
41
- df_feat[col] = le.fit_transform(df_feat[col].astype(str))
42
 
43
  feature_cols = [
44
  'Store', 'DayOfWeek', 'Promo', 'StateHoliday', 'SchoolHoliday',
45
  'Year', 'Month', 'Day', 'IsWeekend', 'DayOfMonth',
46
- 'CompetitionDistance', 'CompetitionOpenTime', 'StoreType', 'Assortment'
47
  ] + [c for c in df_feat.columns if 'fourier' in c or 'easter' in c]
48
 
49
  # 4. Final Training (using all available data to create the 'Gold' model)
 
33
  df_feat = pipeline.run_feature_engineering(df_raw)
34
 
35
  # 3. Define Final Feature Set
36
+ # Consistent Encoding with app.py
37
+ if 'StoreType' in df_feat.columns:
38
+ df_feat['StoreType'] = df_feat['StoreType'].astype(str).map({'a':1, 'b':2, 'c':3, 'd':4}).fillna(0)
39
+ if 'Assortment' in df_feat.columns:
40
+ df_feat['Assortment'] = df_feat['Assortment'].astype(str).map({'a':1, 'b':2, 'c':3}).fillna(0)
 
41
 
42
  feature_cols = [
43
  'Store', 'DayOfWeek', 'Promo', 'StateHoliday', 'SchoolHoliday',
44
  'Year', 'Month', 'Day', 'IsWeekend', 'DayOfMonth',
45
+ 'CompetitionDistance', 'StoreType', 'Assortment'
46
  ] + [c for c in df_feat.columns if 'fourier' in c or 'easter' in c]
47
 
48
  # 4. Final Training (using all available data to create the 'Gold' model)
src/features.py CHANGED
@@ -22,7 +22,7 @@ class DateTransformation(FeatureEngineeringStrategy):
22
  df_transformed['Year'] = df_transformed[date_col].dt.year
23
  df_transformed['Month'] = df_transformed[date_col].dt.month
24
  df_transformed['Day'] = df_transformed[date_col].dt.day
25
- df_transformed['DayOfWeek'] = df_transformed[date_col].dt.dayofweek
26
  df_transformed['IsWeekend'] = (df_transformed[date_col].dt.dayofweek >= 5).astype(int)
27
  df_transformed['DayOfMonth'] = df_transformed[date_col].dt.day
28
  return df_transformed
 
22
  df_transformed['Year'] = df_transformed[date_col].dt.year
23
  df_transformed['Month'] = df_transformed[date_col].dt.month
24
  df_transformed['Day'] = df_transformed[date_col].dt.day
25
+ df_transformed['DayOfWeek'] = df_transformed[date_col].dt.dayofweek + 1
26
  df_transformed['IsWeekend'] = (df_transformed[date_col].dt.dayofweek >= 5).astype(int)
27
  df_transformed['DayOfMonth'] = df_transformed[date_col].dt.day
28
  return df_transformed