Harveyntt commited on
Commit
42b6dee
·
verified ·
1 Parent(s): e2ef0cb

Upload feature_engineering_live.py

Browse files

Add missing feature_engineering_live.py script

Files changed (1) hide show
  1. feature_engineering_live.py +130 -0
feature_engineering_live.py ADDED
@@ -0,0 +1,130 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import numpy as np
3
+
4
+
5
+ def create_live_feature_vector(live_daily_summary: dict, historical_data: pd.DataFrame) -> pd.DataFrame:
6
+ """Create a single-row DataFrame of features suitable for the 5-day models.
7
+
8
+ This is a pragmatic, reduced-feature implementation: it fills a template row
9
+ using the last historical day as a baseline and replaces/engineers the most
10
+ important features from live_daily_summary + recent history.
11
+
12
+ Note: The full project used ~157 features. Implementing all of them here is
13
+ tedious and error-prone; this function focuses on ~25 high-importance
14
+ features commonly used in temperature forecasting. It will also attempt to
15
+ preserve the original columns order (using historical_data.columns) so
16
+ models expecting the same schema are less likely to fail.
17
+ """
18
+ if historical_data is None or historical_data.empty:
19
+ raise ValueError("historical_data must be a non-empty DataFrame")
20
+
21
+ # Use the last historical row as a template (copy to avoid mutation)
22
+ template = historical_data.iloc[-1].copy()
23
+
24
+ # Start with a series having same index as template (so column ordering is preserved)
25
+ today_row = pd.Series(index=historical_data.columns, dtype="float64")
26
+
27
+ # Basic direct mappings (if columns exist)
28
+ mappings = {
29
+ 'temp': ['temp', 'temperature', 'avg_temp'],
30
+ 'feelslike': ['feelslike', 'feels_like'],
31
+ 'humidity': ['humidity'],
32
+ 'precip': ['precip', 'precipitation', 'rain'],
33
+ 'windspeed': ['windspeed', 'wind_speed', 'windspd'],
34
+ 'cloudcover': ['cloudcover', 'clouds', 'cloud_percent']
35
+ }
36
+
37
+ for feature, candidates in mappings.items():
38
+ val = None
39
+ for c in candidates:
40
+ if c in live_daily_summary:
41
+ val = live_daily_summary.get(c)
42
+ break
43
+ # fallback to nested keys in OpenWeather-like structures
44
+ if val is None and 'main' in live_daily_summary and feature in live_daily_summary['main']:
45
+ val = live_daily_summary['main'].get(feature)
46
+ if val is None and feature in live_daily_summary:
47
+ val = live_daily_summary.get(feature)
48
+
49
+ # Put into today_row if a matching column exists
50
+ for col in historical_data.columns:
51
+ if col == feature and val is not None:
52
+ today_row[col] = float(val)
53
+
54
+ # If 'temp' column still missing fill from template or live summary
55
+ if 'temp' in historical_data.columns and pd.isna(today_row.get('temp')):
56
+ if 'temp' in live_daily_summary:
57
+ today_row['temp'] = float(live_daily_summary['temp'])
58
+ else:
59
+ today_row['temp'] = float(template.get('temp', np.nan))
60
+
61
+ # Temporal features
62
+ today_ts = pd.Timestamp.now().normalize()
63
+ if 'year' in historical_data.columns:
64
+ today_row['year'] = today_ts.year
65
+ if 'month' in historical_data.columns:
66
+ today_row['month'] = today_ts.month
67
+ if 'day_of_year' in historical_data.columns:
68
+ today_row['day_of_year'] = today_ts.dayofyear
69
+
70
+ # Lag features (use recent historical days)
71
+ def safe_hist(col, offset=1):
72
+ idx = -offset
73
+ try:
74
+ return float(historical_data[col].iloc[idx])
75
+ except Exception:
76
+ return np.nan
77
+
78
+ if 'temp_lag_1' in historical_data.columns:
79
+ today_row['temp_lag_1'] = safe_hist('temp', 1)
80
+ if 'temp_lag_2' in historical_data.columns:
81
+ today_row['temp_lag_2'] = safe_hist('temp', 2)
82
+ if 'humidity_lag_1' in historical_data.columns:
83
+ today_row['humidity_lag_1'] = safe_hist('humidity', 1)
84
+
85
+ # Rolling windows: combine last N historical days with today's live 'temp' when available
86
+ def rolling_stat(col, window=7, stat='mean'):
87
+ try:
88
+ hist_vals = historical_data[col].dropna().iloc[-(window-1):].astype(float)
89
+ if not np.isnan(today_row.get(col)):
90
+ combined = pd.concat([hist_vals, pd.Series([today_row[col]])], ignore_index=True)
91
+ else:
92
+ combined = hist_vals
93
+ if combined.empty:
94
+ return np.nan
95
+ if stat == 'mean':
96
+ return float(combined.mean())
97
+ if stat == 'std':
98
+ return float(combined.std())
99
+ if stat == 'sum':
100
+ return float(combined.sum())
101
+ return np.nan
102
+ except Exception:
103
+ return np.nan
104
+
105
+ if 'temp_roll_7d_mean' in historical_data.columns:
106
+ today_row['temp_roll_7d_mean'] = rolling_stat('temp', window=7, stat='mean')
107
+ if 'temp_roll_7d_std' in historical_data.columns:
108
+ today_row['temp_roll_7d_std'] = rolling_stat('temp', window=7, stat='std')
109
+ if 'temp_roll_14d_std' in historical_data.columns:
110
+ today_row['temp_roll_14d_std'] = rolling_stat('temp', window=14, stat='std')
111
+
112
+ # If the model expects precip_roll_7d_sum and we can compute it
113
+ if 'precip' in historical_data.columns and 'precip_roll_7d_sum' in historical_data.columns:
114
+ today_row['precip_roll_7d_sum'] = rolling_stat('precip', window=7, stat='sum')
115
+
116
+ # Fill other columns conservatively using the last historical values (template)
117
+ for col in historical_data.columns:
118
+ if pd.isna(today_row.get(col)):
119
+ try:
120
+ today_row[col] = float(template[col]) if pd.notna(template[col]) else np.nan
121
+ except Exception:
122
+ today_row[col] = np.nan
123
+
124
+ # Convert to single-row DataFrame and ensure dtypes
125
+ today_df = pd.DataFrame([today_row])
126
+ today_df.index = [pd.Timestamp.now()]
127
+
128
+ # Reorder columns to match historical_data (already aligned) and return
129
+ today_df = today_df.reindex(columns=historical_data.columns)
130
+ return today_df