aromidvar commited on
Commit
014122c
·
verified ·
1 Parent(s): 66f2b32

Update core/train_eval.py

Browse files
Files changed (1) hide show
  1. core/train_eval.py +178 -68
core/train_eval.py CHANGED
@@ -1,70 +1,180 @@
1
- def preprocess_data(df, features, target, window_size=30, horizon=1):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
  try:
3
- # Validate features
4
- available_columns = df.columns.tolist()
5
- valid_features = [f for f in features if f in available_columns]
6
- if not valid_features:
7
- raise ValueError(f"No valid features found. Available: {available_columns}, Requested: {features}")
8
- if target not in available_columns:
9
- raise ValueError(f"Target {target} not in DataFrame columns: {available_columns}")
10
-
11
- data_df = df[valid_features].copy()
12
- logging.debug(f"Initial features: {valid_features}, Shape: {data_df.shape}")
13
-
14
- # Protect critical features
15
- critical_features = [target, 'Volume', 'rsi_14', 'macdh_12_26_9', 'adx_14']
16
- for feat in critical_features:
17
- if feat in df.columns and feat not in data_df.columns:
18
- data_df[feat] = df[feat]
19
-
20
- # Feature importance with RandomForest
21
- if len(data_df) > 100:
22
- X = data_df.drop([target], axis=1, errors='ignore')
23
- y = data_df[target]
24
- rf = RandomForestRegressor(n_estimators=100, random_state=42)
25
- rf.fit(X, y)
26
- importance = pd.Series(rf.feature_importances_, index=X.columns)
27
- top_features = importance.sort_values(ascending=False).index[:10].tolist()
28
- if target not in top_features:
29
- top_features.append(target)
30
- for feat in critical_features:
31
- if feat in df.columns and feat not in top_features:
32
- top_features.append(feat)
33
- data_df = data_df[top_features]
34
- logging.info(f"Selected top features by RandomForest: {top_features}")
35
-
36
- # Remove highly correlated features
37
- corr_matrix = data_df.corr().abs()
38
- upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
39
- to_drop = [column for column in upper.columns if column not in critical_features and any(upper[column] > 0.85)]
40
- data_df.drop(to_drop, axis=1, inplace=True)
41
- logging.info(f"Dropped correlated features: {to_drop}")
42
-
43
- # PCA with 95% variance
44
- updated_features = data_df.columns.tolist()
45
- if len(updated_features) > 10:
46
- pca = PCA(n_components=0.95)
47
- data_df = pd.DataFrame(pca.fit_transform(data_df), columns=[f'pca_{i}' for i in range(pca.n_components_)])
48
- logging.info(f"Applied PCA: {pca.n_components_} components, explained variance: {sum(pca.explained_variance_ratio_):.2f}")
49
-
50
- data = data_df.values.astype(float)
51
- scaler = StandardScaler()
52
- scaled = scaler.fit_transform(data)
53
- target_idx = updated_features.index(target) if target in updated_features else -1
54
- if target_idx == -1:
55
- raise ValueError("Target not in features after preprocessing.")
56
-
57
- X, y = [], []
58
- for i in range(len(scaled) - window_size - horizon + 1):
59
- X.append(scaled[i:i + window_size])
60
- y.append(scaled[i + window_size:i + window_size + horizon, target_idx])
61
- X = np.array(X)
62
- y = np.array(y)
63
- if X.shape[0] == 0 or y.shape[0] == 0:
64
- raise ValueError(f"Insufficient data: len={len(scaled)}, window={window_size}, horizon={horizon}")
65
-
66
- logging.info(f"Preprocessed data: X.shape={X.shape}, y.shape={y.shape}, Features: {updated_features}")
67
- return X, y, scaler, updated_features, target_idx
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
68
  except Exception as e:
69
- logging.error(f"Preprocessing error: {e}")
70
- raise
 
1
+ # core/train_eval.py
2
+ import numpy as np
3
+ import pandas as pd
4
+ import torch
5
+ from torch import nn, optim
6
+ from sklearn.preprocessing import MinMaxScaler
7
+ from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, explained_variance_score
8
+ from torch.utils.data import DataLoader, TensorDataset
9
+ from torchsummary import summary
10
+ from core.data import preprocess_data
11
+ import logging
12
+
13
+ logging.basicConfig(level=logging.INFO)
14
+
15
+ def mean_absolute_percentage_error(y_true, y_pred):
16
+ y_true, y_pred = np.array(y_true), np.array(y_pred)
17
+ non_zero = np.abs(y_true) > 0
18
+ if np.sum(non_zero) == 0:
19
+ return np.nan
20
+ return np.mean(np.abs((y_true[non_zero] - y_pred[non_zero]) / y_true[non_zero])) * 100
21
+
22
+ def directional_accuracy(y_true, y_pred):
23
+ true_diff = np.diff(y_true)
24
+ pred_diff = np.diff(y_pred)
25
+ return np.mean(np.sign(true_diff) == np.sign(pred_diff)) if len(true_diff) > 0 else np.nan
26
+
27
+ def mase(y_true, y_pred, y_train):
28
+ mae_val = mean_absolute_error(y_true, y_pred)
29
+ naive_mae = mean_absolute_error(y_train[1:], y_train[:-1]) if len(y_train) > 1 else np.nan
30
+ return mae_val / naive_mae if naive_mae != 0 else np.nan
31
+
32
+ def train_and_evaluate(
33
+ df,
34
+ features,
35
+ target,
36
+ model_cls,
37
+ horizon=1,
38
+ hidden=64,
39
+ layers=1,
40
+ epochs=50,
41
+ lr=0.001,
42
+ beta1=0.9,
43
+ beta2=0.999,
44
+ weight_decay=0.01,
45
+ dropout=0.2,
46
+ window=30,
47
+ test_split=0.2,
48
+ device="cuda" if torch.cuda.is_available() else "cpu",
49
+ verbose=True
50
+ ):
51
+ result = {}
52
  try:
53
+ X, y, scaler = preprocess_data(df, features, target, window, horizon)
54
+ if X is None:
55
+ raise ValueError("Preprocessing failed.")
56
+ target_idx = features.index(target)
57
+
58
+ split = int(len(X) * (1 - test_split))
59
+ val_split = int(split * 0.9)
60
+ X_train, X_val, X_test = X[:val_split], X[val_split:split], X[split:]
61
+ y_train, y_val, y_test = y[:val_split], y[val_split:split], y[split:]
62
+
63
+ if len(X_train) == 0 or len(X_test) == 0:
64
+ result["metrics"] = {k: 0.0 for k in ["R2", "ExplainedVariance", "MDA", "RMSE", "MAE", "MAPE", "MASE"]}
65
+ result["forecast"] = []
66
+ result["actual"] = []
67
+ result["latest_prediction"] = []
68
+ result["train_loss"] = []
69
+ result["val_loss"] = []
70
+ return result
71
+
72
+ train_loader = DataLoader(TensorDataset(torch.tensor(X_train, dtype=torch.float32), torch.tensor(y_train, dtype=torch.float32)), batch_size=32, shuffle=True)
73
+ val_loader = DataLoader(TensorDataset(torch.tensor(X_val, dtype=torch.float32), torch.tensor(y_val, dtype=torch.float32)), batch_size=32, shuffle=False)
74
+ test_loader = DataLoader(TensorDataset(torch.tensor(X_test, dtype=torch.float32), torch.tensor(y_test, dtype=torch.float32)), batch_size=32, shuffle=False)
75
+
76
+ input_dim = X_train.shape[2]
77
+ model = model_cls(input_size=input_dim, hidden_size=hidden, num_layers=layers, output_size=horizon, dropout=dropout).to(device)
78
+ result["model_summary"] = str(summary(model, (window, input_dim)))
79
+
80
+ optimizer = optim.AdamW(model.parameters(), lr=lr, betas=(beta1, beta2), weight_decay=weight_decay)
81
+ scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=5, verbose=verbose)
82
+ loss_fn = nn.MSELoss()
83
+
84
+ train_losses = []
85
+ val_losses = []
86
+ best_val_loss = float('inf')
87
+ patience = 10
88
+ counter = 0
89
+ best_model_state = None
90
+
91
+ for epoch in range(epochs):
92
+ model.train()
93
+ epoch_loss = 0.0
94
+ for xb, yb in train_loader:
95
+ xb, yb = xb.to(device), yb.to(device)
96
+ optimizer.zero_grad()
97
+ out = model(xb)
98
+ loss = loss_fn(out, yb)
99
+ loss.backward()
100
+ optimizer.step()
101
+ epoch_loss += loss.item()
102
+ train_losses.append(epoch_loss / len(train_loader))
103
+
104
+ model.eval()
105
+ val_loss = 0.0
106
+ with torch.no_grad():
107
+ for xb, yb in val_loader:
108
+ xb, yb = xb.to(device), yb.to(device)
109
+ out = model(xb)
110
+ loss = loss_fn(out, yb)
111
+ val_loss += loss.item()
112
+ if len(val_loader) > 0:
113
+ val_loss /= len(val_loader)
114
+ val_losses.append(val_loss)
115
+ scheduler.step(val_loss)
116
+
117
+ if verbose and (epoch + 1) % 10 == 0:
118
+ print(f"Epoch {epoch+1}/{epochs} - Train Loss: {train_losses[-1]:.4f}, Val Loss: {val_losses[-1]:.4f}")
119
+
120
+ if val_loss < best_val_loss:
121
+ best_val_loss = val_loss
122
+ counter = 0
123
+ best_model_state = model.state_dict()
124
+ else:
125
+ counter += 1
126
+ if counter >= patience:
127
+ print(f"Early stopping at epoch {epoch+1}")
128
+ break
129
+
130
+ if best_model_state:
131
+ model.load_state_dict(best_model_state)
132
+
133
+ result["train_loss"] = train_losses
134
+ result["val_loss"] = val_losses
135
+
136
+ model.eval()
137
+ preds, targets = [], []
138
+ with torch.no_grad():
139
+ for xb, yb in test_loader:
140
+ xb = xb.to(device)
141
+ out = model(xb).cpu().numpy()
142
+ preds.append(out)
143
+ targets.append(yb.numpy())
144
+
145
+ preds = np.concatenate(preds, axis=0)
146
+ targets = np.concatenate(targets, axis=0)
147
+
148
+ def inverse_y(values):
149
+ dummy = np.zeros((values.shape[0], len(features)))
150
+ dummy[:, target_idx] = values.flatten()
151
+ return scaler.inverse_transform(dummy)[:, target_idx]
152
+
153
+ preds_inv = inverse_y(preds)
154
+ targets_inv = inverse_y(targets)
155
+ y_train_inv = inverse_y(y_train.flatten())
156
+
157
+ result["metrics"] = {
158
+ "R2": r2_score(targets_inv, preds_inv),
159
+ "ExplainedVariance": explained_variance_score(targets_inv, preds_inv),
160
+ "MDA": directional_accuracy(targets_inv, preds_inv),
161
+ "RMSE": np.sqrt(mean_squared_error(targets_inv, preds_inv)),
162
+ "MAE": mean_absolute_error(targets_inv, preds_inv),
163
+ "MAPE": mean_absolute_percentage_error(targets_inv, preds_inv),
164
+ "MASE": mase(targets_inv, preds_inv, y_train_inv)
165
+ }
166
+
167
+ result["forecast"] = preds_inv
168
+ result["actual"] = targets_inv
169
+
170
+ latest_window = df[features].tail(window).values
171
+ latest_scaled = scaler.transform(latest_window)
172
+ latest_input = torch.tensor(latest_scaled.reshape(1, window, -1), dtype=torch.float32).to(device)
173
+ with torch.no_grad():
174
+ future_pred = model(latest_input).cpu().numpy().flatten()
175
+ result["latest_prediction"] = inverse_y(future_pred).tolist()
176
+
177
+ return result
178
  except Exception as e:
179
+ logging.error(f"Train eval error: {e}")
180
+ return {}