Spaces:

aromidvar
/

MarketPredictionPro

Sleeping

App Files Files Community

aromidvar commited on Sep 17, 2025

Commit

950b4c7

verified ·

1 Parent(s): df473db

Update core/train_eval.py

Browse files

Files changed (1) hide show

core/train_eval.py +81 -95

core/train_eval.py CHANGED Viewed

@@ -1,4 +1,4 @@
-# core/train_eval.py
 import numpy as np
 import pandas as pd
 import torch
@@ -40,7 +40,6 @@ def mean_absolute_percentage_error(y_true, y_pred):
         return np.nan
     return np.mean(np.abs((y_true[non_zero] - y_pred[non_zero]) / y_true[non_zero])) * 100
 def directional_accuracy(y_true, y_pred):
     true_diff = np.diff(y_true)
     pred_diff = np.diff(y_pred)
@@ -49,7 +48,6 @@ def directional_accuracy(y_true, y_pred):
         return np.nan
     return np.mean(np.sign(true_diff) == np.sign(pred_diff))
 def mase(y_true, y_pred, y_train):
     mae_val = mean_absolute_error(y_true, y_pred)
     naive_mae = mean_absolute_error(y_train[1:], y_train[:-1]) if len(y_train) > 1 else np.nan
@@ -58,7 +56,6 @@ def mase(y_true, y_pred, y_train):
         return np.nan
     return mae_val / naive_mae
 def compute_volatility(y_pred):
     returns = np.diff(y_pred) / y_pred[:-1]
     if len(returns) == 0:
@@ -66,7 +63,6 @@ def compute_volatility(y_pred):
         return np.nan
     return np.std(returns) * np.sqrt(252)
 def compute_sharpe_ratio(y_pred, risk_free_rate=0.01):
     returns = np.diff(y_pred) / y_pred[:-1]
     if len(returns) == 0:
@@ -79,7 +75,6 @@ def compute_sharpe_ratio(y_pred, risk_free_rate=0.01):
         return np.nan
     return (mean_return - risk_free_rate) / std_return
 def compute_precision_recall(y_true, y_pred):
     true_diff = np.sign(np.diff(y_true))
     pred_diff = np.sign(np.diff(y_pred))
@@ -90,7 +85,6 @@ def compute_precision_recall(y_true, y_pred):
     recall = recall_score(true_diff > 0, pred_diff > 0, zero_division=0)
     return precision, recall
 # ---------------- Feature selection ----------------
 def select_features(df, features, target, selector_method, importance_threshold):
     logging.info(
@@ -104,6 +98,7 @@ def select_features(df, features, target, selector_method, importance_threshold)
             rf.fit(X, y)
             importances = pd.Series(rf.feature_importances_, index=features)
             selected_features = importances[importances >= importance_threshold].index.tolist()
             return selected_features if selected_features else features
         except Exception as e:
             logging.error(f"RandomForest feature selection failed: {str(e)}")
@@ -117,24 +112,17 @@ def select_features(df, features, target, selector_method, importance_threshold)
             pca = PCA(n_components=n_components)
             pca.fit(X_scaled)
             explained_variance_ratio = pca.explained_variance_ratio_.cumsum()
-            n_selected = (
-                sum(explained_variance_ratio < 0.95) + 1
-                if any(explained_variance_ratio < 0.95)
-                else n_components
-            )
             selected_features = features[:n_selected]
             return selected_features if selected_features else features
         except Exception as e:
             logging.error(f"PCA feature selection failed: {str(e)}")
             return features
     else:
-        logging.warning(
-            f"Unsupported selector_method: {selector_method}, using all features"
-        )
         return features
-# ---------------- Training ----------------
 def train_and_evaluate(
     df,
     features,
@@ -154,131 +142,128 @@ def train_and_evaluate(
     selector_method="RandomForest",
     importance_threshold=0.0,
     scheduler_type="None",
-    device="cpu",
-    verbose=True,
 ):
     try:
         from .data import preprocess_data
-        selected_features = select_features(
-            df, features, target, selector_method, importance_threshold
-        )
         logging.info(f"Selected features: {selected_features}")
-        # --- MUST unpack preprocess_data properly (avoid tuple misuse) ---
-        (
-            X,
-            y,
-            feature_scaler,
-            target_scaler,
-            full_features,
-            target_idx,
-            pca,
-            updated_feature_cols,
-        ) = preprocess_data(df, selected_features, target, window, horizon)
-        X = np.asarray(X)
-        y = np.asarray(y)
-        if X.ndim != 3:
-            raise ValueError(f"Preprocessed X must be 3D (samples, window, features). Got shape: {X.shape}")
-        if y.ndim == 1:
-            # ensure y has shape (samples, horizon)
-            y = y.reshape(-1, horizon)
         if X.shape[0] < 10:
             return {"error": f"Insufficient data samples: {X.shape[0]}"}
-        # Train/test split (simple slice to preserve time order)
         train_size = int((1 - test_split) * len(X))
         X_train, X_test = X[:train_size], X[train_size:]
         y_train, y_test = y[:train_size], y[train_size:]
-        # Build datasets (do NOT move to device here; move in training loop)
-        train_dataset = TensorDataset(
-            torch.tensor(X_train, dtype=torch.float32),
-            torch.tensor(y_train, dtype=torch.float32),
-        )
-        test_dataset = TensorDataset(
-            torch.tensor(X_test, dtype=torch.float32),
-            torch.tensor(y_test, dtype=torch.float32),
-        )
         train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
         test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)
         input_size = X.shape[2]
-        model = model_cls(
-            input_size=input_size,
-            hidden_size=hidden,
-            num_layers=layers,
-            output_size=horizon,
-            dropout=dropout,
-        ).to(device)
         if verbose and summary:
             try:
                 output = StringIO()
                 sys.stdout = output
-                # summary expects (channels, seq_len) for some models, here we show (seq_len, features)
                 summary(model, input_size=(window, input_size))
                 sys.stdout = sys.__stdout__
-                logging.debug(output.getvalue())
             except Exception as e:
                 logging.warning(f"Failed to generate model summary: {str(e)}")
-        optimizer = optim.Adam(
-            model.parameters(), lr=lr, betas=(beta1, beta2), weight_decay=weight_decay
-        )
         criterion = nn.MSELoss()
         scheduler = None
         if scheduler_type == "ReduceLROnPlateau":
-            scheduler = lr_scheduler.ReduceLROnPlateau(
-                optimizer, mode="min", factor=0.5, patience=10, verbose=verbose
-            )
-        train_losses, val_losses = [], []
-        # ---------------- Training loop ----------------
         for epoch in range(epochs):
             model.train()
-            running_loss = 0.0
             for batch_X, batch_y in train_loader:
-                batch_X = batch_X.to(device)
-                batch_y = batch_y.to(device)
                 optimizer.zero_grad()
-                outputs = model(batch_X)
                 loss = criterion(outputs, batch_y)
                 loss.backward()
                 optimizer.step()
-                running_loss += loss.item() * batch_X.size(0)
-            epoch_train_loss = running_loss / len(train_loader.dataset)
-            train_losses.append(epoch_train_loss)
-            # validation
             model.eval()
-            running_val = 0.0
             with torch.no_grad():
                 for batch_X, batch_y in test_loader:
-                    batch_X = batch_X.to(device)
-                    batch_y = batch_y.to(device)
-                    outputs = model(batch_X)
-                    v_loss = criterion(outputs, batch_y)
-                    running_val += v_loss.item() * batch_X.size(0)
-            epoch_val_loss = running_val / len(test_loader.dataset)
-            val_losses.append(epoch_val_loss)
             if scheduler:
-                scheduler.step(epoch_val_loss)
-            logging.debug(f"Epoch {epoch+1}/{epochs} train={epoch_train_loss:.6f} val={epoch_val_loss:.6f}")
         # ---------------- Evaluation ----------------
         model.eval()
         with torch.no_grad():
             X_test_tensor = torch.tensor(X_test, dtype=torch.float32).to(device)
-            y_pred_scaled = model(X_test_tensor).cpu().numpy()
         y_test_unscaled = target_scaler.inverse_transform(y_test.reshape(-1, horizon)).flatten()
         y_pred_unscaled = target_scaler.inverse_transform(y_pred_scaled.reshape(-1, horizon)).flatten()
@@ -307,6 +292,7 @@ def train_and_evaluate(
         # Latest prediction (use last window from original X)
         latest_data = torch.tensor(X[-1:], dtype=torch.float32).to(device)
         with torch.no_grad():
             latest_prediction_scaled = model(latest_data).cpu().numpy()
             latest_prediction = target_scaler.inverse_transform(
                 latest_prediction_scaled.reshape(-1, horizon)

+```python
 import numpy as np
 import pandas as pd
 import torch
         return np.nan
     return np.mean(np.abs((y_true[non_zero] - y_pred[non_zero]) / y_true[non_zero])) * 100
 def directional_accuracy(y_true, y_pred):
     true_diff = np.diff(y_true)
     pred_diff = np.diff(y_pred)
         return np.nan
     return np.mean(np.sign(true_diff) == np.sign(pred_diff))
 def mase(y_true, y_pred, y_train):
     mae_val = mean_absolute_error(y_true, y_pred)
     naive_mae = mean_absolute_error(y_train[1:], y_train[:-1]) if len(y_train) > 1 else np.nan
         return np.nan
     return mae_val / naive_mae
 def compute_volatility(y_pred):
     returns = np.diff(y_pred) / y_pred[:-1]
     if len(returns) == 0:
         return np.nan
     return np.std(returns) * np.sqrt(252)
 def compute_sharpe_ratio(y_pred, risk_free_rate=0.01):
     returns = np.diff(y_pred) / y_pred[:-1]
     if len(returns) == 0:
         return np.nan
     return (mean_return - risk_free_rate) / std_return
 def compute_precision_recall(y_true, y_pred):
     true_diff = np.sign(np.diff(y_true))
     pred_diff = np.sign(np.diff(y_pred))
     recall = recall_score(true_diff > 0, pred_diff > 0, zero_division=0)
     return precision, recall
 # ---------------- Feature selection ----------------
 def select_features(df, features, target, selector_method, importance_threshold):
     logging.info(
             rf.fit(X, y)
             importances = pd.Series(rf.feature_importances_, index=features)
             selected_features = importances[importances >= importance_threshold].index.tolist()
+            logging.debug(f"RandomForest selected features: {selected_features}, importances: {importances.to_dict()}")
             return selected_features if selected_features else features
         except Exception as e:
             logging.error(f"RandomForest feature selection failed: {str(e)}")
             pca = PCA(n_components=n_components)
             pca.fit(X_scaled)
             explained_variance_ratio = pca.explained_variance_ratio_.cumsum()
+            n_selected = sum(explained_variance_ratio < 0.95) + 1 if any(explained_variance_ratio < 0.95) else n_components
             selected_features = features[:n_selected]
+            logging.debug(f"PCA selected features: {selected_features}, explained variance: {explained_variance_ratio.tolist()}")
             return selected_features if selected_features else features
         except Exception as e:
             logging.error(f"PCA feature selection failed: {str(e)}")
             return features
     else:
+        logging.warning(f"Unsupported selector_method: {selector_method}, using all features")
         return features
 def train_and_evaluate(
     df,
     features,
     selector_method="RandomForest",
     importance_threshold=0.0,
     scheduler_type="None",
+    device='cpu',
+    verbose=True
 ):
     try:
+        logging.info(f"Starting train_and_evaluate: model={model_cls.__name__}, features={len(features)}, window={window}, horizon={horizon}, scheduler={scheduler_type}, selector_method={selector_method}")
         from .data import preprocess_data
+        selected_features = select_features(df, features, target, selector_method, importance_threshold)
         logging.info(f"Selected features: {selected_features}")
+        X, y, feature_scaler, target_scaler, full_features, target_idx, pca, updated_feature_cols = preprocess_data(df, selected_features, target, window, horizon)
+        logging.debug(f"Preprocess: type(X)={type(X)}, X_shape={X.shape if isinstance(X, np.ndarray) else 'not ndarray'}, type(y)={type(y)}, y_shape={y.shape if isinstance(y, np.ndarray) else 'not ndarray'}")
         if X.shape[0] < 10:
+            logging.error(f"Insufficient data samples: {X.shape[0]}")
             return {"error": f"Insufficient data samples: {X.shape[0]}"}
         train_size = int((1 - test_split) * len(X))
         X_train, X_test = X[:train_size], X[train_size:]
         y_train, y_test = y[:train_size], y[train_size:]
+        logging.debug(f"Train size: {len(X_train)}, Test size: {len(X_test)}")
+        logging.debug(f"X_train type: {type(X_train)}, shape: {X_train.shape if isinstance(X_train, np.ndarray) else 'not ndarray'}")
+        logging.debug(f"X_test type: {type(X_test)}, shape: {X_test.shape if isinstance(X_test, np.ndarray) else 'not ndarray'}")
+        train_dataset = TensorDataset(torch.tensor(X_train, dtype=torch.float32).to(device),
+                                     torch.tensor(y_train, dtype=torch.float32).to(device))
+        test_dataset = TensorDataset(torch.tensor(X_test, dtype=torch.float32).to(device),
+                                    torch.tensor(y_test, dtype=torch.float32).to(device))
         train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
         test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)
+        # Debug DataLoader output
+        for batch_X, batch_y in train_loader:
+            logging.debug(f"DataLoader train batch: X_type={type(batch_X)}, X_shape={batch_X.shape}, y_type={type(batch_y)}, y_shape={batch_y.shape}")
+            break
+        for batch_X, batch_y in test_loader:
+            logging.debug(f"DataLoader test batch: X_type={type(batch_X)}, X_shape={batch_X.shape}, y_type={type(batch_y)}, y_shape={batch_y.shape}")
+            break
         input_size = X.shape[2]
+        model = model_cls(input_size=input_size, hidden_size=hidden, num_layers=layers, output_size=horizon, dropout=dropout).to(device)
+        logging.debug(f"Model initialized: {model_cls.__name__}, input_size={input_size}, hidden={hidden}, layers={layers}")
         if verbose and summary:
             try:
                 output = StringIO()
                 sys.stdout = output
                 summary(model, input_size=(window, input_size))
                 sys.stdout = sys.__stdout__
+                logging.debug(f"Model summary:\n{output.getvalue()}")
             except Exception as e:
                 logging.warning(f"Failed to generate model summary: {str(e)}")
+        optimizer = optim.Adam(model.parameters(), lr=lr, betas=(beta1, beta2), weight_decay=weight_decay)
         criterion = nn.MSELoss()
         scheduler = None
         if scheduler_type == "ReduceLROnPlateau":
+            scheduler = lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=10, verbose=verbose)
+            logging.debug("Initialized ReduceLROnPlateau scheduler")
+        elif scheduler_type != "None":
+            logging.warning(f"Unsupported scheduler type: {scheduler_type}, using None")
+        train_losses = []
+        val_losses = []
         for epoch in range(epochs):
             model.train()
+            train_loss = 0.0
             for batch_X, batch_y in train_loader:
+                logging.debug(f"Training Batch_X type: {type(batch_X)}, shape: {batch_X.shape}")
+                logging.debug(f"Training Batch_Y type: {type(batch_y)}, shape: {batch_y.shape}")
                 optimizer.zero_grad()
+                logging.debug(f"Training input to model: type={type(batch_X)}, shape={batch_X.shape}")
+                try:
+                    outputs = model(batch_X)
+                    logging.debug(f"Training model output shape: {outputs.shape}")
+                except Exception as e:
+                    logging.error(f"Training model forward error: {str(e)}, batch_X_type={type(batch_X)}, batch_X_shape={batch_X.shape}")
+                    raise
                 loss = criterion(outputs, batch_y)
                 loss.backward()
                 optimizer.step()
+                train_loss += loss.item() * batch_X.size(0)
+            train_loss /= len(train_loader.dataset)
+            train_losses.append(train_loss)
             model.eval()
+            val_loss = 0.0
             with torch.no_grad():
                 for batch_X, batch_y in test_loader:
+                    logging.debug(f"Validation Batch_X type: {type(batch_X)}, shape: {batch_X.shape}")
+                    logging.debug(f"Validation Batch_Y type: {type(batch_y)}, shape: {batch_y.shape}")
+                    logging.debug(f"Validation input to model: type={type(batch_X)}, shape={batch_X.shape}")
+                    try:
+                        outputs = model(batch_X)
+                        logging.debug(f"Validation model output shape: {outputs.shape}")
+                    except Exception as e:
+                        logging.error(f"Validation model forward error: {str(e)}, batch_X_type={type(batch_X)}, batch_X_shape={batch_X.shape}")
+                        raise
+                    loss = criterion(outputs, batch_y)
+                    val_loss += loss.item() * batch_X.size(0)
+                val_loss /= len(test_loader.dataset)
+                val_losses.append(val_loss)
             if scheduler:
+                scheduler.step(val_loss)
+                current_lr = optimizer.param_groups[0]['lr']
+                logging.debug(f"Epoch {epoch+1}/{epochs}, Train Loss: {train_loss:.6f}, Val Loss: {val_loss:.6f}, LR: {current_lr}")
+            else:
+                logging.debug(f"Epoch {epoch+1}/{epochs}, Train Loss: {train_loss:.6f}, Val Loss: {val_loss:.6f}")
         # ---------------- Evaluation ----------------
         model.eval()
         with torch.no_grad():
             X_test_tensor = torch.tensor(X_test, dtype=torch.float32).to(device)
+            logging.debug(f"Eval model call: type={type(X_test_tensor)}, shape={X_test_tensor.shape}")
+            try:
+                y_pred_scaled = model(X_test_tensor).cpu().numpy()
+                logging.debug(f"Eval model output shape: {y_pred_scaled.shape}")
+            except Exception as e:
+                logging.error(f"Eval model forward error: {str(e)}, X_test_type={type(X_test_tensor)}, X_test_shape={X_test_tensor.shape}")
+                raise
         y_test_unscaled = target_scaler.inverse_transform(y_test.reshape(-1, horizon)).flatten()
         y_pred_unscaled = target_scaler.inverse_transform(y_pred_scaled.reshape(-1, horizon)).flatten()
         # Latest prediction (use last window from original X)
         latest_data = torch.tensor(X[-1:], dtype=torch.float32).to(device)
         with torch.no_grad():
+            logging.debug(f"Latest prediction input: type={type(latest_data)}, shape={latest_data.shape}")
             latest_prediction_scaled = model(latest_data).cpu().numpy()
             latest_prediction = target_scaler.inverse_transform(
                 latest_prediction_scaled.reshape(-1, horizon)