Spaces:
Sleeping
Sleeping
Prepare project for Hugging Face Space deployment - Add app.py with Gradio interface - Update requirements.txt with torch dependencies - Configure LFS for large files (models, data) - Update README with comprehensive documentation
d2173d1
| """ | |
| Train the LSTM Anomaly Detection Model | |
| """ | |
| import pandas as pd | |
| import numpy as np | |
| import torch | |
| from torch.utils.data import TensorDataset, DataLoader | |
| from pathlib import Path | |
| import pickle | |
| from anomaly_detector import AnomalyDetectionModel | |
| def load_data(data_dir='data/processed'): | |
| """Load preprocessed data""" | |
| data_path = Path(data_dir) | |
| train_df = pd.read_csv(data_path / 'train.csv') | |
| val_df = pd.read_csv(data_path / 'val.csv') | |
| # Load feature columns | |
| with open(data_path / 'feature_columns.pkl', 'rb') as f: | |
| feature_columns = pickle.load(f) | |
| return train_df, val_df, feature_columns | |
| def prepare_data_by_vehicle(df, feature_columns, sequence_length=50): | |
| """Prepare sequences grouped by vehicle""" | |
| all_sequences = [] | |
| all_labels = [] | |
| for vehicle_id in df['vehicle_id'].unique(): | |
| vehicle_data = df[df['vehicle_id'] == vehicle_id] | |
| features = vehicle_data[feature_columns].values | |
| labels = vehicle_data['anomaly'].values | |
| # Create sequences for this vehicle | |
| for i in range(len(features) - sequence_length + 1): | |
| seq = features[i:i + sequence_length] | |
| label = labels[i + sequence_length - 1] | |
| all_sequences.append(seq) | |
| all_labels.append(label) | |
| return np.array(all_sequences), np.array(all_labels) | |
| def train_model(epochs=20, batch_size=32, sequence_length=50): | |
| """Train the anomaly detection model""" | |
| print("="*60) | |
| print("TRAINING ANOMALY DETECTION MODEL") | |
| print("="*60) | |
| # Load data | |
| print("\nLoading data...") | |
| train_df, val_df, feature_columns = load_data() | |
| print(f"β Loaded train: {len(train_df)} records, val: {len(val_df)} records") | |
| print(f"β Features: {len(feature_columns)}") | |
| # Prepare sequences | |
| print("\nPreparing sequences...") | |
| X_train, y_train = prepare_data_by_vehicle(train_df, feature_columns, sequence_length) | |
| X_val, y_val = prepare_data_by_vehicle(val_df, feature_columns, sequence_length) | |
| print(f"β Train sequences: {X_train.shape}") | |
| print(f"β Val sequences: {X_val.shape}") | |
| print(f"β Train anomaly rate: {y_train.mean():.2%}") | |
| print(f"β Val anomaly rate: {y_val.mean():.2%}") | |
| # Create data loaders | |
| train_dataset = TensorDataset( | |
| torch.FloatTensor(X_train), | |
| torch.FloatTensor(y_train) | |
| ) | |
| val_dataset = TensorDataset( | |
| torch.FloatTensor(X_val), | |
| torch.FloatTensor(y_val) | |
| ) | |
| train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True) | |
| val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False) | |
| # Initialize model | |
| input_size = len(feature_columns) | |
| model = AnomalyDetectionModel(input_size, sequence_length) | |
| # Training loop | |
| print(f"\nTraining for {epochs} epochs...") | |
| print("-"*60) | |
| best_val_loss = float('inf') | |
| for epoch in range(epochs): | |
| train_loss = model.train_epoch(train_loader) | |
| val_loss, val_acc = model.evaluate(val_loader) | |
| print(f"Epoch {epoch+1}/{epochs} - " | |
| f"Train Loss: {train_loss:.4f}, " | |
| f"Val Loss: {val_loss:.4f}, " | |
| f"Val Acc: {val_acc:.4f}") | |
| # Save best model | |
| if val_loss < best_val_loss: | |
| best_val_loss = val_loss | |
| model.save('src/models/best_anomaly_detector.pth') | |
| print("-"*60) | |
| print(f"\nβ Training complete! Best val loss: {best_val_loss:.4f}") | |
| print("="*60) | |
| return model | |
| if __name__ == '__main__': | |
| model = train_model(epochs=20, batch_size=32, sequence_length=50) | |