Spaces:
Sleeping
Sleeping
File size: 3,697 Bytes
d2173d1 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 |
"""
Train the LSTM Anomaly Detection Model
"""
import pandas as pd
import numpy as np
import torch
from torch.utils.data import TensorDataset, DataLoader
from pathlib import Path
import pickle
from anomaly_detector import AnomalyDetectionModel
def load_data(data_dir='data/processed'):
"""Load preprocessed data"""
data_path = Path(data_dir)
train_df = pd.read_csv(data_path / 'train.csv')
val_df = pd.read_csv(data_path / 'val.csv')
# Load feature columns
with open(data_path / 'feature_columns.pkl', 'rb') as f:
feature_columns = pickle.load(f)
return train_df, val_df, feature_columns
def prepare_data_by_vehicle(df, feature_columns, sequence_length=50):
"""Prepare sequences grouped by vehicle"""
all_sequences = []
all_labels = []
for vehicle_id in df['vehicle_id'].unique():
vehicle_data = df[df['vehicle_id'] == vehicle_id]
features = vehicle_data[feature_columns].values
labels = vehicle_data['anomaly'].values
# Create sequences for this vehicle
for i in range(len(features) - sequence_length + 1):
seq = features[i:i + sequence_length]
label = labels[i + sequence_length - 1]
all_sequences.append(seq)
all_labels.append(label)
return np.array(all_sequences), np.array(all_labels)
def train_model(epochs=20, batch_size=32, sequence_length=50):
"""Train the anomaly detection model"""
print("="*60)
print("TRAINING ANOMALY DETECTION MODEL")
print("="*60)
# Load data
print("\nLoading data...")
train_df, val_df, feature_columns = load_data()
print(f"β Loaded train: {len(train_df)} records, val: {len(val_df)} records")
print(f"β Features: {len(feature_columns)}")
# Prepare sequences
print("\nPreparing sequences...")
X_train, y_train = prepare_data_by_vehicle(train_df, feature_columns, sequence_length)
X_val, y_val = prepare_data_by_vehicle(val_df, feature_columns, sequence_length)
print(f"β Train sequences: {X_train.shape}")
print(f"β Val sequences: {X_val.shape}")
print(f"β Train anomaly rate: {y_train.mean():.2%}")
print(f"β Val anomaly rate: {y_val.mean():.2%}")
# Create data loaders
train_dataset = TensorDataset(
torch.FloatTensor(X_train),
torch.FloatTensor(y_train)
)
val_dataset = TensorDataset(
torch.FloatTensor(X_val),
torch.FloatTensor(y_val)
)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
# Initialize model
input_size = len(feature_columns)
model = AnomalyDetectionModel(input_size, sequence_length)
# Training loop
print(f"\nTraining for {epochs} epochs...")
print("-"*60)
best_val_loss = float('inf')
for epoch in range(epochs):
train_loss = model.train_epoch(train_loader)
val_loss, val_acc = model.evaluate(val_loader)
print(f"Epoch {epoch+1}/{epochs} - "
f"Train Loss: {train_loss:.4f}, "
f"Val Loss: {val_loss:.4f}, "
f"Val Acc: {val_acc:.4f}")
# Save best model
if val_loss < best_val_loss:
best_val_loss = val_loss
model.save('src/models/best_anomaly_detector.pth')
print("-"*60)
print(f"\nβ Training complete! Best val loss: {best_val_loss:.4f}")
print("="*60)
return model
if __name__ == '__main__':
model = train_model(epochs=20, batch_size=32, sequence_length=50)
|