File size: 3,697 Bytes
d2173d1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
"""
Train the LSTM Anomaly Detection Model
"""
import pandas as pd
import numpy as np
import torch
from torch.utils.data import TensorDataset, DataLoader
from pathlib import Path
import pickle
from anomaly_detector import AnomalyDetectionModel


def load_data(data_dir='data/processed'):
    """Load preprocessed data"""
    data_path = Path(data_dir)
    
    train_df = pd.read_csv(data_path / 'train.csv')
    val_df = pd.read_csv(data_path / 'val.csv')
    
    # Load feature columns
    with open(data_path / 'feature_columns.pkl', 'rb') as f:
        feature_columns = pickle.load(f)
    
    return train_df, val_df, feature_columns


def prepare_data_by_vehicle(df, feature_columns, sequence_length=50):
    """Prepare sequences grouped by vehicle"""
    all_sequences = []
    all_labels = []
    
    for vehicle_id in df['vehicle_id'].unique():
        vehicle_data = df[df['vehicle_id'] == vehicle_id]
        
        features = vehicle_data[feature_columns].values
        labels = vehicle_data['anomaly'].values
        
        # Create sequences for this vehicle
        for i in range(len(features) - sequence_length + 1):
            seq = features[i:i + sequence_length]
            label = labels[i + sequence_length - 1]
            
            all_sequences.append(seq)
            all_labels.append(label)
    
    return np.array(all_sequences), np.array(all_labels)


def train_model(epochs=20, batch_size=32, sequence_length=50):
    """Train the anomaly detection model"""
    print("="*60)
    print("TRAINING ANOMALY DETECTION MODEL")
    print("="*60)
    
    # Load data
    print("\nLoading data...")
    train_df, val_df, feature_columns = load_data()
    print(f"βœ“ Loaded train: {len(train_df)} records, val: {len(val_df)} records")
    print(f"βœ“ Features: {len(feature_columns)}")
    
    # Prepare sequences
    print("\nPreparing sequences...")
    X_train, y_train = prepare_data_by_vehicle(train_df, feature_columns, sequence_length)
    X_val, y_val = prepare_data_by_vehicle(val_df, feature_columns, sequence_length)
    
    print(f"βœ“ Train sequences: {X_train.shape}")
    print(f"βœ“ Val sequences: {X_val.shape}")
    print(f"βœ“ Train anomaly rate: {y_train.mean():.2%}")
    print(f"βœ“ Val anomaly rate: {y_val.mean():.2%}")
    
    # Create data loaders
    train_dataset = TensorDataset(
        torch.FloatTensor(X_train),
        torch.FloatTensor(y_train)
    )
    val_dataset = TensorDataset(
        torch.FloatTensor(X_val),
        torch.FloatTensor(y_val)
    )
    
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
    
    # Initialize model
    input_size = len(feature_columns)
    model = AnomalyDetectionModel(input_size, sequence_length)
    
    # Training loop
    print(f"\nTraining for {epochs} epochs...")
    print("-"*60)
    
    best_val_loss = float('inf')
    
    for epoch in range(epochs):
        train_loss = model.train_epoch(train_loader)
        val_loss, val_acc = model.evaluate(val_loader)
        
        print(f"Epoch {epoch+1}/{epochs} - "
              f"Train Loss: {train_loss:.4f}, "
              f"Val Loss: {val_loss:.4f}, "
              f"Val Acc: {val_acc:.4f}")
        
        # Save best model
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            model.save('src/models/best_anomaly_detector.pth')
    
    print("-"*60)
    print(f"\nβœ“ Training complete! Best val loss: {best_val_loss:.4f}")
    print("="*60)
    
    return model


if __name__ == '__main__':
    model = train_model(epochs=20, batch_size=32, sequence_length=50)