File size: 5,258 Bytes
0162f5e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
"""
Feature Engineering for Schedule ML Model
Extract features from schedule data for training
"""
import numpy as np
from typing import Dict, List, Tuple
from datetime import datetime
from .config import CONFIG


class FeatureExtractor:
    """Extract features from schedule data"""
    
    @staticmethod
    def extract_from_schedule(schedule: Dict) -> Dict[str, float]:
        """Extract features from a single schedule"""
        features = {}
        
        # Basic counts
        trainsets = schedule.get("trainsets", [])
        features["num_trains"] = len(trainsets)
        
        # Status counts
        status_counts = {}
        for train in trainsets:
            status = train.get("status", "UNKNOWN")
            status_counts[status] = status_counts.get(status, 0) + 1
        
        features["num_available"] = (
            status_counts.get("REVENUE_SERVICE", 0) + 
            status_counts.get("STANDBY", 0)
        )
        features["maintenance_count"] = status_counts.get("MAINTENANCE", 0)
        
        # Readiness scores
        readiness_scores = [
            t.get("readiness_score", 0.0) for t in trainsets
        ]
        features["avg_readiness_score"] = np.mean(readiness_scores) if readiness_scores else 0.0
        features["min_readiness_score"] = np.min(readiness_scores) if readiness_scores else 0.0
        
        # Mileage statistics
        mileages = [t.get("cumulative_km", 0) for t in trainsets]
        if mileages:
            features["total_mileage"] = sum(mileages)
            features["avg_mileage"] = np.mean(mileages)
            features["mileage_variance"] = np.var(mileages)
        else:
            features["total_mileage"] = 0
            features["avg_mileage"] = 0
            features["mileage_variance"] = 0
        
        # Certificate expiry
        certificate_issues = 0
        for train in trainsets:
            certs = train.get("fitness_certificates", {})
            for cert_type, cert_data in certs.items():
                if isinstance(cert_data, dict):
                    status = cert_data.get("status", "VALID")
                    if status in ["EXPIRED", "EXPIRING_SOON"]:
                        certificate_issues += 1
        features["certificate_expiry_count"] = certificate_issues
        
        # Branding priority
        branding_score = 0
        priority_map = {"CRITICAL": 4, "HIGH": 3, "MEDIUM": 2, "LOW": 1, "NONE": 0}
        for train in trainsets:
            branding = train.get("branding", {})
            if isinstance(branding, dict):
                priority = branding.get("exposure_priority", "NONE")
                branding_score += priority_map.get(priority, 0)
        features["branding_priority_sum"] = branding_score
        
        # Time features
        try:
            generated_at = datetime.fromisoformat(
                schedule.get("generated_at", "").replace("+05:30", "")
            )
            features["time_of_day"] = generated_at.hour
            features["day_of_week"] = generated_at.weekday()
        except:
            features["time_of_day"] = 12
            features["day_of_week"] = 0
        
        return features
    
    @staticmethod
    def calculate_target(schedule: Dict) -> float:
        """Calculate quality score (target variable)"""
        metrics = schedule.get("optimization_metrics", {})
        
        # Weighted quality score
        score = 0.0
        
        # Component 1: Readiness (0-30 points)
        avg_readiness = metrics.get("avg_readiness_score", 0.0)
        score += avg_readiness * 30
        
        # Component 2: Availability (0-25 points)
        fleet_summary = schedule.get("fleet_summary", {})
        availability = fleet_summary.get("availability_percent", 0.0)
        score += (availability / 100) * 25
        
        # Component 3: Mileage balance (0-20 points)
        mileage_var = metrics.get("mileage_variance_coefficient", 1.0)
        score += max(0, (1 - mileage_var) * 20)
        
        # Component 4: Branding compliance (0-15 points)
        branding_sla = metrics.get("branding_sla_compliance", 0.0)
        score += branding_sla * 15
        
        # Component 5: No violations (0-10 points)
        violations = metrics.get("fitness_expiry_violations", 0)
        score += max(0, 10 - violations * 2)
        
        return min(100.0, score)
    
    def prepare_dataset(self, schedules: List[Dict]) -> Tuple[np.ndarray, np.ndarray]:
        """Prepare feature matrix and target vector"""
        X = []
        y = []
        
        for schedule_data in schedules:
            schedule = schedule_data.get("schedule", schedule_data)
            
            try:
                features = self.extract_from_schedule(schedule)
                target = self.calculate_target(schedule)
                
                # Convert to feature vector in correct order
                feature_vector = [features.get(f, 0.0) for f in CONFIG.FEATURES] # type: ignore
                
                X.append(feature_vector)
                y.append(target)
            except Exception as e:
                print(f"Error extracting features: {e}")
                continue
        
        return np.array(X), np.array(y)