Spaces:
Sleeping
Sleeping
File size: 19,576 Bytes
bc3c386 ff65bb4 ea86914 ff65bb4 8bdaeb6 ff65bb4 70e6d12 ff65bb4 fccc302 ff65bb4 ea86914 735dd52 ea86914 735dd52 ea86914 fccc302 ea86914 fccc302 ea86914 c25dbe3 24a4904 ea86914 70e6d12 ea86914 24a4904 ea86914 ff65bb4 ea86914 fccc302 ea86914 fccc302 ea86914 fccc302 ea86914 fccc302 ea86914 fccc302 ea86914 ff65bb4 ea86914 ff65bb4 ea86914 ff65bb4 ea86914 ff65bb4 735dd52 ff65bb4 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 | # dbehavior_labeler.py
# Load UL models and predict driving style
import os, logging, pickle
import warnings
import joblib
import numpy as np
import pandas as pd
from scipy.signal import medfilt
# Import download functionality
import sys
sys.path.append(os.path.dirname(__file__))
from dbehavior_download import download_latest_models
log = logging.getLogger("dbehavior-labeler")
log.setLevel(logging.INFO)
# Suppress version compatibility warnings in production
warnings.filterwarnings("ignore", category=UserWarning, module="sklearn.base")
warnings.filterwarnings("ignore", category=UserWarning, module="xgboost.core")
MODEL_DIR = os.getenv("MODEL_DIR", "/app/models/ul")
LE_PATH = os.path.join(MODEL_DIR, "label_encoder_ul.pkl")
SC_PATH = os.path.join(MODEL_DIR, "scaler_ul.pkl")
XGB_PATH = os.path.join(MODEL_DIR, "xgb_drivestyle_ul.pkl")
SAFE_DROP = {
"timestamp","driving_style","ul_drivestyle","gt_drivestyle",
"session_id","imported_at","record_index",
"Fuel Efficiciency", "Fuel Efficiency (L/100KM)", "Distance", "Fuel Used", "Route",
"Fuel consumed", "Fuel consumed (total)", "RoadType", "Road Style", "Road Type"
}
def infer_base_interval_seconds(ts: pd.Series) -> float:
"""Infer sampling cadence from timestamp diffs (robust)."""
if ts.size < 2:
return 1.0
diffs = ts.sort_values().diff().dropna().dt.total_seconds()
diffs = diffs[diffs > 0]
if diffs.empty:
return 1.0
q05, q95 = diffs.quantile([0.05, 0.95])
core = diffs[(diffs >= q05) & (diffs <= q95)]
rounded = (core / 0.01).round() * 0.01
mode = rounded.mode()
est = float(mode.iloc[0]) if not mode.empty else float(core.median())
return max(est, 1e-3)
def rows_for(seconds, med_dt):
return max(3, int(round(seconds / max(med_dt, 1e-3))))
def safe_numeric(df, skip=set()):
out = df.copy()
for c in out.columns:
if c in skip: continue
out[c] = pd.to_numeric(out[c], errors="coerce")
return out
def engineer_features(df):
"""
Recreate the exact feature engineering pipeline used during training
"""
log.info("🔧 Starting feature engineering...")
fe = df.copy()
# Clean sentinel values
log.info("🧹 Cleaning sentinel values...")
SENTINELS = {-22, -40, 255}
fe.replace(list(SENTINELS), np.nan, inplace=True)
# Ensure timestamp is proper datetime
if "timestamp" in fe.columns:
fe["timestamp"] = pd.to_datetime(fe["timestamp"], errors="coerce", utc=True)
# Define non-sensor columns that should be excluded from feature engineering
# These are metadata/calculated fields that shouldn't be used as features
NON_SENSOR_COLS = {
"timestamp", "driving_style", "ul_drivestyle", "gt_drivestyle",
"session_id", "imported_at", "record_index",
"Fuel Efficiciency", "Fuel Efficiency (L/100KM)", "Distance", "Fuel Used", "Route",
"Fuel consumed", "Fuel consumed (total)", "RoadType", "Road Style", "Road Type",
"Fuel consumed (total)", "Road Type", "Road Style"
}
# Convert all numeric columns safely, excluding non-sensor columns
fe = safe_numeric(fe, skip=NON_SENSOR_COLS)
# Fill NaN values with median for sensor columns only
sensor_cols = [c for c in fe.select_dtypes(include=[np.number]).columns
if c not in NON_SENSOR_COLS]
fe[sensor_cols] = fe[sensor_cols].fillna(fe[sensor_cols].median())
# Estimate sampling period
if "timestamp" in fe.columns:
base_sec = infer_base_interval_seconds(fe["timestamp"])
else:
base_sec = 1.0
med_dt = base_sec
# Define window sizes (must match training)
W1 = rows_for(1.0, med_dt)
W2 = rows_for(2.0, med_dt)
W5 = rows_for(5.0, med_dt)
W8 = rows_for(8.0, med_dt)
log.info(f"Using window sizes: W1={W1}, W2={W2}, W5={W5}, W8={W8}")
# Base signals - use available columns
available_base = [c for c in ["SPEED", "RPM", "ENGINE_LOAD", "ABSOLUTE_LOAD", "THROTTLE_POS", "MAF"]
if c in fe.columns]
log.info(f"Available base signals: {available_base}")
# Kinematics
if "SPEED" in fe.columns:
fe["ACCEL"] = fe["SPEED"].diff() / max(med_dt, 1e-3)
fe["JERK"] = fe["ACCEL"].diff() / max(med_dt, 1e-3)
else:
fe["ACCEL"] = 0.0
fe["JERK"] = 0.0
# Throttle change rate
if "THROTTLE_POS" in fe.columns:
fe["THROTTLE_D"] = fe["THROTTLE_POS"].diff() / max(med_dt, 1e-3)
else:
fe["THROTTLE_D"] = 0.0
# Rolling stats at multiple horizons (CRITICAL - this creates most features)
def add_roll(col):
if col not in fe.columns:
return
# Safety check to prevent infinite recursion - only skip if it already has rolling suffix
if any(col.endswith(suffix) for suffix in ['_mean_w1', '_std_w1', '_mean_w2', '_std_w2', '_mean_w5', '_std_w5', '_mean_w8', '_std_w8', '_min_w1', '_max_w1']):
return # Skip if already a rolling feature
for w, tag in [(W1, "w1"), (W2, "w2"), (W5, "w5"), (W8, "w8")]:
fe[f"{col}_mean_{tag}"] = fe[col].rolling(w, min_periods=1, center=True).mean()
fe[f"{col}_std_{tag}"] = fe[col].rolling(w, min_periods=1, center=True).std()
fe[f"{col}_min_{tag}"] = fe[col].rolling(w, min_periods=1, center=True).min()
fe[f"{col}_max_{tag}"] = fe[col].rolling(w, min_periods=1, center=True).max()
# Apply rolling features to all base signals and derived signals
all_signals = available_base + ["ACCEL", "JERK", "THROTTLE_D"]
for col in all_signals:
add_roll(col)
# Additional derived features that might have been used in training
if {"MAF", "RPM"}.issubset(fe.columns):
fe["AIRFLOW_PER_RPM"] = fe["MAF"] / fe["RPM"].replace(0, np.nan)
add_roll("AIRFLOW_PER_RPM")
if {"ENGINE_LOAD", "THROTTLE_POS"}.issubset(fe.columns):
fe["LOAD_THROTTLE_RATIO"] = fe["ENGINE_LOAD"] / fe["THROTTLE_POS"].replace(0, np.nan)
add_roll("LOAD_THROTTLE_RATIO")
# Event rates based on quantiles
def q(x, p, default):
return fe[x].abs().quantile(p) if x in fe.columns else default
if "ACCEL" in fe.columns:
a_pos = q("ACCEL", 0.85, 0.5)
a_neg = q("ACCEL", 0.15, -0.5)
evt = pd.DataFrame(index=fe.index)
evt["pos_accel_rate_w5"] = (fe["ACCEL"] > a_pos).rolling(W5, min_periods=1, center=True).mean()
evt["neg_accel_rate_w5"] = (fe["ACCEL"] < a_neg).rolling(W5, min_periods=1, center=True).mean()
fe = pd.concat([fe, evt], axis=1)
if "THROTTLE_D" in fe.columns:
thr_q = q("THROTTLE_D", 0.85, 1.0)
fe["thr_change_rate_w5"] = (fe["THROTTLE_D"].abs() > thr_q).rolling(W5, min_periods=1, center=True).mean()
# Magnitude features
if "ACCEL" in fe.columns:
fe["ACCEL_MAG"] = fe["ACCEL"].abs()
add_roll("ACCEL_MAG")
# Fill any remaining NaN values
fe = fe.bfill().ffill().fillna(0)
log.info(f"Engineered features shape: {fe.shape}")
log.info(f"Total features created: {len(fe.columns)}")
return fe
def detect_idle_episodes(fe):
"""
Detect idle episodes using the same logic as during training
"""
def pick(*names, default=None):
for n in names:
if n in fe.columns:
return fe[n]
return pd.Series(default, index=fe.index, dtype=float)
# Use rolling means for stability
speed_mean = pick("SPEED_mean_w5", "SPEED", default=0.0)
thr_mean = pick("THROTTLE_POS_mean_w5", "THROTTLE_POS", default=0.0)
acc_std = pick("ACCEL_std_w5", "ACCEL_std_w2", "ACCEL_std_w1", default=0.0)
rpm_std = pick("RPM_std_w5", "RPM", default=0.0)
if rpm_std.sum() > 0: # Only apply rolling std if there's actual data
rpm_std = rpm_std.rolling(5, min_periods=1).std()
maf_mean = pick("MAF_mean_w5", "MAF", default=0.0)
# Quantile-based gating
s_gate = speed_mean <= speed_mean.quantile(0.15)
t_gate = thr_mean.fillna(0) <= thr_mean.quantile(0.20)
a_gate = acc_std.fillna(0) <= acc_std.quantile(0.25)
r_gate = rpm_std.fillna(0) <= rpm_std.quantile(0.25)
m_gate = maf_mean.fillna(0) <= maf_mean.quantile(0.20)
idle_mask = (s_gate & t_gate & a_gate & r_gate & m_gate)
# Smooth the mask
idle_mask = medfilt(idle_mask.astype(int), kernel_size=5).astype(bool)
return idle_mask
def _load_any(path):
# Suppress version compatibility warnings for production
with warnings.catch_warnings():
warnings.filterwarnings("ignore", category=UserWarning, module="sklearn")
warnings.filterwarnings("ignore", category=UserWarning, module="xgboost")
warnings.filterwarnings("ignore", category=FutureWarning, module="sklearn")
warnings.filterwarnings("ignore", category=FutureWarning, module="xgboost")
try:
model = joblib.load(path)
except Exception:
with open(path, "rb") as f:
model = pickle.load(f)
# Fix XGBoost compatibility issues for older trained models
if hasattr(model, 'get_booster'): # This is an XGBoost model
# Remove deprecated use_label_encoder attribute that causes issues in newer XGBoost versions
if hasattr(model, '__dict__'):
# Remove all deprecated attributes that cause issues
deprecated_attrs = [
'use_label_encoder', '_le', '_label_encoder',
'use_label_encoder_', '_le_', '_label_encoder_'
]
for attr in deprecated_attrs:
model.__dict__.pop(attr, None)
# Set use_label_encoder to False for newer XGBoost versions
if hasattr(model, 'set_params'):
try:
model.set_params(use_label_encoder=False)
except Exception:
pass
return model
class ULLabeler:
_instance = None
def __init__(self, auto_download: bool = True):
# Auto-download latest models if enabled
if auto_download:
log.info("🔄 Checking for latest model version...")
try:
download_latest_models()
except Exception as e:
log.warning(f"⚠️ Failed to download latest models: {e}")
if not (os.path.exists(LE_PATH) and os.path.exists(SC_PATH) and os.path.exists(XGB_PATH)):
raise FileNotFoundError("Model files not found. Ensure download.py ran successfully.")
self.le = _load_any(LE_PATH)
self.scal = _load_any(SC_PATH)
self.clf = _load_any(XGB_PATH)
# Additional XGBoost compatibility fixes
self._fix_xgb_compatibility()
# Try to discover expected feature names from scaler or model
self.expected = None
if hasattr(self.scal, "feature_names_in_"):
self.expected = list(self.scal.feature_names_in_)
elif hasattr(self.clf, "feature_names_in_"):
self.expected = list(self.clf.feature_names_in_)
log.info(f"ULLabeler ready | expected_features={len(self.expected) if self.expected else 'unknown'}")
def _fix_xgb_compatibility(self):
"""Fix XGBoost compatibility issues with older trained models."""
try:
# Check if this is an XGBoost classifier
if hasattr(self.clf, 'get_booster'):
# Remove deprecated attributes that cause issues in newer XGBoost versions
deprecated_attrs = [
'use_label_encoder', '_le', '_label_encoder',
'use_label_encoder_', '_le_', '_label_encoder_'
]
for attr in deprecated_attrs:
if hasattr(self.clf, attr):
try:
delattr(self.clf, attr)
except (AttributeError, TypeError):
pass
# Set use_label_encoder to False for newer XGBoost versions
if hasattr(self.clf, 'set_params'):
try:
self.clf.set_params(use_label_encoder=False)
except Exception:
pass
# Ensure the model is properly configured for prediction
if hasattr(self.clf, 'n_classes_') and self.clf.n_classes_ is None:
# Try to infer number of classes from the label encoder
if hasattr(self.le, 'classes_'):
self.clf.n_classes_ = len(self.le.classes_)
# For newer XGBoost versions, ensure the model is properly initialized
if hasattr(self.clf, '_le') and self.clf._le is None:
self.clf._le = None
log.info("XGBoost compatibility fixes applied successfully")
except Exception as e:
log.warning(f"XGBoost compatibility fix failed: {e}")
@classmethod
def get(cls, auto_download: bool = True):
if cls._instance is None:
cls._instance = ULLabeler(auto_download=auto_download)
return cls._instance
def _prepare(self, df: pd.DataFrame):
"""
Prepare features using the exact same pipeline as training
"""
log.info("🔧 Starting feature engineering pipeline...")
# Step 1: Engineer features to match training set
engineered_df = engineer_features(df)
# Step 2: Get the feature names the model expects
try:
# Try to get feature names from model
training_columns = self.clf.get_booster().feature_names
if training_columns is None:
# Try to get from scaler
if hasattr(self.scal, 'feature_names_in_'):
training_columns = self.scal.feature_names_in_.tolist()
else:
raise ValueError("Cannot determine feature names")
except:
# Final fallback - use all numeric columns from engineered data
training_columns = engineered_df.select_dtypes(include=[np.number]).columns.tolist()
log.info(f"Model expects {len(training_columns)} features")
# Step 3: Align features with what model expects
missing_in_data = set(training_columns) - set(engineered_df.columns)
extra_in_data = set(engineered_df.columns) - set(training_columns)
log.info(f"Missing features in data: {len(missing_in_data)}")
log.info(f"Extra features in data: {len(extra_in_data)}")
# Define non-sensor features that should be excluded
NON_SENSOR_FEATURES = {
"Fuel consumed", "Fuel Efficiency (L/100KM)", "RoadType", "Road Style", "Road Type",
"Fuel consumed (total)", "Fuel Efficiciency", "Distance", "Fuel Used", "Route"
}
# Create final feature matrix
X_final = pd.DataFrame(index=engineered_df.index)
# Add expected features, handling missing features appropriately
for col in training_columns:
if col in engineered_df.columns:
X_final[col] = engineered_df[col]
elif col in NON_SENSOR_FEATURES:
# These are metadata/calculated features that shouldn't be used
log.info(f"ℹ️ Excluding non-sensor feature: {col}")
X_final[col] = 0.0 # Fill with 0 but don't warn
else:
# This is a missing sensor-derived feature
X_final[col] = 0.0
log.warning(f"⚠️ Added missing sensor feature: {col} (filled with 0)")
# Ensure correct order
X_final = X_final[training_columns]
log.info(f"Final feature matrix shape: {X_final.shape}")
# Step 4: Scale features
try:
Xs = self.scal.transform(X_final)
except Exception as e:
log.warning(f"Scaler transform failed ({e}); using raw features.")
Xs = X_final.values
return Xs, engineered_df
def predict_df(self, df: pd.DataFrame) -> np.ndarray:
"""
Predict driving styles with proper feature engineering and idle detection
"""
try:
log.info("🔧 Starting UL prediction pipeline...")
# Step 1: Prepare features using training pipeline
log.info("📊 Step 1: Feature engineering...")
Xs, engineered_df = self._prepare(df)
log.info("✅ Feature engineering completed")
# Step 2: Make predictions
log.info("🎯 Step 2: Making predictions...")
try:
predictions_encoded = self.clf.predict(Xs)
log.info("✅ Model predictions completed")
except Exception as e:
log.error(f"❌ Model prediction failed: {e}")
raise
# Step 3: Convert encoded predictions to labels
log.info("🏷️ Step 3: Converting predictions to labels...")
try:
predictions_labels = self.le.inverse_transform(predictions_encoded)
log.info("✅ Label conversion completed")
except Exception:
predictions_labels = predictions_encoded
log.info("✅ Using raw predictions as labels")
# Step 4: Detect idle episodes and override predictions
log.info("🔍 Step 4: Detecting idle episodes...")
try:
idle_mask = detect_idle_episodes(engineered_df)
idle_count = idle_mask.sum()
total_count = len(idle_mask)
log.info(f"✅ Idle detection completed: {idle_count} ({idle_count/total_count*100:.1f}%)")
except Exception as e:
log.error(f"❌ Idle detection failed: {e}")
# Fallback: no idle detection
idle_mask = np.zeros(len(predictions_labels), dtype=bool)
log.warning("⚠️ Using fallback: no idle detection")
# Step 5: Override predictions for idle samples
log.info("🔄 Step 5: Applying idle overrides...")
final_predictions = predictions_labels.copy()
final_predictions[idle_mask] = "Idle"
log.info("✅ Idle overrides applied")
# Step 6: Log prediction distribution
log.info("📊 Step 6: Logging results...")
style_proportions = pd.Series(final_predictions).value_counts(normalize=True).sort_index()
log.info("📊 FINAL PREDICTION RESULTS:")
for style, prop in style_proportions.items():
count = (final_predictions == style).sum()
log.info(f" {style:15}: {prop:.3f} ({prop*100:.1f}%) [{count} samples]")
log.info("✅ UL prediction pipeline completed successfully")
return final_predictions
except Exception as e:
log.error(f"❌ UL prediction pipeline failed: {e}")
import traceback
log.error(f"❌ Traceback: {traceback.format_exc()}")
raise
def predict_csv(self, csv_path: str) -> pd.DataFrame:
df = pd.read_csv(csv_path)
y = self.predict_df(df)
out = df.copy()
out["driving_style"] = y
return out
|