Spaces:

BinKhoaLe1812
/

OBD_Logger

Sleeping

App Files Files Community

LiamKhoaLe commited on Oct 26, 2025

Commit

bc3c386

1 Parent(s): 5503637

Upd fuel efficiency model fusing

Browse files

Files changed (21) hide show

.DS_Store +0 -0
.dockerignore +6 -0
Dockerfile +5 -2
OBD/DrivingAggressivenessScorer.py +188 -0
OBD/configScorer.py +97 -0
OBD/obd_analyzer.py +74 -41
OBD/obd_logger.py +404 -247
OBD/scorerConfig/scorerConfig.py +59 -0
OBD/visualiseScorer.py +237 -0
README.md +9 -3
app.py +97 -21
bulk_mongo_upload.py +181 -0
data/mongo_saver.py +2 -1
efficiency/eval.py +458 -0
efficiency/retrain.py +698 -0
train/rlhf.py +1 -1
train/saver.py +1 -1
utils/{download.py → dbehavior_download.py} +0 -0
utils/{ul_label.py → dbehavior_labeler.py} +1 -1
utils/efficiency_download.py +206 -0
utils/efficiency_labeler.py +289 -0

.DS_Store CHANGED Viewed

Binary files a/.DS_Store and b/.DS_Store differ

.dockerignore ADDED Viewed

	@@ -0,0 +1,6 @@

+diagram
+OBD
+*.md
+data.json
+organize.py
+bulk_mongo_upload.py

Dockerfile CHANGED Viewed

@@ -28,11 +28,14 @@ RUN mkdir -p $HOME/app/logs \
     $HOME/app/cache \
     $HOME/app/cache/obd_data \
     $HOME/app/cache/obd_data/plots \
-    $HOME/app/models/ul
-# ── Environment variables for HuggingFace model ──
 ENV MODEL_DIR=$HOME/app/models/ul
 ENV HF_MODEL_REPO=BinKhoaLe1812/Driver_Behavior_OBD
 # ── Models will be downloaded at runtime when app starts ──

     $HOME/app/cache \
     $HOME/app/cache/obd_data \
     $HOME/app/cache/obd_data/plots \
+    $HOME/app/models/ul \
+    $HOME/app/models/efficiency
+# ── Environment variables for HuggingFace models ──
 ENV MODEL_DIR=$HOME/app/models/ul
 ENV HF_MODEL_REPO=BinKhoaLe1812/Driver_Behavior_OBD
+ENV EFFICIENCY_MODEL_DIR=$HOME/app/models/efficiency
+ENV HF_EFFICIENCY_MODEL_REPO=BinKhoaLe1812/Fuel_Efficiency_OBD
 # ── Models will be downloaded at runtime when app starts ──

OBD/DrivingAggressivenessScorer.py ADDED Viewed

	@@ -0,0 +1,188 @@

+import pandas as pd
+import numpy as np
+import json
+from pathlib import Path
+from typing import Dict, List, Tuple
+from datetime import datetime
+class DrivingAggressivenessScorer:
+    def __init__(self, bounds_file: str = 'obd_bounds.json', weights: Dict = None):
+        self.bounds_file = Path(bounds_file)
+        self.weights = weights if weights else self.weights.copy()
+        self.bounds = self._load_bounds()
+        weight_sum = sum(self.weights.values())
+        if not np.isclose(weight_sum, 1.0):
+            print(f"Warning: Weights sum to {weight_sum:.3f}, normalizing to 1.0")
+            self.weights = {k: v/weight_sum for k, v in self.weights.items()}
+    def _load_bounds(self) -> Dict:
+        if self.bounds_file.exists():
+            with open(self.bounds_file, 'r') as f:
+                return json.load(f)
+    def _save_bounds(self):
+        with open(self.bounds_file, 'w') as f:
+            json.dump(self.bounds, f, indent=2)
+        print(f"✓ Bounds updated and saved to {self.bounds_file}")
+    def update_bounds(self, df: pd.DataFrame):
+        updated = False
+        for param in self.weights.keys():
+            if param in df.columns:
+                data_min = df[param].min()
+                data_max = df[param].max()
+                # Update bounds if new extremes found
+                if data_min < self.bounds[param]['min']:
+                    self.bounds[param]['min'] = data_min
+                    updated = True
+                    print(f"  New MIN for {param}: {data_min:.2f}")
+                if data_max > self.bounds[param]['max']:
+                    self.bounds[param]['max'] = data_max
+                    updated = True
+                    print(f"  New MAX for {param}: {data_max:.2f}")
+        if updated:
+            self._save_bounds()
+        return updated
+    def normalize_value(self, value: float, param: str) -> float:
+        min_val = self.bounds[param]['min']
+        max_val = self.bounds[param]['max']
+        if max_val == min_val:
+            return 0.0
+        normalized = (value - min_val) / (max_val - min_val)
+        return np.clip(normalized, 0.0, 1.0)
+    def calculate_row_score(self, row: pd.Series) -> float:
+        weighted_score = 0.0
+        for param, weight in self.weights.items():
+            if param in row and pd.notna(row[param]):
+                normalized = self.normalize_value(row[param], param)
+                weighted_score += normalized * weight
+        # Convert to 0-100 scale
+        return weighted_score * 100
+    def calculate_drive_scores(self, df: pd.DataFrame) -> pd.DataFrame:
+        df = df.copy()
+        df['aggressiveness_score'] = df.apply(self.calculate_row_score, axis=1)
+        return df
+    def calculate_aggregate_score(self, scores: np.ndarray) -> Dict:
+        mean_score = np.mean(scores)
+        median_score = np.median(scores)
+        std_score = np.std(scores)
+        # Percentile analysis for spike detection
+        p50 = np.percentile(scores, 50)
+        p75 = np.percentile(scores, 75)
+        p90 = np.percentile(scores, 90)
+        p95 = np.percentile(scores, 95)
+        p99 = np.percentile(scores, 99)
+        max_score = np.max(scores)
+        # Detect aggressive spikes (scores > 70)
+        spike_threshold = 70
+        spike_count = np.sum(scores >= spike_threshold)
+        spike_percentage = (spike_count / len(scores)) * 100
+        # Detect extreme spikes (scores > 85)
+        extreme_threshold = 85
+        extreme_count = np.sum(scores >= extreme_threshold)
+        extreme_percentage = (extreme_count / len(scores)) * 100
+        # Penalty increases exponentially with spike frequency and intensity
+        spike_penalty = 0.0
+        if p95 > 70:
+            spike_penalty += (p95 - 70) * 0.3
+        if p99 > 80:
+            spike_penalty += (p99 - 80) * 0.5
+        # Penalty for frequency of spikes
+        if spike_percentage > 5:
+            spike_penalty += (spike_percentage - 5) * 2.0
+        if extreme_percentage > 2:
+            spike_penalty += (extreme_percentage - 2) * 3.0
+        # Calculate final aggregate score
+        base_score = (mean_score * 0.7) + (p75 * 0.3)
+        # Apply spike penalty
+        final_score = np.clip(base_score + spike_penalty, 0, 100)
+        return {
+            'final_score': round(final_score, 2),
+            'mean_score': round(mean_score, 2),
+            'median_score': round(median_score, 2),
+            'std_score': round(std_score, 2),
+            'p75_score': round(p75, 2),
+            'p90_score': round(p90, 2),
+            'p95_score': round(p95, 2),
+            'p99_score': round(p99, 2),
+            'max_score': round(max_score, 2),
+            'spike_percentage': round(spike_percentage, 2),
+            'extreme_percentage': round(extreme_percentage, 2),
+            'spike_penalty': round(spike_penalty, 2)
+        }
+    def analyze_drive(self, csv_path: str, update_bounds: bool = True) -> Tuple[pd.DataFrame, Dict]:
+        print(f"\n{'='*60}")
+        print(f"ANALYZING DRIVE: {csv_path}")
+        print(f"{'='*60}")
+        # Load data
+        df = pd.read_csv(csv_path)
+        print(f"✓ Loaded {len(df)} data points")
+        # Update bounds if requested
+        if update_bounds:
+            print("\nUpdating bounds...")
+            self.update_bounds(df)
+        # Calculate scores
+        print("\nCalculating aggressiveness scores...")
+        df_scored = self.calculate_drive_scores(df)
+        # Calculate aggregate
+        aggregate = self.calculate_aggregate_score(df_scored['aggressiveness_score'].values)
+        return df_scored, aggregate
+    def get_current_bounds(self) -> Dict:
+        return self.bounds
+    def print_bounds(self):
+        print("\nCurrent Parameter Bounds:")
+        print("-" * 50)
+        for param in self.weights.keys():
+            min_val = self.bounds[param]['min']
+            max_val = self.bounds[param]['max']
+            print(f"{param:20s}: {min_val:8.2f} to {max_val:8.2f}")
+if __name__ == "__main__":
+    scorer = DrivingAggressivenessScorer()
+    # Analyze a drive
+    df_scored, results = scorer.analyze_drive('obd_data_log_20251012_121810.csv')
+    # Save scored data
+    output_path = 'obd_data_scored.csv'
+    df_scored.to_csv(output_path, index=False)
+    print(f"✓ Scored data saved to {output_path}")
+    # Display current bounds
+    scorer.print_bounds()

OBD/configScorer.py ADDED Viewed

	@@ -0,0 +1,97 @@

+import yaml
+from driving_aggressiveness_scorer import DrivingAggressivenessScorer
+from driving_analyzer import visualize_drive, compare_drives
+def load_config(config_path: str = 'config.yaml') -> dict:
+    """Load configuration from YAML file."""
+    try:
+        with open(config_path, 'r') as f:
+            return yaml.safe_load(f)
+    except FileNotFoundError:
+        print(f"Config file not found: {config_path}")
+        print("Using default configuration.")
+        return None
+def create_scorer_from_config(config_path: str = 'config.yaml') -> DrivingAggressivenessScorer:
+    """Create scorer instance from configuration file."""
+    config = load_config(config_path)
+    if config:
+        weights = config.get('weights', None)
+        bounds_file = config.get('bounds', {}).get('file', 'obd_bounds.json')
+        scorer = DrivingAggressivenessScorer(bounds_file=bounds_file, weights=weights)
+        print(f"✓ Scorer initialized with config from {config_path}")
+    else:
+        scorer = DrivingAggressivenessScorer()
+        print("✓ Scorer initialized with default settings")
+    return scorer
+# Quick start examples
+if __name__ == "__main__":
+    # METHOD 1: Use with config file (recommended)
+    print("\n" + "="*60)
+    print("METHOD 1: Config-based scoring")
+    print("="*60)
+    scorer = create_scorer_from_config('config.yaml')
+    df_scored, results = scorer.analyze_drive('obd_data_log_20251012_121810.csv')
+    visualize_drive(df_scored, results, save_path='drive_analysis_config.png')
+    # METHOD 2: Use with custom weights (no config file)
+    print("\n" + "="*60)
+    print("METHOD 2: Custom weights")
+    print("="*60)
+    custom_weights = {
+        'RPM': 0.20,
+        'THROTTLE_POS': 0.35,  # More emphasis on throttle
+        'ENGINE_LOAD': 0.25,
+        'MAF': 0.10,
+        'SPEED': 0.05,
+        'INTAKE_PRESSURE': 0.05
+    }
+    scorer_custom = DrivingAggressivenessScorer(weights=custom_weights)
+    df_scored2, results2 = scorer_custom.analyze_drive('obd_data_log_20251012_121810.csv')
+    # METHOD 3: Analyze without updating bounds (testing)
+    print("\n" + "="*60)
+    print("METHOD 3: Analysis without updating bounds")
+    print("="*60)
+    scorer_test = DrivingAggressivenessScorer()
+    df_test, results_test = scorer_test.analyze_drive(
+        'obd_data_log_20251012_121810.csv',
+        update_bounds=False  # Don't update global bounds
+    )
+    # METHOD 4: Quick comparison script
+    print("\n" + "="*60)
+    print("METHOD 4: Compare multiple drives")
+    print("="*60)
+    """
+    # Uncomment when you have multiple CSV files:
+    comparison = compare_drives(scorer, [
+        'obd_data_log_20251012_121810.csv',
+        'obd_data_log_20251013_101234.csv',
+        'obd_data_log_20251014_155030.csv'
+    ])
+    """
+    print("\n" + "="*60)
+    print("SETUP COMPLETE!")
+    print("="*60)
+    print("\nYour system is ready to:")
+    print("  1. Analyze individual drives")
+    print("  2. Compare multiple drives")
+    print("  3. Batch process folders")
+    print("  4. Dynamically update bounds")
+    print("  5. Generate visualizations")
+    print("\nBounds file: obd_bounds.json")
+    print("Config file: config.yaml")
+    print("="*60 + "\n")

OBD/obd_analyzer.py CHANGED Viewed

@@ -24,18 +24,22 @@ KPH_TO_MPS = 1 / 3.6
 G_ACCELERATION = 9.80665
 MIN_MOVING_SPEED_KPH = 2 # have to be moving
-AGGRESSIVE_RPM_ENTRY_THRESHOLD = 2700
 AGGRESSIVE_THROTTLE_ENTRY_THRESHOLD = 40
-AGGRESSIVE_RPM_HOLD_THRESHOLD = 2300
-HARSH_BRAKING_THRESHOLD_G = -0.25
-# roc
-AGGRESSIVE_RPM_ROC_THRESHOLD = 500
-AGGRESSIVE_THROTTLE_ROC_THRESHOLD = 45
-POSITIVE_ACCEL_FOR_ROC_CHECK_G = 0.1
-MODERATE_RPM_THRESHOLD = 2100
-MODERATE_THROTTLE_THRESHOLD = 25
 MIN_DATA_POINTS_FOR_ROC = 2
@@ -67,15 +71,26 @@ def load_and_preprocess_data(csv_filepath):
         # Handle empty DataFrame after potential filtering or if it was empty to begin with
         return df # Or handle error appropriately
-    numeric_cols = ['SPEED', 'RPM', 'THROTTLE_POS']
-    for col in numeric_cols:
-        if col in df.columns:
-            df[col] = pd.to_numeric(df[col], errors='coerce')
-        else:
-            print(f"Warning: Column {col} not found. It will be filled with NaN.")
             df[col] = np.nan
-    df[numeric_cols] = df[numeric_cols].fillna(method='ffill').fillna(0)
     if 'SPEED' in df.columns:
         df['SPEED_mps'] = df['SPEED'] * KPH_TO_MPS
@@ -115,8 +130,8 @@ def load_and_preprocess_data(csv_filepath):
     return df
 def classify_driving_style_stateful(df):
-    if df.empty or not all(col in df.columns for col in ['RPM', 'THROTTLE_POS', 'SPEED', 'acceleration_g']):
-        print("Warning: Missing one or more required columns for stateful classification (RPM, THROTTLE_POS, SPEED, acceleration_g).")
         return pd.Series([DRIVING_STYLE_UNKNOWN] * len(df), index=df.index, dtype=str)
     driving_styles = [DRIVING_STYLE_UNKNOWN] * len(df)
@@ -130,45 +145,63 @@ def classify_driving_style_stateful(df):
         rpm_roc = df.loc[i, 'RPM_roc']
         throttle_roc = df.loc[i, 'THROTTLE_roc']
-        row_style = DRIVING_STYLE_PASSIVE
         is_moving = speed_kph > MIN_MOVING_SPEED_KPH
-        is_hard_braking_trigger = accel_g < HARSH_BRAKING_THRESHOLD_G and is_moving
-        is_high_abs_rpm_throttle_trigger = (rpm > AGGRESSIVE_RPM_ENTRY_THRESHOLD and
-                                            throttle > AGGRESSIVE_THROTTLE_ENTRY_THRESHOLD and
-                                            is_moving)
-        is_actively_accelerating = accel_g > POSITIVE_ACCEL_FOR_ROC_CHECK_G
-        is_high_roc_trigger = (is_moving and
-                               is_actively_accelerating and
-                               (rpm_roc > AGGRESSIVE_RPM_ROC_THRESHOLD or
-                                throttle_roc > AGGRESSIVE_THROTTLE_ROC_THRESHOLD))
-        is_currently_aggressive_event = is_hard_braking_trigger or is_high_abs_rpm_throttle_trigger or is_high_roc_trigger
         if current_style == DRIVING_STYLE_AGGRESSIVE:
-            if is_currently_aggressive_event:
                 row_style = DRIVING_STYLE_AGGRESSIVE
-            elif rpm > AGGRESSIVE_RPM_HOLD_THRESHOLD and is_moving:
                 row_style = DRIVING_STYLE_AGGRESSIVE
-            else:
                 if (rpm > MODERATE_RPM_THRESHOLD or throttle > MODERATE_THROTTLE_THRESHOLD) and is_moving:
                     row_style = DRIVING_STYLE_MODERATE
                 else:
                     row_style = DRIVING_STYLE_PASSIVE
-        else:
             if is_currently_aggressive_event:
-                row_style = DRIVING_STYLE_AGGRESSIVE
-            else:
                 if (rpm > MODERATE_RPM_THRESHOLD or throttle > MODERATE_THROTTLE_THRESHOLD) and is_moving:
                     row_style = DRIVING_STYLE_MODERATE
                 else:
                     row_style = DRIVING_STYLE_PASSIVE
         driving_styles[i] = row_style
-        current_style = row_style
     print("Stateful driving style classification complete.")
     return pd.Series(driving_styles, index=df.index)
@@ -206,7 +239,7 @@ def main():
             print(f"Error saving output CSV to {args.output_csv}: {e}")
     else:
         print("\n--- First 20 Rows of Analyzed Data (showing key fields) ---")
-        display_cols = ['timestamp', 'SPEED', 'RPM', 'THROTTLE_POS', 'acceleration_g', 'driving_style_analyzed']
         display_cols = [col for col in display_cols if col in df.columns]
         if display_cols: print(df[display_cols].head(20))
         else: print("Key display columns not found in DataFrame.")

 G_ACCELERATION = 9.80665
 MIN_MOVING_SPEED_KPH = 2 # have to be moving
+VERY_HIGH_RPM_AGGRESSIVE_THRESHOLD = 3500
+AGGRESSIVE_RPM_ENTRY_THRESHOLD = 2900
 AGGRESSIVE_THROTTLE_ENTRY_THRESHOLD = 40
+AGGRESSIVE_RPM_HOLD_THRESHOLD = 2400
+HARSH_BRAKING_THRESHOLD_G = -0.25
+HIGH_RPM_FOR_ROC_AGGRESSIVE_THRESHOLD = 2300
+AGGRESSIVE_RPM_ROC_THRESHOLD = 500
+AGGRESSIVE_THROTTLE_ROC_THRESHOLD = 45
+POSITIVE_ACCEL_FOR_ROC_CHECK_G = 0.1
+MIN_SPEED_FOR_HOLDING_GEAR_CHECK_KPH = 15
+LOW_G_FOR_HOLDING_GEAR = 0.1
+MODERATE_RPM_THRESHOLD = 2100
+MODERATE_THROTTLE_THRESHOLD = 25
 MIN_DATA_POINTS_FOR_ROC = 2
         # Handle empty DataFrame after potential filtering or if it was empty to begin with
         return df # Or handle error appropriately
+    # Define all possible numeric columns from current fuel efficiency logging
+    all_numeric_cols = ['SPEED', 'RPM', 'THROTTLE_POS', 'MAF', 'ENGINE_LOAD', 'INTAKE_PRESSURE',
+                        'SHORT_FUEL_TRIM_1', 'SHORT_FUEL_TRIM_2', 'LONG_FUEL_TRIM_1', 'LONG_FUEL_TRIM_2']
+    # Only process columns that exist in the dataframe
+    numeric_cols = [col for col in all_numeric_cols if col in df.columns]
+    required_cols = ['SPEED', 'RPM', 'THROTTLE_POS']  # Essential for driving style analysis
+    # Ensure required columns exist
+    for col in required_cols:
+        if col not in df.columns:
+            print(f"Warning: Required column {col} not found. It will be filled with NaN.")
             df[col] = np.nan
+    # Convert all numeric columns to numeric type
+    for col in numeric_cols:
+        df[col] = pd.to_numeric(df[col], errors='coerce')
+    # Fill missing values for all numeric columns
+    df[numeric_cols] = df[numeric_cols].ffill().fillna(0)
     if 'SPEED' in df.columns:
         df['SPEED_mps'] = df['SPEED'] * KPH_TO_MPS
     return df
 def classify_driving_style_stateful(df):
+    if df.empty or not all(col in df.columns for col in ['RPM', 'THROTTLE_POS', 'SPEED', 'acceleration_g', 'RPM_roc', 'THROTTLE_roc']):
+        print("Warning: Missing required columns for stateful classification.")
         return pd.Series([DRIVING_STYLE_UNKNOWN] * len(df), index=df.index, dtype=str)
     driving_styles = [DRIVING_STYLE_UNKNOWN] * len(df)
         rpm_roc = df.loc[i, 'RPM_roc']
         throttle_roc = df.loc[i, 'THROTTLE_roc']
+        row_style = DRIVING_STYLE_PASSIVE # Default for this row
         is_moving = speed_kph > MIN_MOVING_SPEED_KPH
+        # --- Define Aggressive Triggers for this specific row ---
+        # 1. Absolute very high RPM
+        trigger_very_high_rpm = (rpm > VERY_HIGH_RPM_AGGRESSIVE_THRESHOLD and is_moving)
+        # 2. High RPM + High Throttle (user's primary combo)
+        trigger_high_rpm_throttle = (rpm > AGGRESSIVE_RPM_ENTRY_THRESHOLD and
+                                     throttle > AGGRESSIVE_THROTTLE_ENTRY_THRESHOLD and
+                                     is_moving)
+        # 3. RoC-based (RPM or Throttle) during active acceleration, with RPM already elevated
+        is_actively_accelerating = accel_g > POSITIVE_ACCEL_FOR_ROC_CHECK_G
+        trigger_high_roc = (is_moving and is_actively_accelerating and
+                            rpm > HIGH_RPM_FOR_ROC_AGGRESSIVE_THRESHOLD and
+                            (rpm_roc > AGGRESSIVE_RPM_ROC_THRESHOLD or
+                             throttle_roc > AGGRESSIVE_THROTTLE_ROC_THRESHOLD))
+        # 4. Holding gear aggressively (high RPM, moving, but low change in speed)
+        trigger_holding_gear = (rpm > AGGRESSIVE_RPM_HOLD_THRESHOLD and # Using hold RPM as base for this check
+                                is_moving and
+                                speed_kph > MIN_SPEED_FOR_HOLDING_GEAR_CHECK_KPH and
+                                abs(accel_g) < LOW_G_FOR_HOLDING_GEAR)
+        # 5. Hard braking
+        trigger_hard_braking = (accel_g < HARSH_BRAKING_THRESHOLD_G and is_moving)
+        # Combine all triggers for the current row
+        is_currently_aggressive_event = (trigger_very_high_rpm or
+                                         trigger_high_rpm_throttle or
+                                         trigger_high_roc or
+                                         trigger_holding_gear or
+                                         trigger_hard_braking)
+        # --- Stateful Logic ---
         if current_style == DRIVING_STYLE_AGGRESSIVE:
+            if is_currently_aggressive_event: # Re-triggered by a new event this row
                 row_style = DRIVING_STYLE_AGGRESSIVE
+            elif rpm > AGGRESSIVE_RPM_HOLD_THRESHOLD and is_moving: # Maintain based on RPM hold
                 row_style = DRIVING_STYLE_AGGRESSIVE
+            else: # Conditions to stay aggressive not met, transition out
                 if (rpm > MODERATE_RPM_THRESHOLD or throttle > MODERATE_THROTTLE_THRESHOLD) and is_moving:
                     row_style = DRIVING_STYLE_MODERATE
                 else:
                     row_style = DRIVING_STYLE_PASSIVE
+        else: # current_style is Passive or Moderate
             if is_currently_aggressive_event:
+                row_style = DRIVING_STYLE_AGGRESSIVE # Enter aggressive state
+            else: # Not an aggressive event, classify as Moderate or Passive
                 if (rpm > MODERATE_RPM_THRESHOLD or throttle > MODERATE_THROTTLE_THRESHOLD) and is_moving:
                     row_style = DRIVING_STYLE_MODERATE
                 else:
                     row_style = DRIVING_STYLE_PASSIVE
         driving_styles[i] = row_style
+        current_style = row_style # Update the overall state for the next iteration
     print("Stateful driving style classification complete.")
     return pd.Series(driving_styles, index=df.index)
             print(f"Error saving output CSV to {args.output_csv}: {e}")
     else:
         print("\n--- First 20 Rows of Analyzed Data (showing key fields) ---")
+        display_cols = ['timestamp', 'SPEED', 'RPM', 'THROTTLE_POS', 'acceleration_g', 'RPM_roc', 'THROTTLE_roc', 'driving_style_analyzed']
         display_cols = [col for col in display_cols if col in df.columns]
         if display_cols: print(df[display_cols].head(20))
         else: print("Key display columns not found in DataFrame.")

OBD/obd_logger.py CHANGED Viewed

@@ -3,80 +3,54 @@ import time
 import datetime
 import csv
 import os
-from collections import deque
-import numpy as np
 import shutil
 import subprocess
-DRIVING_STYLE_PASSIVE = "Passive"
-DRIVING_STYLE_MODERATE = "Moderate"
-DRIVING_STYLE_AGGRESSIVE = "Aggressive"
-DRIVING_STYLE_UNKNOWN = "UNKNOWN_STYLE"
-ROAD_TYPE_LOCAL = "Local"
-ROAD_TYPE_MAIN = "Main"
-ROAD_TYPE_HIGHWAY = "Highway"
-ROAD_TYPE_UNKNOWN = "UNKNOWN_ROAD"
-TRAFFIC_CONDITION_LIGHT = "Light"
-TRAFFIC_CONDITION_MODERATE = "Moderate"
-TRAFFIC_CONDITION_HEAVY = "Heavy"
-TRAFFIC_CONDITION_UNKNOWN = "UNKNOWN_TRAFFIC"
-# Rolling Average Configuration
-ROLLING_WINDOW_SIZE = 20  # 6 seconds
-MIN_SAMPLES_FOR_CLASSIFICATION = 10
-# ROC needs tuning
-SHORT_ROC_WINDOW_SIZE = 3
-MIN_SAMPLES_FOR_ROC_CHECK = SHORT_ROC_WINDOW_SIZE
-ROC_THROTTLE_AGGRESSIVE_THRESHOLD = 25.0
-ROC_RPM_AGGRESSIVE_THRESHOLD = 700.0
-ROC_SPEED_AGGRESSIVE_THRESHOLD = 8.0
-MIN_RPM_FOR_AGGRESSIVE_TRIGGER = 1000.0
-AGGRESSIVE_EVENT_COOLDOWN_SAMPLES = 15
-HIGH_FREQUENCY_PIDS = [
-    obd.commands.RPM,
-    obd.commands.THROTTLE_POS,
-    obd.commands.SPEED,
 ]
-LOW_FREQUENCY_PIDS_POOL = [
-    obd.commands.FUEL_PRESSURE,
-    obd.commands.ENGINE_LOAD,
-    obd.commands.COOLANT_TEMP,
-    obd.commands.INTAKE_TEMP,
-    obd.commands.TIMING_ADVANCE,
-    obd.commands.MAF,
-    obd.commands.INTAKE_PRESSURE,
-    obd.commands.SHORT_FUEL_TRIM_1,
-    obd.commands.LONG_FUEL_TRIM_1,
-    obd.commands.SHORT_FUEL_TRIM_2,
     obd.commands.LONG_FUEL_TRIM_2,
-    obd.commands.COMMANDED_EQUIV_RATIO,
-    obd.commands.O2_B1S2,
-    obd.commands.O2_B2S2,
-    obd.commands.O2_S1_WR_VOLTAGE,
-    obd.commands.COMMANDED_EGR,
 ]
 ALL_PIDS_TO_LOG = HIGH_FREQUENCY_PIDS + LOW_FREQUENCY_PIDS_POOL
 CSV_FILENAME_BASE = "obd_data_log"
-# Define new structured log directories relative to the OBD_Logger/OBD directory
-LOGS_BASE_DIR = os.path.join(os.path.dirname(__file__), "..", "..", "logs") # Corrected: Up two levels to Base, then into logs
-ORIGINAL_CSV_DIR = os.path.join(LOGS_BASE_DIR, "OriginalCSV")
-DUPLICATE_CSV_DIR = os.path.join(LOGS_BASE_DIR, "DuplicateCSV")
-WIFI_ADAPTER_HOST = "192.168.0.10"
-WIFI_ADAPTER_PORT = 35000
-WIFI_PROTOCOL = "6"
-USE_WIFI_SETTINGS = False # using socat to mimic serial connection
 def get_pid_value(connection, pid_command):
-    """Queries a PID and returns its value, or None if not available or error."""
     try:
         response = connection.query(pid_command, force=True)
         if response.is_null() or response.value is None:
@@ -87,47 +61,98 @@ def get_pid_value(connection, pid_command):
     except Exception as e:
         print(f"Error querying {pid_command.name}: {e}")
         return None
-def perform_logging_session():
-    connection = None
-    print("Starting OBD-II Data Logger...")
-    print("Classifications (Style, Road, Traffic) will be determined automatically.")
-    initial_driving_style = ""
-    initial_road_type = ""
-    initial_traffic_condition = ""
-    BASE_LOG_INTERVAL = .3  # for high frequency data
-    LOW_FREQUENCY_GROUP_POLL_INTERVAL = 90.0  # Interval in seconds to poll one group of LF PIDs
-    NUM_LOW_FREQUENCY_GROUPS = 3
-    # Prepare Low-Frequency PID groups
-    low_frequency_pid_groups = []
-    if LOW_FREQUENCY_PIDS_POOL:
-        chunk_size = (len(LOW_FREQUENCY_PIDS_POOL) + NUM_LOW_FREQUENCY_GROUPS - 1) // NUM_LOW_FREQUENCY_GROUPS
-        for i in range(0, len(LOW_FREQUENCY_PIDS_POOL), chunk_size):
-            low_frequency_pid_groups.append(LOW_FREQUENCY_PIDS_POOL[i:i + chunk_size])
-    if not low_frequency_pid_groups: # Handle case with no LF PIDs
-        low_frequency_pid_groups.append([])
-        NUM_LOW_FREQUENCY_GROUPS = 1
-    last_low_frequency_group_poll_time = time.monotonic()
-    current_low_frequency_group_index = 0
     current_pid_values = {pid.name: '' for pid in ALL_PIDS_TO_LOG}
-    # Create log directories
-    for dir_path in [ORIGINAL_CSV_DIR, DUPLICATE_CSV_DIR]: # Add ANALYZED_OUTPUT_DIR if used
         try:
             os.makedirs(dir_path, exist_ok=True)
             print(f"Ensured directory exists: {dir_path}")
         except OSError as e:
             print(f"Error creating directory {dir_path}: {e}. Attempting to use current directory.")
-            # Fallback logic may be needed if creation fails critically
-            if dir_path == ORIGINAL_CSV_DIR: # Critical for saving original log
-                 print("Cannot create original log directory. Exiting.")
                  return None
     current_session_timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
@@ -135,81 +160,72 @@ def perform_logging_session():
     original_csv_filepath = os.path.join(ORIGINAL_CSV_DIR, csv_file_name_only)
     try:
-        if USE_WIFI_SETTINGS:
-            print(f"Attempting to connect to WiFi adapter at {WIFI_ADAPTER_HOST}:{WIFI_ADAPTER_PORT} using protocol {WIFI_PROTOCOL}...")
-            connection = obd.OBD(protocol=WIFI_PROTOCOL,
-                                 host=WIFI_ADAPTER_HOST,
-                                 port=WIFI_ADAPTER_PORT,
-                                 fast=False,
-                                 timeout=30)
-        else:
-            print("Attempting to connect via socat PTY /dev/ttys011...")
-            connection = obd.OBD("/dev/ttys086", fast=True, timeout=30) # Auto-scan for USB/Bluetooth
-        if not connection.is_connected():
-            print("Failed to connect to OBD-II adapter.")
-            print(f"Connection status: {connection.status()}")
-            return None
-        print(f"Successfully connected to OBD-II adapter: {connection.port_name()}")
-        print(f"Adapter status: {connection.status()}")
-        print(f"Supported PIDs (sample):")
-        supported_commands = connection.supported_commands
-        for i, cmd in enumerate(supported_commands):
-            print(f"  - {cmd.name}")
-        if not supported_commands:
-            print("No commands")
         # Creating initial full PID sample to have fully populated rows from beginning
         print("\nPerforming initial full PID sample...")
         initial_log_entry = {
-            'timestamp': datetime.datetime.now().isoformat(),
-            'driving_style': initial_driving_style,
-            'road_type': initial_road_type,
-            'traffic_condition': initial_traffic_condition
         }
-        print("Polling initial High-Frequency PIDs...")
-        for pid_command in HIGH_FREQUENCY_PIDS:
-            value = get_pid_value(connection, pid_command)
-            current_pid_values[pid_command.name] = value if value is not None else ''
-            initial_log_entry[pid_command.name] = current_pid_values[pid_command.name]
-        print("Polling initial Low-Frequency PIDs (all groups)...")
-        if low_frequency_pid_groups and low_frequency_pid_groups[0]: # Check if there are any LF PIDs
-            for group in low_frequency_pid_groups:
-                for pid_command in group:
-                    value = get_pid_value(connection, pid_command)
-                    current_pid_values[pid_command.name] = value if value is not None else ''
-                    initial_log_entry[pid_command.name] = current_pid_values[pid_command.name]
-        else:
-            print("No Low-Frequency PIDs to poll for initial sample.")
         for pid_obj in ALL_PIDS_TO_LOG:
             if pid_obj.name not in initial_log_entry:
-                initial_log_entry[pid_obj.name] = '' # Default to empty if somehow missed
     except Exception as e:
         print(f"An error occurred during connection or initial PID sample: {e}")
         if connection and connection.is_connected():
             connection.close()
-        return None
     file_exists = os.path.isfile(original_csv_filepath)
     try:
         with open(original_csv_filepath, 'a', newline='') as csvfile:
-            # Add new columns for analyzer output, they will be empty initially from logger
-            header_names = ['timestamp',
-                            'driving_style', 'road_type', 'traffic_condition', # Original placeholder columns
-                            'driving_style_analyzed', 'road_type_analyzed', 'traffic_condition_analyzed' # For analyzer
-                           ] + [pid.name for pid in ALL_PIDS_TO_LOG]
-            # Remove duplicates if any PID name is already in the first part
-            processed_headers = []
-            for item in header_names:
-                if item not in processed_headers:
-                    processed_headers.append(item)
-            header_names = processed_headers
             writer = csv.DictWriter(csvfile, fieldnames=header_names)
@@ -218,74 +234,106 @@ def perform_logging_session():
                 print(f"Created new CSV file: {original_csv_filepath} with headers: {header_names}")
             if initial_log_entry:
-                # Add placeholder columns for analyzer to the initial entry
-                initial_log_entry['driving_style_analyzed'] = ''
-                initial_log_entry['road_type_analyzed'] = ''
-                initial_log_entry['traffic_condition_analyzed'] = ''
                 writer.writerow(initial_log_entry)
                 csvfile.flush()
-                print(f"Logged initial full sample. Style: {initial_driving_style}, Road: {initial_road_type}, Traffic: {initial_traffic_condition}.")
-            last_low_frequency_group_poll_time = time.monotonic()
-            current_low_frequency_group_index = 0
-            print(f"\nLogging high-frequency data every {BASE_LOG_INTERVAL} second(s).")
-            print(f"Polling one group of low-frequency PIDs every {LOW_FREQUENCY_GROUP_POLL_INTERVAL} second(s).")
-            print(f"Low-frequency PIDs divided into {len(low_frequency_pid_groups)} groups.")
-            log_count = 0
-            while True:
                 loop_start_time = time.monotonic()
                 current_datetime = datetime.datetime.now()
                 timestamp_iso = current_datetime.isoformat()
-                hf_reads = 0
-                for pid_command in HIGH_FREQUENCY_PIDS:
-                    value = get_pid_value(connection, pid_command)
-                    current_pid_values[pid_command.name] = value if value is not None else ''
-                    if value is not None:
-                        hf_reads += 1
-                lf_reads_this_cycle = 0
-                lf_group_polled_this_cycle = "None"
-                if low_frequency_pid_groups and (time.monotonic() - last_low_frequency_group_poll_time) >= LOW_FREQUENCY_GROUP_POLL_INTERVAL:
-                    group_to_poll = low_frequency_pid_groups[current_low_frequency_group_index]
-                    lf_group_polled_this_cycle = f"Group {current_low_frequency_group_index + 1}/{len(low_frequency_pid_groups)}"
-                    for pid_command in group_to_poll:
                         value = get_pid_value(connection, pid_command)
                         current_pid_values[pid_command.name] = value if value is not None else ''
                         if value is not None:
-                            lf_reads_this_cycle +=1
-                        else:
-                            print(f"Warning: Could not read LF PID {pid_command.name}")
-                    last_low_frequency_group_poll_time = time.monotonic()
-                    current_low_frequency_group_index = (current_low_frequency_group_index + 1) % len(low_frequency_pid_groups)
                 final_log_entry = {
-                    'timestamp': timestamp_iso,
-                    'driving_style': initial_driving_style,
-                    'road_type': initial_road_type,
-                    'traffic_condition': initial_traffic_condition,
-                    'driving_style_analyzed': '',
-                    'road_type_analyzed': '',
-                    'traffic_condition_analyzed': ''
                 }
-                # Add all PID values for this cycle from current_pid_values
                 for pid_obj in ALL_PIDS_TO_LOG:
                      final_log_entry[pid_obj.name] = current_pid_values.get(pid_obj.name, '')
                 writer.writerow(final_log_entry)
                 csvfile.flush()
                 log_count += 1
                 if log_count % 10 == 0:
-                    status_msg = f"Logged entry {log_count} - HF PIDs Read: {hf_reads}/{len(HIGH_FREQUENCY_PIDS)}"
-                    if lf_reads_this_cycle > 0 or lf_group_polled_this_cycle != "None":
-                         status_msg += f" - LF PIDs ({lf_group_polled_this_cycle}) Read: {lf_reads_this_cycle}/unknown_total_for_group_easily"
-                    print(status_msg)
                 elapsed_time_in_loop = time.monotonic() - loop_start_time
                 sleep_duration = max(0, BASE_LOG_INTERVAL - elapsed_time_in_loop)
@@ -296,79 +344,188 @@ def perform_logging_session():
     except Exception as e:
         print(f"An error occurred during logging: {e}")
     finally:
-        if connection and connection.is_connected():
-            print("Closing OBD-II connection.")
-            connection.close()
-        print(f"Data logging stopped. Original CSV file '{original_csv_filepath}' saved.")
-    return original_csv_filepath
-def duplicate_csv(original_filepath):
-    if not original_filepath or not os.path.exists(original_filepath):
-        print(f"Error: Original CSV not found for duplication: {original_filepath}")
         return None
-    # Ensure DUPLICATE_CSV_DIR exists (it should have been created by perform_logging_session)
-    os.makedirs(DUPLICATE_CSV_DIR, exist_ok=True)
-    # Get just the filename from the original path
-    original_filename = os.path.basename(original_filepath)
-    base, ext = os.path.splitext(original_filename)
-    # Construct new filename for the duplicate
-    duplicate_filename = f"{base}_to_analyze{ext}" # Suffix to distinguish
-    duplicate_filepath = os.path.join(DUPLICATE_CSV_DIR, duplicate_filename)
     try:
-        shutil.copy2(original_filepath, duplicate_filepath)
-        print(f"Successfully duplicated CSV to: {duplicate_filepath}")
-        return duplicate_filepath
     except Exception as e:
-        print(f"Error duplicating CSV {original_filepath} to {duplicate_filepath}: {e}")
         return None
-def run_analyzer_on_csv(csv_to_analyze_path):
-    if not csv_to_analyze_path or not os.path.exists(csv_to_analyze_path):
-        print(f"Error: Analyzer input CSV not found: {csv_to_analyze_path}")
-        return
-    # Analyzer script is in the same directory as this logger script
-    analyzer_script_path = os.path.join(os.path.dirname(__file__), "obd_analyzer.py")
-    if not os.path.exists(analyzer_script_path):
-        print(f"CRITICAL Error: Analyzer script not found at {analyzer_script_path}")
-        return
-    analyzed_file_basename = os.path.basename(csv_to_analyze_path).replace("_to_analyze.csv", "_final_analyzed.csv")
-    final_output_path = os.path.join(DUPLICATE_CSV_DIR, analyzed_file_basename)
-    command = [
-        "python",
-        analyzer_script_path,
-        csv_to_analyze_path,
-        "--output_csv",
-        final_output_path
-    ]
-    print(f"Running analyzer: {' '.join(command)}")
     try:
-        process = subprocess.run(command, check=True, capture_output=True, text=True, cwd=os.path.dirname(__file__))
-        print("Analyzer Output:\n", process.stdout)
-        if process.stderr: print("Analyzer Errors:\n", process.stderr)
-        print(f"Analyzer finished. Output saved to {final_output_path}")
-    except subprocess.CalledProcessError as e:
-        print(f"Error running analyzer: {e}\nStdout: {e.stdout}\nStderr: {e.stderr}")
-    except FileNotFoundError:
-        print(f"Error: 'python' or analyzer script not found ({analyzer_script_path}).")
-if __name__ == "__main__":
-    original_log_file = perform_logging_session()
-    if original_log_file and os.path.exists(original_log_file):
-        duplicated_log_file = duplicate_csv(original_log_file)
-        if duplicated_log_file:
-            run_analyzer_on_csv(duplicated_log_file)
-            print(f"Process complete. Original log: {original_log_file}, Analyzed log copy: {duplicated_log_file}")
-    else:
-        print("OBD logging did not produce a valid CSV file. Skipping analysis.")

 import datetime
 import csv
 import os
 import shutil
 import subprocess
+import sys
+import select
+try:
+    from logging_wrapper import auto_score_on_completion
+    SCORING_AVAILABLE = True
+    print("Auto-scoring module loaded")
+except ImportError:
+    SCORING_AVAILABLE = False
+    print("Auto-scoring module not found - scoring will be skipped")
+CRITICAL_FUEL_PIDS = [
+    obd.commands.RPM,
+    obd.commands.SPEED,
+    obd.commands.THROTTLE_POS,
+    obd.commands.MAF,
+]
+SECONDARY_FUEL_PIDS = [
+    obd.commands.ENGINE_LOAD,
+    obd.commands.INTAKE_PRESSURE,
 ]
+TERTIARY_FUEL_PIDS = [
+    obd.commands.SHORT_FUEL_TRIM_1,
+    obd.commands.SHORT_FUEL_TRIM_2,
+    obd.commands.LONG_FUEL_TRIM_1,
     obd.commands.LONG_FUEL_TRIM_2,
 ]
+HIGH_FREQUENCY_PIDS = CRITICAL_FUEL_PIDS
+LOW_FREQUENCY_PIDS_POOL = SECONDARY_FUEL_PIDS + TERTIARY_FUEL_PIDS
 ALL_PIDS_TO_LOG = HIGH_FREQUENCY_PIDS + LOW_FREQUENCY_PIDS_POOL
 CSV_FILENAME_BASE = "obd_data_log"
+LOGS_BASE_DIR = os.path.join(os.path.dirname(__file__), "..", "..", "logs")
+FUEL_LOGS_DIR = os.path.join(LOGS_BASE_DIR, "FuelLogs")
+ANALYSED_LOGS_DIR = os.path.join(LOGS_BASE_DIR, "analysedLogsAutomated")
+SCORED_LOGS_DIR = os.path.join(LOGS_BASE_DIR, "ScoredLogs")
+ORIGINAL_CSV_DIR = FUEL_LOGS_DIR
 def get_pid_value(connection, pid_command):
+    """Queries a PID and returns its value"""
     try:
         response = connection.query(pid_command, force=True)
         if response.is_null() or response.value is None:
     except Exception as e:
         print(f"Error querying {pid_command.name}: {e}")
         return None
+ef calculate_fuel_metrics(csv_path):
+    """Calculate fuel consumption and efficiency from MAF and SPEED data."""
+    try:
+        df = pd.read_csv(csv_path)
+        # Constants
+        AFR = 14.7  # Air-Fuel Ratio for petrol
+        FUEL_DENSITY = 737  # gg/ for petrol
+        # Calculate time delta between rows (in seconds)
+        df['timestamp'] = pd.to_datetime(df['timestamp'])
+        df['time_delta'] = df['timestamp'].diff().dt.total_seconds()
+        df.loc[0, 'time_delta'] = 0  # First row has no previous row
+        # Calculate instantaneous fuel rate (L/hr) from MAF
+        df['fuel_rate_L_per_hr'] = (df['MAF'] * 3600) / (AFR * FUEL_DENSITY)
+        # Calculate fuel used in this time interval (L)
+        df['fuel_used_interval'] = (df['fuel_rate_L_per_hr'] / 3600) * df['time_delta']
+        # Calculate distance traveled in this interval (km)
+        df['distance_interval'] = (df['SPEED'] / 3600) * df['time_delta']
+        # Calculate cumulative values
+        df['Fuel_Used'] = df['fuel_used_interval'].cumsum()
+        df['Distance'] = df['distance_interval'].cumsum()
+        # Calculate fuel efficiency (L/100km)
+        df['Fuel_efficiency (L/100km)'] = np.where(
+            df['Distance'] > 0,
+            (df['Fuel_Used'] / df['Distance']) * 100,
+            0
+        )
+        df['Fuel_Used'] = df['Fuel_Used'].round(3)
+        df['Distance'] = df['Distance'].round(2)
+        df['Fuel_efficiency (L/100km)'] = df['Fuel_efficiency (L/100km)'].round(2)
+        # Drop intermediate calculation columns
+        df = df.drop(columns=['time_delta', 'fuel_rate_L_per_hr',
+                              'fuel_used_interval', 'distance_interval'])
+        # Save back to CSV
+        df.to_csv(csv_path, index=False)
+        # Print summary
+        total_fuel = df['Fuel_Used'].iloc[-1]
+        total_distance = df['Distance'].iloc[-1]
+        avg_efficiency = df['Fuel_efficiency (L/100km)'].iloc[-1]
+        print(f"Total Fuel Used: {total_fuel:.3f} L")
+        print(f"Total Distance: {total_distance:.2f} km")
+        print(f"Average Efficiency: {avg_efficiency:.2f} L/100km")
+        return csv_path
+    except Exception as e:
+        print(f"Error calculating fuel metrics: {e}")
+        import traceback
+        traceback.print_exc()
+        return None
+def perform_logging_session(connection):
+    """Perform a single logging session with an existing OBD connection."""
+    print(f"\nStarting new fuel efficiency logging session")
+    print("Commands:")
+    print("  - Type 'next' and press Enter to finish this drive and start a new one")
+    print("  - Type 'quit' and press Enter to stop all logging")
+    CRITICAL_PID_INTERVAL = 0.65
+    SECONDARY_PID_INTERVAL = 2.0
+    TERTIARY_PID_INTERVAL = 5.0
+    last_critical_poll_time = time.monotonic() - CRITICAL_PID_INTERVAL
+    last_secondary_poll_time = time.monotonic() - SECONDARY_PID_INTERVAL
+    last_tertiary_poll_time = time.monotonic() - TERTIARY_PID_INTERVAL
+    BASE_LOG_INTERVAL = CRITICAL_PID_INTERVAL
     current_pid_values = {pid.name: '' for pid in ALL_PIDS_TO_LOG}
+    for dir_path in [FUEL_LOGS_DIR, ANALYSED_LOGS_DIR, SCORED_LOGS_DIR]:
         try:
             os.makedirs(dir_path, exist_ok=True)
             print(f"Ensured directory exists: {dir_path}")
         except OSError as e:
             print(f"Error creating directory {dir_path}: {e}. Attempting to use current directory.")
+            if dir_path == FUEL_LOGS_DIR:
+                 print("Cannot create fuel log directory. Exiting.")
                  return None
     current_session_timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
     original_csv_filepath = os.path.join(ORIGINAL_CSV_DIR, csv_file_name_only)
     try:
+        if not connection or not connection.is_connected():
+            print("OBD connection not available")
+            return None, "quit"
+        print(f"Using existing OBD connection: {connection.port_name()}")
         # Creating initial full PID sample to have fully populated rows from beginning
         print("\nPerforming initial full PID sample...")
         initial_log_entry = {
+            'timestamp': datetime.datetime.now().isoformat()
         }
+        print("Polling initial Critical Fuel PIDs...")
+        for pid_command in CRITICAL_FUEL_PIDS:
+            try:
+                value = get_pid_value(connection, pid_command)
+                current_pid_values[pid_command.name] = value if value is not None else ''
+                initial_log_entry[pid_command.name] = current_pid_values[pid_command.name]
+            except Exception as e:
+                print(f"Warning: Failed to get {pid_command.name}: {e}")
+                current_pid_values[pid_command.name] = ''
+                initial_log_entry[pid_command.name] = ''
+        print("Polling initial Secondary Fuel PIDs...")
+        for pid_command in SECONDARY_FUEL_PIDS:
+            try:
+                value = get_pid_value(connection, pid_command)
+                current_pid_values[pid_command.name] = value if value is not None else ''
+                initial_log_entry[pid_command.name] = current_pid_values[pid_command.name]
+            except Exception as e:
+                print(f"Warning: Failed to get {pid_command.name}: {e}")
+                current_pid_values[pid_command.name] = ''
+                initial_log_entry[pid_command.name] = ''
+        print("Polling initial Tertiary Fuel PIDs...")
+        for pid_command in TERTIARY_FUEL_PIDS:
+            try:
+                value = get_pid_value(connection, pid_command)
+                current_pid_values[pid_command.name] = value if value is not None else ''
+                initial_log_entry[pid_command.name] = current_pid_values[pid_command.name]
+            except Exception as e:
+                print(f"Warning: Failed to get {pid_command.name}: {e}")
+                current_pid_values[pid_command.name] = ''
+                initial_log_entry[pid_command.name] = ''
         for pid_obj in ALL_PIDS_TO_LOG:
             if pid_obj.name not in initial_log_entry:
+                initial_log_entry[pid_obj.name] = ''
+        # Empty driving style and fuel columns
+        initial_log_entry['Driving_style'] = ''
+        initial_log_entry['Fuel_efficiency (L/100km)'] = ''
+        initial_log_entry['Distance'] = ''
+        initial_log_entry['Fuel_Used'] = ''
+        initial_log_entry['Route'] = ''
     except Exception as e:
         print(f"An error occurred during connection or initial PID sample: {e}")
         if connection and connection.is_connected():
             connection.close()
+        return None, "quit"
     file_exists = os.path.isfile(original_csv_filepath)
     try:
         with open(original_csv_filepath, 'a', newline='') as csvfile:
+            header_names = ['timestamp'] + [pid.name for pid in ALL_PIDS_TO_LOG] + ['Driving_style', 'Fuel_efficiency (L/100km)', 'Distance', 'Fuel_Used', 'Route']
             writer = csv.DictWriter(csvfile, fieldnames=header_names)
                 print(f"Created new CSV file: {original_csv_filepath} with headers: {header_names}")
             if initial_log_entry:
                 writer.writerow(initial_log_entry)
                 csvfile.flush()
+                print(f"Logged initial full sample with all fuel efficiency PIDs.")
+            log_count = 0
+            user_stop_requested = False
+            print(f"Started logging")
+            while not user_stop_requested:
+                if log_count % 100 == 0 and log_count > 0:
+                    print(f"Debug: Main loop running, iteration {log_count}")
+                # Check for non-blocking input
+                if select.select([sys.stdin], [], [], 0.0)[0]:
+                    user_command = sys.stdin.readline().strip().lower()
+                    if user_command == "next":
+                        print("\nUser typed 'next'. Finishing current drive...")
+                        user_stop_requested = True
+                        break
+                    elif user_command == "quit":
+                        print("\nUser typed 'quit'. Stopping all logging...")
+                        user_stop_requested = True
+                        return original_csv_filepath, "quit"
+                    else:
+                        print(f"Input detected: '{user_command}'. Type 'next' or 'quit'.", end='\r')
                 loop_start_time = time.monotonic()
                 current_datetime = datetime.datetime.now()
                 timestamp_iso = current_datetime.isoformat()
+                critical_reads = 0
+                secondary_reads = 0
+                tertiary_reads = 0
+                # Always poll critical PIDs (highest frequency)
+                if (time.monotonic() - last_critical_poll_time) >= CRITICAL_PID_INTERVAL:
+                    if not connection or not connection.is_connected():
+                        print("\nOBD connection lost during logging. Ending session.")
+                        user_stop_requested = True
+                        break
+                    for pid_command in CRITICAL_FUEL_PIDS:
                         value = get_pid_value(connection, pid_command)
                         current_pid_values[pid_command.name] = value if value is not None else ''
                         if value is not None:
+                            critical_reads += 1
+                    last_critical_poll_time = time.monotonic()
+                # Poll secondary PIDs at medium frequency
+                if (time.monotonic() - last_secondary_poll_time) >= SECONDARY_PID_INTERVAL:
+                    if not connection or not connection.is_connected():
+                        print("\nOBD connection lost during logging. Ending session.")
+                        user_stop_requested = True
+                        break
+                    for pid_command in SECONDARY_FUEL_PIDS:
+                        value = get_pid_value(connection, pid_command)
+                        current_pid_values[pid_command.name] = value if value is not None else ''
+                        if value is not None:
+                            secondary_reads += 1
+                    last_secondary_poll_time = time.monotonic()
+                # Poll tertiary PIDs at low frequency
+                if (time.monotonic() - last_tertiary_poll_time) >= TERTIARY_PID_INTERVAL:
+                    if not connection or not connection.is_connected():
+                        print("\nOBD connection lost during logging. Ending session.")
+                        user_stop_requested = True
+                        break
+                    for pid_command in TERTIARY_FUEL_PIDS:
+                        value = get_pid_value(connection, pid_command)
+                        current_pid_values[pid_command.name] = value if value is not None else ''
+                        if value is not None:
+                            tertiary_reads += 1
+                    last_tertiary_poll_time = time.monotonic()
                 final_log_entry = {
+                    'timestamp': timestamp_iso
                 }
                 for pid_obj in ALL_PIDS_TO_LOG:
                      final_log_entry[pid_obj.name] = current_pid_values.get(pid_obj.name, '')
+                final_log_entry['Driving_style'] = ''
+                final_log_entry['Fuel_efficiency (L/100km)'] = ''
+                final_log_entry['Distance'] = ''
+                final_log_entry['Fuel_Used'] = ''
+                final_log_entry['Route'] = ''
                 writer.writerow(final_log_entry)
                 csvfile.flush()
                 log_count += 1
                 if log_count % 10 == 0:
+                    status_msg = f"Entry {log_count} - Critical: {critical_reads}/{len(CRITICAL_FUEL_PIDS)}"
+                    if secondary_reads > 0:
+                        status_msg += f" Secondary: {secondary_reads}/{len(SECONDARY_FUEL_PIDS)}"
+                    if tertiary_reads > 0:
+                        status_msg += f" Tertiary: {tertiary_reads}/{len(TERTIARY_FUEL_PIDS)}"
+                    print(status_msg + " " * 20, end='\r')
                 elapsed_time_in_loop = time.monotonic() - loop_start_time
                 sleep_duration = max(0, BASE_LOG_INTERVAL - elapsed_time_in_loop)
     except Exception as e:
         print(f"An error occurred during logging: {e}")
     finally:
+        print(" " * 100, end='\r')
+        print(f"Drive completed - data saved to: {os.path.basename(original_csv_filepath)}")
+    return original_csv_filepath, "next"
+def run_scorer_on_csv(original_csv_path):
+    if not SCORING_AVAILABLE:
+        print("Scoring module not available, skipping aggressiveness scoring")
         return None
+    if not original_csv_path or not os.path.exists(original_csv_path):
+        print(f"Error: Original CSV not found for scoring: {original_csv_path}")
+        return None
+    print(f"\nRunning aggressiveness scorer...")
+    original_filename = os.path.basename(original_csv_path)
+    base, ext = os.path.splitext(original_filename)
     try:
+        # Import and configure the scorer
+        from driving_aggressiveness_scorer import DrivingAggressivenessScorer
+        import json
+        # Initialize scorer with bounds file in logs directory
+        bounds_file = os.path.join(LOGS_BASE_DIR, 'obd_bounds.json')
+        scorer = DrivingAggressivenessScorer(bounds_file=bounds_file)
+        # Run analysis
+        df_scored, results = scorer.analyze_drive(str(original_csv_path), update_bounds=True)
+        df_scored['drive_score'] = results['final_score']
+        # Save scored CSV to ScoredLogs directory
+        scored_csv_path = os.path.join(SCORED_LOGS_DIR, f"{base}_scored{ext}")
+        df_scored.to_csv(scored_csv_path, index=False)
+        print(f"Scored CSV saved: {os.path.basename(scored_csv_path)}")
+        # Save summary JSON to ScoredLogs directory
+        summary_json_path = os.path.join(SCORED_LOGS_DIR, f"{base}_score_summary.json")
+        summary = {
+            'timestamp': datetime.datetime.now().isoformat(),
+            'original_file': str(original_csv_path),
+            'scored_file': str(scored_csv_path),
+            'results': results
+        }
+        with open(summary_json_path, 'w') as f:
+            json.dump(summary, f, indent=2)
+        print(f"Score summary saved: {os.path.basename(summary_json_path)}")
+        try:
+            from visualiseScorer import visualize_drive
+            visualization_path = os.path.join(SCORED_LOGS_DIR, f"{base}_visualization.png")
+            visualize_drive(df_scored, results, save_path=visualization_path)
+            print(f"Visualization saved: {os.path.basename(visualization_path)}")
+        except Exception as viz_error:
+            print(f"Warning: Could not generate visualization: {viz_error}")
+        # Print quick summary
+        print(f"Drive Score: {results['final_score']:.1f}/100")
+        return scored_csv_path
     except Exception as e:
+        print(f"Error running scorer: {e}")
+        import traceback
+        traceback.print_exc()
         return None
+def initialize_obd_connection():
+    """Initialize OBD connection once for multiple sessions."""
+    connection = None
+    try:
+        print("Attempting to connect via socat PTY /dev/ttys006...")
+        connection = obd.OBD("/dev/ttys002", fast=True, timeout=30)
+        if not connection.is_connected():
+            print("Failed to connect to OBD-II adapter.")
+            print(f"Connection status: {connection.status()}")
+            return None
+        print(f"Successfully connected to OBD-II adapter: {connection.port_name()}")
+        print(f"Adapter status: {connection.status()}")
+        return connection
+    except Exception as e:
+        print(f"An error occurred during OBD connection: {e}")
+        return None
+def main():
+    print("Fuel Efficiency OBD Logger - Multi-Session Mode")
+    if SCORING_AVAILABLE:
+        print("Aggressiveness scoring enabled")
+    print("=" * 50)
+    # Initialize OBD connection once
+    connection = initialize_obd_connection()
+    if not connection:
+        print("Could not establish OBD connection. Exiting.")
+        return
+    session_count = 0
+    logged_files = []
     try:
+        while True:
+            session_count += 1
+            print(f"\n📊 Session {session_count} ready to start")
+            # Check if connection is still available before starting new session
+            if not connection or not connection.is_connected():
+                print("OBD connection not available. Attempting to reconnect...")
+                connection = initialize_obd_connection()
+                if not connection:
+                    print("Could not re-establish OBD connection. Exiting.")
+                    break
+            result = perform_logging_session(connection)
+            if isinstance(result, tuple):
+                csv_file, command = result
+            else:
+                csv_file, command = result, "quit"
+            # Handle the result
+            if csv_file and os.path.exists(csv_file):
+                try:
+                    with open(csv_file, 'r') as f:
+                        lines = f.readlines()
+                        if len(lines) > 1:  # More than just the header
+                            logged_files.append(csv_file)
+                            print(f"Drive {session_count} saved: {os.path.basename(csv_file)}")
+                            calculate_fuel_metrics(csv_file)
+                            print(f"\nStarting aggressiveness scoring for drive {session_count}...")
+                            scored_file = run_scorer_on_csv(csv_file)
+                            if scored_file:
+                                print(f"Aggressiveness scoring complete for drive {session_count}")
+                            else:
+                                print(f"Aggressiveness scoring failed for drive {session_count}, but drive data is still saved")
+                        else:
+                            print(f"⚠️ Drive {session_count} had no data, skipping analysis")
+                            os.remove(csv_file)
+                except Exception as e:
+                    print(f"Error checking file {csv_file}: {e}")
+            # Check if user wants to quit
+            if command == "quit":
+                print("\nStopping all logging as requested")
+                break
+            # Otherwise continue to next session
+            print(f"\n Ready for next drive (Session {session_count + 1})")
+    except KeyboardInterrupt:
+        print("\n Logging stopped by user (Ctrl+C)")
+    finally:
+        if connection and connection.is_connected():
+            print("Closing OBD-II connection...")
+            connection.close()
+        print("\n" + "=" * 50)
+        print(f"📈 LOGGING SUMMARY")
+        print(f"Total drives logged: {len(logged_files)}")
+        if logged_files:
+            print("📁 Files saved to:")
+            print("   - Raw logs:      logs/FuelLogs/")
+            if SCORING_AVAILABLE:
+                print("   - Scored logs:   logs/ScoredLogs/")
+            print("\n📝 Files created:")
+            for file in logged_files:
+                print(f"  - {os.path.basename(file)}")
+        print("=" * 50)
+if __name__ == "__main__":
+    main()

OBD/scorerConfig/scorerConfig.py ADDED Viewed

	@@ -0,0 +1,59 @@

+weights:
+  RPM: 0.25
+  THROTTLE_POS: 0.25
+  ENGINE_LOAD: 0.25
+  MAF: 0.25
+# Spike Detection Thresholds
+spike_thresholds:
+  moderate_spike: 65
+  extreme_spike: 85
+  spike_percentage_threshold: 3
+  extreme_percentage_threshold: 1
+# Penalty Multipliers
+penalty_multipliers:
+  p95_multiplier: 0.3
+  p99_multiplier: 0.5
+  spike_freq_multiplier: 2.0
+  extreme_freq_multiplier: 3.0
+# Aggregate Score Calculation
+aggregate_weights:
+  mean_weight: 0.7
+  p75_weight: 0.3
+style_categories:
+  very_calm: [0, 20]
+  calm: [20, 40]
+  moderate: [40, 55]
+  aggressive: [55, 70]
+  very_aggressive: [70, 100]
+bounds:
+  file: "obd_bounds.json"
+  auto_update: true      # Automatically update bounds with new data
+theoretical_maxes:
+  RPM: 6000
+  THROTTLE_POS: 100
+  ENGINE_LOAD: 100
+  MAF: 300
+  SPEED: 250
+  INTAKE_PRESSURE: 250
+theoretical_mins:
+  RPM: 0
+  THROTTLE_POS: 0
+  ENGINE_LOAD: 0
+  MAF: 0
+  SPEED: 0
+  INTAKE_PRESSURE: 0
+output:
+  save_scored_csv: true
+  visualization: true
+  verbose: true

OBD/visualiseScorer.py ADDED Viewed

	@@ -0,0 +1,237 @@

+import pandas as pd
+import numpy as np
+import matplotlib.pyplot as plt
+from driving_aggressiveness_scorer import DrivingAggressivenessScorer
+def visualize_drive(df_scored: pd.DataFrame, results: dict, save_path: str = None):
+    """
+    Create comprehensive visualization of drive analysis.
+    Args:
+        df_scored: DataFrame with aggressiveness scores
+        results: Aggregate results dictionary
+        save_path: Optional path to save figure
+    """
+    fig, axes = plt.subplots(3, 2, figsize=(15, 12))
+    fig.suptitle(f"Drive Analysis - Score: {results['final_score']:.1f}/100",
+                 fontsize=16, fontweight='bold')
+    # 1. Aggressiveness Score Over Time
+    ax = axes[0, 0]
+    ax.plot(df_scored['aggressiveness_score'], linewidth=1, color='#2E86AB')
+    ax.axhline(y=results['mean_score'], color='green', linestyle='--',
+               label=f"Mean: {results['mean_score']:.1f}")
+    ax.axhline(y=70, color='orange', linestyle='--', alpha=0.5, label='Spike Threshold')
+    ax.axhline(y=85, color='red', linestyle='--', alpha=0.5, label='Extreme Threshold')
+    ax.set_title('Aggressiveness Score Timeline')
+    ax.set_ylabel('Score (0-100)')
+    ax.set_xlabel('Sample Number')
+    ax.legend()
+    ax.grid(True, alpha=0.3)
+    # 2. Score Distribution
+    ax = axes[0, 1]
+    ax.hist(df_scored['aggressiveness_score'], bins=50, color='#A23B72', alpha=0.7, edgecolor='black')
+    ax.axvline(x=results['mean_score'], color='green', linestyle='--', linewidth=2, label='Mean')
+    ax.axvline(x=results['median_score'], color='blue', linestyle='--', linewidth=2, label='Median')
+    ax.set_title('Score Distribution')
+    ax.set_xlabel('Aggressiveness Score')
+    ax.set_ylabel('Frequency')
+    ax.legend()
+    ax.grid(True, alpha=0.3)
+    # 3. RPM vs Throttle Position (colored by score)
+    ax = axes[1, 0]
+    scatter = ax.scatter(df_scored['THROTTLE_POS'], df_scored['RPM'],
+                        c=df_scored['aggressiveness_score'], cmap='RdYlGn_r',
+                        s=10, alpha=0.6)
+    ax.set_title('RPM vs Throttle Position')
+    ax.set_xlabel('Throttle Position (%)')
+    ax.set_ylabel('RPM')
+    plt.colorbar(scatter, ax=ax, label='Aggressiveness')
+    ax.grid(True, alpha=0.3)
+    # 4. Speed vs Engine Load (colored by score)
+    ax = axes[1, 1]
+    scatter = ax.scatter(df_scored['SPEED'], df_scored['ENGINE_LOAD'],
+                        c=df_scored['aggressiveness_score'], cmap='RdYlGn_r',
+                        s=10, alpha=0.6)
+    ax.set_title('Speed vs Engine Load')
+    ax.set_xlabel('Speed (km/h)')
+    ax.set_ylabel('Engine Load (%)')
+    plt.colorbar(scatter, ax=ax, label='Aggressiveness')
+    ax.grid(True, alpha=0.3)
+    # 5. Key Metrics Over Time
+    ax = axes[2, 0]
+    ax2 = ax.twinx()
+    ln1 = ax.plot(df_scored['RPM'] / 100, label='RPM/100', color='#E63946', linewidth=0.8)
+    ln2 = ax.plot(df_scored['THROTTLE_POS'], label='Throttle %', color='#F77F00', linewidth=0.8)
+    ln3 = ax2.plot(df_scored['SPEED'], label='Speed', color='#06FFA5', linewidth=0.8)
+    ax.set_title('Key Metrics Timeline')
+    ax.set_xlabel('Sample Number')
+    ax.set_ylabel('RPM/100 & Throttle %')
+    ax2.set_ylabel('Speed (km/h)')
+    # Combine legends
+    lns = ln1 + ln2 + ln3
+    labs = [l.get_label() for l in lns]
+    ax.legend(lns, labs, loc='upper left')
+    ax.grid(True, alpha=0.3)
+    # 6. Score Statistics Summary
+    ax = axes[2, 1]
+    ax.axis('off')
+    stats_text = f"""
+    AGGREGATE SCORE BREAKDOWN
+    {'─' * 40}
+    Final Score:          {results['final_score']:.1f} / 100
+    SCORE STATISTICS
+    Mean:                 {results['mean_score']:.1f}
+    Median:               {results['median_score']:.1f}
+    Std Dev:              {results['std_score']:.1f}
+    PERCENTILES
+    75th:                 {results['p75_score']:.1f}
+    90th:                 {results['p90_score']:.1f}
+    95th:                 {results['p95_score']:.1f}
+    99th:                 {results['p99_score']:.1f}
+    Max:                  {results['max_score']:.1f}
+    SPIKE ANALYSIS
+    Spikes (>70):         {results['spike_percentage']:.1f}%
+    Extreme (>85):        {results['extreme_percentage']:.1f}%
+    Spike Penalty:        +{results['spike_penalty']:.1f}
+    """
+    ax.text(0.1, 0.95, stats_text, transform=ax.transAxes,
+            fontfamily='monospace', fontsize=10, verticalalignment='top',
+            bbox=dict(boxstyle='round', facecolor='wheat', alpha=0.3))
+    plt.tight_layout()
+    if save_path:
+        plt.savefig(save_path, dpi=150, bbox_inches='tight')
+        print(f"✓ Visualization saved to {save_path}")
+        plt.close()
+    else:
+        plt.show()
+def compare_drives(scorer: DrivingAggressivenessScorer, csv_paths: list):
+    """
+    Compare multiple drives side-by-side.
+    Args:
+        scorer: DrivingAggressivenessScorer instance
+        csv_paths: List of CSV file paths to compare
+    """
+    results_list = []
+    for csv_path in csv_paths:
+        _, results = scorer.analyze_drive(csv_path, update_bounds=True)
+        results['file'] = csv_path
+        results_list.append(results)
+    # Create comparison DataFrame
+    comparison_df = pd.DataFrame(results_list)
+    print("\n" + "="*80)
+    print("DRIVE COMPARISON")
+    print("="*80)
+    print(comparison_df[['file', 'final_score', 'mean_score',
+                         'spike_percentage', 'spike_penalty']].to_string(index=False))
+    print("="*80 + "\n")
+    return comparison_df
+def batch_analyze_folder(folder_path: str, pattern: str = "*.csv"):
+    from pathlib import Path
+    scorer = DrivingAggressivenessScorer()
+    csv_files = list(Path(folder_path).glob(pattern))
+    if not csv_files:
+        print(f"No CSV files found in {folder_path}")
+        return
+    print(f"Found {len(csv_files)} CSV files")
+    all_results = []
+    for csv_file in csv_files:
+        try:
+            df_scored, results = scorer.analyze_drive(str(csv_file), update_bounds=True)
+            results['filename'] = csv_file.name
+            all_results.append(results)
+            # Save individual scored file
+            output_path = csv_file.parent / f"{csv_file.stem}_scored.csv"
+            df_scored.to_csv(output_path, index=False)
+        except Exception as e:
+            print(f"Error processing {csv_file}: {e}")
+            continue
+    summary_df = pd.DataFrame(all_results)
+    summary_path = Path(folder_path) / "drive_summary_report.csv"
+    summary_df.to_csv(summary_path, index=False)
+    print(f"\n✓ Summary report saved to {summary_path}")
+    return summary_df
+def export_bounds_report(scorer: DrivingAggressivenessScorer, output_path: str = "bounds_report.txt"):
+    bounds = scorer.get_current_bounds()
+    report = []
+    report.append("="*60)
+    report.append("DRIVING AGGRESSIVENESS SCORER - BOUNDS REPORT")
+    report.append("="*60)
+    report.append(f"\nGenerated: {pd.Timestamp.now()}\n")
+    report.append("PARAMETER WEIGHTS:")
+    report.append("-"*60)
+    for param, weight in scorer.weights.items():
+        report.append(f"{param:20s}: {weight:.3f} ({weight*100:.1f}%)")
+    report.append("\n\nCURRENT BOUNDS:")
+    report.append("-"*60)
+    report.append(f"{'Parameter':<20s} {'Min':>12s} {'Max':>12s} {'Range':>12s}")
+    report.append("-"*60)
+    for param in scorer.weights.keys():
+        min_val = bounds[param]['min']
+        max_val = bounds[param]['max']
+        range_val = max_val - min_val
+        report.append(f"{param:<20s} {min_val:>12.2f} {max_val:>12.2f} {range_val:>12.2f}")
+    report.append("="*60)
+    report_text = "\n".join(report)
+    with open(output_path, 'w') as f:
+        f.write(report_text)
+    print(report_text)
+    print(f"\n✓ Report saved to {output_path}")
+# Example usage
+if __name__ == "__main__":
+    scorer = DrivingAggressivenessScorer()
+    csv_path = 'obd_data_log_20251012_121810.csv'
+    df_scored, results = scorer.analyze_drive(csv_path)
+    visualize_drive(df_scored, results, save_path='drive_analysis.png')
+    # Export bounds report
+    export_bounds_report(scorer)

README.md CHANGED Viewed

@@ -11,7 +11,7 @@ short_description: OBD-logging FastAPI server with data processing pipelines
 # OBD Logger
-A comprehensive OBD-II data logging and processing system built with FastAPI, featuring advanced data cleaning, Google Drive integration, MongoDB storage capabilities, and **Reinforcement Learning from Human Feedback (RLHF)** for driver behavior classification.
 ![System Architecture](diagram/diagram.svg)
@@ -24,6 +24,7 @@ A comprehensive OBD-II data logging and processing system built with FastAPI, fe
   - Firebase for structured data storage and querying
   - MongoDB Atlas for structured data storage and querying
 - **Driver Behavior Classification**: XGBoost-based ML model for driving style prediction
 - **RLHF Training System**: Continuous model improvement through human feedback
 - **Data Visualization**: Automatic generation of correlation heatmaps and trend plots
 - **RESTful API**: Comprehensive endpoints for data management and retrieval
@@ -45,6 +46,9 @@ The application is structured into modular components:
   - **`rlhf.py`**: Main RLHF training pipeline for continuous model improvement
 - **`OBD/`**: OBD-specific modules for data analysis and logging
 - **`utils/`**: Utility modules for model management and data processing
 ## Quick Start
@@ -58,8 +62,10 @@ The application is structured into modular components:
    - `FIREBASE_SERVICE_ACCOUNT_JSON`: Firebase connection string
    - `FIREBASE_ADMIN_JSON`: Firebase Admin SDK credentials
    - `HF_TOKEN`: Hugging Face authentication token
-   - `HF_MODEL_REPO`: Hugging Face model repository (default: `BinKhoaLe1812/Driver_Behavior_OBD`)
-   - `MODEL_DIR`: Local model directory (default: `/app/models/ul`)
 3. **Run the Application**:
    ```bash

 # OBD Logger
+A comprehensive OBD-II data logging and processing system built with FastAPI, featuring advanced data cleaning, Google Drive integration, MongoDB storage capabilities, **Reinforcement Learning from Human Feedback (RLHF)** for driver behavior classification, and **fuel efficiency scoring** using machine learning models.
 ![System Architecture](diagram/diagram.svg)
   - Firebase for structured data storage and querying
   - MongoDB Atlas for structured data storage and querying
 - **Driver Behavior Classification**: XGBoost-based ML model for driving style prediction
+- **Fuel Efficiency Scoring**: ML model for drive-level fuel efficiency prediction (0-100%)
 - **RLHF Training System**: Continuous model improvement through human feedback
 - **Data Visualization**: Automatic generation of correlation heatmaps and trend plots
 - **RESTful API**: Comprehensive endpoints for data management and retrieval
   - **`rlhf.py`**: Main RLHF training pipeline for continuous model improvement
 - **`OBD/`**: OBD-specific modules for data analysis and logging
 - **`utils/`**: Utility modules for model management and data processing
+- **`efficiency/`**: Fuel efficiency model training and evaluation
+  - **`retrain.py`**: Train and upload fuel efficiency models to Hugging Face
+  - **`eval.py`**: Evaluate fuel efficiency on OBD data
 ## Quick Start
    - `FIREBASE_SERVICE_ACCOUNT_JSON`: Firebase connection string
    - `FIREBASE_ADMIN_JSON`: Firebase Admin SDK credentials
    - `HF_TOKEN`: Hugging Face authentication token
+   - `HF_MODEL_REPO`: Driver behavior model repository (default: `BinKhoaLe1812/Driver_Behavior_OBD`)
+   - `HF_EFFICIENCY_MODEL_REPO`: Fuel efficiency model repository (default: `BinKhoaLe1812/Fuel_Efficiency_OBD`)
+   - `MODEL_DIR`: Driver behavior model directory (default: `/app/models/ul`)
+   - `EFFICIENCY_MODEL_DIR`: Fuel efficiency model directory (default: `/app/models/efficiency`)
 3. **Run the Application**:
    ```bash

app.py CHANGED Viewed

@@ -15,8 +15,8 @@ import numpy as np
 import matplotlib.pyplot as plt
 import seaborn as sns
 from sklearn.preprocessing import MinMaxScaler, StandardScaler
-from sklearn.impute import KNNImputer
 # Utils
 import os, datetime, json, logging, re
 from datetime import timedelta
 import pathlib
@@ -29,7 +29,10 @@ from data.mongo_saver import MongoSaver, save_csv_to_mongo, save_dataframe_to_mo
 from data.firebase_saver import FirebaseSaver, save_csv_increment, save_dataframe_increment
 # UL Model
-from utils.ul_label import ULLabeler
 # RLHF Training
 from train import RLHFTrainer
@@ -58,6 +61,7 @@ os.makedirs(CLEANED_DIR, exist_ok=True)
 os.makedirs(PLOT_DIR, exist_ok=True)
 DRIVE_STYLE = []  # latest UL predictions (string labels) — overwritten each run
 # Init temp empty file
 if not os.path.exists(RAW_CSV):
@@ -78,7 +82,7 @@ async def startup_event():
     """Download models on app startup"""
     try:
         logger.info("🚀 Starting model download...")
-        from utils.download import download_latest_models
         # Load .env file if it exists
         env_path = pathlib.Path(".env")
@@ -96,7 +100,20 @@ async def startup_event():
         if success:
             logger.info("✅ Models downloaded successfully on startup")
         else:
-            logger.warning("⚠️ Model download failed on startup - some features may not work")
     except Exception as e:
         logger.error(f"❌ Startup model download failed: {e}")
@@ -457,6 +474,26 @@ def _process_and_save(df, norm_ts):
         logger.info(f"✅ UL labels generated ({len(DRIVE_STYLE)}) → {labeled_path}")
     except Exception as e:
         logger.error(f"❌ UL labeling failed: {e}")
     # 10) Plots
     _plot_corr(df, norm_ts)
     _plot_trend(df, norm_ts)
@@ -528,28 +565,56 @@ def health():
 def models_status():
     """Check if models are loaded and available"""
     try:
-        model_dir = pathlib.Path(os.getenv("MODEL_DIR", "/app/models/ul"))
-        required_files = ["label_encoder_ul.pkl", "scaler_ul.pkl", "xgb_drivestyle_ul.pkl"]
-        available_files = []
-        missing_files = []
-        for file in required_files:
-            file_path = model_dir / file
             if file_path.exists():
-                available_files.append(file)
             else:
-                missing_files.append(file)
-        status = "ready" if len(available_files) == len(required_files) else "loading"
         return {
-            "status": status,
-            "model_directory": str(model_dir),
-            "available_files": available_files,
-            "missing_files": missing_files,
-            "total_files": len(required_files),
-            "loaded_files": len(available_files)
         }
     except Exception as e:
         return {
@@ -564,6 +629,17 @@ def models_status():
 def get_events():
     return PIPELINE_EVENTS
 # ────── Delete event from dashboard ──────────────
 @app.delete("/events/remove/{timestamp}")
@@ -845,7 +921,7 @@ async def get_latest_model_version():
     Get the latest model version information for the UI.
     """
     try:
-        from utils.download import get_latest_version
         # Get the latest version from Hugging Face
         latest_version = get_latest_version()
@@ -872,4 +948,4 @@ async def get_latest_model_version():
         raise HTTPException(
             status_code=500,
             detail=f"Failed to get latest model version: {str(e)}"
-        )

 import matplotlib.pyplot as plt
 import seaborn as sns
 from sklearn.preprocessing import MinMaxScaler, StandardScaler
 # Utils
+from sklearn.impute import KNNImputer
 import os, datetime, json, logging, re
 from datetime import timedelta
 import pathlib
 from data.firebase_saver import FirebaseSaver, save_csv_increment, save_dataframe_increment
 # UL Model
+from utils.dbehavior_labeler import ULLabeler
+# Fuel Efficiency Model
+from utils.efficiency_labeler import EfficiencyLabeler
 # RLHF Training
 from train import RLHFTrainer
 os.makedirs(PLOT_DIR, exist_ok=True)
 DRIVE_STYLE = []  # latest UL predictions (string labels) — overwritten each run
+FUEL_EFFICIENCY = []  # latest fuel efficiency predictions (0-100%) — overwritten each run
 # Init temp empty file
 if not os.path.exists(RAW_CSV):
     """Download models on app startup"""
     try:
         logger.info("🚀 Starting model download...")
+        from utils.dbehavior_download import download_latest_models
         # Load .env file if it exists
         env_path = pathlib.Path(".env")
         if success:
             logger.info("✅ Models downloaded successfully on startup")
         else:
+            logger.warning("⚠️ Driver behavior model download failed - some features may not work")
+        # Download fuel efficiency models
+        from utils.efficiency_download import download_latest_efficiency_models
+        success_efficiency = download_latest_efficiency_models()
+        if success_efficiency:
+            logger.info("✅ Fuel efficiency models downloaded successfully")
+        else:
+            logger.warning("⚠️ Fuel efficiency model download failed - some features may not work")
+        if success_ul or success_efficiency:
+            logger.info("✅ At least one model type downloaded successfully")
+        else:
+            logger.warning("⚠️ All model downloads failed - some features may not work")
     except Exception as e:
         logger.error(f"❌ Startup model download failed: {e}")
         logger.info(f"✅ UL labels generated ({len(DRIVE_STYLE)}) → {labeled_path}")
     except Exception as e:
         logger.error(f"❌ UL labeling failed: {e}")
+    # 9.5) Fuel efficiency predictions
+    efficiency_path = None
+    try:
+        efficiency_labeler = EfficiencyLabeler.get()
+        efficiency_preds = efficiency_labeler.predict_df(df)
+        # update global FUEL_EFFICIENCY (overwrite if already exists)
+        global FUEL_EFFICIENCY
+        FUEL_EFFICIENCY = [float(p) for p in efficiency_preds]
+        # write efficiency CSV (fuel_efficiency column)
+        df_efficiency = df_for_persist.copy()
+        df_efficiency["fuel_efficiency"] = FUEL_EFFICIENCY
+        efficiency_path = os.path.join(CLEANED_DIR, f"cleaned_{norm_ts}_efficiency.csv")
+        df_efficiency.to_csv(efficiency_path, index=False)
+        df_for_persist = df_efficiency
+        # Update the global FUEL_EFFICIENCY list
+        logger.info(f"✅ Fuel efficiency scores generated ({len(FUEL_EFFICIENCY)}) → {efficiency_path}")
+        logger.info(f"📊 Drive efficiency: {FUEL_EFFICIENCY[0]:.1f}%" if FUEL_EFFICIENCY else "No efficiency score")
+    except Exception as e:
+        logger.error(f"❌ Fuel efficiency scoring failed: {e}")
     # 10) Plots
     _plot_corr(df, norm_ts)
     _plot_trend(df, norm_ts)
 def models_status():
     """Check if models are loaded and available"""
     try:
+        # Driver behavior model status
+        ul_model_dir = pathlib.Path(os.getenv("MODEL_DIR", "/app/models/ul"))
+        ul_required_files = ["label_encoder_ul.pkl", "scaler_ul.pkl", "xgb_drivestyle_ul.pkl"]
+        ul_available_files = []
+        ul_missing_files = []
+        for file in ul_required_files:
+            file_path = ul_model_dir / file
             if file_path.exists():
+                ul_available_files.append(file)
             else:
+                ul_missing_files.append(file)
+        ul_status = "ready" if len(ul_available_files) == len(ul_required_files) else "loading"
+        # Fuel efficiency model status
+        efficiency_model_dir = pathlib.Path(os.getenv("EFFICIENCY_MODEL_DIR", "/app/models/efficiency"))
+        efficiency_required_files = ["efficiency_model.joblib"]
+        efficiency_available_files = []
+        efficiency_missing_files = []
+        for file in efficiency_required_files:
+            file_path = efficiency_model_dir / file
+            if file_path.exists():
+                efficiency_available_files.append(file)
+            else:
+                efficiency_missing_files.append(file)
+        efficiency_status = "ready" if len(efficiency_available_files) == len(efficiency_required_files) else "loading"
         return {
+            "driver_behavior": {
+                "status": ul_status,
+                "model_directory": str(ul_model_dir),
+                "available_files": ul_available_files,
+                "missing_files": ul_missing_files,
+                "total_files": len(ul_required_files),
+                "loaded_files": len(ul_available_files)
+            },
+            "fuel_efficiency": {
+                "status": efficiency_status,
+                "model_directory": str(efficiency_model_dir),
+                "available_files": efficiency_available_files,
+                "missing_files": efficiency_missing_files,
+                "total_files": len(efficiency_required_files),
+                "loaded_files": len(efficiency_available_files)
+            },
+            "overall_status": "ready" if (ul_status == "ready" and efficiency_status == "ready") else "loading"
         }
     except Exception as e:
         return {
 def get_events():
     return PIPELINE_EVENTS
+@app.get("/predictions/latest")
+def get_latest_predictions():
+    """Get the latest driver behavior and fuel efficiency predictions"""
+    return {
+        "driver_behavior": DRIVE_STYLE,
+        "fuel_efficiency": FUEL_EFFICIENCY,
+        "timestamp": datetime.datetime.now().isoformat(),
+        "driver_behavior_count": len(DRIVE_STYLE),
+        "fuel_efficiency_count": len(FUEL_EFFICIENCY)
+    }
 # ────── Delete event from dashboard ──────────────
 @app.delete("/events/remove/{timestamp}")
     Get the latest model version information for the UI.
     """
     try:
+        from utils.dbehavior_download import get_latest_version
         # Get the latest version from Hugging Face
         latest_version = get_latest_version()
         raise HTTPException(
             status_code=500,
             detail=f"Failed to get latest model version: {str(e)}"
+        )

bulk_mongo_upload.py ADDED Viewed

	@@ -0,0 +1,181 @@

+#!/usr/bin/env python3
+"""
+Bulk MongoDB Upload Script for Fuel Efficiency Data
+Processes all pending CSV files and uploads them to MongoDB when WiFi is available.
+"""
+import os
+import sys
+import glob
+from datetime import datetime
+from pathlib import Path
+# Load environment variables from .env file
+try:
+    from dotenv import load_dotenv
+    load_dotenv()
+except ImportError:
+    print("⚠️  python-dotenv not installed. Using system environment variables only.")
+    print("   Install with: pip install python-dotenv")
+# Add parent directory to path to import mongo_saver
+current_dir = os.path.dirname(__file__)
+sys.path.append(current_dir)
+from mongo_saver import save_csv_to_mongo
+def check_mongodb_config():
+    """Check if MongoDB configuration is available."""
+    mongo_uri = os.getenv("MONGO_URI")
+    if not mongo_uri:
+        print("Error: MONGO_URI not found in .env file")
+        return False
+    print(f"MongoDB URI configured)")
+    return True
+def find_pending_csv_files(logs_dir):
+    """Find all OBD CSV files that haven't been uploaded yet."""
+    fuel_logs_dir = os.path.join(logs_dir, "FuelLogs")
+    if not os.path.exists(fuel_logs_dir):
+        print(f"FuelLogs directory not found: {fuel_logs_dir}")
+        return []
+    # Find all CSV files matching our naming pattern
+    pattern = os.path.join(fuel_logs_dir, "obd_data_log_*.csv")
+    csv_files = glob.glob(pattern)
+    # Sort by modification time (newest first)
+    csv_files.sort(key=os.path.getmtime, reverse=True)
+    print(f"Found {len(csv_files)} fuel efficiency CSV files to process")
+    return csv_files
+def create_session_id_from_filename(csv_filepath):
+    """Generate a session ID from the CSV filename."""
+    filename = os.path.basename(csv_filepath)
+    # Convert obd_data_log_20231201_120000.csv -> fuel_efficiency_20231201_120000
+    session_id = filename.replace('obd_data_log_', 'fuel_efficiency_').replace('.csv', '')
+    return session_id
+def upload_csv_files_to_mongo(csv_files, max_uploads=None):
+    if not csv_files:
+        print("No CSV files to upload")
+        return
+    # Limit uploads if specified
+    if max_uploads:
+        csv_files = csv_files[:max_uploads]
+        print(f"Limiting upload to {max_uploads} files for this batch")
+    upload_stats = {
+        'successful': 0,
+        'failed': 0,
+        'total': len(csv_files)
+    }
+    print(f"Starting bulk upload of {len(csv_files)} fuel efficiency sessions...")
+    print("=" * 60)
+    for i, csv_file in enumerate(csv_files, 1):
+        try:
+            # Generate session ID
+            session_id = create_session_id_from_filename(csv_file)
+            filename = os.path.basename(csv_file)
+            print(f"[{i}/{len(csv_files)}] Processing: {filename}")
+            print(f"   Session ID: {session_id}")
+            success = save_csv_to_mongo(csv_file, session_id)
+            if success:
+                upload_stats['successful'] += 1
+                print(f"Upload successful")
+                move_to_processed_folder(csv_file)
+            else:
+                upload_stats['failed'] += 1
+                print(f"Upload failed")
+        except Exception as e:
+            upload_stats['failed'] += 1
+            print(f"Error processing {filename}: {e}")
+        print("-" * 40)
+    # Print summary
+    print("=" * 60)
+    print("BULK UPLOAD SUMMARY")
+    print(f"Successful uploads: {upload_stats['successful']}")
+    print(f"Failed uploads: {upload_stats['failed']}")
+    print(f"Total processed: {upload_stats['total']}")
+    success_rate = (upload_stats['successful'] / upload_stats['total']) * 100 if upload_stats['total'] > 0 else 0
+    print(f"Success rate: {success_rate:.1f}%")
+def move_to_processed_folder(csv_file):
+    """Move successfully uploaded CSV to a 'processed' folder."""
+    try:
+        # Create processed folder if it doesn't exist
+        processed_dir = os.path.join(os.path.dirname(csv_file), "processed")
+        os.makedirs(processed_dir, exist_ok=True)
+        # Move file
+        filename = os.path.basename(csv_file)
+        new_path = os.path.join(processed_dir, filename)
+        os.rename(csv_file, new_path)
+        print(f"Moved to processed folder: {filename}")
+    except Exception as e:
+        print(f"Could not move file to processed folder: {e}")
+def main():
+    """Main function to run bulk upload."""
+    print("Fuel Efficiency Data - Bulk MongoDB Upload")
+    print(f"Started at: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
+    print("=" * 60)
+    # Check MongoDB configuration first
+    if not check_mongodb_config():
+        return
+    # Find logs directory (relative to script location)
+    logs_dir = os.path.join(current_dir, "..", "logs")
+    logs_dir = os.path.abspath(logs_dir)
+    print(f"Searching for CSV files in: {logs_dir}")
+    # Find pending CSV files
+    csv_files = find_pending_csv_files(logs_dir)
+    if not csv_files:
+        print("No pending CSV files to upload - all caught up!")
+        return
+    # Show files to be processed
+    print("\nFiles to upload:")
+    for i, csv_file in enumerate(csv_files[:10], 1):  # Show first 10
+        filename = os.path.basename(csv_file)
+        mod_time = datetime.fromtimestamp(os.path.getmtime(csv_file))
+        print(f"   {i}. {filename} (modified: {mod_time.strftime('%Y-%m-%d %H:%M')})")
+    if len(csv_files) > 10:
+        print(f"   ... and {len(csv_files) - 10} more files")
+    # Confirm upload
+    print(f"\nUpload {len(csv_files)} fuel efficiency sessions to MongoDB? (y/n): ", end="")
+    response = input().strip().lower()
+    if response not in ['y', 'yes']:
+        print("Upload cancelled by user")
+        return
+    # Perform bulk upload
+    upload_csv_files_to_mongo(csv_files)
+    print(f"\nBulk upload completed at: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
+if __name__ == "__main__":
+    main()

data/mongo_saver.py CHANGED Viewed

@@ -53,7 +53,8 @@ class MongoSaver:
                 self.mongo_uri,
                 serverSelectionTimeoutMS=5000,  # 5 second timeout
                 connectTimeoutMS=10000,         # 10 second connection timeout
-                socketTimeoutMS=10000           # 10 second socket timeout
             )
             # Test connection

                 self.mongo_uri,
                 serverSelectionTimeoutMS=5000,  # 5 second timeout
                 connectTimeoutMS=10000,         # 10 second connection timeout
+                socketTimeoutMS=10000,          # 10 second socket timeout
+                tlsAllowInvalidCertificates=True  # Fix for SSL certificate issues on macOS
             )
             # Test connection

efficiency/eval.py ADDED Viewed

	@@ -0,0 +1,458 @@

+"""
+Fuel Efficiency Model Evaluation Script
+Integration-ready evaluation script for fuel efficiency scoring in the main pipeline
+Based on the original eval.py but reformatted for system integration
+"""
+import os
+import glob
+import joblib
+import logging
+import numpy as np
+import pandas as pd
+from pathlib import Path
+from typing import List, Dict, Any, Optional, Tuple
+from datetime import datetime
+logger = logging.getLogger("efficiency-eval")
+logger.setLevel(logging.INFO)
+if not logger.handlers:
+    handler = logging.StreamHandler()
+    handler.setFormatter(logging.Formatter("[%(levelname)s] %(asctime)s - %(message)s"))
+    logger.addHandler(handler)
+# Constants
+KMH_TO_MS = 1000.0/3600.0
+class EfficiencyEvaluator:
+    """
+    Fuel efficiency evaluator for OBD data using trained model.
+    Provides drive-level efficiency scoring for integration into main pipeline.
+    """
+    def __init__(self, model_path: Optional[str] = None):
+        """
+        Initialize the evaluator.
+        Args:
+            model_path: Path to the trained model. If None, will try to load from default location.
+        """
+        self.model_path = model_path or self._find_model_path()
+        self.model_artifacts = None
+        self.metadata = None
+        self._load_model()
+    def _find_model_path(self) -> str:
+        """Find the model path from various possible locations"""
+        possible_paths = [
+            "./efficiency_export/efficiency_model.joblib",
+            "/app/models/efficiency/efficiency_model.joblib",
+            "./efficiency_model.joblib"
+        ]
+        for path in possible_paths:
+            if os.path.exists(path):
+                logger.info(f"📁 Found model at: {path}")
+                return path
+        # Try to download from Hugging Face
+        logger.warning("⚠️ Model not found locally, attempting download...")
+        try:
+            from utils.efficiency_download import download_latest_efficiency_models
+            success = download_latest_efficiency_models()
+            if success:
+                return "/app/models/efficiency/efficiency_model.joblib"
+        except Exception as e:
+            logger.error(f"❌ Failed to download model: {e}")
+        raise FileNotFoundError("Could not find or download efficiency model")
+    def _load_model(self):
+        """Load the efficiency model and metadata"""
+        try:
+            logger.info(f"📥 Loading efficiency model from: {self.model_path}")
+            # Load model artifacts
+            self.model_artifacts = joblib.load(self.model_path)
+            # Load metadata if available
+            meta_path = self.model_path.replace("efficiency_model.joblib", "efficiency_meta.json")
+            if os.path.exists(meta_path):
+                import json
+                with open(meta_path, 'r') as f:
+                    self.metadata = json.load(f)
+            logger.info(f"✅ Model loaded | kind: {self.model_artifacts.get('model_kind', 'unknown')}")
+            logger.info(f"📊 Features: {len(self.model_artifacts.get('feature_names', []))}")
+            if self.metadata:
+                logger.info(f"📅 Training date: {self.metadata.get('training_date', 'unknown')}")
+                logger.info(f"📈 OOF MAE: {self.metadata.get('oof_stats', {}).get('oof_mae_qmap', 'unknown')}")
+        except Exception as e:
+            logger.error(f"❌ Error loading model: {e}")
+            raise
+    def _ensure_dt(self, s):
+        """Ensure datetime conversion"""
+        return pd.to_datetime(s, errors="coerce")
+    def _infer_base_interval_seconds(self, ts, fallback=1.0):
+        """Infer base interval from timestamps"""
+        ts = pd.to_datetime(ts, errors="coerce")
+        dt = ts.diff().dt.total_seconds().dropna()
+        med = float(np.nanmedian(dt)) if len(dt) else fallback
+        return fallback if (not np.isfinite(med) or med <= 0) else med
+    def _rows_for(self, seconds, base_sec):
+        """Calculate number of rows for given time window"""
+        return max(3, int(round(seconds / max(1e-3, base_sec))))
+    def _add_basic_derivatives(self, d):
+        """Add basic derivatives (acceleration, jerk, distance)"""
+        d = d.copy()
+        d["timestamp"] = self._ensure_dt(d["timestamp"])
+        d = d.dropna(subset=["timestamp"]).sort_values("timestamp")
+        base = self._infer_base_interval_seconds(d["timestamp"], 1.0)
+        # Convert numeric columns
+        for c in ["SPEED","RPM","MAF","ENGINE_LOAD","THROTTLE_POS"]:
+            if c in d.columns:
+                d[c] = pd.to_numeric(d[c], errors="coerce")
+        # Convert speed to m/s
+        if "SPEED_ms" not in d.columns:
+            d["SPEED_ms"] = (d["SPEED"] * KMH_TO_MS) if "SPEED" in d.columns else np.nan
+        # Calculate derivatives
+        d["ACCEL"] = d["SPEED_ms"].diff()/max(base,1e-3)
+        d["JERK"] = d["ACCEL"].diff()/max(base,1e-3)
+        # Calculate distance
+        dt = d["timestamp"].diff().dt.total_seconds().fillna(0).clip(lower=0, upper=10*base)
+        d["dist_m"] = d["SPEED_ms"] * dt
+        return d
+    def _idle_rule(self, d, thr):
+        """Apply idle detection rule"""
+        speed_low = (d["SPEED_ms"].abs() <= thr.get("SPEED_IDLE_MPS", 0.6))
+        thr_low = (d["THROTTLE_POS"] <= thr.get("THR_LOW_Q10", 0.0)) if "THROTTLE_POS" in d else True
+        load_low = (d["ENGINE_LOAD"] <= thr.get("LOAD_LOW_Q15", 0.0)) if "ENGINE_LOAD" in d else True
+        maf_low = (d["MAF"] <= thr.get("MAF_LOW_Q10", 0.0)) if "MAF" in d else True
+        accel_low = (d["ACCEL"].abs() <= thr.get("ACCEL_LOW_Q20", 0.0))
+        mask = (speed_low & thr_low & load_low & maf_low & accel_low).astype(int)
+        k = 5
+        return (mask.rolling(k, center=True, min_periods=1).median().round().astype(bool)
+                if len(mask) >= k else mask.astype(bool))
+    def _sharp_mask_from_thresholds(self, d, thr):
+        """Detect sharp acceleration/deceleration events"""
+        thr_a = thr.get("ACCEL_HIGH_Q85",
+                       np.nanquantile(d["ACCEL"].abs().dropna(), 0.85) if d["ACCEL"].notna().any() else 0.3)
+        thr_j = thr.get("JERK_HIGH_Q90",
+                       np.nanquantile(d["JERK"].abs().dropna(), 0.90) if d["JERK"].notna().any() else 0.5)
+        return (d["ACCEL"].abs() > thr_a) | (d["JERK"].abs() > thr_j)
+    def _q(self, s, p):
+        """Quantile helper function"""
+        s = pd.to_numeric(s, errors="coerce")
+        return float(np.nanquantile(s, p)) if s.notna().any() else 0.0
+    def _agg_for_ml_drive(self, g, thr):
+        """Aggregate drive-level features for ML model"""
+        g = self._add_basic_derivatives(g.copy())
+        base = self._infer_base_interval_seconds(g["timestamp"], 1.0)
+        g["IDLE_RULE"] = self._idle_rule(g, thr)
+        dt = g["timestamp"].diff().dt.total_seconds().fillna(0).clip(lower=0, upper=10*base)
+        T = float(dt.sum())
+        mins = max(1e-6, T/60)
+        sharp = self._sharp_mask_from_thresholds(g, thr).values
+        edges = np.flatnonzero(np.diff(np.r_[False, sharp, False]))
+        sharp_freq_pm = (len(edges)//2)/mins
+        rpm90, maf90 = thr.get("RPM90", np.nan), thr.get("MAF90", np.nan)
+        frac_rpm90 = float((g["RPM"] >= rpm90).mean()) if ("RPM" in g and np.isfinite(rpm90)) else 0.0
+        frac_maf90 = float((g["MAF"] >= maf90).mean()) if ("MAF" in g and np.isfinite(maf90)) else 0.0
+        W10 = self._rows_for(10, base)
+        speed_cv = float((g["SPEED_ms"].rolling(W10,1).std()/(g["SPEED_ms"].rolling(W10,1).mean()+1e-6)).mean())
+        return {
+            "duration_min": max(1e-6, T/60),
+            "distance_km": g["dist_m"].sum()/1000.0,
+            "speed_mean": float(g["SPEED_ms"].mean()),
+            "speed_q90": self._q(g["SPEED_ms"], 0.90),
+            "speed_cv": speed_cv,
+            "accel_q90": self._q(g["ACCEL"].abs(), 0.90),
+            "jerk_q90": self._q(g["JERK"].abs(), 0.90),
+            "sharp_freq_pm": sharp_freq_pm,
+            "idle_frac": float(g["IDLE_RULE"].mean()),
+            "idle_epm": (len(np.flatnonzero(np.diff(np.r_[False, g['IDLE_RULE'].values, False])))//2)/mins,
+            "rpm_q90": self._q(g["RPM"], 0.90) if "RPM" in g else 0.0,
+            "maf_q90": self._q(g["MAF"], 0.90) if "MAF" in g else 0.0,
+            "load_q85": self._q(g["ENGINE_LOAD"], 0.85) if "ENGINE_LOAD" in g else 0.0,
+            "thr_q85": self._q(g["THROTTLE_POS"], 0.85) if "THROTTLE_POS" in g else 0.0,
+            "frac_rpm90": frac_rpm90,
+            "frac_maf90": frac_maf90,
+            "fuel_intensity": (self._q(g["RPM"], 0.90)*self._q(g["MAF"], 0.90)) if (("RPM" in g) and ("MAF" in g)) else 0.0
+        }
+    def _align_to_schema(self, feats, art):
+        """Align features to model schema"""
+        x = pd.DataFrame([feats])
+        for c in art["feature_names"]:
+            if c not in x.columns:
+                x[c] = 0.0
+        x = x[art["feature_names"]]
+        if len(art["num_cols"]):
+            x.loc[:, art["num_cols"]] = art["scaler"].transform(x[art["num_cols"]])
+        return x
+    def _predict_drive(self, df_drive):
+        """Predict efficiency for a single drive"""
+        art = self.model_artifacts
+        thr = art["thr"]
+        feats = self._agg_for_ml_drive(df_drive, thr)
+        x = self._align_to_schema(feats, art)
+        # Get model
+        mdl = art["rf"] if art.get("model_kind") == "rf" else art["gbm"]
+        raw = float(mdl.predict(x)[0])
+        # Apply quantile-mapping calibration
+        if art.get("calib", {}).get("type") == "qmap":
+            rq = np.array(art["calib"]["rq"])
+            yq = np.array(art["calib"]["yq"])
+            # Ensure strictly increasing rq for stable interpolation
+            for i in range(1, len(rq)):
+                if rq[i] <= rq[i-1]:
+                    rq[i] = rq[i-1] + 1e-6
+            pred = float(np.clip(np.interp(raw, rq, yq), 0, 100))
+        else:
+            pred = float(np.clip(raw, 0, 100))
+        return pred, raw, feats
+    def predict_single_drive(self, df: pd.DataFrame) -> Dict[str, Any]:
+        """
+        Predict fuel efficiency for a single drive.
+        Args:
+            df: DataFrame with OBD data including timestamp, SPEED, RPM, MAF, etc.
+        Returns:
+            Dictionary containing efficiency prediction and metadata
+        """
+        try:
+            if self.model_artifacts is None:
+                raise RuntimeError("Efficiency model not loaded")
+            if len(df) < 5:
+                logger.warning("⚠️ Drive too short for efficiency prediction")
+                return {
+                    "efficiency_score": 0.0,
+                    "raw_score": 0.0,
+                    "duration_min": 0.0,
+                    "distance_km": 0.0,
+                    "note": "too short",
+                    "features": {}
+                }
+            # Calculate basic drive metrics
+            g2 = self._add_basic_derivatives(df[["timestamp","SPEED"]].assign(
+                RPM=df.get("RPM"), MAF=df.get("MAF"),
+                ENGINE_LOAD=df.get("ENGINE_LOAD"), THROTTLE_POS=df.get("THROTTLE_POS")))
+            dt = g2["timestamp"].diff().dt.total_seconds().fillna(0)
+            mins = float(dt.sum())/60.0
+            dist_km = float(pd.to_numeric(g2["dist_m"], errors="coerce").fillna(0).sum())/1000.0
+            # Predict efficiency
+            efficiency_score, raw_score, features = self._predict_drive(df)
+            logger.info(f"📊 Drive efficiency: {efficiency_score:.1f}% (raw: {raw_score:.3f})")
+            return {
+                "efficiency_score": round(efficiency_score, 1),
+                "raw_score": round(raw_score, 3),
+                "duration_min": round(mins, 2),
+                "distance_km": round(dist_km, 3),
+                "features": features,
+                "timestamp": datetime.now().isoformat()
+            }
+        except Exception as e:
+            logger.error(f"❌ Error predicting efficiency: {e}")
+            return {
+                "efficiency_score": 0.0,
+                "raw_score": 0.0,
+                "duration_min": 0.0,
+                "distance_km": 0.0,
+                "error": str(e),
+                "timestamp": datetime.now().isoformat()
+            }
+    def predict_batch(self, csv_files: List[str]) -> pd.DataFrame:
+        """
+        Predict efficiency for multiple CSV files (batch processing).
+        Args:
+            csv_files: List of CSV file paths
+        Returns:
+            DataFrame with predictions for each file
+        """
+        logger.info(f"📊 Processing {len(csv_files)} CSV files...")
+        rows = []
+        for i, csv_path in enumerate(csv_files, start=1):
+            try:
+                # Load CSV
+                df = pd.read_csv(csv_path)
+                df["source_file"] = os.path.basename(csv_path)
+                df["drive_id"] = i
+                df["timestamp"] = self._ensure_dt(df["timestamp"])
+                df = df.dropna(subset=["timestamp"]).sort_values("timestamp")
+                if len(df) < 5:
+                    rows.append({
+                        "source_file": os.path.basename(csv_path),
+                        "drive_id": i,
+                        "duration_min": np.nan,
+                        "distance_km": np.nan,
+                        "pred_efficiency_ml": np.nan,
+                        "raw": np.nan,
+                        "note": "too short"
+                    })
+                    continue
+                # Predict efficiency
+                result = self.predict_single_drive(df)
+                rows.append({
+                    "source_file": os.path.basename(csv_path),
+                    "drive_id": i,
+                    "duration_min": result["duration_min"],
+                    "distance_km": result["distance_km"],
+                    "pred_efficiency_ml": result["efficiency_score"],
+                    "raw": result["raw_score"]
+                })
+            except Exception as e:
+                logger.error(f"❌ Error processing {csv_path}: {e}")
+                rows.append({
+                    "source_file": os.path.basename(csv_path),
+                    "drive_id": i,
+                    "duration_min": np.nan,
+                    "distance_km": np.nan,
+                    "pred_efficiency_ml": np.nan,
+                    "raw": np.nan,
+                    "error": str(e)
+                })
+        pred_df = pd.DataFrame(rows).sort_values("drive_id").reset_index(drop=True)
+        # Calculate statistics
+        valid_preds = pred_df["pred_efficiency_ml"].dropna()
+        if len(valid_preds) > 0:
+            logger.info(f"📊 Batch results: {len(valid_preds)} valid predictions")
+            logger.info(f"📈 Efficiency range: {valid_preds.min():.1f}% - {valid_preds.max():.1f}%")
+            logger.info(f"📊 Mean efficiency: {valid_preds.mean():.1f}%")
+            logger.info(f"📊 Std efficiency: {valid_preds.std():.1f}%")
+        return pred_df
+    def get_model_info(self) -> Dict[str, Any]:
+        """Get information about the loaded model"""
+        if self.model_artifacts is None:
+            return {"error": "Model not loaded"}
+        return {
+            "model_kind": self.model_artifacts.get("model_kind", "unknown"),
+            "feature_count": len(self.model_artifacts.get("feature_names", [])),
+            "features": self.model_artifacts.get("feature_names", []),
+            "calibration_type": self.model_artifacts.get("calib", {}).get("type", "none"),
+            "oof_stats": self.model_artifacts.get("oof_stats", {}),
+            "metadata": self.metadata,
+            "model_path": self.model_path
+        }
+def evaluate_csv_files(csv_directory: str = "./") -> pd.DataFrame:
+    """
+    Convenience function to evaluate all CSV files in a directory.
+    Args:
+        csv_directory: Directory containing CSV files
+    Returns:
+        DataFrame with efficiency predictions
+    """
+    # Find CSV files
+    csv_patterns = [
+        os.path.join(csv_directory, "*.csv"),
+        os.path.join("/content", "*.csv")  # For Colab compatibility
+    ]
+    csv_files = []
+    for pattern in csv_patterns:
+        csv_files.extend(glob.glob(pattern))
+    csv_files = sorted([p for p in csv_files if os.path.isfile(p)])
+    if not csv_files:
+        logger.warning("⚠️ No CSV files found")
+        return pd.DataFrame()
+    # Initialize evaluator and process files
+    evaluator = EfficiencyEvaluator()
+    return evaluator.predict_batch(csv_files)
+def main():
+    """Main function for command-line usage"""
+    import argparse
+    parser = argparse.ArgumentParser(description="Evaluate fuel efficiency model")
+    parser.add_argument("--csv-dir", default="./", help="Directory containing CSV files")
+    parser.add_argument("--model-path", help="Path to efficiency model file")
+    parser.add_argument("--output", help="Output CSV file path")
+    args = parser.parse_args()
+    try:
+        # Initialize evaluator
+        evaluator = EfficiencyEvaluator(model_path=args.model_path)
+        # Print model info
+        info = evaluator.get_model_info()
+        print(f"📊 Model info: {info}")
+        # Evaluate CSV files
+        results_df = evaluate_csv_files(args.csv_dir)
+        if len(results_df) > 0:
+            print("\n=== Batch Efficiency Scores (per CSV / drive) ===")
+            print(results_df.to_string(index=False))
+            # Save results if output path specified
+            if args.output:
+                results_df.to_csv(args.output, index=False)
+                print(f"\n💾 Results saved to: {args.output}")
+        else:
+            print("❌ No valid CSV files found for evaluation")
+            return 1
+        return 0
+    except Exception as e:
+        print(f"❌ Evaluation failed: {e}")
+        return 1
+if __name__ == "__main__":
+    exit(main())

efficiency/retrain.py ADDED Viewed

	@@ -0,0 +1,698 @@

+"""
+Fuel Efficiency Model Retraining Script
+Reproducible training script for fuel efficiency model with Hugging Face integration
+Based on the original retrain.py but reformatted for system integration
+"""
+import os
+import glob
+import json
+import math
+import joblib
+import warnings
+import logging
+import numpy as np
+import pandas as pd
+from pathlib import Path
+from typing import Dict, List, Tuple, Optional, Any
+from datetime import datetime
+# ML imports
+from sklearn.preprocessing import StandardScaler
+from sklearn.ensemble import HistGradientBoostingRegressor, RandomForestRegressor
+from sklearn.model_selection import GroupKFold
+from sklearn.metrics import mean_absolute_error
+from sklearn.linear_model import Ridge
+# Hugging Face integration
+from huggingface_hub import HfApi, Repository
+# Suppress warnings
+warnings.filterwarnings("ignore", category=UserWarning)
+# Setup logging
+logger = logging.getLogger("efficiency-retrain")
+logger.setLevel(logging.INFO)
+if not logger.handlers:
+    handler = logging.StreamHandler()
+    handler.setFormatter(logging.Formatter("[%(levelname)s] %(asctime)s - %(message)s"))
+    logger.addHandler(handler)
+# Constants
+SEED = 42
+KMH_TO_MS = 1000.0/3600.0
+np.random.seed(SEED)
+class EfficiencyModelTrainer:
+    """
+    Fuel efficiency model trainer with Hugging Face integration.
+    Handles data loading, feature engineering, model training, and model upload.
+    """
+    def __init__(self,
+                 csv_directory: str = "./",
+                 export_directory: str = "./efficiency_export",
+                 repo_id: str = "BinKhoaLe1812/Fuel_Efficiency_OBD"):
+        """
+        Initialize the trainer.
+        Args:
+            csv_directory: Directory containing CSV files for training
+            export_directory: Directory to save trained model artifacts
+            repo_id: Hugging Face repository ID for model upload
+        """
+        self.csv_directory = csv_directory
+        self.export_directory = Path(export_directory)
+        self.repo_id = repo_id
+        self.hf_token = os.getenv("HF_TOKEN")
+        # Create export directory
+        self.export_directory.mkdir(parents=True, exist_ok=True)
+        # Initialize HF API if token available
+        self.hf_api = None
+        if self.hf_token:
+            self.hf_api = HfApi(token=self.hf_token)
+            logger.info(f"✅ Hugging Face API initialized for {repo_id}")
+        else:
+            logger.warning("⚠️ HF_TOKEN not set - model will not be uploaded to Hugging Face")
+    def load_training_data(self) -> pd.DataFrame:
+        """Load and preprocess training data from CSV files"""
+        logger.info("📊 Loading training data...")
+        # Find CSV files
+        csv_patterns = [
+            os.path.join(self.csv_directory, "*.csv"),
+            os.path.join("/content", "*.csv")  # For Colab compatibility
+        ]
+        csvs = []
+        for pattern in csv_patterns:
+            csvs.extend(glob.glob(pattern))
+        csvs = sorted([p for p in csvs if os.path.isfile(p)])
+        if not csvs:
+            raise RuntimeError("No CSV logs found for training")
+        logger.info(f"📁 Found {len(csvs)} CSV files")
+        # Load and combine CSV files
+        frames = []
+        for i, p in enumerate(csvs, start=1):
+            try:
+                d = pd.read_csv(p)
+                d["source_file"] = os.path.basename(p)
+                d["drive_id"] = i
+                frames.append(d)
+                logger.info(f"✅ Loaded {os.path.basename(p)} ({len(d)} rows)")
+            except Exception as e:
+                logger.warning(f"⚠️ Failed to load {p}: {e}")
+        if not frames:
+            raise RuntimeError("No valid CSV files could be loaded")
+        # Combine all data
+        df = pd.concat(frames, ignore_index=True)
+        df["timestamp"] = self._ensure_dt(df["timestamp"])
+        df = df.dropna(subset=["timestamp"]).sort_values(["drive_id", "timestamp"]).reset_index(drop=True)
+        df = self._add_basic_derivatives(df)
+        logger.info(f"📊 Combined dataset: {len(df)} rows, {df['drive_id'].nunique()} drives")
+        return df
+    def _ensure_dt(self, s):
+        """Ensure datetime conversion"""
+        return pd.to_datetime(s, errors="coerce")
+    def _infer_base_interval_seconds(self, ts, fallback=1.0):
+        """Infer base interval from timestamps"""
+        ts = pd.to_datetime(ts, errors="coerce")
+        dt = ts.diff().dt.total_seconds().dropna()
+        med = float(np.nanmedian(dt)) if len(dt) else fallback
+        return fallback if (not np.isfinite(med) or med <= 0) else med
+    def _rows_for(self, seconds, base_sec):
+        """Calculate number of rows for given time window"""
+        return max(3, int(round(seconds / max(1e-3, base_sec))))
+    def _add_basic_derivatives(self, d):
+        """Add basic derivatives (acceleration, jerk, distance)"""
+        d = d.copy()
+        d["timestamp"] = self._ensure_dt(d["timestamp"])
+        d = d.dropna(subset=["timestamp"]).sort_values("timestamp")
+        base = self._infer_base_interval_seconds(d["timestamp"], 1.0)
+        # Convert numeric columns
+        for c in ["SPEED","RPM","MAF","ENGINE_LOAD","THROTTLE_POS"]:
+            if c in d.columns:
+                d[c] = pd.to_numeric(d[c], errors="coerce")
+        # Convert speed to m/s
+        if "SPEED_ms" not in d.columns:
+            d["SPEED_ms"] = (d["SPEED"] * KMH_TO_MS) if "SPEED" in d.columns else np.nan
+        # Calculate derivatives
+        d["ACCEL"] = d["SPEED_ms"].diff()/max(base,1e-3)
+        d["JERK"] = d["ACCEL"].diff()/max(base,1e-3)
+        # Calculate distance
+        dt = d["timestamp"].diff().dt.total_seconds().fillna(0).clip(lower=0, upper=10*base)
+        d["dist_m"] = d["SPEED_ms"] * dt
+        return d
+    def _idle_rule(self, d, thr):
+        """Apply idle detection rule"""
+        speed_low = (d["SPEED_ms"].abs() <= thr.get("SPEED_IDLE_MPS", 0.6))
+        thr_low = (d["THROTTLE_POS"] <= thr.get("THR_LOW_Q10", 0.0)) if "THROTTLE_POS" in d else True
+        load_low = (d["ENGINE_LOAD"] <= thr.get("LOAD_LOW_Q15", 0.0)) if "ENGINE_LOAD" in d else True
+        maf_low = (d["MAF"] <= thr.get("MAF_LOW_Q10", 0.0)) if "MAF" in d else True
+        accel_low = (d["ACCEL"].abs() <= thr.get("ACCEL_LOW_Q20", 0.0))
+        mask = (speed_low & thr_low & load_low & maf_low & accel_low).astype(int)
+        k = 5
+        return (mask.rolling(k, center=True, min_periods=1).median().round().astype(bool)
+                if len(mask) >= k else mask.astype(bool))
+    def _sharp_mask_from_thresholds(self, d, thr):
+        """Detect sharp acceleration/deceleration events"""
+        thr_a = thr.get("ACCEL_HIGH_Q85",
+                       np.nanquantile(d["ACCEL"].abs().dropna(), 0.85) if d["ACCEL"].notna().any() else 0.3)
+        thr_j = thr.get("JERK_HIGH_Q90",
+                       np.nanquantile(d["JERK"].abs().dropna(), 0.90) if d["JERK"].notna().any() else 0.5)
+        return (d["ACCEL"].abs() > thr_a) | (d["JERK"].abs() > thr_j)
+    def _run_lengths(self, mask):
+        """Calculate run lengths from boolean mask"""
+        m = np.asarray(mask, dtype=bool)
+        if m.size == 0:
+            return np.array([], int), np.array([], int)
+        dm = np.diff(np.r_[False, m, False].astype(int))
+        starts = np.where(dm == 1)[0]
+        ends = np.where(dm == -1)[0]
+        return starts, (ends - starts)
+    def _penalty(self, series):
+        """Calculate penalty function for efficiency scoring"""
+        arr = pd.to_numeric(series, errors="coerce").fillna(0).values
+        if arr.size == 0:
+            return pd.Series([], dtype=float, index=series.index)
+        q25, q50, q75 = np.quantile(arr, [0.25, 0.50, 0.75])
+        s = (q75-q25)/1.349 if (q75 > q25) else (np.std(arr) if np.std(arr) > 0 else 1.0)
+        return pd.Series(1/(1+np.exp(-(arr - q50)/max(1e-6, s))), index=series.index)
+    def compute_fleet_thresholds(self, df: pd.DataFrame) -> Dict[str, float]:
+        """Compute fleet-wide thresholds for feature engineering"""
+        logger.info("🔧 Computing fleet thresholds...")
+        thr = {}
+        # RPM threshold
+        if "RPM" in df and df["RPM"].notna().any():
+            thr["RPM90"] = float(np.nanquantile(df["RPM"], 0.90))
+        # MAF threshold
+        if "MAF" in df and df["MAF"].notna().any():
+            thr["MAF90"] = float(np.nanquantile(df["MAF"], 0.90))
+        # Throttle position thresholds
+        if "THROTTLE_POS" in df and df["THROTTLE_POS"].notna().any():
+            thr["THR_LOW_Q10"] = float(np.nanquantile(df["THROTTLE_POS"], 0.10))
+            thr["THR_Q85"] = float(np.nanquantile(df["THROTTLE_POS"], 0.85))
+        # Engine load thresholds
+        if "ENGINE_LOAD" in df and df["ENGINE_LOAD"].notna().any():
+            thr["LOAD_LOW_Q15"] = float(np.nanquantile(df["ENGINE_LOAD"], 0.15))
+            thr["LOAD_Q85"] = float(np.nanquantile(df["ENGINE_LOAD"], 0.85))
+        # Acceleration and jerk thresholds
+        tmpd = self._add_basic_derivatives(df[["timestamp","SPEED"]].assign(
+            RPM=df.get("RPM"), MAF=df.get("MAF"),
+            THROTTLE_POS=df.get("THROTTLE_POS"), ENGINE_LOAD=df.get("ENGINE_LOAD")))
+        thr["ACCEL_LOW_Q20"] = float(np.nanquantile(tmpd["ACCEL"].abs().dropna(), 0.20)) if tmpd["ACCEL"].notna().any() else 0.05
+        thr["ACCEL_HIGH_Q85"] = float(np.nanquantile(tmpd["ACCEL"].abs().dropna(), 0.85)) if tmpd["ACCEL"].notna().any() else 0.3
+        thr["JERK_HIGH_Q90"] = float(np.nanquantile(tmpd["JERK"].abs().dropna(), 0.90)) if tmpd["JERK"].notna().any() else 0.5
+        thr["SPEED_IDLE_MPS"] = 0.6
+        logger.info(f"✅ Computed {len(thr)} fleet thresholds")
+        return thr
+    def create_algorithmic_teacher(self, df: pd.DataFrame, thr: Dict[str, float]) -> pd.DataFrame:
+        """Create algorithmic teacher labels for training"""
+        logger.info("🎯 Creating algorithmic teacher labels...")
+        # Apply idle rule to all drives
+        df["IDLE_RULE"] = False
+        for gid, g in df.groupby("drive_id", sort=True):
+            df.loc[g.index, "IDLE_RULE"] = self._idle_rule(g, thr)
+        # Extract thresholds
+        thr_accel, thr_jerk = thr["ACCEL_HIGH_Q85"], thr["JERK_HIGH_Q90"]
+        thr_rpm90, thr_maf90 = thr.get("RPM90", np.nan), thr.get("MAF90", np.nan)
+        # Process each drive
+        drv = []
+        for gid, g in df.groupby("drive_id", sort=True):
+            if len(g) < 5:
+                continue
+            base = self._infer_base_interval_seconds(g["timestamp"], 1.0)
+            dt_s = g["timestamp"].diff().dt.total_seconds().fillna(0).clip(lower=0, upper=10*base)
+            T = float(dt_s.sum())
+            mins = max(1e-6, T/60)
+            # Sharp acceleration analysis
+            sharp = self._sharp_mask_from_thresholds(g, thr).values
+            st, ln = self._run_lengths(sharp)
+            freq_pm = len(ln)/mins
+            dur_frac = (ln.sum()*base)/max(1e-6, T)
+            # Peak analysis
+            peaks = []
+            for a, b in zip(st, ln):
+                seg = g.iloc[a:a+b]
+                pa = float(np.nanmax(np.abs(seg["ACCEL"])))
+                pj = float(np.nanmax(np.abs(seg["JERK"])))
+                over_a = max(0.0, (pa-thr_accel)/max(1e-6, thr_accel))
+                over_j = max(0.0, (pj-thr_jerk)/max(1e-6, thr_jerk))
+                peaks.append(min(1.5, 0.7*over_a + 0.3*over_j))
+            sharp_mag = float(np.mean(peaks)) if peaks else 0.0
+            # Idle analysis
+            idle_frac = float(g["IDLE_RULE"].mean())
+            sti, lni = self._run_lengths(g["IDLE_RULE"].values)
+            idle_med_s = float(np.median(lni)*base if len(lni) else 0.0)
+            idle_epm = len(lni)/mins
+            # Speed variability
+            W10 = self._rows_for(10, base)
+            speed_cv = float((g["SPEED_ms"].rolling(W10,1).std()/(g["SPEED_ms"].rolling(W10,1).mean()+1e-6)).mean())
+            # High-load fractions
+            frac_rpm90 = float((g["RPM"] >= thr_rpm90).mean()) if ("RPM" in g and np.isfinite(thr_rpm90)) else 0.0
+            frac_maf90 = float((g["MAF"] >= thr_maf90).mean()) if ("MAF" in g and np.isfinite(thr_maf90)) else 0.0
+            frac_load85 = float((g["ENGINE_LOAD"] >= thr.get("LOAD_Q85", np.inf)).mean()) if "ENGINE_LOAD" in g else 0.0
+            frac_thr85 = float((g["THROTTLE_POS"] >= thr.get("THR_Q85", np.inf)).mean()) if "THROTTLE_POS" in g else 0.0
+            # Efficiency proxy
+            proxy = (0.80*frac_rpm90 + 0.60*frac_maf90 + 0.15*frac_load85 + 0.10*frac_thr85 + 0.10*idle_frac)
+            drv.append(dict(
+                drive_id=gid, duration_min=mins, distance_km=g["dist_m"].sum()/1000.0,
+                freq_pm=freq_pm, dur_frac=dur_frac, sharp_mag=sharp_mag,
+                idle_frac=idle_frac, idle_med_s=idle_med_s, idle_epm=idle_epm,
+                speed_cv=speed_cv, frac_rpm90=frac_rpm90, frac_maf90=frac_maf90, proxy=proxy
+            ))
+        dfeat = pd.DataFrame(drv).set_index("drive_id")
+        # Calculate penalty-based features
+        P = pd.DataFrame({
+            "p_freq": self._penalty(dfeat["freq_pm"]),
+            "p_dur": self._penalty(dfeat["dur_frac"]),
+            "p_mag": self._penalty(dfeat["sharp_mag"]),
+            "p_idle": 0.7*self._penalty(dfeat["idle_frac"]) + 0.3*self._penalty(dfeat["idle_med_s"]),
+            "p_cv": self._penalty(dfeat["speed_cv"]),
+            "p_rpm": self._penalty(dfeat["frac_rpm90"]),
+            "p_maf": self._penalty(dfeat["frac_maf90"]),
+        }, index=dfeat.index)
+        # Calculate efficiency scores
+        proxy = dfeat["proxy"].clip(0, 1-1e-6)
+        target_lin = -np.log(1 - proxy)
+        w = np.linalg.lstsq(P.values, target_lin.values, rcond=None)[0]
+        dfeat["ineff_model"] = 1 - np.exp(-P.values @ w)
+        dfeat["efficiency_algo"] = 100*(1 - dfeat["ineff_model"])
+        logger.info(f"✅ Teacher range: {dfeat['efficiency_algo'].min():.1f} → {dfeat['efficiency_algo'].max():.1f}")
+        return dfeat
+    def _q(self, s, p):
+        """Quantile helper function"""
+        s = pd.to_numeric(s, errors="coerce")
+        return float(np.nanquantile(s, p)) if s.notna().any() else 0.0
+    def _agg_for_ml_drive(self, g, thr):
+        """Aggregate drive-level features for ML model"""
+        g = self._add_basic_derivatives(g.copy())
+        base = self._infer_base_interval_seconds(g["timestamp"], 1.0)
+        g["IDLE_RULE"] = self._idle_rule(g, thr)
+        dt = g["timestamp"].diff().dt.total_seconds().fillna(0).clip(lower=0, upper=10*base)
+        T = float(dt.sum())
+        mins = max(1e-6, T/60)
+        sharp = self._sharp_mask_from_thresholds(g, thr).values
+        edges = np.flatnonzero(np.diff(np.r_[False, sharp, False]))
+        sharp_freq_pm = (len(edges)//2)/mins
+        rpm90, maf90 = thr.get("RPM90", np.nan), thr.get("MAF90", np.nan)
+        frac_rpm90 = float((g["RPM"] >= rpm90).mean()) if ("RPM" in g and np.isfinite(rpm90)) else 0.0
+        frac_maf90 = float((g["MAF"] >= maf90).mean()) if ("MAF" in g and np.isfinite(maf90)) else 0.0
+        W10 = self._rows_for(10, base)
+        speed_cv = float((g["SPEED_ms"].rolling(W10,1).std()/(g["SPEED_ms"].rolling(W10,1).mean()+1e-6)).mean())
+        return {
+            "duration_min": max(1e-6, T/60),
+            "distance_km": g["dist_m"].sum()/1000.0,
+            "speed_mean": float(g["SPEED_ms"].mean()),
+            "speed_q90": self._q(g["SPEED_ms"], 0.90),
+            "speed_cv": speed_cv,
+            "accel_q90": self._q(g["ACCEL"].abs(), 0.90),
+            "jerk_q90": self._q(g["JERK"].abs(), 0.90),
+            "sharp_freq_pm": sharp_freq_pm,
+            "idle_frac": float(g["IDLE_RULE"].mean()),
+            "idle_epm": (len(np.flatnonzero(np.diff(np.r_[False, g['IDLE_RULE'].values, False])))//2)/mins,
+            "rpm_q90": self._q(g["RPM"], 0.90) if "RPM" in g else 0.0,
+            "maf_q90": self._q(g["MAF"], 0.90) if "MAF" in g else 0.0,
+            "load_q85": self._q(g["ENGINE_LOAD"], 0.85) if "ENGINE_LOAD" in g else 0.0,
+            "thr_q85": self._q(g["THROTTLE_POS"], 0.85) if "THROTTLE_POS" in g else 0.0,
+            "frac_rpm90": frac_rpm90,
+            "frac_maf90": frac_maf90,
+            "fuel_intensity": (self._q(g["RPM"], 0.90)*self._q(g["MAF"], 0.90)) if (("RPM" in g) and ("MAF" in g)) else 0.0
+        }
+    def prepare_ml_data(self, df: pd.DataFrame, dfeat: pd.DataFrame, thr: Dict[str, float]) -> Tuple[pd.DataFrame, np.ndarray, np.ndarray]:
+        """Prepare data for machine learning training"""
+        logger.info("🔧 Preparing ML training data...")
+        rows, y, groups = [], [], []
+        for gid, g in df.groupby("drive_id", sort=True):
+            if len(g) < 5:
+                continue
+            rows.append(self._agg_for_ml_drive(g, thr))
+            y.append(float(dfeat.loc[gid, "efficiency_algo"]))
+            groups.append(g["source_file"].iloc[0] if "source_file" in g.columns else gid)
+        X = pd.DataFrame(rows)
+        y = np.asarray(y, float)
+        groups = np.asarray(groups)
+        # Remove zero-variance features
+        zv = X.std(numeric_only=True).fillna(0.0)
+        drop_cols = list(zv[zv <= 1e-10].index)
+        if drop_cols:
+            X = X.drop(columns=drop_cols)
+            logger.info(f"🗑️ Dropped zero-variance features: {drop_cols}")
+        # Scale features
+        holdout_cols = ["duration_min", "distance_km"]
+        num_cols = [c for c in X.columns if c not in holdout_cols]
+        sc = StandardScaler().fit(X[num_cols])
+        X[num_cols] = sc.transform(X[num_cols])
+        logger.info(f"✅ Prepared ML data: {X.shape[0]} samples, {X.shape[1]} features")
+        return X, y, groups, sc, num_cols, holdout_cols
+    def train_model(self, X: pd.DataFrame, y: np.ndarray, groups: np.ndarray) -> Tuple[Any, str, Dict[str, Any]]:
+        """Train the efficiency model with cross-validation"""
+        logger.info("🤖 Training efficiency model...")
+        # Out-of-fold predictions for calibration
+        gkf = GroupKFold(n_splits=min(5, max(2, len(np.unique(groups)))))
+        oof_raw = np.zeros_like(y)
+        for tr, va in gkf.split(X, y, groups):
+            gbm_fold = HistGradientBoostingRegressor(
+                loss="squared_error", max_depth=6, learning_rate=0.08, max_bins=255,
+                early_stopping=True, random_state=SEED
+            )
+            wtr = np.clip(X.iloc[tr]["duration_min"].values, 0.5, None)
+            gbm_fold.fit(X.iloc[tr], y[tr], sample_weight=wtr)
+            pred = gbm_fold.predict(X.iloc[va])
+            if np.std(pred) < 1e-6:
+                # Ridge rescue to enforce variability
+                ridge = Ridge(alpha=1.0, random_state=SEED).fit(X.iloc[tr][X.columns[2:]], y[tr])
+                pred = ridge.predict(X.iloc[va][X.columns[2:]])
+            oof_raw[va] = pred
+        # Calculate OOF statistics
+        raw_std = float(np.std(oof_raw))
+        y_std = float(np.std(y))
+        corr = float(np.corrcoef(oof_raw, y)[0,1]) if len(y) > 1 else 1.0
+        logger.info(f"📊 OOF: corr={corr:.3f} | raw_std={raw_std:.3f} | y_std={y_std:.3f}")
+        # Quantile-mapping calibration
+        qs = np.linspace(0.05, 0.95, 19)
+        rq = np.quantile(oof_raw, qs)
+        yq = np.quantile(y, qs)
+        # Ensure strictly increasing rq for stable interpolation
+        for i in range(1, len(rq)):
+            if rq[i] <= rq[i-1]:
+                rq[i] = rq[i-1] + 1e-6
+        calib = {"type": "qmap", "rq": rq.tolist(), "yq": yq.tolist()}
+        def apply_calib_qmap(raw):
+            return float(np.clip(np.interp(raw, rq, yq), 0, 100))
+        oof_cal = np.array([apply_calib_qmap(r) for r in oof_raw], float)
+        oof_mae = float(mean_absolute_error(y, oof_cal))
+        logger.info(f"📊 OOF MAE (qmap): {oof_mae:.2f}")
+        # Final model training
+        gbm = HistGradientBoostingRegressor(
+            loss="squared_error", max_depth=6, learning_rate=0.08, max_bins=255,
+            early_stopping=False, max_iter=400, random_state=SEED
+        )
+        w_all = np.clip(X["duration_min"].values, 0.5, None)
+        gbm.fit(X, y, sample_weight=w_all)
+        raw_all = gbm.predict(X)
+        if np.std(raw_all) < 1e-6:
+            logger.warning("⚠️ Final GBM raw constant — switching to RandomForest")
+            rf = RandomForestRegressor(n_estimators=600, min_samples_leaf=2, random_state=SEED, n_jobs=-1)
+            rf.fit(X, y)
+            model_kind, model = "rf", rf
+        else:
+            model_kind, model = "gbm", gbm
+        oof_stats = {
+            "oof_mae_qmap": oof_mae,
+            "oof_corr": corr,
+            "raw_std": raw_std,
+            "y_std": y_std
+        }
+        logger.info(f"✅ Model training complete | kind: {model_kind}")
+        return model, model_kind, calib, oof_stats
+    def save_model(self, model, model_kind: str, scaler, feature_names: List[str],
+                   num_cols: List[str], holdout_cols: List[str], thr: Dict[str, float],
+                   calib: Dict[str, Any], oof_stats: Dict[str, Any]) -> str:
+        """Save the trained model and artifacts"""
+        logger.info("💾 Saving model artifacts...")
+        # Prepare artifacts
+        artifacts = {
+            "scaler": scaler,
+            "model_kind": model_kind,
+            "gbm": model if model_kind == "gbm" else None,
+            "rf": model if model_kind == "rf" else None,
+            "feature_names": feature_names,
+            "num_cols": num_cols,
+            "holdout_cols": holdout_cols,
+            "windowing": {"size_s": 120, "step_s": 60},  # For future use
+            "thr": thr,
+            "seed": SEED,
+            "calib": calib,
+            "oof_stats": oof_stats,
+            "training_timestamp": datetime.now().isoformat(),
+            "version": "1.0"  # Will be updated based on HF versioning
+        }
+        # Save model
+        model_path = self.export_directory / "efficiency_model.joblib"
+        joblib.dump(artifacts, model_path)
+        # Save metadata
+        metadata = {
+            "model_type": "fuel_efficiency",
+            "version": "1.0",
+            "training_date": datetime.now().isoformat(),
+            "model_kind": model_kind,
+            "feature_count": len(feature_names),
+            "oof_stats": oof_stats,
+            "calibration_type": calib.get("type", "none")
+        }
+        meta_path = self.export_directory / "efficiency_meta.json"
+        with open(meta_path, 'w') as f:
+            json.dump(metadata, f, indent=2)
+        logger.info(f"✅ Model saved to {model_path}")
+        logger.info(f"✅ Metadata saved to {meta_path}")
+        return str(model_path)
+    def upload_to_huggingface(self, version: str = None) -> bool:
+        """Upload the trained model to Hugging Face Hub"""
+        if not self.hf_api:
+            logger.warning("⚠️ Hugging Face API not available - skipping upload")
+            return False
+        try:
+            if version is None:
+                version = self._get_next_version()
+            logger.info(f"📤 Uploading model version {version} to Hugging Face...")
+            # Upload model file
+            model_path = self.export_directory / "efficiency_model.joblib"
+            meta_path = self.export_directory / "efficiency_meta.json"
+            if not model_path.exists():
+                logger.error(f"❌ Model file not found: {model_path}")
+                return False
+            # Upload files
+            self.hf_api.upload_file(
+                path_or_fileobj=str(model_path),
+                path_in_repo=f"{version}/efficiency_model.joblib",
+                repo_id=self.repo_id,
+                repo_type="model"
+            )
+            if meta_path.exists():
+                self.hf_api.upload_file(
+                    path_or_fileobj=str(meta_path),
+                    path_in_repo=f"{version}/efficiency_meta.json",
+                    repo_id=self.repo_id,
+                    repo_type="model"
+                )
+            logger.info(f"✅ Model {version} uploaded successfully to {self.repo_id}")
+            return True
+        except Exception as e:
+            logger.error(f"❌ Error uploading to Hugging Face: {e}")
+            return False
+    def _get_next_version(self) -> str:
+        """Get the next version number (1.0, 1.1, 1.2, ..., 1.9, 2.0, etc.)"""
+        try:
+            repo_files = self.hf_api.list_repo_files(
+                repo_id=self.repo_id,
+                repo_type="model"
+            )
+            # Find existing versions
+            versions = []
+            for f in repo_files:
+                if f.startswith('v') and '/' not in f:
+                    try:
+                        version_str = f[1:]  # Remove 'v' prefix
+                        major, minor = map(int, version_str.split('.'))
+                        versions.append((major, minor))
+                    except ValueError:
+                        continue
+            if not versions:
+                return "v1.0"
+            # Sort and get next version
+            versions.sort(key=lambda x: (x[0], x[1]))
+            latest_major, latest_minor = versions[-1]
+            if latest_minor < 9:
+                return f"v{latest_major}.{latest_minor + 1}"
+            else:
+                return f"v{latest_major + 1}.0"
+        except Exception as e:
+            logger.warning(f"⚠️ Could not determine next version: {e}")
+            return "v1.0"
+    def train_and_upload(self, upload_to_hf: bool = True) -> Dict[str, Any]:
+        """Complete training pipeline"""
+        try:
+            logger.info("🚀 Starting fuel efficiency model training pipeline...")
+            # Load data
+            df = self.load_training_data()
+            # Compute thresholds
+            thr = self.compute_fleet_thresholds(df)
+            # Create teacher labels
+            dfeat = self.create_algorithmic_teacher(df, thr)
+            # Prepare ML data
+            X, y, groups, scaler, num_cols, holdout_cols = self.prepare_ml_data(df, dfeat, thr)
+            # Train model
+            model, model_kind, calib, oof_stats = self.train_model(X, y, groups)
+            # Save model
+            model_path = self.save_model(
+                model, model_kind, scaler, list(X.columns),
+                num_cols, holdout_cols, thr, calib, oof_stats
+            )
+            # Upload to Hugging Face
+            upload_success = False
+            if upload_to_hf:
+                upload_success = self.upload_to_huggingface()
+            result = {
+                "success": True,
+                "model_path": model_path,
+                "model_kind": model_kind,
+                "oof_stats": oof_stats,
+                "upload_success": upload_success,
+                "training_samples": len(X),
+                "feature_count": len(X.columns)
+            }
+            logger.info("✅ Training pipeline completed successfully")
+            return result
+        except Exception as e:
+            logger.error(f"❌ Training pipeline failed: {e}")
+            return {"success": False, "error": str(e)}
+def main():
+    """Main function for command-line usage"""
+    import argparse
+    parser = argparse.ArgumentParser(description="Train fuel efficiency model")
+    parser.add_argument("--csv-dir", default="./", help="Directory containing CSV files")
+    parser.add_argument("--export-dir", default="./efficiency_export", help="Export directory")
+    parser.add_argument("--repo-id", default="BinKhoaLe1812/Fuel_Efficiency_OBD", help="Hugging Face repo ID")
+    parser.add_argument("--no-upload", action="store_true", help="Skip Hugging Face upload")
+    args = parser.parse_args()
+    # Initialize trainer
+    trainer = EfficiencyModelTrainer(
+        csv_directory=args.csv_dir,
+        export_directory=args.export_dir,
+        repo_id=args.repo_id
+    )
+    # Train and upload
+    result = trainer.train_and_upload(upload_to_hf=not args.no_upload)
+    if result["success"]:
+        print("✅ Training completed successfully!")
+        print(f"📊 Model: {result['model_kind']}")
+        print(f"📈 OOF MAE: {result['oof_stats']['oof_mae_qmap']:.2f}")
+        print(f"📤 Upload: {'✅' if result['upload_success'] else '❌'}")
+    else:
+        print(f"❌ Training failed: {result['error']}")
+        return 1
+    return 0
+if __name__ == "__main__":
+    exit(main())

train/rlhf.py CHANGED Viewed

@@ -168,7 +168,7 @@ class RLHFTrainer:
             # First, try to download the latest model
             logger.info("🔄 Checking for latest model version...")
             try:
-                from utils.download import download_latest_models
                 download_latest_models()
             except Exception as e:
                 logger.warning(f"⚠️ Failed to download latest models: {e}")

             # First, try to download the latest model
             logger.info("🔄 Checking for latest model version...")
             try:
+                from utils.dbehavior_download import download_latest_models
                 download_latest_models()
             except Exception as e:
                 logger.warning(f"⚠️ Failed to download latest models: {e}")

train/saver.py CHANGED Viewed

@@ -102,7 +102,7 @@ class ModelSaver:
             "performance_metrics": performance_metrics,
             "framework": "xgboost",
             "task": "driver_behavior_classification",
-            "labels": ["aggressive", "normal", "conservative"],  # Based on ul_label.py
             "features": "obd_sensor_data",
             "rlhf_metadata": rlhf_metadata or {}
         }

             "performance_metrics": performance_metrics,
             "framework": "xgboost",
             "task": "driver_behavior_classification",
+            "labels": ["aggressive", "normal", "conservative"],  # Based on dbehavior_labeler.py
             "features": "obd_sensor_data",
             "rlhf_metadata": rlhf_metadata or {}
         }

utils/{download.py → dbehavior_download.py} RENAMED Viewed

File without changes

utils/{ul_label.py → dbehavior_labeler.py} RENAMED Viewed

@@ -1,4 +1,4 @@
-# ul_label.py
 # Load UL models and predict driving style
 import os, logging, pickle
 import warnings

+# dbehavior_labeler.py
 # Load UL models and predict driving style
 import os, logging, pickle
 import warnings

utils/efficiency_download.py ADDED Viewed

	@@ -0,0 +1,206 @@

+"""
+Fuel Efficiency Model Downloader
+Downloads the latest fuel efficiency model from Hugging Face Hub
+Similar to utils/download.py but for fuel efficiency models
+"""
+import os
+import pathlib
+import logging
+from typing import Optional, List
+from huggingface_hub import HfApi, hf_hub_download
+import joblib
+logger = logging.getLogger("efficiency-downloader")
+logger.setLevel(logging.INFO)
+if not logger.handlers:
+    handler = logging.StreamHandler()
+    handler.setFormatter(logging.Formatter("[%(levelname)s] %(asctime)s - %(message)s"))
+    logger.addHandler(handler)
+def load_env_file():
+    """Load .env file if it exists"""
+    env_path = pathlib.Path(".env")
+    if env_path.exists():
+        logger.info("📄 Loading .env file...")
+        with open(env_path, 'r') as f:
+            for line in f:
+                line = line.strip()
+                if line and not line.startswith('#') and '=' in line:
+                    key, value = line.split('=', 1)
+                    os.environ[key] = value
+        return True
+    return False
+# Load .env file first before setting any environment variables
+load_env_file()
+# Configuration
+EFFICIENCY_REPO_ID = os.getenv("HF_EFFICIENCY_MODEL_REPO", "BinKhoaLe1812/Fuel_Efficiency_OBD")
+EFFICIENCY_MODEL_DIR = pathlib.Path(os.getenv("EFFICIENCY_MODEL_DIR", "/app/models/efficiency")).resolve()
+EFFICIENCY_FILES = ["efficiency_model.joblib", "efficiency_meta.json"]
+EFFICIENCY_MODEL_DIR.mkdir(parents=True, exist_ok=True)
+def get_latest_efficiency_version():
+    """Get the latest fuel efficiency model version from Hugging Face repo"""
+    try:
+        hf_token = os.getenv("HF_TOKEN")
+        if not hf_token:
+            logger.warning("⚠️ HF_TOKEN not set, using default efficiency model files")
+            return None
+        api = HfApi(token=hf_token)
+        repo_files = api.list_repo_files(
+            repo_id=EFFICIENCY_REPO_ID,
+            repo_type="model"
+        )
+        logger.info(f"🔍 Checking efficiency repository files...")
+        logger.info(f"📁 Found {len(repo_files)} files in efficiency repository")
+        # Find version directories (v1.0, v1.1, etc.)
+        version_dirs = [f for f in repo_files if f.startswith('v') and '/' not in f]
+        logger.info(f"📦 Found efficiency version directories: {version_dirs}")
+        # Also check for version directories with files inside
+        version_dirs_with_files = []
+        for f in repo_files:
+            if f.startswith('v') and '/' in f:
+                version_dir = f.split('/')[0]
+                if version_dir not in version_dirs_with_files:
+                    version_dirs_with_files.append(version_dir)
+        if version_dirs_with_files:
+            logger.info(f"📦 Found efficiency version directories with files: {version_dirs_with_files}")
+            version_dirs.extend(version_dirs_with_files)
+        versions = []
+        for v_dir in version_dirs:
+            try:
+                # Extract version number (e.g., "v1.0" -> 1.0)
+                version_str = v_dir[1:]  # Remove 'v' prefix
+                major, minor = map(int, version_str.split('.'))
+                versions.append((major, minor, v_dir))
+            except ValueError:
+                logger.warning(f"⚠️ Could not parse version: {v_dir}")
+                continue
+        if not versions:
+            logger.warning("⚠️ No valid efficiency versions found")
+            return None
+        # Sort by major.minor version
+        versions.sort(key=lambda x: (x[0], x[1]))
+        latest_version = versions[-1][2]  # Get the version string
+        logger.info(f"✅ Latest efficiency model version: {latest_version}")
+        return latest_version
+    except Exception as e:
+        logger.error(f"❌ Error getting latest efficiency version: {e}")
+        return None
+def download_efficiency_model(version: Optional[str] = None) -> bool:
+    """Download the specified version of the fuel efficiency model"""
+    try:
+        hf_token = os.getenv("HF_TOKEN")
+        if not hf_token:
+            logger.error("❌ HF_TOKEN not set")
+            return False
+        if version is None:
+            version = get_latest_efficiency_version()
+            if version is None:
+                logger.error("❌ Could not determine latest efficiency version")
+                return False
+        logger.info(f"📥 Downloading efficiency model version: {version}")
+        # Download each required file
+        for filename in EFFICIENCY_FILES:
+            try:
+                file_path = hf_hub_download(
+                    repo_id=EFFICIENCY_REPO_ID,
+                    filename=f"{version}/{filename}",
+                    token=hf_token,
+                    local_dir=EFFICIENCY_MODEL_DIR,
+                    local_dir_use_symlinks=False
+                )
+                logger.info(f"✅ Downloaded: {filename}")
+            except Exception as e:
+                logger.error(f"❌ Failed to download {filename}: {e}")
+                return False
+        logger.info(f"✅ Efficiency model {version} downloaded successfully")
+        return True
+    except Exception as e:
+        logger.error(f"❌ Error downloading efficiency model: {e}")
+        return False
+def download_latest_efficiency_models() -> bool:
+    """Download the latest fuel efficiency model files"""
+    try:
+        logger.info("🚀 Starting efficiency model download...")
+        # Get latest version
+        latest_version = get_latest_efficiency_version()
+        if latest_version is None:
+            logger.error("❌ Could not determine latest efficiency version")
+            return False
+        # Download the model
+        success = download_efficiency_model(latest_version)
+        if success:
+            logger.info("✅ Latest efficiency model downloaded successfully")
+        else:
+            logger.error("❌ Failed to download latest efficiency model")
+        return success
+    except Exception as e:
+        logger.error(f"❌ Error in download_latest_efficiency_models: {e}")
+        return False
+def load_efficiency_model():
+    """Load the efficiency model from local storage"""
+    try:
+        model_path = EFFICIENCY_MODEL_DIR / "efficiency_model.joblib"
+        meta_path = EFFICIENCY_MODEL_DIR / "efficiency_meta.json"
+        if not model_path.exists():
+            logger.error(f"❌ Efficiency model not found at {model_path}")
+            return None, None
+        # Load model
+        model_artifacts = joblib.load(model_path)
+        # Load metadata if available
+        metadata = None
+        if meta_path.exists():
+            import json
+            with open(meta_path, 'r') as f:
+                metadata = json.load(f)
+        logger.info("✅ Efficiency model loaded successfully")
+        return model_artifacts, metadata
+    except Exception as e:
+        logger.error(f"❌ Error loading efficiency model: {e}")
+        return None, None
+def check_efficiency_model_exists() -> bool:
+    """Check if efficiency model files exist locally"""
+    model_path = EFFICIENCY_MODEL_DIR / "efficiency_model.joblib"
+    return model_path.exists()
+if __name__ == "__main__":
+    # Test the download functionality
+    success = download_latest_efficiency_models()
+    if success:
+        print("✅ Efficiency model download test successful")
+    else:
+        print("❌ Efficiency model download test failed")

utils/efficiency_labeler.py ADDED Viewed

	@@ -0,0 +1,289 @@

+"""
+Fuel Efficiency Labeler
+Provides fuel efficiency scoring for OBD data using the trained model
+Similar to utils/ul_label.py but for fuel efficiency scoring
+"""
+import os
+import logging
+import joblib
+import numpy as np
+import pandas as pd
+from typing import List, Optional, Dict, Any, Tuple
+from pathlib import Path
+logger = logging.getLogger("efficiency-labeler")
+logger.setLevel(logging.INFO)
+if not logger.handlers:
+    handler = logging.StreamHandler()
+    handler.setFormatter(logging.Formatter("[%(levelname)s] %(asctime)s - %(message)s"))
+    logger.addHandler(handler)
+# Constants
+KMH_TO_MS = 1000.0/3600.0
+SEED = 42
+class EfficiencyLabeler:
+    """
+    Fuel efficiency scorer for OBD data using machine learning model.
+    Provides drive-level efficiency scores (0-100%) for entire drives.
+    """
+    _instance = None
+    _model_artifacts = None
+    _metadata = None
+    _initialized = False
+    def __init__(self):
+        if not EfficiencyLabeler._initialized:
+            self._load_model()
+            EfficiencyLabeler._initialized = True
+    @classmethod
+    def get(cls):
+        """Get singleton instance"""
+        if cls._instance is None:
+            cls._instance = cls()
+        return cls._instance
+    def _load_model(self):
+        """Load the efficiency model and metadata"""
+        try:
+            from utils.efficiency_download import load_efficiency_model, check_efficiency_model_exists
+            # Check if model exists locally
+            if not check_efficiency_model_exists():
+                logger.warning("⚠️ Efficiency model not found locally, attempting download...")
+                from utils.efficiency_download import download_latest_efficiency_models
+                success = download_latest_efficiency_models()
+                if not success:
+                    raise RuntimeError("Failed to download efficiency model")
+            # Load model
+            model_artifacts, metadata = load_efficiency_model()
+            if model_artifacts is None:
+                raise RuntimeError("Failed to load efficiency model")
+            EfficiencyLabeler._model_artifacts = model_artifacts
+            EfficiencyLabeler._metadata = metadata
+            logger.info(f"✅ Efficiency model loaded | kind: {model_artifacts.get('model_kind', 'unknown')}")
+            logger.info(f"📊 Model features: {len(model_artifacts.get('feature_names', []))}")
+        except Exception as e:
+            logger.error(f"❌ Error loading efficiency model: {e}")
+            raise
+    def _ensure_dt(self, s):
+        """Ensure datetime conversion"""
+        return pd.to_datetime(s, errors="coerce")
+    def _infer_base_interval_seconds(self, ts, fallback=1.0):
+        """Infer base interval from timestamps"""
+        ts = pd.to_datetime(ts, errors="coerce")
+        dt = ts.diff().dt.total_seconds().dropna()
+        med = float(np.nanmedian(dt)) if len(dt) else fallback
+        return fallback if (not np.isfinite(med) or med <= 0) else med
+    def _rows_for(self, seconds, base_sec):
+        """Calculate number of rows for given time window"""
+        return max(3, int(round(seconds / max(1e-3, base_sec))))
+    def _add_basic_derivatives(self, d):
+        """Add basic derivatives (acceleration, jerk, distance)"""
+        d = d.copy()
+        d["timestamp"] = self._ensure_dt(d["timestamp"])
+        d = d.dropna(subset=["timestamp"]).sort_values("timestamp")
+        base = self._infer_base_interval_seconds(d["timestamp"], 1.0)
+        # Convert numeric columns
+        for c in ["SPEED","RPM","MAF","ENGINE_LOAD","THROTTLE_POS"]:
+            if c in d.columns:
+                d[c] = pd.to_numeric(d[c], errors="coerce")
+        # Convert speed to m/s
+        if "SPEED_ms" not in d.columns:
+            d["SPEED_ms"] = (d["SPEED"] * KMH_TO_MS) if "SPEED" in d.columns else np.nan
+        # Calculate derivatives
+        d["ACCEL"] = d["SPEED_ms"].diff()/max(base,1e-3)
+        d["JERK"] = d["ACCEL"].diff()/max(base,1e-3)
+        # Calculate distance
+        dt = d["timestamp"].diff().dt.total_seconds().fillna(0).clip(lower=0, upper=10*base)
+        d["dist_m"] = d["SPEED_ms"] * dt
+        return d
+    def _idle_rule(self, d, thr):
+        """Apply idle detection rule"""
+        speed_low = (d["SPEED_ms"].abs() <= thr.get("SPEED_IDLE_MPS", 0.6))
+        thr_low = (d["THROTTLE_POS"] <= thr.get("THR_LOW_Q10", 0.0)) if "THROTTLE_POS" in d else True
+        load_low = (d["ENGINE_LOAD"] <= thr.get("LOAD_LOW_Q15", 0.0)) if "ENGINE_LOAD" in d else True
+        maf_low = (d["MAF"] <= thr.get("MAF_LOW_Q10", 0.0)) if "MAF" in d else True
+        accel_low = (d["ACCEL"].abs() <= thr.get("ACCEL_LOW_Q20", 0.0))
+        mask = (speed_low & thr_low & load_low & maf_low & accel_low).astype(int)
+        k = 5
+        return (mask.rolling(k, center=True, min_periods=1).median().round().astype(bool)
+                if len(mask) >= k else mask.astype(bool))
+    def _sharp_mask_from_thresholds(self, d, thr):
+        """Detect sharp acceleration/deceleration events"""
+        thr_a = thr.get("ACCEL_HIGH_Q85",
+                       np.nanquantile(d["ACCEL"].abs().dropna(), 0.85) if d["ACCEL"].notna().any() else 0.3)
+        thr_j = thr.get("JERK_HIGH_Q90",
+                       np.nanquantile(d["JERK"].abs().dropna(), 0.90) if d["JERK"].notna().any() else 0.5)
+        return (d["ACCEL"].abs() > thr_a) | (d["JERK"].abs() > thr_j)
+    def _agg_for_ml_drive(self, g, thr):
+        """Aggregate drive-level features for ML model"""
+        g = self._add_basic_derivatives(g.copy())
+        base = self._infer_base_interval_seconds(g["timestamp"], 1.0)
+        g["IDLE_RULE"] = self._idle_rule(g, thr)
+        dt = g["timestamp"].diff().dt.total_seconds().fillna(0).clip(lower=0, upper=10*base)
+        T = float(dt.sum())
+        mins = max(1e-6, T/60)
+        sharp = self._sharp_mask_from_thresholds(g, thr).values
+        edges = np.flatnonzero(np.diff(np.r_[False, sharp, False]))
+        sharp_freq_pm = (len(edges)//2)/mins
+        def q(s, p):
+            s = pd.to_numeric(s, errors="coerce")
+            return float(np.nanquantile(s, p)) if s.notna().any() else 0.0
+        rpm90, maf90 = thr.get("RPM90", np.nan), thr.get("MAF90", np.nan)
+        frac_rpm90 = float((g["RPM"] >= rpm90).mean()) if ("RPM" in g and np.isfinite(rpm90)) else 0.0
+        frac_maf90 = float((g["MAF"] >= maf90).mean()) if ("MAF" in g and np.isfinite(maf90)) else 0.0
+        W10 = self._rows_for(10, base)
+        speed_cv = float((g["SPEED_ms"].rolling(W10,1).std()/(g["SPEED_ms"].rolling(W10,1).mean()+1e-6)).mean())
+        return {
+            "duration_min": max(1e-6, T/60),
+            "distance_km": g["dist_m"].sum()/1000.0,
+            "speed_mean": float(g["SPEED_ms"].mean()),
+            "speed_q90": q(g["SPEED_ms"], 0.90),
+            "speed_cv": speed_cv,
+            "accel_q90": q(g["ACCEL"].abs(), 0.90),
+            "jerk_q90": q(g["JERK"].abs(), 0.90),
+            "sharp_freq_pm": sharp_freq_pm,
+            "idle_frac": float(g["IDLE_RULE"].mean()),
+            "idle_epm": (len(np.flatnonzero(np.diff(np.r_[False, g['IDLE_RULE'].values, False])))//2)/mins,
+            "rpm_q90": q(g["RPM"], 0.90) if "RPM" in g else 0.0,
+            "maf_q90": q(g["MAF"], 0.90) if "MAF" in g else 0.0,
+            "load_q85": q(g["ENGINE_LOAD"], 0.85) if "ENGINE_LOAD" in g else 0.0,
+            "thr_q85": q(g["THROTTLE_POS"], 0.85) if "THROTTLE_POS" in g else 0.0,
+            "frac_rpm90": frac_rpm90,
+            "frac_maf90": frac_maf90,
+            "fuel_intensity": (q(g["RPM"], 0.90)*q(g["MAF"], 0.90)) if (("RPM" in g) and ("MAF" in g)) else 0.0
+        }
+    def _align_to_schema(self, feats, art):
+        """Align features to model schema"""
+        x = pd.DataFrame([feats])
+        for c in art["feature_names"]:
+            if c not in x.columns:
+                x[c] = 0.0
+        x = x[art["feature_names"]]
+        if len(art["num_cols"]):
+            x.loc[:, art["num_cols"]] = art["scaler"].transform(x[art["num_cols"]])
+        return x
+    def _predict_drive(self, df_drive):
+        """Predict efficiency for a single drive"""
+        art = EfficiencyLabeler._model_artifacts
+        thr = art["thr"]
+        feats = self._agg_for_ml_drive(df_drive, thr)
+        x = self._align_to_schema(feats, art)
+        # Get model
+        mdl = art["rf"] if art.get("model_kind") == "rf" else art["gbm"]
+        raw = float(mdl.predict(x)[0])
+        # Apply quantile-mapping calibration
+        if art.get("calib", {}).get("type") == "qmap":
+            rq = np.array(art["calib"]["rq"])
+            yq = np.array(art["calib"]["yq"])
+            # Ensure strictly increasing rq for stable interpolation
+            for i in range(1, len(rq)):
+                if rq[i] <= rq[i-1]:
+                    rq[i] = rq[i-1] + 1e-6
+            pred = float(np.clip(np.interp(raw, rq, yq), 0, 100))
+        else:
+            pred = float(np.clip(raw, 0, 100))
+        return pred, raw
+    def predict_df(self, df: pd.DataFrame) -> List[float]:
+        """
+        Predict fuel efficiency for a DataFrame containing OBD data.
+        Returns a single efficiency score (0-100%) for the entire drive.
+        Args:
+            df: DataFrame with OBD data including timestamp, SPEED, RPM, MAF, etc.
+        Returns:
+            List containing single efficiency score for the drive
+        """
+        try:
+            if EfficiencyLabeler._model_artifacts is None:
+                raise RuntimeError("Efficiency model not loaded")
+            if len(df) < 5:
+                logger.warning("⚠️ Drive too short for efficiency prediction")
+                return [0.0]  # Return minimum efficiency for very short drives
+            # Ensure timestamp column exists
+            if "timestamp" not in df.columns:
+                logger.error("❌ No timestamp column found")
+                return [0.0]
+            # Predict efficiency for the entire drive
+            efficiency_score, raw_score = self._predict_drive(df)
+            logger.info(f"📊 Drive efficiency: {efficiency_score:.1f}% (raw: {raw_score:.3f})")
+            return [efficiency_score]
+        except Exception as e:
+            logger.error(f"❌ Error predicting efficiency: {e}")
+            return [0.0]  # Return minimum efficiency on error
+    def get_model_info(self) -> Dict[str, Any]:
+        """Get information about the loaded model"""
+        if EfficiencyLabeler._model_artifacts is None:
+            return {"error": "Model not loaded"}
+        art = EfficiencyLabeler._model_artifacts
+        return {
+            "model_kind": art.get("model_kind", "unknown"),
+            "feature_count": len(art.get("feature_names", [])),
+            "features": art.get("feature_names", []),
+            "calibration_type": art.get("calib", {}).get("type", "none"),
+            "oof_stats": art.get("oof_stats", {}),
+            "metadata": EfficiencyLabeler._metadata
+        }
+# Convenience function for backward compatibility
+def predict_efficiency(df: pd.DataFrame) -> List[float]:
+    """Convenience function to predict efficiency"""
+    labeler = EfficiencyLabeler.get()
+    return labeler.predict_df(df)
+if __name__ == "__main__":
+    # Test the efficiency labeler
+    try:
+        labeler = EfficiencyLabeler.get()
+        print("✅ Efficiency labeler initialized successfully")
+        # Print model info
+        info = labeler.get_model_info()
+        print(f"📊 Model info: {info}")
+    except Exception as e:
+        print(f"❌ Error initializing efficiency labeler: {e}")