Spaces:

BinKhoaLe1812
/

OBD_Logger

Sleeping

LiamKhoaLe commited on Sep 27, 2025

Commit

ff65bb4

0 Parent(s):

Initial commit: OBD Logger with RLHF training system

- FastAPI-based OBD-II data processing
- Real-time data ingestion and cleaning
- Firebase and MongoDB integration
- RLHF training pipeline with versioned models
- Docker deployment ready
- Security: No hardcoded tokens

Files changed (31) hide show

.DS_Store +0 -0
.gitattributes +35 -0
.gitignore +11 -0
Dockerfile +43 -0
GOOGLE_DRIVE_SETUP.md +336 -0
MONGODB_SETUP.md +133 -0
OBD/obd_analyzer.py +215 -0
OBD/obd_logger.py +374 -0
README.md +199 -0
app.py +802 -0
data.json +26 -0
data/drive_saver.py +110 -0
data/firebase_saver.py +315 -0
data/mongo_saver.py +362 -0
organization.py +76 -0
organze.py +184 -0
requirements.txt +37 -0
static/check.png +0 -0
static/edit.png +0 -0
static/icon.png +0 -0
static/index.html +16 -0
static/script.js +230 -0
static/styles.css +135 -0
train/README.md +140 -0
train/__init__.py +8 -0
train/loader.py +370 -0
train/rlhf.py +420 -0
train/saver.py +381 -0
utils/download.py +154 -0
utils/mount_drive.py +44 -0
utils/ul_label.py +206 -0

.DS_Store ADDED Viewed

Binary file (6.15 kB). View file

.gitattributes ADDED Viewed

	@@ -0,0 +1,35 @@

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tar filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,11 @@

+.env
+token.json
+service.json
+firebase.json
+# Security - prevent token leaks
+*.token
+*.key
+*secret*
+*credential*
+*password*

Dockerfile ADDED Viewed

	@@ -0,0 +1,43 @@

+FROM python:3.11-slim
+# ── Create and switch to non-root user ──
+RUN useradd -m -u 1000 user
+USER user
+# ── Set environment and working directory ──
+ENV HOME=/home/user
+WORKDIR $HOME/app
+# ── Upgrade pip and install dependencies ──
+COPY --chown=user requirements.txt .
+RUN pip install --upgrade pip
+RUN pip install --no-cache-dir -r requirements.txt
+# Install latest versions for UL model inference
+RUN pip install --no-cache-dir huggingface_hub xgboost joblib scikit-learn
+# ── Pre-mount GDrive (no-op if creds not found) ──
+COPY --chown=user utils/mount_drive.py .
+RUN python mount_drive.py || true
+# ── Copy application source ──
+COPY --chown=user . .
+# ── Create required folders ──
+RUN mkdir -p $HOME/app/logs \
+    $HOME/app/cache \
+    $HOME/app/cache/obd_data \
+    $HOME/app/cache/obd_data/plots \
+    $HOME/app/models/ul
+# ── Environment variables for HuggingFace model ──
+ENV MODEL_DIR=$HOME/app/models/ul
+ENV HF_MODEL_REPO=BinKhoaLe1812/Driver_Behavior_OBD
+# ── Models will be downloaded at runtime when app starts ──
+# ── Default port ──
+EXPOSE 7860
+# ── Start app ──
+CMD ["python", "-m", "uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]

GOOGLE_DRIVE_SETUP.md ADDED Viewed

	@@ -0,0 +1,336 @@

+# Google Drive Integration Setup Guide
+This guide explains how to set up Google Drive integration for the OBD Logger application.
+## Prerequisites
+1. **Google Cloud Platform Account**: You need a Google Cloud Platform account
+2. **Google Drive API**: Enable the Google Drive API in your project
+3. **Service Account**: Create a service account with appropriate permissions
+4. **Python Dependencies**: Install the required packages
+## Installation
+### 1. Install Dependencies
+The required packages are already included in `requirements.txt`:
+```bash
+pip install -r requirements.txt
+```
+Required packages:
+- `google-auth`
+- `google-auth-httplib2`
+- `google-auth-oauthlib`
+- `google-api-python-client`
+### 2. Environment Variables
+Create a `.env` file in your project root with the following variables:
+```bash
+# Google Drive Configuration
+GDRIVE_CREDENTIALS_JSON={"type":"service_account","project_id":"your-project","private_key_id":"...","private_key":"...","client_email":"...","client_id":"...","auth_uri":"https://accounts.google.com/o/oauth2/auth","token_uri":"https://oauth2.googleapis.com/token","auth_provider_x509_cert_url":"https://www.googleapis.com/oauth2/v1/certs","client_x509_cert_url":"..."}
+# Optional: Custom Google Drive Folder ID
+GDRIVE_FOLDER_ID=1r-wefqKbK9k9BeYDW1hXRbx4B-0Fvj5P
+```
+## Google Cloud Platform Setup
+### 1. Create a New Project
+1. Go to [Google Cloud Console](https://console.cloud.google.com/)
+2. Click "Select a project" → "New Project"
+3. Enter a project name (e.g., "OBD-Logger-Drive")
+4. Click "Create"
+### 2. Enable Google Drive API
+1. In your project, go to "APIs & Services" → "Library"
+2. Search for "Google Drive API"
+3. Click on "Google Drive API"
+4. Click "Enable"
+### 3. Create Service Account
+1. Go to "APIs & Services" → "Credentials"
+2. Click "Create Credentials" → "Service Account"
+3. Fill in the service account details:
+   - **Name**: `obd-logger-drive`
+   - **Description**: `Service account for OBD Logger Google Drive operations`
+4. Click "Create and Continue"
+5. For roles, select "Editor" (or create a custom role with minimal permissions)
+6. Click "Continue" → "Done"
+### 4. Generate Service Account Key
+1. In the service accounts list, click on your newly created service account
+2. Go to the "Keys" tab
+3. Click "Add Key" → "Create New Key"
+4. Choose "JSON" format
+5. Click "Create" - this will download a JSON file
+6. **Important**: Keep this file secure and never commit it to version control
+### 5. Share Google Drive Folder
+1. Go to [Google Drive](https://drive.google.com/)
+2. Create a new folder or use an existing one
+3. Right-click the folder → "Share"
+4. Add your service account email (found in the JSON file under `client_email`)
+5. Give it "Editor" permissions
+6. Copy the folder ID from the URL (the long string after `/folders/`)
+## Configuration
+### 1. Set Up Credentials
+Copy the contents of your downloaded JSON file and set it as the `GDRIVE_CREDENTIALS_JSON` environment variable:
+```bash
+export GDRIVE_CREDENTIALS_JSON='{"type":"service_account","project_id":"your-project",...}'
+```
+Or add it to your `.env` file.
+### 2. Configure Folder ID
+Set the `GDRIVE_FOLDER_ID` environment variable to your target folder ID:
+```bash
+export GDRIVE_FOLDER_ID="your_folder_id_here"
+```
+## Usage
+### Automatic Saving
+The application automatically uploads cleaned CSV files to Google Drive after processing.
+### Manual Operations
+#### Initialize Drive Service
+```python
+from drive_saver import DriveSaver
+# Create instance
+drive_saver = DriveSaver()
+# Check if service is available
+if drive_saver.is_service_available():
+    print("✅ Google Drive service ready")
+else:
+    print("❌ Google Drive service not available")
+```
+#### Upload CSV File
+```python
+# Upload to default folder
+success = drive_saver.upload_csv_to_drive("path/to/your/file.csv")
+# Upload to specific folder
+success = drive_saver.upload_csv_to_drive("path/to/your/file.csv", "custom_folder_id")
+```
+#### Configuration Management
+```python
+# Get current folder ID
+current_folder = drive_saver.get_folder_id()
+# Set new folder ID
+drive_saver.set_folder_id("new_folder_id")
+```
+### Legacy Functions (Backward Compatibility)
+The module maintains backward compatibility with existing code:
+```python
+from drive_saver import get_drive_service, upload_to_folder
+# Legacy usage
+service = get_drive_service()
+result = upload_to_folder(service, "file.csv", "folder_id")
+```
+## File Management
+### Supported File Types
+- **CSV files**: Primary format for OBD data
+- **Text files**: Other data formats
+- **Binary files**: Limited support
+### File Naming
+Files are uploaded with their original names. The system automatically:
+- Preserves file extensions
+- Maintains original timestamps
+- Creates unique names if conflicts exist
+### Storage Organization
+- **Default folder**: All files go to the configured default folder
+- **Custom folders**: Specify different folders for different data types
+- **Session-based**: Files are organized by processing sessions
+## Error Handling
+### Common Issues
+1. **Authentication Errors**
+   - Check service account credentials
+   - Verify API is enabled
+   - Ensure service account has proper permissions
+2. **Permission Errors**
+   - Verify folder sharing settings
+   - Check service account email is added to folder
+   - Ensure "Editor" or higher permissions
+3. **Quota Exceeded**
+   - Monitor Google Drive storage usage
+   - Check API quotas in Google Cloud Console
+   - Consider upgrading storage plan
+### Troubleshooting
+#### Check Service Status
+```python
+from drive_saver import DriveSaver
+saver = DriveSaver()
+print(f"Service available: {saver.is_service_available()}")
+print(f"Current folder: {saver.get_folder_id()}")
+```
+#### Test Connection
+```python
+# Try uploading a small test file
+test_success = drive_saver.upload_csv_to_drive("test.csv")
+if test_success:
+    print("✅ Connection test successful")
+else:
+    print("❌ Connection test failed")
+```
+## Security Best Practices
+### Credential Management
+- **Never commit** service account JSON to version control
+- **Use environment variables** for sensitive data
+- **Rotate keys** regularly
+- **Limit permissions** to minimum required
+### Access Control
+- **Restrict folder access** to necessary users only
+- **Monitor access logs** in Google Drive
+- **Use organization policies** for additional security
+- **Consider VPC Service Controls** for production
+### Network Security
+- **HTTPS only** for all API communications
+- **Firewall rules** to restrict access if needed
+- **Audit logs** for suspicious activity
+## Performance Optimization
+### Upload Strategies
+- **Batch uploads** for multiple files
+- **Compression** for large CSV files
+- **Async processing** for non-blocking operations
+### Monitoring
+- **Track upload success rates**
+- **Monitor file sizes and upload times**
+- **Set up alerts** for failures
+## Integration with OBD Logger
+### Automatic Uploads
+The system automatically uploads files after:
+1. Data processing completion
+2. CSV cleaning and validation
+3. Feature engineering
+4. Quality checks
+### File Naming Convention
+Uploaded files follow the pattern:
+```
+cleaned_{timestamp}.csv
+```
+Where `{timestamp}` is the normalized timestamp from the processing session.
+### Error Recovery
+If uploads fail:
+- Files remain in local storage
+- Errors are logged for debugging
+- Processing continues without interruption
+- Manual retry options available
+## Advanced Configuration
+### Custom Scopes
+Modify the authentication scopes in `drive_saver.py`:
+```python
+scopes = [
+    "https://www.googleapis.com/auth/drive",
+    "https://www.googleapis.com/auth/drive.file"  # More restrictive
+]
+```
+### Retry Logic
+The system includes automatic retry logic for:
+- Network timeouts
+- Rate limiting
+- Temporary service unavailability
+### Logging
+Comprehensive logging includes:
+- Upload success/failure
+- File details and metadata
+- Performance metrics
+- Error details for debugging
+## Support and Maintenance
+### Regular Tasks
+1. **Monitor storage usage** in Google Drive
+2. **Check API quotas** in Google Cloud Console
+3. **Review access logs** for security
+4. **Update service account keys** as needed
+### Troubleshooting Resources
+- [Google Drive API Documentation](https://developers.google.com/drive/api)
+- [Google Cloud Console](https://console.cloud.google.com/)
+- [Google Drive Help](https://support.google.com/drive/)
+- Application logs and error messages
+### Getting Help
+For issues with the OBD Logger integration:
+1. Check application logs
+2. Verify environment variables
+3. Test with simple file uploads
+4. Review Google Cloud Console for errors

MONGODB_SETUP.md ADDED Viewed

	@@ -0,0 +1,133 @@

+# MongoDB Integration Setup Guide
+This guide explains how to set up MongoDB integration for the OBD Logger application.
+## Prerequisites
+1. **MongoDB Atlas Account**: You need a MongoDB Atlas account (free tier available)
+2. **Python Dependencies**: Install the required packages
+## Installation
+### 1. Install Dependencies
+```bash
+pip install pymongo
+```
+Or update your requirements.txt and run:
+```bash
+pip install -r requirements.txt
+```
+### 2. Environment Variables
+Create a `.env` file in your project root with the following variables:
+```bash
+# Google Drive Configuration
+GDRIVE_CREDENTIALS_JSON={"type":"service_account","project_id":"your-project",...}
+# MongoDB Atlas Connection String
+MONGO_URI=mongodb+srv://username:password@cluster.mongodb.net/obd_logger?retryWrites=true&w=majority
+# Optional: Custom Google Drive Folder ID
+GDRIVE_FOLDER_ID=1r-wefqKbK9k9BeYDW1hXRbx4B-0Fvj5P
+```
+## MongoDB Atlas Setup
+### 1. Create Cluster
+1. Go to [MongoDB Atlas](https://cloud.mongodb.com/)
+2. Create a free cluster
+3. Choose your preferred cloud provider and region
+### 2. Database Access
+1. Go to "Database Access" in the left sidebar
+2. Click "Add New Database User"
+3. Choose "Password" authentication
+4. Set username and password (save these!)
+5. Set privileges to "Read and write to any database"
+### 3. Network Access
+1. Go to "Network Access" in the left sidebar
+2. Click "Add IP Address"
+3. For development: Click "Allow Access from Anywhere" (0.0.0.0/0)
+4. For production: Add your specific IP addresses
+### 4. Get Connection String
+1. Go to "Clusters" in the left sidebar
+2. Click "Connect" on your cluster
+3. Choose "Connect your application"
+4. Copy the connection string
+5. Replace `<username>`, `<password>`, and `<dbname>` with your values
+## Usage
+### Automatic Saving
+The application now automatically saves cleaned data to both Google Drive and MongoDB after processing.
+### Manual Operations
+#### Check MongoDB Status
+```bash
+GET /mongo/status
+```
+#### Get Session Summary
+```bash
+GET /mongo/sessions
+```
+#### Query Data
+```bash
+GET /mongo/query?session_id=session_20231201_120000&driving_style=aggressive&limit=100
+```
+#### Save CSV Directly to MongoDB
+```bash
+POST /mongo/save-csv
+# Upload CSV file with optional session_id parameter
+```
+## Data Structure
+Each document in MongoDB contains:
+- All OBD sensor data from the original CSV
+- `session_id`: Unique identifier for the data session
+- `imported_at`: Timestamp when data was imported
+- `record_index`: Original row index from CSV
+- `timestamp`: OBD data timestamp (converted to datetime)
+- `driving_style`: Driving style classification
+## Performance Features
+- **Indexes**: Automatic creation of indexes on timestamp, driving_style, and session_id
+- **Connection Pooling**: Efficient connection management
+- **Batch Operations**: Bulk insert for better performance
+- **Error Handling**: Graceful fallback if MongoDB is unavailable
+## Troubleshooting
+### Connection Issues
+1. Check your MongoDB URI format
+2. Verify network access settings in Atlas
+3. Check username/password credentials
+4. Ensure cluster is running
+### Data Import Issues
+1. Check CSV file format
+2. Verify data types in your CSV
+3. Check application logs for specific error messages
+### Performance Issues
+1. Monitor database indexes
+2. Check connection pool settings
+3. Consider data partitioning for large datasets
+## Security Notes
+- Never commit your `.env` file to version control
+- Use strong passwords for database users
+- Restrict network access to necessary IP addresses only
+- Consider using VPC peering for production deployments

OBD/obd_analyzer.py ADDED Viewed

	@@ -0,0 +1,215 @@

+import pandas as pd
+import numpy as np
+import argparse
+import os
+DRIVING_STYLE_PASSIVE = "Passive"
+DRIVING_STYLE_MODERATE = "Moderate"
+DRIVING_STYLE_AGGRESSIVE = "Aggressive"
+DRIVING_STYLE_UNKNOWN = "UNKNOWN_STYLE"
+ROAD_TYPE_LOCAL = "Local"
+ROAD_TYPE_MAIN = "Main"
+ROAD_TYPE_HIGHWAY = "Highway"
+ROAD_TYPE_UNKNOWN = "UNKNOWN_ROAD"
+TRAFFIC_CONDITION_LIGHT = "Light"
+TRAFFIC_CONDITION_MODERATE = "Moderate"
+TRAFFIC_CONDITION_HEAVY = "Heavy"
+TRAFFIC_CONDITION_UNKNOWN = "UNKNOWN_TRAFFIC"
+KPH_TO_MPS = 1 / 3.6
+G_ACCELERATION = 9.80665
+MIN_MOVING_SPEED_KPH = 2 # have to be moving
+AGGRESSIVE_RPM_ENTRY_THRESHOLD = 2700
+AGGRESSIVE_THROTTLE_ENTRY_THRESHOLD = 40
+AGGRESSIVE_RPM_HOLD_THRESHOLD = 2300
+HARSH_BRAKING_THRESHOLD_G = -0.25
+# roc
+AGGRESSIVE_RPM_ROC_THRESHOLD = 500
+AGGRESSIVE_THROTTLE_ROC_THRESHOLD = 45
+POSITIVE_ACCEL_FOR_ROC_CHECK_G = 0.1
+MODERATE_RPM_THRESHOLD = 2100
+MODERATE_THROTTLE_THRESHOLD = 25
+MIN_DATA_POINTS_FOR_ROC = 2
+def load_and_preprocess_data(csv_filepath):
+    """Loads OBD data from CSV and preprocesses it."""
+    if not os.path.exists(csv_filepath):
+        print(f"Error: File not found at {csv_filepath}")
+        return None
+    try:
+        df = pd.read_csv(csv_filepath)
+    except Exception as e:
+        print(f"Error loading CSV {csv_filepath}: {e}")
+        return None
+    print(f"Successfully loaded {csv_filepath} with {len(df)} rows.")
+    if 'timestamp' not in df.columns:
+        print("Error: 'timestamp' column is missing from the CSV.")
+        return None
+    df['timestamp'] = pd.to_datetime(df['timestamp'])
+    df = df.sort_values(by='timestamp').reset_index(drop=True)
+    df['delta_time_s'] = df['timestamp'].diff().dt.total_seconds()
+    if not df.empty:
+        df.loc[0, 'delta_time_s'] = 0
+    else:
+        # Handle empty DataFrame after potential filtering or if it was empty to begin with
+        return df # Or handle error appropriately
+    numeric_cols = ['SPEED', 'RPM', 'THROTTLE_POS']
+    for col in numeric_cols:
+        if col in df.columns:
+            df[col] = pd.to_numeric(df[col], errors='coerce')
+        else:
+            print(f"Warning: Column {col} not found. It will be filled with NaN.")
+            df[col] = np.nan
+    df[numeric_cols] = df[numeric_cols].fillna(method='ffill').fillna(0)
+    if 'SPEED' in df.columns:
+        df['SPEED_mps'] = df['SPEED'] * KPH_TO_MPS
+    else:
+        df['SPEED_mps'] = 0
+    if len(df) >= MIN_DATA_POINTS_FOR_ROC:
+        df['acceleration_mps2'] = df['SPEED_mps'].diff() / df['delta_time_s']
+        df['acceleration_mps2'] = df['acceleration_mps2'].replace([np.inf, -np.inf], 0).fillna(0)
+        if not df.empty: df.loc[0, 'acceleration_mps2'] = 0
+        df['acceleration_g'] = df['acceleration_mps2'] / G_ACCELERATION
+        if not df.empty: df.loc[0, 'acceleration_g'] = 0
+        df['acceleration_g'] = df['acceleration_g'].fillna(0)
+        if 'RPM' in df.columns:
+            df['RPM_roc'] = df['RPM'].diff() / df['delta_time_s']
+            df['RPM_roc'] = df['RPM_roc'].replace([np.inf, -np.inf], 0).fillna(0)
+            if not df.empty: df.loc[0, 'RPM_roc'] = 0
+        else:
+            df['RPM_roc'] = 0
+        if 'THROTTLE_POS' in df.columns:
+            df['THROTTLE_roc'] = df['THROTTLE_POS'].diff() / df['delta_time_s']
+            df['THROTTLE_roc'] = df['THROTTLE_roc'].replace([np.inf, -np.inf], 0).fillna(0)
+            if not df.empty: df.loc[0, 'THROTTLE_roc'] = 0
+        else:
+            df['THROTTLE_roc'] = 0
+    else:
+        # Not enough data for RoC calculations, fill with 0 or handle as error
+        df['acceleration_mps2'] = 0
+        df['acceleration_g'] = 0
+        df['RPM_roc'] = 0
+        df['THROTTLE_roc'] = 0
+        print("Warning: Not enough data points for full RoC calculations. Output might be limited.")
+    print("Preprocessing complete.")
+    return df
+def classify_driving_style_stateful(df):
+    if df.empty or not all(col in df.columns for col in ['RPM', 'THROTTLE_POS', 'SPEED', 'acceleration_g']):
+        print("Warning: Missing one or more required columns for stateful classification (RPM, THROTTLE_POS, SPEED, acceleration_g).")
+        return pd.Series([DRIVING_STYLE_UNKNOWN] * len(df), index=df.index, dtype=str)
+    driving_styles = [DRIVING_STYLE_UNKNOWN] * len(df)
+    current_style = DRIVING_STYLE_PASSIVE
+    for i in range(len(df)):
+        rpm = df.loc[i, 'RPM']
+        throttle = df.loc[i, 'THROTTLE_POS']
+        speed_kph = df.loc[i, 'SPEED']
+        accel_g = df.loc[i, 'acceleration_g']
+        rpm_roc = df.loc[i, 'RPM_roc']
+        throttle_roc = df.loc[i, 'THROTTLE_roc']
+        row_style = DRIVING_STYLE_PASSIVE
+        is_moving = speed_kph > MIN_MOVING_SPEED_KPH
+        is_hard_braking_trigger = accel_g < HARSH_BRAKING_THRESHOLD_G and is_moving
+        is_high_abs_rpm_throttle_trigger = (rpm > AGGRESSIVE_RPM_ENTRY_THRESHOLD and
+                                            throttle > AGGRESSIVE_THROTTLE_ENTRY_THRESHOLD and
+                                            is_moving)
+        is_actively_accelerating = accel_g > POSITIVE_ACCEL_FOR_ROC_CHECK_G
+        is_high_roc_trigger = (is_moving and
+                               is_actively_accelerating and
+                               (rpm_roc > AGGRESSIVE_RPM_ROC_THRESHOLD or
+                                throttle_roc > AGGRESSIVE_THROTTLE_ROC_THRESHOLD))
+        is_currently_aggressive_event = is_hard_braking_trigger or is_high_abs_rpm_throttle_trigger or is_high_roc_trigger
+        if current_style == DRIVING_STYLE_AGGRESSIVE:
+            if is_currently_aggressive_event:
+                row_style = DRIVING_STYLE_AGGRESSIVE
+            elif rpm > AGGRESSIVE_RPM_HOLD_THRESHOLD and is_moving:
+                row_style = DRIVING_STYLE_AGGRESSIVE
+            else:
+                if (rpm > MODERATE_RPM_THRESHOLD or throttle > MODERATE_THROTTLE_THRESHOLD) and is_moving:
+                    row_style = DRIVING_STYLE_MODERATE
+                else:
+                    row_style = DRIVING_STYLE_PASSIVE
+        else:
+            if is_currently_aggressive_event:
+                row_style = DRIVING_STYLE_AGGRESSIVE
+            else:
+                if (rpm > MODERATE_RPM_THRESHOLD or throttle > MODERATE_THROTTLE_THRESHOLD) and is_moving:
+                    row_style = DRIVING_STYLE_MODERATE
+                else:
+                    row_style = DRIVING_STYLE_PASSIVE
+        driving_styles[i] = row_style
+        current_style = row_style
+    print("Stateful driving style classification complete.")
+    return pd.Series(driving_styles, index=df.index)
+def main():
+    parser = argparse.ArgumentParser(description="Analyze OBD CSV log data for driving behavior (stateful).")
+    parser.add_argument("csv_filepath", help="Path to the OBD log CSV file.")
+    parser.add_argument("--output_csv", help="Path to save the analyzed data CSV file.", default=None)
+    args = parser.parse_args()
+    df = load_and_preprocess_data(args.csv_filepath)
+    if df is None or df.empty:
+        print("No data to process after loading or preprocessing.")
+        return
+    df['driving_style_analyzed'] = classify_driving_style_stateful(df)
+    print("\n--- Analysis Summary ---")
+    print("Driving Style Distribution (Analyzed):")
+    counts = df['driving_style_analyzed'].value_counts(dropna=False)
+    percentages = df['driving_style_analyzed'].value_counts(normalize=True, dropna=False) * 100
+    summary_df = pd.DataFrame({'Count': counts, 'Percentage': percentages})
+    print(summary_df)
+    if args.output_csv:
+        try:
+            output_path = args.output_csv
+            output_dir = os.path.dirname(output_path)
+            if output_dir and not os.path.exists(output_dir):
+                os.makedirs(output_dir)
+            df.to_csv(output_path, index=False)
+            print(f"\nAnalyzed data saved to {output_path}")
+        except Exception as e:
+            print(f"Error saving output CSV to {args.output_csv}: {e}")
+    else:
+        print("\n--- First 20 Rows of Analyzed Data (showing key fields) ---")
+        display_cols = ['timestamp', 'SPEED', 'RPM', 'THROTTLE_POS', 'acceleration_g', 'driving_style_analyzed']
+        display_cols = [col for col in display_cols if col in df.columns]
+        if display_cols: print(df[display_cols].head(20))
+        else: print("Key display columns not found in DataFrame.")
+if __name__ == "__main__":
+    main()

OBD/obd_logger.py ADDED Viewed

	@@ -0,0 +1,374 @@

+import obd
+import time
+import datetime
+import csv
+import os
+from collections import deque
+import numpy as np
+import shutil
+import subprocess
+DRIVING_STYLE_PASSIVE = "Passive"
+DRIVING_STYLE_MODERATE = "Moderate"
+DRIVING_STYLE_AGGRESSIVE = "Aggressive"
+DRIVING_STYLE_UNKNOWN = "UNKNOWN_STYLE"
+ROAD_TYPE_LOCAL = "Local"
+ROAD_TYPE_MAIN = "Main"
+ROAD_TYPE_HIGHWAY = "Highway"
+ROAD_TYPE_UNKNOWN = "UNKNOWN_ROAD"
+TRAFFIC_CONDITION_LIGHT = "Light"
+TRAFFIC_CONDITION_MODERATE = "Moderate"
+TRAFFIC_CONDITION_HEAVY = "Heavy"
+TRAFFIC_CONDITION_UNKNOWN = "UNKNOWN_TRAFFIC"
+# Rolling Average Configuration
+ROLLING_WINDOW_SIZE = 20  # 6 seconds
+MIN_SAMPLES_FOR_CLASSIFICATION = 10
+# ROC needs tuning
+SHORT_ROC_WINDOW_SIZE = 3
+MIN_SAMPLES_FOR_ROC_CHECK = SHORT_ROC_WINDOW_SIZE
+ROC_THROTTLE_AGGRESSIVE_THRESHOLD = 25.0
+ROC_RPM_AGGRESSIVE_THRESHOLD = 700.0
+ROC_SPEED_AGGRESSIVE_THRESHOLD = 8.0
+MIN_RPM_FOR_AGGRESSIVE_TRIGGER = 1000.0
+AGGRESSIVE_EVENT_COOLDOWN_SAMPLES = 15
+HIGH_FREQUENCY_PIDS = [
+    obd.commands.RPM,
+    obd.commands.THROTTLE_POS,
+    obd.commands.SPEED,
+]
+LOW_FREQUENCY_PIDS_POOL = [
+    obd.commands.FUEL_PRESSURE,
+    obd.commands.ENGINE_LOAD,
+    obd.commands.COOLANT_TEMP,
+    obd.commands.INTAKE_TEMP,
+    obd.commands.TIMING_ADVANCE,
+    obd.commands.MAF,
+    obd.commands.INTAKE_PRESSURE,
+    obd.commands.SHORT_FUEL_TRIM_1,
+    obd.commands.LONG_FUEL_TRIM_1,
+    obd.commands.SHORT_FUEL_TRIM_2,
+    obd.commands.LONG_FUEL_TRIM_2,
+    obd.commands.COMMANDED_EQUIV_RATIO,
+    obd.commands.O2_B1S2,
+    obd.commands.O2_B2S2,
+    obd.commands.O2_S1_WR_VOLTAGE,
+    obd.commands.COMMANDED_EGR,
+]
+ALL_PIDS_TO_LOG = HIGH_FREQUENCY_PIDS + LOW_FREQUENCY_PIDS_POOL
+CSV_FILENAME_BASE = "obd_data_log"
+# Define new structured log directories relative to the OBD_Logger/OBD directory
+LOGS_BASE_DIR = os.path.join(os.path.dirname(__file__), "..", "..", "logs") # Corrected: Up two levels to Base, then into logs
+ORIGINAL_CSV_DIR = os.path.join(LOGS_BASE_DIR, "OriginalCSV")
+DUPLICATE_CSV_DIR = os.path.join(LOGS_BASE_DIR, "DuplicateCSV")
+WIFI_ADAPTER_HOST = "192.168.0.10"
+WIFI_ADAPTER_PORT = 35000
+WIFI_PROTOCOL = "6"
+USE_WIFI_SETTINGS = False # using socat to mimic serial connection
+def get_pid_value(connection, pid_command):
+    """Queries a PID and returns its value, or None if not available or error."""
+    try:
+        response = connection.query(pid_command, force=True)
+        if response.is_null() or response.value is None:
+            return None
+        if hasattr(response.value, 'magnitude'):
+            return response.value.magnitude
+        return response.value
+    except Exception as e:
+        print(f"Error querying {pid_command.name}: {e}")
+        return None
+def perform_logging_session():
+    connection = None
+    print("Starting OBD-II Data Logger...")
+    print("Classifications (Style, Road, Traffic) will be determined automatically.")
+    initial_driving_style = ""
+    initial_road_type = ""
+    initial_traffic_condition = ""
+    BASE_LOG_INTERVAL = .3  # for high frequency data
+    LOW_FREQUENCY_GROUP_POLL_INTERVAL = 90.0  # Interval in seconds to poll one group of LF PIDs
+    NUM_LOW_FREQUENCY_GROUPS = 3
+    # Prepare Low-Frequency PID groups
+    low_frequency_pid_groups = []
+    if LOW_FREQUENCY_PIDS_POOL:
+        chunk_size = (len(LOW_FREQUENCY_PIDS_POOL) + NUM_LOW_FREQUENCY_GROUPS - 1) // NUM_LOW_FREQUENCY_GROUPS
+        for i in range(0, len(LOW_FREQUENCY_PIDS_POOL), chunk_size):
+            low_frequency_pid_groups.append(LOW_FREQUENCY_PIDS_POOL[i:i + chunk_size])
+    if not low_frequency_pid_groups: # Handle case with no LF PIDs
+        low_frequency_pid_groups.append([])
+        NUM_LOW_FREQUENCY_GROUPS = 1
+    last_low_frequency_group_poll_time = time.monotonic()
+    current_low_frequency_group_index = 0
+    current_pid_values = {pid.name: '' for pid in ALL_PIDS_TO_LOG}
+    # Create log directories
+    for dir_path in [ORIGINAL_CSV_DIR, DUPLICATE_CSV_DIR]: # Add ANALYZED_OUTPUT_DIR if used
+        try:
+            os.makedirs(dir_path, exist_ok=True)
+            print(f"Ensured directory exists: {dir_path}")
+        except OSError as e:
+            print(f"Error creating directory {dir_path}: {e}. Attempting to use current directory.")
+            # Fallback logic may be needed if creation fails critically
+            if dir_path == ORIGINAL_CSV_DIR: # Critical for saving original log
+                 print("Cannot create original log directory. Exiting.")
+                 return None
+    current_session_timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
+    csv_file_name_only = f"{CSV_FILENAME_BASE}_{current_session_timestamp}.csv"
+    original_csv_filepath = os.path.join(ORIGINAL_CSV_DIR, csv_file_name_only)
+    try:
+        if USE_WIFI_SETTINGS:
+            print(f"Attempting to connect to WiFi adapter at {WIFI_ADAPTER_HOST}:{WIFI_ADAPTER_PORT} using protocol {WIFI_PROTOCOL}...")
+            connection = obd.OBD(protocol=WIFI_PROTOCOL,
+                                 host=WIFI_ADAPTER_HOST,
+                                 port=WIFI_ADAPTER_PORT,
+                                 fast=False,
+                                 timeout=30)
+        else:
+            print("Attempting to connect via socat PTY /dev/ttys011...")
+            connection = obd.OBD("/dev/ttys086", fast=True, timeout=30) # Auto-scan for USB/Bluetooth
+        if not connection.is_connected():
+            print("Failed to connect to OBD-II adapter.")
+            print(f"Connection status: {connection.status()}")
+            return None
+        print(f"Successfully connected to OBD-II adapter: {connection.port_name()}")
+        print(f"Adapter status: {connection.status()}")
+        print(f"Supported PIDs (sample):")
+        supported_commands = connection.supported_commands
+        for i, cmd in enumerate(supported_commands):
+            print(f"  - {cmd.name}")
+        if not supported_commands:
+            print("No commands")
+        # Creating initial full PID sample to have fully populated rows from beginning
+        print("\nPerforming initial full PID sample...")
+        initial_log_entry = {
+            'timestamp': datetime.datetime.now().isoformat(),
+            'driving_style': initial_driving_style,
+            'road_type': initial_road_type,
+            'traffic_condition': initial_traffic_condition
+        }
+        print("Polling initial High-Frequency PIDs...")
+        for pid_command in HIGH_FREQUENCY_PIDS:
+            value = get_pid_value(connection, pid_command)
+            current_pid_values[pid_command.name] = value if value is not None else ''
+            initial_log_entry[pid_command.name] = current_pid_values[pid_command.name]
+        print("Polling initial Low-Frequency PIDs (all groups)...")
+        if low_frequency_pid_groups and low_frequency_pid_groups[0]: # Check if there are any LF PIDs
+            for group in low_frequency_pid_groups:
+                for pid_command in group:
+                    value = get_pid_value(connection, pid_command)
+                    current_pid_values[pid_command.name] = value if value is not None else ''
+                    initial_log_entry[pid_command.name] = current_pid_values[pid_command.name]
+        else:
+            print("No Low-Frequency PIDs to poll for initial sample.")
+        for pid_obj in ALL_PIDS_TO_LOG:
+            if pid_obj.name not in initial_log_entry:
+                initial_log_entry[pid_obj.name] = '' # Default to empty if somehow missed
+    except Exception as e:
+        print(f"An error occurred during connection or initial PID sample: {e}")
+        if connection and connection.is_connected():
+            connection.close()
+        return None
+    file_exists = os.path.isfile(original_csv_filepath)
+    try:
+        with open(original_csv_filepath, 'a', newline='') as csvfile:
+            # Add new columns for analyzer output, they will be empty initially from logger
+            header_names = ['timestamp',
+                            'driving_style', 'road_type', 'traffic_condition', # Original placeholder columns
+                            'driving_style_analyzed', 'road_type_analyzed', 'traffic_condition_analyzed' # For analyzer
+                           ] + [pid.name for pid in ALL_PIDS_TO_LOG]
+            # Remove duplicates if any PID name is already in the first part
+            processed_headers = []
+            for item in header_names:
+                if item not in processed_headers:
+                    processed_headers.append(item)
+            header_names = processed_headers
+            writer = csv.DictWriter(csvfile, fieldnames=header_names)
+            if not file_exists or os.path.getsize(original_csv_filepath) == 0:
+                writer.writeheader()
+                print(f"Created new CSV file: {original_csv_filepath} with headers: {header_names}")
+            if initial_log_entry:
+                # Add placeholder columns for analyzer to the initial entry
+                initial_log_entry['driving_style_analyzed'] = ''
+                initial_log_entry['road_type_analyzed'] = ''
+                initial_log_entry['traffic_condition_analyzed'] = ''
+                writer.writerow(initial_log_entry)
+                csvfile.flush()
+                print(f"Logged initial full sample. Style: {initial_driving_style}, Road: {initial_road_type}, Traffic: {initial_traffic_condition}.")
+            last_low_frequency_group_poll_time = time.monotonic()
+            current_low_frequency_group_index = 0
+            print(f"\nLogging high-frequency data every {BASE_LOG_INTERVAL} second(s).")
+            print(f"Polling one group of low-frequency PIDs every {LOW_FREQUENCY_GROUP_POLL_INTERVAL} second(s).")
+            print(f"Low-frequency PIDs divided into {len(low_frequency_pid_groups)} groups.")
+            log_count = 0
+            while True:
+                loop_start_time = time.monotonic()
+                current_datetime = datetime.datetime.now()
+                timestamp_iso = current_datetime.isoformat()
+                hf_reads = 0
+                for pid_command in HIGH_FREQUENCY_PIDS:
+                    value = get_pid_value(connection, pid_command)
+                    current_pid_values[pid_command.name] = value if value is not None else ''
+                    if value is not None:
+                        hf_reads += 1
+                lf_reads_this_cycle = 0
+                lf_group_polled_this_cycle = "None"
+                if low_frequency_pid_groups and (time.monotonic() - last_low_frequency_group_poll_time) >= LOW_FREQUENCY_GROUP_POLL_INTERVAL:
+                    group_to_poll = low_frequency_pid_groups[current_low_frequency_group_index]
+                    lf_group_polled_this_cycle = f"Group {current_low_frequency_group_index + 1}/{len(low_frequency_pid_groups)}"
+                    for pid_command in group_to_poll:
+                        value = get_pid_value(connection, pid_command)
+                        current_pid_values[pid_command.name] = value if value is not None else ''
+                        if value is not None:
+                            lf_reads_this_cycle +=1
+                        else:
+                            print(f"Warning: Could not read LF PID {pid_command.name}")
+                    last_low_frequency_group_poll_time = time.monotonic()
+                    current_low_frequency_group_index = (current_low_frequency_group_index + 1) % len(low_frequency_pid_groups)
+                final_log_entry = {
+                    'timestamp': timestamp_iso,
+                    'driving_style': initial_driving_style,
+                    'road_type': initial_road_type,
+                    'traffic_condition': initial_traffic_condition,
+                    'driving_style_analyzed': '',
+                    'road_type_analyzed': '',
+                    'traffic_condition_analyzed': ''
+                }
+                # Add all PID values for this cycle from current_pid_values
+                for pid_obj in ALL_PIDS_TO_LOG:
+                     final_log_entry[pid_obj.name] = current_pid_values.get(pid_obj.name, '')
+                writer.writerow(final_log_entry)
+                csvfile.flush()
+                log_count += 1
+                if log_count % 10 == 0:
+                    status_msg = f"Logged entry {log_count} - HF PIDs Read: {hf_reads}/{len(HIGH_FREQUENCY_PIDS)}"
+                    if lf_reads_this_cycle > 0 or lf_group_polled_this_cycle != "None":
+                         status_msg += f" - LF PIDs ({lf_group_polled_this_cycle}) Read: {lf_reads_this_cycle}/unknown_total_for_group_easily"
+                    print(status_msg)
+                elapsed_time_in_loop = time.monotonic() - loop_start_time
+                sleep_duration = max(0, BASE_LOG_INTERVAL - elapsed_time_in_loop)
+                time.sleep(sleep_duration)
+    except KeyboardInterrupt:
+        print("\nStopping data logging due to user interruption (Ctrl+C).")
+    except Exception as e:
+        print(f"An error occurred during logging: {e}")
+    finally:
+        if connection and connection.is_connected():
+            print("Closing OBD-II connection.")
+            connection.close()
+        print(f"Data logging stopped. Original CSV file '{original_csv_filepath}' saved.")
+    return original_csv_filepath
+def duplicate_csv(original_filepath):
+    if not original_filepath or not os.path.exists(original_filepath):
+        print(f"Error: Original CSV not found for duplication: {original_filepath}")
+        return None
+    # Ensure DUPLICATE_CSV_DIR exists (it should have been created by perform_logging_session)
+    os.makedirs(DUPLICATE_CSV_DIR, exist_ok=True)
+    # Get just the filename from the original path
+    original_filename = os.path.basename(original_filepath)
+    base, ext = os.path.splitext(original_filename)
+    # Construct new filename for the duplicate
+    duplicate_filename = f"{base}_to_analyze{ext}" # Suffix to distinguish
+    duplicate_filepath = os.path.join(DUPLICATE_CSV_DIR, duplicate_filename)
+    try:
+        shutil.copy2(original_filepath, duplicate_filepath)
+        print(f"Successfully duplicated CSV to: {duplicate_filepath}")
+        return duplicate_filepath
+    except Exception as e:
+        print(f"Error duplicating CSV {original_filepath} to {duplicate_filepath}: {e}")
+        return None
+def run_analyzer_on_csv(csv_to_analyze_path):
+    if not csv_to_analyze_path or not os.path.exists(csv_to_analyze_path):
+        print(f"Error: Analyzer input CSV not found: {csv_to_analyze_path}")
+        return
+    # Analyzer script is in the same directory as this logger script
+    analyzer_script_path = os.path.join(os.path.dirname(__file__), "obd_analyzer.py")
+    if not os.path.exists(analyzer_script_path):
+        print(f"CRITICAL Error: Analyzer script not found at {analyzer_script_path}")
+        return
+    analyzed_file_basename = os.path.basename(csv_to_analyze_path).replace("_to_analyze.csv", "_final_analyzed.csv")
+    final_output_path = os.path.join(DUPLICATE_CSV_DIR, analyzed_file_basename)
+    command = [
+        "python",
+        analyzer_script_path,
+        csv_to_analyze_path,
+        "--output_csv",
+        final_output_path
+    ]
+    print(f"Running analyzer: {' '.join(command)}")
+    try:
+        process = subprocess.run(command, check=True, capture_output=True, text=True, cwd=os.path.dirname(__file__))
+        print("Analyzer Output:\n", process.stdout)
+        if process.stderr: print("Analyzer Errors:\n", process.stderr)
+        print(f"Analyzer finished. Output saved to {final_output_path}")
+    except subprocess.CalledProcessError as e:
+        print(f"Error running analyzer: {e}\nStdout: {e.stdout}\nStderr: {e.stderr}")
+    except FileNotFoundError:
+        print(f"Error: 'python' or analyzer script not found ({analyzer_script_path}).")
+if __name__ == "__main__":
+    original_log_file = perform_logging_session()
+    if original_log_file and os.path.exists(original_log_file):
+        duplicated_log_file = duplicate_csv(original_log_file)
+        if duplicated_log_file:
+            run_analyzer_on_csv(duplicated_log_file)
+            print(f"Process complete. Original log: {original_log_file}, Analyzed log copy: {duplicated_log_file}")
+    else:
+        print("OBD logging did not produce a valid CSV file. Skipping analysis.")

README.md ADDED Viewed

	@@ -0,0 +1,199 @@

+---
+title: OBD Logger
+emoji: 🚗
+colorFrom: gray
+colorTo: purple
+sdk: docker
+pinned: false
+license: apache-2.0
+short_description: OBD-logging FastAPI server with data processing pipelines
+---
+# OBD Logger
+A comprehensive OBD-II data logging and processing system built with FastAPI, featuring advanced data cleaning, Google Drive integration, MongoDB storage capabilities, and **Reinforcement Learning from Human Feedback (RLHF)** for driver behavior classification.
+## Features
+- **Real-time OBD-II Data Ingestion**: Stream and process OBD sensor data in real-time
+- **Advanced Data Cleaning**: Intelligent gap detection, KNN imputation, and outlier handling
+- **Multi-Storage Architecture**:
+  - Google Drive integration for CSV storage
+  - Firebase for structured data storage and querying
+  - MongoDB Atlas for structured data storage and querying
+- **Driver Behavior Classification**: XGBoost-based ML model for driving style prediction
+- **RLHF Training System**: Continuous model improvement through human feedback
+- **Data Visualization**: Automatic generation of correlation heatmaps and trend plots
+- **RESTful API**: Comprehensive endpoints for data management and retrieval
+- **Web Dashboard**: User-friendly interface for monitoring and control
+- **Model Versioning**: Semantic versioning (1.0, 1.1, 1.2, etc.) with Hugging Face integration
+## Architecture
+The application is structured into modular components:
+- **`app.py`**: Main FastAPI application with data processing pipeline and RLHF endpoints
+- **`data/`**: Storage and persistence modules
+  - **`drive_saver.py`**: Google Drive operations and file management
+  - **`mongo_saver.py`**: MongoDB operations and data persistence
+  - **`firebase_saver.py`**: Firebase operations and data persistence
+- **`train/`**: RLHF training system
+  - **`loader.py`**: Load labeled data from Firebase storage with original dataset tracking
+  - **`saver.py`**: Save trained models to Hugging Face Hub with semantic versioning
+  - **`rlhf.py`**: Main RLHF training pipeline for continuous model improvement
+- **`OBD/`**: OBD-specific modules for data analysis and logging
+- **`utils/`**: Utility modules for model management and data processing
+## Quick Start
+1. **Install Dependencies**:
+   ```bash
+   pip install -r requirements.txt
+   ```
+2. **Set Environment Variables**:
+   - `GDRIVE_CREDENTIALS_JSON`: Google Service Account credentials
+   - `FIREBASE_SERVICE_ACCOUNT_JSON`: Firebase connection string
+   - `FIREBASE_ADMIN_JSON`: Firebase Admin SDK credentials
+   - `HF_TOKEN`: Hugging Face authentication token
+   - `HF_MODEL_REPO`: Hugging Face model repository (default: `BinKhoaLe1812/Driver_Behavior_OBD`)
+   - `MODEL_DIR`: Local model directory (default: `/app/models/ul`)
+3. **Run the Application**:
+   ```bash
+   uvicorn app:app --reload
+   ```
+4. **Access the Dashboard**:
+   - Web UI: `http://localhost:8000/ui`
+   - API Docs: `http://localhost:8000/docs`
+## Data Processing Pipeline
+1. **Ingestion**: Real-time streaming or bulk CSV upload
+2. **Cleaning**: Automatic gap detection and KNN imputation
+3. **Feature Engineering**: Derived metrics and sensor combinations
+4. **Storage**: Simultaneous save to Google Drive, Firebase, and MongoDB
+5. **Driver Behavior Classification**: XGBoost model prediction on processed data
+6. **RLHF Training**: Continuous model improvement through human feedback
+7. **Visualization**: Correlation analysis and trend plots
+## API Endpoints
+### Data Ingestion
+- `POST /ingest`: Stream OBD data
+- `POST /upload-csv/`: Bulk CSV upload
+### Data Retrieval
+- `GET /download/{filename}`: Download cleaned CSV
+- `GET /events`: Get processing status
+### MongoDB Operations
+- `GET /mongo/status`: Check MongoDB connection
+- `GET /mongo/sessions`: Get data session summaries
+- `GET /mongo/query`: Query data with filters
+- `POST /mongo/save-csv`: Direct CSV to MongoDB
+### RLHF Training System
+- `POST /rlhf/train`: Trigger RLHF training session
+- `GET /rlhf/status`: Get RLHF system status and available labeled data
+- `GET /rlhf/trained-datasets`: List datasets already used for training
+### Firebase Storage
+- Structured data storage with automatic versioning
+- **`skyledge/raw/`**: Original OBD data files
+- **`skyledge/processed/`**: Cleaned and processed data
+- **`skyledge/labeled/`**: Human-labeled data for RLHF training
+- **`skyledge/labeled/trained.txt`**: Tracks processed datasets to avoid retraining
+### Hugging Face Hub
+- **Model Repository**: `BinKhoaLe1812/Driver_Behavior_OBD`
+- **Semantic Versioning**: v1.0, v1.1, v1.2, ..., v2.0, etc.
+- **Model Components**: XGBoost model, label encoder, scaler
+- **Metadata**: Training logs, performance metrics, dataset information
+## RLHF Training System
+### Overview
+The Reinforcement Learning from Human Feedback (RLHF) system enables continuous improvement of the driver behavior classification model through human-labeled data.
+### Key Features
+- **Original Dataset Tracking**: Automatically links labeled data to original datasets
+- **Preference Learning**: Learns from differences between model predictions and human labels
+- **Semantic Versioning**: Automatic model versioning (1.0 → 1.1 → 1.2 → 2.0)
+- **Hugging Face Integration**: Saves models to HF Hub with metadata
+- **Training Tracking**: Prevents retraining on the same datasets
+### Usage Examples
+#### Trigger RLHF Training
+```bash
+curl -X POST "http://localhost:8000/rlhf/train" \
+     -H "Content-Type: application/json" \
+     -d '{
+       "max_datasets": 5,
+       "force_retrain": false
+     }'
+```
+#### Check Training Status
+```bash
+curl -X GET "http://localhost:8000/rlhf/status"
+```
+#### List Trained Datasets
+```bash
+curl -X GET "http://localhost:8000/rlhf/trained-datasets"
+```
+### Data Flow
+1. **Human Labeling**: Data labeled and stored in `skyledge/labeled/`
+2. **Filename Convention**: `001_raw-002_2025-09-19-labelled.csv`
+3. **Original Dataset**: Automatically loads `skyledge/raw/002_2025-09-19-raw.csv`
+4. **RLHF Training**: Compares model predictions vs human labels
+5. **Model Update**: Trains new model with preference learning
+6. **Versioning**: Saves as v1.0, v1.1, etc. to Hugging Face Hub
+## Documentation
+- **MongoDB Setup**: See `MONGODB_SETUP.md` for detailed configuration
+- **Google Drive Setup**: See `GOOGLE_DRIVE_SETUP.md` for configuration
+- **RLHF Training**: See `train/README.md` for detailed RLHF documentation
+- **API Reference**: Interactive docs at `/docs` endpoint
+- **Code Structure**: Modular design for easy maintenance
+## Development
+The codebase follows clean architecture principles:
+- **Separation of concerns**: Between storage, processing, API, and ML layers
+- **Comprehensive error handling**: Graceful fallbacks for service unavailability
+- **Type hints and documentation**: Full type annotations and docstrings
+- **Modular design**: Easy to extend and maintain
+- **RLHF Integration**: Seamless integration of machine learning with data processing
+- **Version control**: Semantic versioning for model artifacts
+- **Testing**: Comprehensive test coverage for all components
+## Model Management
+### Driver Behavior Classification
+- **Model Type**: XGBoost Classifier
+- **Labels**: Aggressive, Normal, Conservative
+- **Features**: OBD sensor data (speed, RPM, throttle, etc.)
+- **Training**: RLHF with human feedback integration
+### Model Artifacts
+- **XGBoost Model**: `xgb_drivestyle_ul.pkl`
+- **Label Encoder**: `label_encoder_ul.pkl`
+- **Feature Scaler**: `scaler_ul.pkl`
+- **Metadata**: Training logs and performance metrics
+### Versioning Strategy
+- **Semantic Versioning**: 1.0 → 1.1 → 1.2 → 2.0
+- **Automatic Detection**: Checks existing versions in HF repo
+- **Fallback**: Timestamp-based versioning if HF unavailable
+- **Local Backup**: Saves to local `/app/models/ul/v{version}/`
+## License
+Apache 2.0 License

app.py ADDED Viewed

	@@ -0,0 +1,802 @@

+# Access: https://binkhoale1812-obd-logger.hf.space/ui
+# ───────────── Installation ─────────────
+# Router
+from fastapi import FastAPI, UploadFile, File, BackgroundTasks, HTTPException
+from fastapi.responses import FileResponse, HTMLResponse
+from fastapi.staticfiles import StaticFiles
+from fastapi.templating import Jinja2Templates
+from fastapi.requests import Request
+from pydantic import BaseModel
+# ML/DL
+import pandas as pd
+import numpy as np
+import matplotlib.pyplot as plt
+import seaborn as sns
+from sklearn.preprocessing import MinMaxScaler
+from sklearn.impute import KNNImputer
+# Utils
+import os, datetime, json, logging, re
+from datetime import timedelta
+import pathlib
+# Drive
+from data.drive_saver import DriveSaver, get_drive_service, upload_to_folder
+# Database
+from data.mongo_saver import MongoSaver, save_csv_to_mongo, save_dataframe_to_mongo, MONGODB_AVAILABLE
+from data.firebase_saver import FirebaseSaver, save_csv_increment, save_dataframe_increment
+# UL Model
+from utils.ul_label import ULLabeler
+# RLHF Training
+from train import RLHFTrainer
+# ───────────── Logging Setup ─────────────
+logger = logging.getLogger("obd-logger")
+logger.setLevel(logging.INFO)
+fmt = logging.Formatter("[%(levelname)s] %(asctime)s - %(message)s")
+handler = logging.StreamHandler()
+handler.setFormatter(fmt)
+logger.addHandler(handler)
+# ───────────── FastAPI Init ─────────────
+app = FastAPI(title="OBD-II Logging & Processing API")
+# ───────────── Directory Paths ─────────────
+APP_ROOT = pathlib.Path(__file__).parent.resolve()  # Absolute base dir
+BASE_DIR = os.path.join(APP_ROOT, './cache/obd_data')
+CLEANED_DIR = os.path.join(BASE_DIR, "cleaned")
+PLOT_DIR = os.path.join(BASE_DIR, "plots")
+RAW_CSV = os.path.join(BASE_DIR, "raw_logs.csv")
+os.makedirs(BASE_DIR, exist_ok=True)
+os.makedirs(CLEANED_DIR, exist_ok=True)
+os.makedirs(PLOT_DIR, exist_ok=True)
+DRIVE_STYLE = []  # latest UL predictions (string labels) — overwritten each run
+# Init temp empty file
+if not os.path.exists(RAW_CSV):
+    pd.DataFrame(columns=["timestamp", "driving_style"]).to_csv(RAW_CSV, index=False)
+PIPELINE_EVENTS = {}
+# ───────────── Drive & Database Services ─────────────
+# Initialize services
+drive_saver = DriveSaver()
+mongo_saver = MongoSaver()
+firebase_saver = FirebaseSaver()
+# ───────────── Model Download on Startup ─────────────
+@app.on_event("startup")
+async def startup_event():
+    """Download models on app startup"""
+    try:
+        logger.info("🚀 Starting model download...")
+        from utils.download import download_latest_models
+        # Load .env file if it exists
+        env_path = pathlib.Path(".env")
+        if env_path.exists():
+            logger.info("📄 Loading .env file...")
+            with open(env_path, 'r') as f:
+                for line in f:
+                    line = line.strip()
+                    if line and not line.startswith('#') and '=' in line:
+                        key, value = line.split('=', 1)
+                        os.environ[key] = value
+        # Download models
+        success = download_latest_models()
+        if success:
+            logger.info("✅ Models downloaded successfully on startup")
+        else:
+            logger.warning("⚠️ Model download failed on startup - some features may not work")
+    except Exception as e:
+        logger.error(f"❌ Startup model download failed: {e}")
+        logger.warning("⚠️ Continuing without models - some features may not work")
+# ───────────── Render Dashboard UI ──────────────
+app.mount("/static", StaticFiles(directory="static"), name="static")
+app.mount("/plots", StaticFiles(directory=str(PLOT_DIR)), name="plots")
+templates = Jinja2Templates(directory="static")
+# Endpoint
+@app.get("/ui", response_class=HTMLResponse)
+def dashboard(request: Request):
+    return templates.TemplateResponse("index.html", {"request": request})
+# ───────────── Streamed Entry Ingest ─────────────
+class OBDEntry(BaseModel):
+    timestamp: str
+    driving_style: str
+    data: dict
+    status: str = None  # Optional for control signal (start/end streaming)
+# Direct centralized timestamp format
+def normalize_timestamp(ts):
+    return ts.replace(":", "-").replace(".", "-").replace(" ", "T").replace("/", "-")
+# Real time endpoint
+@app.post("/ingest")
+def ingest(entry: OBDEntry, background_tasks: BackgroundTasks):
+    norm_ts = normalize_timestamp(entry.timestamp)
+    logger.info(f"Ingest received: {norm_ts} | Status: {entry.status}")
+    # Start logging
+    if entry.status == "start":
+        PIPELINE_EVENTS[norm_ts] = {"status": "started", "time": norm_ts}
+        return {"status": "started"}
+    # End logging, start processing
+    if entry.status == "end":
+        background_tasks.add_task(process_data, norm_ts)
+        return {"status": "processed"}
+    # Normal row append
+    try:
+        df = pd.read_csv(RAW_CSV)
+        row = {"timestamp": norm_ts, "driving_style": entry.driving_style}
+        row.update(entry.data)
+        df = pd.concat([df, pd.DataFrame([row])], ignore_index=True)
+        df.to_csv(RAW_CSV, index=False)
+        return {"status": "row appended"}
+    except Exception as e:
+        logger.error(f"Streaming ingest failed: {e}")
+        raise HTTPException(status_code=500, detail="Ingest error")
+# ───────────── Bulk CSV Upload ───────────────────
+@app.post("/upload-csv/")
+async def upload_csv(file: UploadFile = File(...), background_tasks: BackgroundTasks = None):
+    ts = datetime.datetime.now().isoformat()
+    norm_ts = normalize_timestamp(ts)
+    path = os.path.join(BASE_DIR, file.filename)
+    PIPELINE_EVENTS[norm_ts] = {"status": "started", "time": norm_ts}
+    with open(path, "wb") as f:
+        f.write(await file.read())
+    logger.info(f"CSV uploaded: {path}")
+    background_tasks.add_task(process_uploaded_csv, path, norm_ts)
+    return {"status": "processing started", "file": file.filename}
+# ───────────── Data Processing ──────────────────
+# Bulk CSV
+def process_uploaded_csv(path, norm_ts):
+    try:
+        df = pd.read_csv(path, parse_dates=["timestamp"])
+        PIPELINE_EVENTS[norm_ts] = {
+            "status": "processed",
+            "time": norm_ts
+        }
+        _process_and_save(df, norm_ts)
+    except Exception as e:
+        logger.error(f"CSV processing failed: {e}")
+# Process streaming
+def process_data(norm_ts):
+    try:
+        df = pd.read_csv(RAW_CSV, parse_dates=["timestamp"])
+        PIPELINE_EVENTS[norm_ts] = {
+            "status": "processed",
+            "time": norm_ts
+        }
+        _process_and_save(df, norm_ts)
+    except Exception as e:
+        logger.error(f"Streamed data processing failed: {e}")
+# All processing pipeline
+def _process_and_save(df, norm_ts):
+    """
+    Gap-aware, multi-sensor backfill for OBD-II streams with unknown cadence.
+    - Infers sampling interval from data (robust).
+    - Inserts placeholder rows for gaps using the inferred interval.
+    - Flags only corrupted values (NaN/inf/sentinels); does NOT trim 'extreme but plausible' outliers.
+    - Backfills ALL numeric sensors with KNNImputer (+ time as a feature).
+    - Keeps your plotting, Drive upload, and PIPELINE_EVENTS wiring intact.
+    """
+    logger.info("🔧 Cleaning started (auto-interval, KNN for all sensors)")
+    # ----------------------- helpers (scoped locally) -----------------------
+    protected_cols = {"timestamp", "driving_style"}
+    SENTINELS = {-22, -40, 255}
+    def _to_dt(_df: pd.DataFrame) -> pd.DataFrame:
+        _df = _df.copy()
+        _df["timestamp"] = pd.to_datetime(_df["timestamp"], errors="coerce", utc=True)
+        _df = _df.dropna(subset=["timestamp"]).sort_values("timestamp").reset_index(drop=True)
+        # drop exact duplicate timestamps (keep first)
+        _df = _df[~_df["timestamp"].duplicated(keep="first")].reset_index(drop=True)
+        return _df
+    def _drop_dead_weight(_df: pd.DataFrame) -> pd.DataFrame:
+        _df = _df.copy()
+        # drop all-NaN or constant columns (except protected)
+        drop_cols = [c for c in _df.columns
+                     if c not in protected_cols and (_df[c].nunique(dropna=True) <= 1 or _df[c].isna().all())]
+        if drop_cols:
+            _df.drop(columns=drop_cols, inplace=True, errors="ignore")
+        # drop duplicate columns
+        _df = _df.loc[:, ~_df.T.duplicated()]
+        # drop duplicate rows
+        _df.drop_duplicates(inplace=True)
+        return _df
+    def _normalize_corruption(_df: pd.DataFrame) -> pd.DataFrame:
+        _df = _df.copy()
+        # normalize obvious corruptions: NaN/inf/sentinels → NaN
+        _df.replace(list(SENTINELS), np.nan, inplace=True)
+        num_cols = _df.select_dtypes(include=[np.number]).columns
+        for c in num_cols:
+            s = _df[c]
+            s = s.astype(float)
+            s[~np.isfinite(s)] = np.nan
+            _df[c] = s
+        return _df
+    def _light_row_col_filters(_df: pd.DataFrame) -> pd.DataFrame:
+        _df = _df.copy()
+        # keep rows with <=80% NaN (excluding timestamp)
+        if "timestamp" in _df.columns and _df.shape[1] > 1:
+            keep = _df.drop(columns=["timestamp"]).isna().mean(axis=1) <= 0.8
+            _df = _df[keep]
+        # prune columns with >80% NaN (except protected)
+        na_frac = _df.isna().mean(numeric_only=False)
+        high_na = [c for c in na_frac.index if na_frac[c] > 0.8 and c not in protected_cols]
+        if high_na:
+            _df.drop(columns=high_na, inplace=True, errors="ignore")
+        # keep rows that have >1 observed value across non-timestamp columns
+        if "timestamp" in _df.columns and _df.shape[1] > 1:
+            valid = _df.drop(columns=["timestamp"]).notna().sum(axis=1) > 1
+            _df = _df[valid]
+        return _df
+    def _infer_base_interval_seconds(ts: pd.Series) -> float:
+        """
+        Robustly infer base cadence from timestamp diffs.
+        Strategy:
+          - take positive diffs
+          - winsorize to 5–95% to reduce impact of long gaps
+          - compute a 'rounded mode' on 10ms grid; fall back to median if needed
+        """
+        if ts.size < 2:
+            return 1.0  # fallback
+        diffs = ts.sort_values().diff().dropna().dt.total_seconds()
+        diffs = diffs[diffs > 0]
+        if diffs.empty:
+            return 1.0
+        q05, q95 = diffs.quantile([0.05, 0.95])
+        core = diffs[(diffs >= q05) & (diffs <= q95)]
+        if core.empty:
+            core = diffs
+        # round to 10ms and take the most frequent bin
+        rounded = (core / 0.01).round() * 0.01
+        mode = rounded.mode()
+        if not mode.empty:
+            est = float(mode.iloc[0])
+        else:
+            est = float(core.median())
+        # guardrails
+        if est <= 0:
+            est = float(core.median())
+        logger.info(f"⏱️ Inferred base interval ≈ {est:.3f}s")
+        return est
+    def _insert_time_gaps(_df: pd.DataFrame, base_sec: float) -> pd.DataFrame:
+        """
+        Insert placeholder rows at multiples of inferred base_sec when gaps exceed ~1.5× base.
+        All numeric columns are NaN in inserted rows; non-numeric are forward-filled (except protected).
+        """
+        if _df.empty:
+            return _df
+        _df = _df.copy()
+        _df = _to_dt(_df)
+        expected = timedelta(seconds=base_sec)
+        # tolerance ~ half interval to avoid jittery inserts
+        tol = timedelta(seconds=0.5 * base_sec)
+        # Normalize data
+        num_cols = _df.select_dtypes(include=[np.number]).columns.tolist()
+        non_num_cols = [c for c in _df.columns if c not in num_cols]
+        # Missing detection on interval expectation
+        rows = [_df.iloc[0].copy()]
+        for i in range(1, len(_df)):
+            prev = _df.iloc[i - 1]
+            curr = _df.iloc[i]
+            dt = curr["timestamp"] - prev["timestamp"]
+            if dt > expected * 1.5 + tol:
+                n_missing = int(round(dt / expected)) - 1
+                if n_missing > 0:
+                    for j in range(1, n_missing + 1):
+                        gap = prev.copy()
+                        gap["timestamp"] = prev["timestamp"] + j * expected
+                        # numeric sensors left as NaN to be imputed
+                        for c in num_cols:
+                            if c not in protected_cols:
+                                gap[c] = np.nan
+                        # for non-numeric, keep last known (except protected)
+                        for c in non_num_cols:
+                            if c not in protected_cols:
+                                gap[c] = prev[c]
+                        rows.append(gap)
+            rows.append(curr.copy())
+        # Sorting
+        out = pd.DataFrame(rows).sort_values("timestamp").reset_index(drop=True)
+        return out
+    def _knn_impute_all(_df: pd.DataFrame) -> pd.DataFrame:
+        """
+        Backfill ALL numeric sensors jointly with KNN, using time (ts_sec) as an additional feature.
+        """
+        _df = _df.copy()
+        _df["ts_sec"] = (_df["timestamp"] - _df["timestamp"].min()).dt.total_seconds()
+        # Normalize data
+        num_cols = _df.select_dtypes(include=[np.number]).columns.tolist()
+        # ensure ts_sec included
+        if "ts_sec" not in num_cols:
+            num_cols = num_cols + ["ts_sec"]
+        # Build imputation frame and remember order
+        X = _df[num_cols].copy()
+        non_missing_rows = X.dropna().shape[0]
+        k = min(5, max(1, non_missing_rows))
+        logger.info(f"🤝 KNNImputer n_neighbors={k} on {len(num_cols)} features")
+        # Impute and backfill data using KNN
+        imputer = KNNImputer(n_neighbors=k)
+        X_imp = imputer.fit_transform(X)
+        X_imp = pd.DataFrame(X_imp, columns=num_cols, index=_df.index)
+        # Write back (excluding ts_sec)
+        for c in num_cols:
+            if c == "ts_sec":
+                continue
+            _df[c] = X_imp[c]
+        _df.drop(columns=["ts_sec"], inplace=True)
+        return _df
+    # Copy data from selective sensor types for Feature Engineering
+    def _feature_engineering(_df: pd.DataFrame) -> pd.DataFrame:
+        _df = _df.copy()
+        if {"ENGINE_LOAD", "ABSOLUTE_LOAD"}.issubset(_df.columns):
+            _df["AVG_ENGINE_LOAD"] = _df[["ENGINE_LOAD", "ABSOLUTE_LOAD"]].mean(axis=1)
+        if {"INTAKE_TEMP", "OIL_TEMP", "COOLANT_TEMP"}.issubset(_df.columns):
+            _df["TEMP_MEAN"] = _df[["INTAKE_TEMP", "OIL_TEMP", "COOLANT_TEMP"]].mean(axis=1)
+        if {"MAF", "RPM"}.issubset(_df.columns):
+            _df["AIRFLOW_PER_RPM"] = _df["MAF"] / _df["RPM"].replace(0, np.nan)
+        return _df
+    # Apply MinMaxScaler to fit data frame
+    def _scale_numeric(_df: pd.DataFrame) -> pd.DataFrame:
+        _df = _df.copy()
+        num_cols = _df.select_dtypes(include=[np.number]).columns.tolist()
+        for c in list(protected_cols):
+            if c in num_cols:
+                num_cols.remove(c)
+        if num_cols:
+            scaler = MinMaxScaler()
+            _df[num_cols] = scaler.fit_transform(_df[num_cols])
+        return _df
+    # Correlation heatmap plotter
+    def _plot_corr(_df: pd.DataFrame, _id: str):
+        try:
+            num = _df.select_dtypes(include=[np.number])
+            if num.shape[1] < 2:
+                return
+            plt.figure(figsize=(12, 10))
+            sns.heatmap(num.corr(), annot=True, fmt=".2f", cmap="coolwarm")
+            plt.title("Correlation Between Numeric OBD-II Variables")
+            plt.tight_layout()
+            plt.savefig(os.path.join(PLOT_DIR, f"heatmap_{_id}.png"))
+            plt.close()
+        except Exception as e:
+            logger.error(f"Heatmap generation failed: {e}")
+    # Sensor trend plotter
+    def _plot_trend(_df: pd.DataFrame, _id: str):
+        try:
+            plt.figure(figsize=(15, 6))
+            for col in ['RPM', 'ENGINE_LOAD', 'ABSOLUTE_LOAD', 'COOLANT_TEMP',
+                        'INTAKE_TEMP', 'OIL_TEMP', 'INTAKE_PRESSURE', 'BAROMETRIC_PRESSURE',
+                        'CONTROL_MODULE_VOLTAGE']:
+                if col in _df.columns:
+                    plt.plot(_df.index, _df[col], label=col)
+            plt.title("Sensor Trends (Index-Based, No Time Gaps)")
+            plt.xlabel("Sample Index")
+            plt.ylabel("Sensor Value")
+            plt.legend()
+            plt.grid(True)
+            plt.tight_layout()
+            plt.savefig(os.path.join(PLOT_DIR, f"trend_{_id}.png"))
+            plt.close()
+        except Exception as e:
+            logger.error(f"Trend plot failed: {e}")
+    # ----------------------- pipeline -----------------------
+    df = df.copy()
+    # 0) Basic tidy
+    df = _drop_dead_weight(df)
+    df = _to_dt(df)
+    # 1) Corruption-only normalization (no outlier trimming)
+    df = _normalize_corruption(df)
+    # 2) Light row/column filtering for extreme sparsity
+    df = _light_row_col_filters(df)
+    # 3) Auto infer base interval & insert gap rows
+    base_sec = _infer_base_interval_seconds(df["timestamp"])
+    df = _insert_time_gaps(df, base_sec)
+    # 4) KNN backfill all numeric sensors (time-aware)
+    df = _knn_impute_all(df)
+    # 5) Feature engineering AFTER imputation
+    df = _feature_engineering(df)
+    # 6) Final sort / index
+    df.sort_values("timestamp", inplace=True)
+    df.reset_index(drop=True, inplace=True)
+    # 7) Scaling after impute (kept from original)
+    if not df.select_dtypes(include=["number"]).empty:
+        df = _scale_numeric(df)
+    # 8) Save
+    out_path = os.path.join(CLEANED_DIR, f"cleaned_{norm_ts}.csv")
+    df.to_csv(out_path, index=False)
+    logger.info(f"✅ Cleaned saved: {out_path}")
+    # 9) UL drivestyle predictions
+    df_for_persist = df
+    labeled_path = None
+    try:
+        ul = ULLabeler.get()
+        preds = ul.predict_df(df)
+        # update global DRIVE_STYLE (overwrite if already exists)
+        global DRIVE_STYLE
+        DRIVE_STYLE = [str(p) for p in preds]
+        # write labeled CSV (driving_style column)
+        df_labeled = df.copy()
+        df_labeled["driving_style"] = DRIVE_STYLE
+        labeled_path = os.path.join(CLEANED_DIR, f"cleaned_{norm_ts}_labeled.csv")
+        df_labeled.to_csv(labeled_path, index=False)
+        df_for_persist = df_labeled
+        # Update the global DRIVE_STYLE list
+        logger.info(f"✅ UL labels generated ({len(DRIVE_STYLE)}) → {labeled_path}")
+    except Exception as e:
+        logger.error(f"❌ UL labeling failed: {e}")
+    # 10) Plots
+    _plot_corr(df, norm_ts)
+    _plot_trend(df, norm_ts)
+    # 11) Update event
+    try:
+        PIPELINE_EVENTS[norm_ts]["status"] = "done"
+    except Exception:
+        pass
+    # 12) Upload to Drive
+    try:
+        if drive_saver.is_service_available():
+            if labeled_path and os.path.exists(labeled_path):
+                drive_saver.upload_csv_to_drive(labeled_path)
+                logger.info("✅ Uploaded labeled to Google Drive")
+            else:
+                drive_saver.upload_csv_to_drive(out_path)
+                logger.info("✅ Uploaded default to Google Drive")
+        else:
+            logger.warning("⚠️  Google Drive service not available")
+    except Exception as e:
+        logger.error(f"❌ Drive upload error: {e}")
+    # 13) Save to MongoDB
+    try:
+        if mongo_saver.is_connected():
+            # Save the cleaned DataFrame directly to MongoDB
+            session_id = f"session_{norm_ts}"
+            if mongo_saver.save_dataframe_to_mongo(df_for_persist, session_id):
+                logger.info("✅ Saved to MongoDB")
+            else:
+                logger.warning("⚠️  MongoDB save failed")
+        else:
+            logger.warning("⚠️  MongoDB not connected")
+    except Exception as e:
+            logger.error(f"❌ MongoDB save error: {e}")
+    # 14) Save to Firebase Storage (incremented NNN_YYYY-MM-DD_processed.csv at fixed path)
+    try:
+        if firebase_saver and firebase_saver.is_available():
+            # Choose the final artifact to persist
+            if labeled_path and os.path.exists(labeled_path):
+                target_path = labeled_path
+            else:
+                target_path = out_path
+            # Optional: use the acquisition date if norm_ts starts with YYYY-MM-DD, else let saver use AUS/Melbourne "today"
+            date_str = None
+            try:
+                date_str = str(norm_ts)[:10] if norm_ts and len(str(norm_ts)) >= 10 else None
+            except Exception:
+                date_str = None
+            # Upload with auto-incremented name: NNN_YYYY-MM-DD_processed.csv under skyledge/processed
+            gs_url = firebase_saver.upload_file_with_increment(target_path, date_str=date_str)
+            # Save to Firebase Storage (incremented NNN_YYYY-MM-DD_processed.csv at fixed path)
+            if gs_url:
+                logger.info(f"✅ Saved to Firebase Storage: {gs_url}")
+            else:
+                logger.warning("⚠️ Firebase Storage upload returned empty URL")
+        else:
+            logger.warning("⚠️ Firebase Storage not available")
+    except Exception as e:
+        logger.error(f"❌ Firebase Storage save error: {e}")
+# ───────────── Health Check ──────────────────────
+@app.get("/health")
+def health():
+    return {"status": "ok"}
+@app.get("/models/status")
+def models_status():
+    """Check if models are loaded and available"""
+    try:
+        model_dir = pathlib.Path(os.getenv("MODEL_DIR", "/app/models/ul"))
+        required_files = ["label_encoder_ul.pkl", "scaler_ul.pkl", "xgb_drivestyle_ul.pkl"]
+        available_files = []
+        missing_files = []
+        for file in required_files:
+            file_path = model_dir / file
+            if file_path.exists():
+                available_files.append(file)
+            else:
+                missing_files.append(file)
+        status = "ready" if len(available_files) == len(required_files) else "loading"
+        return {
+            "status": status,
+            "model_directory": str(model_dir),
+            "available_files": available_files,
+            "missing_files": missing_files,
+            "total_files": len(required_files),
+            "loaded_files": len(available_files)
+        }
+    except Exception as e:
+        return {
+            "status": "error",
+            "error": str(e),
+            "timestamp": datetime.now().isoformat()
+        }
+# ─────── Send status to frontend ─────────────────
+@app.get("/events")
+def get_events():
+    return PIPELINE_EVENTS
+# ────── Delete event from dashboard ──────────────
+@app.delete("/events/remove/{timestamp}")
+def remove_event(timestamp: str):
+    if timestamp in PIPELINE_EVENTS:
+        del PIPELINE_EVENTS[timestamp]
+    return {"status": "deleted"}
+# ───────────── Download Cleaned ──────────────────
+@app.get("/download/{filename}")
+def download_file(filename: str):
+    path = os.path.join(CLEANED_DIR, filename)
+    if not os.path.exists(path):
+        raise HTTPException(status_code=404, detail="Not found")
+    return FileResponse(path, media_type='text/csv', filename=filename)
+# ───────────── MongoDB Operations ──────────────────
+@app.get("/mongo/status")
+def mongo_status():
+    """Check MongoDB connection status"""
+    return {
+        "connected": mongo_saver.is_connected(),
+        "available": MONGODB_AVAILABLE if 'MONGODB_AVAILABLE' in globals() else False
+    }
+@app.get("/mongo/sessions")
+def get_mongo_sessions():
+    """Get summary of all MongoDB sessions"""
+    if not mongo_saver.is_connected():
+        raise HTTPException(status_code=503, detail="MongoDB not connected")
+    sessions = mongo_saver.get_session_summary()
+    return {"sessions": sessions}
+@app.get("/mongo/query")
+def query_mongo_data(
+    session_id: str = None,
+    driving_style: str = None,
+    start_time: str = None,
+    end_time: str = None,
+    limit: int = 1000
+):
+    """Query data from MongoDB with filters"""
+    if not mongo_saver.is_connected():
+        raise HTTPException(status_code=503, detail="MongoDB not connected")
+    # Parse datetime strings if provided
+    start_dt = None
+    end_dt = None
+    if start_time:
+        try:
+            start_dt = pd.to_datetime(start_time)
+        except Exception:
+            raise HTTPException(status_code=400, detail="Invalid start_time format")
+    if end_time:
+        try:
+            end_dt = pd.to_datetime(end_time)
+        except Exception:
+            raise HTTPException(status_code=400, detail="Invalid end_time format")
+    results = mongo_saver.query_data(
+        session_id=session_id,
+        driving_style=driving_style,
+        start_time=start_dt,
+        end_time=end_dt,
+        limit=limit
+    )
+    return {"results": results, "count": len(results)}
+@app.post("/mongo/save-csv")
+async def save_csv_to_mongo_endpoint(
+    file: UploadFile = File(...),
+    session_id: str = None
+):
+    """Save uploaded CSV directly to MongoDB"""
+    if not mongo_saver.is_connected():
+        raise HTTPException(status_code=503, detail="MongoDB not connected")
+    try:
+        # Save uploaded file temporarily
+        temp_path = os.path.join(BASE_DIR, f"temp_{file.filename}")
+        with open(temp_path, "wb") as f:
+            f.write(await file.read())
+        # Save to MongoDB
+        success = mongo_saver.save_csv_to_mongo(temp_path, session_id)
+        # Clean up temp file
+        if os.path.exists(temp_path):
+            os.remove(temp_path)
+        if success:
+            return {"status": "success", "message": "CSV saved to MongoDB"}
+        else:
+            raise HTTPException(status_code=500, detail="Failed to save to MongoDB")
+    except Exception as e:
+        logger.error(f"CSV to MongoDB save failed: {e}")
+        raise HTTPException(status_code=500, detail=f"Save failed: {str(e)}")
+# ───────────── RLHF Training Endpoints ─────────────
+class RLHFTrainingRequest(BaseModel):
+    max_datasets: int = 10
+    force_retrain: bool = False
+class RLHFTrainingResponse(BaseModel):
+    status: str
+    model_version: str = None
+    datasets_processed: int = 0
+    samples_processed: int = 0
+    performance_metrics: dict = None
+    error: str = None
+    timestamp: str = None
+@app.post("/rlhf/train", response_model=RLHFTrainingResponse)
+async def trigger_rlhf_training(
+    request: RLHFTrainingRequest,
+    background_tasks: BackgroundTasks
+):
+    """
+    Trigger RLHF (Reinforcement Learning from Human Feedback) training session.
+    This endpoint:
+    1. Loads human-labeled data from Firebase storage (skyledge/labeled)
+    2. Combines it with existing model predictions for RLHF
+    3. Retrains the XGBoost model with the combined dataset
+    4. Saves the new model to Hugging Face Hub
+    """
+    try:
+        logger.info(f"🚀 RLHF training requested with max_datasets={request.max_datasets}")
+        # Initialize trainer
+        trainer = RLHFTrainer()
+        # Run training
+        result = trainer.train(max_datasets=request.max_datasets)
+        if result["status"] == "success":
+            logger.info(f"✅ RLHF training completed: v{result['model_version']}")
+            return RLHFTrainingResponse(
+                status="success",
+                model_version=result["model_version"],
+                datasets_processed=result["datasets_processed"],
+                samples_processed=result["samples_processed"],
+                performance_metrics=result["performance_metrics"],
+                timestamp=datetime.now().isoformat()
+            )
+        elif result["status"] == "no_data":
+            logger.info("ℹ️ No new data available for RLHF training")
+            return RLHFTrainingResponse(
+                status="no_data",
+                timestamp=datetime.now().isoformat()
+            )
+        else:
+            logger.error(f"❌ RLHF training failed: {result.get('error', 'Unknown error')}")
+            return RLHFTrainingResponse(
+                status="error",
+                error=result.get("error", "Unknown error"),
+                timestamp=datetime.now().isoformat()
+            )
+    except Exception as e:
+        logger.error(f"❌ RLHF training endpoint failed: {e}")
+        raise HTTPException(
+            status_code=500,
+            detail=f"RLHF training failed: {str(e)}"
+        )
+@app.get("/rlhf/status")
+async def get_rlhf_status():
+    """
+    Get status of RLHF training system and available labeled data.
+    """
+    try:
+        from train import LabeledDataLoader
+        loader = LabeledDataLoader()
+        datasets = loader.list_labeled_datasets()
+        return {
+            "status": "available",
+            "labeled_datasets_count": len(datasets),
+            "datasets": [
+                {
+                    "name": d["name"],
+                    "size": d["size"],
+                    "created": d["created"]
+                } for d in datasets[:10]  # Limit to first 10 for response size
+            ],
+            "firebase_bucket": "skyledge-36b56.firebasestorage.app",
+            "labeled_path": "skyledge/labeled",
+            "timestamp": datetime.now().isoformat()
+        }
+    except Exception as e:
+        logger.error(f"❌ RLHF status check failed: {e}")
+        raise HTTPException(
+            status_code=500,
+            detail=f"Status check failed: {str(e)}"
+        )
+@app.get("/rlhf/trained-datasets")
+async def get_trained_datasets():
+    """
+    Get list of datasets that have already been used for training.
+    """
+    try:
+        from train import LabeledDataLoader
+        loader = LabeledDataLoader()
+        trained_datasets = loader._get_trained_datasets()
+        return {
+            "trained_datasets_count": len(trained_datasets),
+            "trained_datasets": trained_datasets,
+            "timestamp": datetime.now().isoformat()
+        }
+    except Exception as e:
+        logger.error(f"❌ Failed to get trained datasets: {e}")
+        raise HTTPException(
+            status_code=500,
+            detail=f"Failed to get trained datasets: {str(e)}"
+        )

data.json ADDED Viewed

	@@ -0,0 +1,26 @@

+{
+    "timestamp": "2025-05-15T10:00:00",
+    "driving_style": "aggressive",
+    "data": {
+      "RPM": 3200,
+      "THROTTLE_POS": 75,
+      "SPEED": 110,
+      "FUEL_PRESSURE": 290,
+      "ENGINE_LOAD": 45,
+      "COOLANT_TEMP": 85,
+      "INTAKE_TEMP": 30,
+      "TIMING_ADVANCE": 10,
+      "MAF": 12.5,
+      "INTAKE_PRESSURE": 28,
+      "SHORT_FUEL_TRIM_1": 3.1,
+      "LONG_FUEL_TRIM_1": 6.2,
+      "SHORT_FUEL_TRIM_2": 2.5,
+      "LONG_FUEL_TRIM_2": 5.0,
+      "COMMANDED_EQUIV_RATIO": 1.0,
+      "O2_B1S2": 0.74,
+      "O2_B2S2": 0.68,
+      "O2_S1_WR_VOLTAGE": 0.85,
+      "COMMANDED_EGR": 10
+    }
+  }

data/drive_saver.py ADDED Viewed

	@@ -0,0 +1,110 @@

+# Google Drive Operations for OBD Logger
+# Handles authentication and file uploads to Google Drive
+import os
+import json
+import logging
+from google.oauth2 import service_account
+from googleapiclient.discovery import build
+from googleapiclient.http import MediaFileUpload
+# ───────────── Logging Setup ─────────────
+logger = logging.getLogger("drive-saver")
+logger.setLevel(logging.INFO)
+fmt = logging.Formatter("[%(levelname)s] %(asctime)s - %(message)s")
+handler = logging.StreamHandler()
+handler.setFormatter(fmt)
+logger.addHandler(handler)
+class DriveSaver:
+    """Handles Google Drive operations for saving OBD data"""
+    def __init__(self):
+        self.service = None
+        self.folder_id = "1r-wefqKbK9k9BeYDW1hXRbx4B-0Fvj5P"  # Default folder ID
+        self._initialize_service()
+    def _initialize_service(self):
+        """Initialize Google Drive service with credentials"""
+        try:
+            creds_dict = json.loads(os.getenv("GDRIVE_CREDENTIALS_JSON"))
+            creds = service_account.Credentials.from_service_account_info(
+                creds_dict,
+                scopes=["https://www.googleapis.com/auth/drive"]
+            )
+            self.service = build("drive", "v3", credentials=creds)
+            logger.info("✅ Google Drive service initialized successfully")
+        except Exception as e:
+            logger.error(f"❌ Drive initialization failed: {e}")
+            self.service = None
+    def upload_csv_to_drive(self, file_path: str, folder_id: str = None) -> bool:
+        """
+        Upload a CSV file to Google Drive
+        Args:
+            file_path (str): Path to the CSV file to upload
+            folder_id (str, optional): Target folder ID. Uses default if not specified.
+        Returns:
+            bool: True if upload successful, False otherwise
+        """
+        if not self.service:
+            logger.error("❌ Drive service not initialized")
+            return False
+        target_folder = folder_id or self.folder_id
+        try:
+            file_name = os.path.basename(file_path)
+            media = MediaFileUpload(file_path, mimetype='text/csv')
+            metadata = {"name": file_name, "parents": [target_folder]}
+            result = self.service.files().create(
+                body=metadata,
+                media_body=media,
+                fields="id"
+            ).execute()
+            logger.info(f"✅ File uploaded to Drive successfully: {file_name} (ID: {result.get('id')})")
+            return True
+        except Exception as e:
+            logger.error(f"❌ Drive upload failed: {e}")
+            return False
+    def is_service_available(self) -> bool:
+        """Check if Drive service is available"""
+        return self.service is not None
+    def get_folder_id(self) -> str:
+        """Get the default folder ID"""
+        return self.folder_id
+    def set_folder_id(self, folder_id: str):
+        """Set a new default folder ID"""
+        self.folder_id = folder_id
+        logger.info(f"📁 Default folder ID updated to: {folder_id}")
+# Convenience function for backward compatibility
+def get_drive_service():
+    """Legacy function - returns DriveSaver instance"""
+    return DriveSaver()
+def upload_to_folder(service, file_path, folder_id):
+    """Legacy function - uploads file to specified folder"""
+    if isinstance(service, DriveSaver):
+        return service.upload_csv_to_drive(file_path, folder_id)
+    else:
+        # Handle legacy service object
+        try:
+            file_name = os.path.basename(file_path)
+            media = MediaFileUpload(file_path, mimetype='text/csv')
+            metadata = {"name": file_name, "parents": [folder_id]}
+            return service.files().create(body=metadata, media_body=media, fields="id").execute()
+        except Exception as e:
+            logger.error(f"❌ Legacy upload failed: {e}")
+            return None

data/firebase_saver.py ADDED Viewed

	@@ -0,0 +1,315 @@

+# firebase_saver.py
+import os
+import io
+import re
+import json
+import logging
+from datetime import datetime
+from typing import Optional, Tuple, List
+import pandas as pd
+logger = logging.getLogger("firebase-saver")
+logger.setLevel(logging.INFO)
+if not logger.handlers:
+    _h = logging.StreamHandler()
+    _h.setFormatter(logging.Formatter("[%(levelname)s] %(asctime)s - %(message)s"))
+    logger.addHandler(_h)
+# ---------- Constants (fixed as requested) ----------
+FIXED_BUCKET = "skyledge-36b56.firebasestorage.app"
+FIXED_PREFIX = "skyledge/processed"  # no trailing slash
+# Pattern: NNN_YYYY-MM-DD_processed.csv
+FILENAME_RE = re.compile(r"^(?P<num>\d{3})_(?P<date>\d{4}-\d{2}-\d{2})_processed\.csv$")
+def _parse_gs_uri(uri: Optional[str]):
+    if not uri or not uri.startswith("gs://"):
+        return None, None
+    path = uri[len("gs://"):]
+    parts = path.split("/", 1)
+    bucket = parts[0]
+    prefix = parts[1] if len(parts) > 1 else ""
+    return bucket, prefix
+def _maybe_default_firebase_bucket(name: Optional[str]) -> Optional[str]:
+    # If user passed a project ID (no dot), convert to <project>.appspot.com
+    if name and "." not in name:
+        return f"{name}.appspot.com"
+    return name
+# -------------------- Low-level clients --------------------
+class _AdminClient:
+    """Firebase Admin SDK storage client."""
+    def __init__(self, bucket: str):
+        import firebase_admin
+        from firebase_admin import credentials, storage as fb_storage
+        raw = os.getenv("FIREBASE_ADMIN_JSON")
+        if not raw:
+            raise RuntimeError("FIREBASE_ADMIN_JSON not set")
+        info = json.loads(raw)
+        client_email = info.get("client_email")
+        cred = credentials.Certificate(info)
+        if not firebase_admin._apps:
+            firebase_admin.initialize_app(cred, {"storageBucket": bucket})
+        # fb_storage.bucket returns a google.cloud.storage.bucket.Bucket
+        self.bucket = fb_storage.bucket(bucket)
+        self._bucket_name = bucket
+        logger.info(f"✅ Firebase Admin initialized | bucket={bucket} as {client_email}")
+    # Uploads
+    def upload_from_filename(self, local_path: str, dest_path: str, content_type: str):
+        blob = self.bucket.blob(dest_path)
+        blob.cache_control = "no-store"
+        blob.upload_from_filename(local_path, content_type=content_type)
+    def upload_from_bytes(self, data: bytes, dest_path: str, content_type: str):
+        blob = self.bucket.blob(dest_path)
+        blob.cache_control = "no-store"
+        blob.upload_from_string(data, content_type=content_type)
+    # Listing (needs storage.objects.list permission)
+    def list_names(self, prefix: str) -> List[str]:
+        # Bucket.list_blobs works via the underlying GCS client
+        blobs = self.bucket.list_blobs(prefix=prefix)
+        return [b.name for b in blobs]
+    # Existence check (for collision-safe retry)
+    def blob_exists(self, path: str) -> bool:
+        blob = self.bucket.blob(path)
+        return blob.exists()
+class _GCSClient:
+    """google-cloud-storage client."""
+    def __init__(self, bucket: str):
+        from google.cloud import storage
+        from google.oauth2 import service_account
+        raw = os.getenv("FIREBASE_SERVICE_ACCOUNT_JSON")
+        if not raw:
+            raise RuntimeError("FIREBASE_SERVICE_ACCOUNT_JSON not set")
+        info = json.loads(raw)
+        client_email = info.get("client_email")
+        creds = service_account.Credentials.from_service_account_info(info)
+        project_id = info.get("project_id")
+        self.client = storage.Client(credentials=creds, project=project_id)
+        self.bucket = self.client.bucket(bucket)
+        self._bucket_name = bucket
+        logger.info(f"✅ GCS client initialized | bucket={bucket} as {client_email}")
+    def upload_from_filename(self, local_path: str, dest_path: str, content_type: str):
+        blob = self.bucket.blob(dest_path)
+        blob.cache_control = "no-store"
+        blob.upload_from_filename(local_path, content_type=content_type)
+    def upload_from_bytes(self, data: bytes, dest_path: str, content_type: str):
+        blob = self.bucket.blob(dest_path)
+        blob.cache_control = "no-store"
+        blob.upload_from_string(data, content_type=content_type)
+    def list_names(self, prefix: str) -> List[str]:
+        blobs = self.client.list_blobs(self._bucket_name, prefix=prefix)
+        return [b.name for b in blobs]
+    def blob_exists(self, path: str) -> bool:
+        blob = self.bucket.blob(path)
+        return blob.exists(self.client)
+# -------------------- Saver (high level) --------------------
+class FirebaseSaver:
+    """
+    Fixed target:
+      Bucket: skyledge-36b56.firebasestorage.app
+      Prefix: skyledge/processed
+    Filename convention: NNN_YYYY-MM-DD_processed.csv (NNN is 001-based, zero-padded).
+    Auto-increments by listing current objects and picking max+1.
+    """
+    def __init__(self):
+        # Force fixed location regardless of env (as requested)
+        bucket_name = FIXED_BUCKET
+        self.prefix = FIXED_PREFIX
+        # Try Admin SDK first; fallback to GCS client
+        self.client = None
+        self.mode = None
+        try:
+            if os.getenv("FIREBASE_ADMIN_JSON"):
+                self.client = _AdminClient(bucket_name)
+                self.mode = "admin"
+        except Exception as e:
+            logger.warning(f"⚠️ Admin SDK init failed: {e}")
+        if self.client is None:
+            try:
+                self.client = _GCSClient(bucket_name)
+                self.mode = "gcs"
+            except Exception as e:
+                logger.error(f"❌ GCS client init failed: {e}")
+                raise
+        logger.info(f"📦 FirebaseSaver ready | mode={self.mode} bucket={bucket_name} prefix={self.prefix}")
+    def is_available(self) -> bool:
+        return self.client is not None
+    # ---------- Incremental naming helpers ----------
+    def _list_existing_filenames(self) -> List[str]:
+        """List object names under the fixed prefix, return just basenames under that folder."""
+        names = self.client.list_names(prefix=self.prefix + "/")
+        # keep only items immediately under prefix (not subfolders) & matching our filename pattern
+        base_names = []
+        for full in names:
+            # full looks like 'skyledge/processed/NNN_YYYY-MM-DD_processed.csv'
+            if not full.startswith(self.prefix + "/"):
+                continue
+            base = full[len(self.prefix) + 1:]  # strip 'prefix/'
+            if "/" in base:
+                # skip nested items (none expected)
+                continue
+            if FILENAME_RE.match(base):
+                base_names.append(base)
+        return base_names
+    def _max_existing_id(self) -> int:
+        """Return max NNN found under prefix, or 0 if none."""
+        try:
+            base_names = self._list_existing_filenames()
+        except Exception as e:
+            logger.warning(f"⚠️ Unable to list existing objects; defaulting max_id=0: {e}")
+            return 0
+        max_id = 0
+        for name in base_names:
+            m = FILENAME_RE.match(name)
+            if not m:
+                continue
+            try:
+                num = int(m.group("num"))
+                if num > max_id:
+                    max_id = num
+            except ValueError:
+                continue
+        return max_id
+    @staticmethod
+    def _format_id(n: int) -> str:
+        return f"{n:03d}"
+    @staticmethod
+    def _today_au() -> str:
+        # Use Australia/Melbourne local date; if zoneinfo unavailable, fall back to UTC date.
+        try:
+            from zoneinfo import ZoneInfo
+            dt = datetime.now(ZoneInfo("Australia/Melbourne"))
+        except Exception:
+            dt = datetime.utcnow()
+        return dt.strftime("%Y-%m-%d")
+    def _build_filename(self, n_int: int, date_str: Optional[str] = None) -> str:
+        date_val = (date_str or self._today_au())
+        return f"{self._format_id(n_int)}_{date_val}_processed.csv"
+    def _dest_path(self, filename: str) -> str:
+        return f"{self.prefix}/{filename}"
+    def _next_available_name(self, date_str: Optional[str] = None, max_retries: int = 5) -> Tuple[str, str]:
+        """
+        Compute the next file name by listing existing ones and incrementing.
+        Includes a collision check (exists) and retries if necessary.
+        Returns: (filename, full_gcs_path)
+        """
+        start = self._max_existing_id() + 1
+        n = start
+        for _ in range(max_retries):
+            candidate = self._build_filename(n, date_str=date_str)
+            dest_path = self._dest_path(candidate)
+            # collision check
+            if not self.client.blob_exists(dest_path):
+                return candidate, dest_path
+            n += 1
+        # As a final fallback, return the last tried (very unlikely to collide repeatedly)
+        candidate = self._build_filename(n, date_str=date_str)
+        return candidate, self._dest_path(candidate)
+    # ---------- Public save methods (incremental) ----------
+    def upload_file_with_increment(
+        self,
+        local_path: str,
+        date_str: Optional[str] = None,
+        content_type: str = "text/csv",
+    ) -> str:
+        """
+        Upload a local file using the next incremental name.
+        Returns the gs:// URL of the uploaded object (string) or "" on failure.
+        """
+        if not self.is_available():
+            logger.warning("⚠️ Firebase saver unavailable")
+            return ""
+        try:
+            filename, dest_path = self._next_available_name(date_str=date_str)
+            self.client.upload_from_filename(local_path, dest_path, content_type)
+            logger.info(f"✅ Uploaded file to gs://{FIXED_BUCKET}/{dest_path}")
+            return f"gs://{FIXED_BUCKET}/{dest_path}"
+        except Exception as e:
+            logger.error(f"❌ Firebase upload failed: {e}")
+            return ""
+    def upload_dataframe_with_increment(
+        self,
+        df: pd.DataFrame,
+        date_str: Optional[str] = None,
+        content_type: str = "text/csv",
+    ) -> str:
+        """
+        Upload a DataFrame (as CSV) using the next incremental name.
+        Returns the gs:// URL of the uploaded object (string) or "" on failure.
+        """
+        if not self.is_available():
+            logger.warning("⚠️ Firebase saver unavailable")
+            return ""
+        try:
+            buf = io.StringIO()
+            df.to_csv(buf, index=False)
+            data = buf.getvalue().encode("utf-8")
+            filename, dest_path = self._next_available_name(date_str=date_str)
+            self.client.upload_from_bytes(data, dest_path, content_type)
+            logger.info(f"✅ Uploaded DataFrame to gs://{FIXED_BUCKET}/{dest_path}")
+            return f"gs://{FIXED_BUCKET}/{dest_path}"
+        except Exception as e:
+            logger.error(f"❌ Firebase DF upload failed: {e}")
+            return ""
+# ---------- Convenience free functions ----------
+def save_csv_increment(csv_path: str, date_str: Optional[str] = None) -> str:
+    """
+    Upload local CSV with auto-incremented name 'NNN_YYYY-MM-DD_processed.csv'.
+    Returns gs:// URL or "".
+    """
+    saver = FirebaseSaver()
+    return saver.upload_file_with_increment(csv_path, date_str=date_str)
+def save_dataframe_increment(df: pd.DataFrame, date_str: Optional[str] = None) -> str:
+    """
+    Upload DataFrame with auto-incremented name 'NNN_YYYY-MM-DD_processed.csv'.
+    Returns gs:// URL or "".
+    """
+    saver = FirebaseSaver()
+    return saver.upload_dataframe_with_increment(df, date_str=date_str)

data/mongo_saver.py ADDED Viewed

	@@ -0,0 +1,362 @@

+# MongoDB Operations for OBD Logger
+# Handles data restructuring and saving to MongoDB Atlas
+import os
+import json
+import logging
+from datetime import datetime
+from typing import Dict, List, Any, Optional
+import pandas as pd
+import numpy as np
+# MongoDB dependencies
+try:
+    from pymongo import MongoClient
+    from pymongo.errors import ConnectionFailure, ServerSelectionTimeoutError
+    MONGODB_AVAILABLE = True
+except ImportError:
+    MONGODB_AVAILABLE = False
+    print("⚠️  PyMongo not available. Install with: pip install pymongo")
+# ───────────── Logging Setup ─────────────
+logger = logging.getLogger("mongo-saver")
+logger.setLevel(logging.INFO)
+fmt = logging.Formatter("[%(levelname)s] %(asctime)s - %(message)s")
+handler = logging.StreamHandler()
+handler.setFormatter(fmt)
+logger.addHandler(handler)
+class MongoSaver:
+    """Handles MongoDB operations for saving OBD data"""
+    def __init__(self, mongo_uri: str = None):
+        self.client = None
+        self.db = None
+        self.collection = None
+        self.mongo_uri = mongo_uri or os.getenv("MONGO_URI")
+        self._initialize_connection()
+    def _initialize_connection(self):
+        """Initialize MongoDB connection"""
+        if not MONGODB_AVAILABLE:
+            logger.error("❌ PyMongo not available. Cannot connect to MongoDB")
+            return
+        if not self.mongo_uri:
+            logger.error("❌ MongoDB URI not provided. Set MONGO_URI environment variable")
+            return
+        try:
+            # Connect with timeout and retry settings
+            self.client = MongoClient(
+                self.mongo_uri,
+                serverSelectionTimeoutMS=5000,  # 5 second timeout
+                connectTimeoutMS=10000,         # 10 second connection timeout
+                socketTimeoutMS=10000           # 10 second socket timeout
+            )
+            # Test connection
+            self.client.admin.command('ping')
+            # Set up database and collection
+            self.db = self.client.obd_logger
+            self.collection = self.db.obd_data
+            # Create indexes for better performance
+            self._create_indexes()
+            logger.info("✅ MongoDB connection established successfully")
+        except (ConnectionFailure, ServerSelectionTimeoutError) as e:
+            logger.error(f"❌ MongoDB connection failed: {e}")
+            self.client = None
+            self.db = None
+            self.collection = None
+        except Exception as e:
+            logger.error(f"❌ MongoDB initialization error: {e}")
+            self.client = None
+            self.db = None
+            self.collection = None
+    def _create_indexes(self):
+        """Create database indexes for better query performance"""
+        try:
+            # Index on timestamp for time-based queries
+            self.collection.create_index("timestamp")
+            # Index on driving_style for filtering
+            self.collection.create_index("driving_style")
+            # Compound index for common queries
+            self.collection.create_index([("timestamp", -1), ("driving_style", 1)])
+            # Index on session_id for session-based queries
+            self.collection.create_index("session_id")
+            logger.info("✅ Database indexes created successfully")
+        except Exception as e:
+            logger.warning(f"⚠️  Index creation failed: {e}")
+    def is_connected(self) -> bool:
+        """Check if MongoDB connection is active"""
+        if not self.client:
+            return False
+        try:
+            # Ping the database
+            self.client.admin.command('ping')
+            return True
+        except Exception:
+            return False
+    def save_csv_to_mongo(self, csv_file_path: str, session_id: str = None) -> bool:
+        """
+        Read CSV file and save data to MongoDB
+        Args:
+            csv_file_path (str): Path to the CSV file
+            session_id (str, optional): Unique identifier for this data session
+        Returns:
+            bool: True if save successful, False otherwise
+        """
+        if not self.is_connected():
+            logger.error("❌ MongoDB not connected")
+            return False
+        try:
+            # Read CSV file
+            df = pd.read_csv(csv_file_path)
+            if df.empty:
+                logger.warning("⚠️  CSV file is empty")
+                return False
+            # Generate session ID if not provided
+            if not session_id:
+                session_id = f"session_{datetime.now().strftime('%Y%m%d_%H%M%S')}"
+            # Convert DataFrame to MongoDB documents
+            documents = self._dataframe_to_documents(df, session_id)
+            # Insert documents into MongoDB
+            if documents:
+                result = self.collection.insert_many(documents)
+                logger.info(f"✅ Saved {len(result.inserted_ids)} records to MongoDB (Session: {session_id})")
+                return True
+            else:
+                logger.warning("⚠️  No valid documents to save")
+                return False
+        except Exception as e:
+            logger.error(f"❌ Failed to save CSV to MongoDB: {e}")
+            return False
+    def save_dataframe_to_mongo(self, df: pd.DataFrame, session_id: str = None) -> bool:
+        """
+        Save pandas DataFrame directly to MongoDB
+        Args:
+            df (pd.DataFrame): DataFrame to save
+            session_id (str, optional): Unique identifier for this data session
+        Returns:
+            bool: True if save successful, False otherwise
+        """
+        if not self.is_connected():
+            logger.error("❌ MongoDB not connected")
+            return False
+        try:
+            if df.empty:
+                logger.warning("⚠️  DataFrame is empty")
+                return False
+            # Generate session ID if not provided
+            if not session_id:
+                session_id = f"session_{datetime.now().strftime('%Y%m%d_%H%M%S')}"
+            # Convert DataFrame to MongoDB documents
+            documents = self._dataframe_to_documents(df, session_id)
+            # Insert documents into MongoDB
+            if documents:
+                result = self.collection.insert_many(documents)
+                logger.info(f"✅ Saved {len(result.inserted_ids)} records to MongoDB (Session: {session_id})")
+                return True
+            else:
+                logger.warning("⚠️  No valid documents to save")
+                return False
+        except Exception as e:
+            logger.error(f"❌ Failed to save DataFrame to MongoDB: {e}")
+            return False
+    def _dataframe_to_documents(self, df: pd.DataFrame, session_id: str) -> List[Dict[str, Any]]:
+        """
+        Convert pandas DataFrame to MongoDB documents
+        Args:
+            df (pd.DataFrame): Input DataFrame
+            session_id (str): Session identifier
+        Returns:
+            List[Dict[str, Any]]: List of MongoDB documents
+        """
+        documents = []
+        for index, row in df.iterrows():
+            try:
+                # Convert row to dictionary
+                doc = row.to_dict()
+                # Add metadata
+                doc['session_id'] = session_id
+                doc['imported_at'] = datetime.utcnow()
+                doc['record_index'] = index
+                # Handle timestamp conversion
+                if 'timestamp' in doc and pd.notna(doc['timestamp']):
+                    try:
+                        # Try to parse timestamp
+                        if isinstance(doc['timestamp'], str):
+                            doc['timestamp'] = pd.to_datetime(doc['timestamp'])
+                        # Convert to datetime object
+                        doc['timestamp'] = doc['timestamp'].to_pydatetime()
+                    except Exception:
+                        # Keep as string if parsing fails
+                        pass
+                # Convert numeric types and handle NaN values
+                for key, value in doc.items():
+                    if pd.isna(value):
+                        doc[key] = None
+                    elif isinstance(value, (np.integer, np.floating)):
+                        doc[key] = value.item()
+                    elif isinstance(value, np.bool_):
+                        doc[key] = bool(value)
+                documents.append(doc)
+            except Exception as e:
+                logger.warning(f"⚠️  Failed to process row {index}: {e}")
+                continue
+        return documents
+    def query_data(self,
+                   session_id: str = None,
+                   driving_style: str = None,
+                   start_time: datetime = None,
+                   end_time: datetime = None,
+                   limit: int = 1000) -> List[Dict[str, Any]]:
+        """
+        Query data from MongoDB
+        Args:
+            session_id (str, optional): Filter by session ID
+            driving_style (str, optional): Filter by driving style
+            start_time (datetime, optional): Start time filter
+            end_time (datetime, optional): End time filter
+            limit (int): Maximum number of records to return
+        Returns:
+            List[Dict[str, Any]]: Query results
+        """
+        if not self.is_connected():
+            logger.error("❌ MongoDB not connected")
+            return []
+        try:
+            # Build query filter
+            query_filter = {}
+            if session_id:
+                query_filter['session_id'] = session_id
+            if driving_style:
+                query_filter['driving_style'] = driving_style
+            if start_time or end_time:
+                time_filter = {}
+                if start_time:
+                    time_filter['$gte'] = start_time
+                if end_time:
+                    time_filter['$lte'] = end_time
+                query_filter['timestamp'] = time_filter
+            # Execute query
+            cursor = self.collection.find(query_filter).limit(limit)
+            results = list(cursor)
+            logger.info(f"✅ Query returned {len(results)} records")
+            return results
+        except Exception as e:
+            logger.error(f"❌ Query failed: {e}")
+            return []
+    def get_session_summary(self) -> List[Dict[str, Any]]:
+        """
+        Get summary of all data sessions
+        Returns:
+            List[Dict[str, Any]]: Session summaries
+        """
+        if not self.is_connected():
+            logger.error("❌ MongoDB not connected")
+            return []
+        try:
+            pipeline = [
+                {
+                    '$group': {
+                        '_id': '$session_id',
+                        'count': {'$sum': 1},
+                        'driving_styles': {'$addToSet': '$driving_style'},
+                        'first_record': {'$min': '$timestamp'},
+                        'last_record': {'$max': '$timestamp'},
+                        'imported_at': {'$first': '$imported_at'}
+                    }
+                },
+                {
+                    '$sort': {'imported_at': -1}
+                }
+            ]
+            results = list(self.collection.aggregate(pipeline))
+            logger.info(f"✅ Retrieved summary for {len(results)} sessions")
+            return results
+        except Exception as e:
+            logger.error(f"❌ Session summary failed: {e}")
+            return []
+    def close_connection(self):
+        """Close MongoDB connection"""
+        if self.client:
+            self.client.close()
+            logger.info("✅ MongoDB connection closed")
+    def __enter__(self):
+        """Context manager entry"""
+        return self
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        """Context manager exit"""
+        self.close_connection()
+# Convenience functions
+def save_csv_to_mongo(csv_file_path: str, session_id: str = None) -> bool:
+    """Convenience function to save CSV to MongoDB"""
+    with MongoSaver() as saver:
+        return saver.save_csv_to_mongo(csv_file_path, session_id)
+def save_dataframe_to_mongo(df: pd.DataFrame, session_id: str = None) -> bool:
+    """Convenience function to save DataFrame to MongoDB"""
+    with MongoSaver() as saver:
+        return saver.save_dataframe_to_mongo(df, session_id)

organization.py ADDED Viewed

	@@ -0,0 +1,76 @@

+#!/usr/bin/env python3
+"""
+Script to reorganize existing models in HF repo to versioned structure.
+This will move the current 3 .pkl files from root to v1.0 folder.
+"""
+import os
+import sys
+import tempfile
+import json
+from pathlib import Path
+# Load environment variables from .env file
+def load_env():
+    """Load environment variables from .env file"""
+    env_path = Path(__file__).parent / '.env'
+    if env_path.exists():
+        with open(env_path, 'r') as f:
+            for line in f:
+                line = line.strip()
+                if line and not line.startswith('#') and '=' in line:
+                    key, value = line.split('=', 1)
+                    os.environ[key] = value
+        print(f"✅ Loaded environment variables from {env_path}")
+    else:
+        print("⚠️ No .env file found")
+# Load environment variables
+load_env()
+# Add train directory to path
+sys.path.append(os.path.join(os.path.dirname(__file__), 'train'))
+def main():
+    """Main function to reorganize models"""
+    print("🔄 Reorganizing models in Hugging Face repository...")
+    print("=" * 60)
+    # Check if HF_TOKEN is set
+    if not os.getenv("HF_TOKEN"):
+        print("❌ Error: HF_TOKEN environment variable not set")
+        print("Please set your Hugging Face token:")
+        print("export HF_TOKEN=your_token_here")
+        return 1
+    # Check if we're in the right directory
+    if not os.path.exists("train/rlhf.py"):
+        print("❌ Error: Please run this script from the OBD_Logger root directory")
+        return 1
+    try:
+        # Import and run the reorganization
+        from train.move_models_to_v1 import move_models_to_v1
+        print("📥 Starting model reorganization...")
+        move_models_to_v1()
+        print("\n✅ Model reorganization completed successfully!")
+        print("📁 Your models are now organized in the v1.0 folder")
+        print("🔄 Future RLHF training will create v1.1, v1.2, etc.")
+        print("\nNext steps:")
+        print("1. Verify the models are in the v1.0 folder on Hugging Face")
+        print("2. Test the RLHF training with: curl -X POST 'http://localhost:8000/rlhf/train'")
+        return 0
+    except Exception as e:
+        print(f"❌ Reorganization failed: {e}")
+        print("\nTroubleshooting:")
+        print("1. Make sure HF_TOKEN is set correctly")
+        print("2. Check that you have write access to the repository")
+        print("3. Verify the repository name is correct")
+        return 1
+if __name__ == "__main__":
+    exit(main())

organze.py ADDED Viewed

	@@ -0,0 +1,184 @@

+#!/usr/bin/env python3
+"""
+Simple script to reorganize existing models in HF repo to versioned structure.
+This will move the current 3 .pkl files from root to v1.0 folder.
+"""
+import os
+import tempfile
+import json
+from pathlib import Path
+from huggingface_hub import HfApi, hf_hub_download, upload_file
+def load_env():
+    """Load environment variables from .env file"""
+    env_path = Path(__file__).parent / '.env'
+    if env_path.exists():
+        with open(env_path, 'r') as f:
+            for line in f:
+                line = line.strip()
+                if line and not line.startswith('#') and '=' in line:
+                    key, value = line.split('=', 1)
+                    os.environ[key] = value
+        print(f"✅ Loaded environment variables from {env_path}")
+    else:
+        print("⚠️ No .env file found")
+def main():
+    """Main function to reorganize models"""
+    print("🔄 Reorganizing models in Hugging Face repository...")
+    print("=" * 60)
+    # Load environment variables
+    load_env()
+    # Check if HF_TOKEN is set
+    hf_token = os.getenv("HF_TOKEN")
+    if not hf_token:
+        print("❌ Error: HF_TOKEN not found in environment")
+        return 1
+    print(f"✅ HF_TOKEN loaded: {hf_token[:10]}...")
+    # Configuration
+    repo_id = "BinKhoaLe1812/Driver_Behavior_OBD"
+    model_files = ["label_encoder_ul.pkl", "scaler_ul.pkl", "xgb_drivestyle_ul.pkl"]
+    print(f"📦 Target repository: {repo_id}")
+    print(f"📁 Model files to move: {model_files}")
+    # Initialize HF API
+    hf_api = HfApi(token=hf_token)
+    try:
+        # Create temporary directory
+        with tempfile.TemporaryDirectory() as temp_dir:
+            temp_path = Path(temp_dir)
+            print(f"📁 Using temporary directory: {temp_path}")
+            # Download existing model files
+            downloaded_files = []
+            for file in model_files:
+                try:
+                    print(f"📥 Downloading {file}...")
+                    local_path = hf_hub_download(
+                        repo_id=repo_id,
+                        filename=file,
+                        repo_type="model",
+                        token=hf_token
+                    )
+                    downloaded_files.append((file, local_path))
+                    print(f"✅ Downloaded {file}")
+                except Exception as e:
+                    print(f"⚠️ Could not download {file}: {e}")
+            if not downloaded_files:
+                print("⚠️ No model files found to move")
+                return 1
+            # Create v1.0 directory structure
+            v1_dir = temp_path / "v1.0"
+            v1_dir.mkdir(exist_ok=True)
+            print(f"📁 Created v1.0 directory: {v1_dir}")
+            # Copy files to v1.0 directory
+            for filename, local_path in downloaded_files:
+                dest_path = v1_dir / filename
+                import shutil
+                shutil.copy2(local_path, dest_path)
+                print(f"📦 Prepared {filename} for v1.0/")
+            # Create metadata.json for v1.0
+            metadata = {
+                "version": "1.0",
+                "model_type": "xgboost_classifier",
+                "created_at": "2024-12-01T00:00:00",
+                "description": "Initial model version - moved from root directory",
+                "framework": "xgboost",
+                "task": "driver_behavior_classification",
+                "labels": ["aggressive", "normal", "conservative"],
+                "features": "obd_sensor_data",
+                "files": [f[0] for f in downloaded_files]
+            }
+            metadata_path = v1_dir / "metadata.json"
+            with open(metadata_path, 'w') as f:
+                json.dump(metadata, f, indent=2)
+            print("📝 Created metadata.json for v1.0")
+            # Create README.md for v1.0
+            readme_content = """---
+license: mit
+tags:
+- driver-behavior
+- obd-data
+- xgboost
+- version-1.0
+---
+# Driver Behavior Classification Model v1.0
+Initial version of the driver behavior classification model.
+## Files
+- `xgb_drivestyle_ul.pkl`: Main XGBoost model
+- `label_encoder_ul.pkl`: Label encoder for behavior categories
+- `scaler_ul.pkl`: Feature scaler
+- `metadata.json`: Model metadata
+## Usage
+```python
+import joblib
+# Load the model
+model = joblib.load('xgb_drivestyle_ul.pkl')
+label_encoder = joblib.load('label_encoder_ul.pkl')
+scaler = joblib.load('scaler_ul.pkl')
+# Make predictions
+predictions = model.predict(scaled_data)
+behavior_labels = label_encoder.inverse_transform(predictions)
+```
+"""
+            readme_path = v1_dir / "README.md"
+            with open(readme_path, 'w') as f:
+                f.write(readme_content)
+            print("📖 Created README.md for v1.0")
+            # Upload files to v1.0 directory in HF repo
+            print("🚀 Uploading files to Hugging Face Hub...")
+            for file_path in v1_dir.iterdir():
+                if file_path.is_file():
+                    hf_filename = f"v1.0/{file_path.name}"
+                    print(f"📤 Uploading {file_path.name} to {hf_filename}...")
+                    upload_file(
+                        path_or_fileobj=str(file_path),
+                        path_in_repo=hf_filename,
+                        repo_id=repo_id,
+                        repo_type="model",
+                        token=hf_token,
+                        commit_message=f"Add {file_path.name} to v1.0 directory"
+                    )
+                    print(f"✅ Uploaded {file_path.name} to v1.0/")
+            print("\n✅ Successfully moved models to v1.0 structure!")
+            print(f"📁 Models now located at: {repo_id}/v1.0/")
+            print("\nNext steps:")
+            print("1. Verify the models are in the v1.0 folder on Hugging Face")
+            print("2. Test the RLHF training with: curl -X POST 'http://localhost:8000/rlhf/train'")
+            return 0
+    except Exception as e:
+        print(f"❌ Reorganization failed: {e}")
+        print("\nTroubleshooting:")
+        print("1. Make sure HF_TOKEN is set correctly")
+        print("2. Check that you have write access to the repository")
+        print("3. Verify the repository name is correct")
+        return 1
+if __name__ == "__main__":
+    exit(main())

requirements.txt ADDED Viewed

	@@ -0,0 +1,37 @@

+# Server
+fastapi
+uvicorn[standard]
+python-multipart
+jinja2
+# Data
+pandas
+numpy
+scikit-learn
+# ML Models
+xgboost
+joblib
+# Drive
+gspread
+oauth2client
+google-auth
+google-auth-httplib2
+google-auth-oauthlib
+google-api-python-client
+# Database
+pymongo
+google-cloud-storage
+firebase-admin
+# Visualize
+matplotlib
+seaborn
+# HuggingFace
+huggingface_hub==0.25.2
+# Additional dependencies for RLHF training
+pyarrow  # For parquet file support

static/check.png ADDED Viewed

static/edit.png ADDED Viewed

static/icon.png ADDED Viewed

static/index.html ADDED Viewed

	@@ -0,0 +1,16 @@

+<!DOCTYPE html>
+<html lang="en">
+<head>
+  <meta charset="UTF-8">
+  <meta name="viewport" content="width=device-width, initial-scale=1.0">
+  <title>OBD-II Data Dashboard</title>
+  <link rel="website icon" type="png" href="/static/icon.png" >
+  <link rel="stylesheet" href="/static/styles.css">
+</head>
+<body>
+  <h1>OBD-II Data Pipeline Monitor</h1>
+  <div id="log-container"></div>
+  <script src="https://cdn.jsdelivr.net/npm/chart.js"></script>
+  <script src="/static/script.js?v=2"></script>
+</body>
+</html>

static/script.js ADDED Viewed

	@@ -0,0 +1,230 @@

+const expandedItems = JSON.parse(localStorage.getItem("expandedItems") || "{}");
+const renamedLabels = JSON.parse(localStorage.getItem("renamedLabels") || "{}"); // Allow card to change their name (original identified by ts)
+let previousKeys = [];
+let previousEvents = {}; // Track event status to avoid redundant updates
+// ─────────────────────────────────────────
+// Refresh event per interval
+// ─────────────────────────────────────────
+async function fetchEvents() {
+    const res = await fetch('/events');
+    const data = await res.json();
+    renderEvents(data);
+}
+// ─────────────────────────────────────────
+// Update or Create new card
+// ─────────────────────────────────────────
+function renderEvents(events) {
+    const container = document.getElementById('log-container');
+    const currentKeys = Object.keys(events).sort();
+    const newlyAdded = currentKeys.find(k => !previousKeys.includes(k));
+    previousKeys = currentKeys;
+    currentKeys.forEach(key => {
+        const event = events[key];
+        const existing = document.getElementById(`card-${key}`);
+        const prevStatus = previousEvents[key]?.status;
+        if (!existing) {
+            const card = createCard(key, event);
+            container.appendChild(card);
+            if (key === newlyAdded && event.status === 'done') {
+                setTimeout(() => card.scrollIntoView({ behavior: 'smooth', block: 'center' }), 300);
+            }
+        } else if (event.status !== prevStatus) {
+            updateCard(key, event); // Only update if status changed
+        }
+        previousEvents[key] = { status: event.status }; // Cache latest status
+    });
+}
+// ─────────────────────────────────────────
+// Create new card on unmatched key
+// ─────────────────────────────────────────
+function createCard(key, event) {
+    const readable = renamedLabels[key] || formatTimestamp(key);
+    const safeKey = key.replace(/[:.]/g, "-");
+    const card = document.createElement('div');
+    card.id = `card-${key}`;
+    card.className = 'card';
+    const removeBtn = document.createElement('button');
+    removeBtn.className = 'btn-remove';
+    removeBtn.textContent = 'X';
+    removeBtn.onclick = () => removeItem(key);
+    const tsDiv = document.createElement('div');
+    tsDiv.className = 'timestamp';
+    tsDiv.innerHTML = `<span class="label-text">${readable}</span>`;
+    const editIcon = document.createElement('img');
+    editIcon.src = '/static/edit.png';
+    editIcon.className = 'icon-edit';
+    editIcon.onclick = () => toggleEditMode(tsDiv, key);
+    tsDiv.appendChild(editIcon);
+    const statusDiv = document.createElement('div');
+    statusDiv.className = 'status';
+    const actionDiv = document.createElement('div');
+    actionDiv.className = 'actions';
+    card.appendChild(removeBtn);
+    card.appendChild(tsDiv);
+    card.appendChild(statusDiv);
+    card.appendChild(actionDiv);
+    updateCardContent(card, key, event);
+    return card;
+}
+// ─────────────────────────────────────────
+// Validate existing card
+// ─────────────────────────────────────────
+function updateCard(key, event) {
+    const card = document.getElementById(`card-${key}`);
+    if (card) {
+        updateCardContent(card, key, event);
+    }
+}
+// ─────────────────────────────────────────
+// Update existing card content
+// ─────────────────────────────────────────
+function updateCardContent(card, key, event) {
+    const statusDiv = card.querySelector('.status');
+    const actionDiv = card.querySelector('.actions');
+    const safeKey = key.replace(/[:.]/g, "-");
+    actionDiv.innerHTML = '';
+    if (event.status === 'started') {
+        statusDiv.textContent = "Received signal. Data logging started.";
+        card.style.backgroundColor = '#780606';
+    } else if (event.status === 'processed') {
+        statusDiv.textContent = "Data logging finished. Start cleaning process.";
+        card.style.backgroundColor = '#2e6930';
+    } else if (event.status === 'done') {
+        statusDiv.textContent = "Cleaned data saved. Insights is ready.";
+        card.style.backgroundColor = '#8a00c2';
+        const expandBtn = document.createElement('button');
+        expandBtn.className = 'btn-expand';
+        expandBtn.textContent = expandedItems[key] ? 'Collapse' : 'Expand';
+        expandBtn.onclick = () => toggleExpand(key, expandBtn);
+        const expandDiv = document.createElement('div');
+        expandDiv.id = `expand-${key}`;
+        expandDiv.className = 'expanded-content';
+        if (expandedItems[key]) expandDiv.classList.add('show');
+        expandDiv.innerHTML = `
+            <img src="/plots/heatmap_${safeKey}.png" width="100%">
+            <img src="/plots/trend_${safeKey}.png" width="100%">
+        `;
+        actionDiv.appendChild(expandBtn);
+        actionDiv.appendChild(expandDiv);
+    }
+}
+// ─────────────────────────────────────────
+// Toggle card expansion
+// ─────────────────────────────────────────
+function toggleExpand(key, btn) {
+    const el = document.getElementById(`expand-${key}`);
+    const showing = el.classList.contains('show');
+    if (showing) {
+        el.classList.remove('show');
+        expandedItems[key] = false;
+        btn.textContent = 'Expand';
+    } else {
+        el.classList.add('show');
+        expandedItems[key] = true;
+        btn.textContent = 'Collapse';
+    }
+    localStorage.setItem("expandedItems", JSON.stringify(expandedItems));
+}
+// ─────────────────────────────────────────
+// Toggle card edit-view mode
+// ─────────────────────────────────────────
+function toggleEditMode(container, key) {
+    const icon = container.querySelector('.icon-edit');
+    if (!container.classList.contains('editing')) {
+        const span = container.querySelector('.label-text');
+        if (!span) return;
+        const input = document.createElement('input');
+        input.type = 'text';
+        input.value = span.textContent;
+        input.className = 'label-input';
+        span.replaceWith(input);
+        icon.src = '/static/check.png';
+        container.classList.add('editing');
+    } else {
+        const input = container.querySelector('.label-input');
+        if (!input) return;
+        const newLabel = input.value.trim() || formatTimestamp(key);
+        renamedLabels[key] = newLabel;
+        localStorage.setItem("renamedLabels", JSON.stringify(renamedLabels));
+        const newSpan = document.createElement('span');
+        newSpan.className = 'label-text';
+        newSpan.textContent = newLabel;
+        input.replaceWith(newSpan);
+        icon.src = '/static/edit.png';
+        container.classList.remove('editing');
+    }
+}
+// ─────────────────────────────────────────
+// Remove a card item
+// ─────────────────────────────────────────
+function removeItem(key) {
+    const card = document.getElementById(`card-${key}`);
+    if (card) card.remove();
+    delete expandedItems[key];
+    delete previousEvents[key];
+    localStorage.setItem("expandedItems", JSON.stringify(expandedItems));
+    fetch(`/events/remove/${key}`, { method: 'DELETE' });
+}
+// ─────────────────────────────────────────
+// Format timestamp as hh:mm dd/mm/yyyy
+// ─────────────────────────────────────────
+function formatTimestamp(norm_ts) {
+    try {
+        const parts = norm_ts.split("T");
+        if (parts.length !== 2) throw new Error("Invalid format");
+        // Extract date and time parts
+        const datePart = parts[0]; // e.g., "2025-05-21"
+        const timeParts = parts[1].split("-"); // ["hh", "mm", "ss"]
+        if (timeParts.length < 3) throw new Error("Incomplete time");
+        // Reformat
+        const [year, month, day] = datePart.split("-").map(Number);
+        let [hour, minute, second] = timeParts.map(Number);
+        hour = (hour - 2 + 24) % 24;
+        // Create Date in local time (note: month is 0-based)
+        const dt = new Date(year, month - 1, day, hour, minute, second);
+        // Write string
+        const timeStr = dt.toLocaleTimeString([], { hour: '2-digit', minute: '2-digit' });
+        const dateStr = dt.toLocaleDateString('en-AU');
+        return `${timeStr} ${dateStr}`;
+    } catch (err) {
+        console.warn("formatTimestamp fallback:", err.message);
+        return norm_ts;
+    }
+}
+// ─────────────────────────────────────────
+// Sanitize filenames from timestamp
+// ─────────────────────────────────────────
+function sanitizeFilename(ts) {
+    return ts.replace(/:/g, '-').replace(/ /g, 'T').replace(/\//g, '-');
+}
+// ─────────────────────────────────────────
+fetchEvents();
+setInterval(fetchEvents, 1000);

static/styles.css ADDED Viewed

	@@ -0,0 +1,135 @@

+body {
+  font-family: 'Segoe UI', sans-serif;
+  background: linear-gradient(to bottom right, #eef1f7, #f9fafe);
+  margin: 0;
+  padding: 2rem;
+  color: #333;
+}
+h1 {
+  text-align: center;
+  margin-bottom: 2rem;
+  font-size: 2rem;
+  color: #2c3e50;
+}
+#log-container {
+  display: flex;
+  flex-direction: column;
+  gap: 1.5rem;
+  max-width: 960px;
+  margin: auto;
+}
+/* Card display */
+.card {
+  border-radius: 10px;
+  padding: 1.2rem 1.5rem;
+  color: white;
+  position: relative;
+  box-shadow: 0 4px 10px rgba(0, 0, 0, 0.08);
+  transition: transform 0.3s ease, background-color 0.3s ease;
+  overflow: hidden;
+}
+.card:hover {
+  transform: translateY(-3px);
+}
+.status {
+  font-weight: 600;
+  font-size: 1.1rem;
+}
+.timestamp {
+  font-size: 0.95rem;
+  opacity: 0.9;
+  margin-top: 4px;
+  display: flex;
+  align-items: center;
+  gap: 8px;
+}
+.icon-edit {
+  width: 18px;
+  height: 18px;
+  cursor: pointer;
+  margin-left: 4px;
+}
+.label-input {
+  font-size: 1rem;
+  padding: 2px 6px;
+  border-radius: 4px;
+  border: 1px solid #ccc;
+  width: 160px;
+}
+/* All buttons */
+.btn-expand,
+.btn-remove {
+  margin-top: 1rem;
+  padding: 0.4rem 1.2rem;
+  cursor: pointer;
+  font-size: 0.9rem;
+  border: none;
+  border-radius: 4px;
+  transition: background-color 0.2s ease;
+}
+.btn-expand {
+  background-color: rgba(255, 255, 255, 0.25);
+  color: white;
+}
+.btn-expand:hover {
+  background-color: rgba(255, 255, 255, 0.4);
+}
+.btn-remove {
+  position: absolute;
+  top: 10px;
+  right: 14px;
+  background: rgba(255, 255, 255, 0.15);
+  color: white;
+}
+.btn-remove:hover {
+  background: rgba(255, 255, 255, 0.3);
+}
+/* Expanded content */
+.expanded-content {
+  margin-top: 1.2rem;
+  animation: fadeIn 0.3s ease-in-out;
+  max-height: 0; /* You can adjust this limit */
+  overflow-y: auto;  /* Allow vertical scroll */
+  transition: max-height 0.4s ease-in-out, opacity 0.3s ease;
+  opacity: 0;
+  padding-right: 5px; /* Optional: give room for scrollbar */
+}
+.expanded-content.show {
+  max-height: 1000px;
+  opacity: 1;
+}
+.expanded-content img {
+  margin-top: 1rem;
+  border-radius: 6px;
+  box-shadow: 0 2px 8px rgba(0, 0, 0, 0.1);
+}
+/* Colors */
+.card.red {
+  background-color: #e74c3c;
+}
+.card.green {
+  background-color: #27ae60;
+}
+.card.purple {
+  background-color: #8e44ad;
+}
+/* Animation */
+@keyframes fadeIn {
+  from {
+    opacity: 0;
+    transform: translateY(10px);
+  }
+  to {
+    opacity: 1;
+    transform: translateY(0);
+  }
+}

train/README.md ADDED Viewed

	@@ -0,0 +1,140 @@

+---
+license: apache-2.0
+language:
+- en
+pipeline_tag: tabular-classification
+---
+# RLHF Training System
+This directory contains the Reinforcement Learning from Human Feedback (RLHF) training pipeline for the driver behavior classification model.
+## Overview
+The RLHF system enables continuous improvement of the driver behavior model by:
+1. Loading human-labeled data from Firebase storage (`skyledge/labeled`)
+2. Combining it with existing model predictions for reinforcement learning
+3. Retraining the XGBoost model with the enhanced dataset
+4. Saving new model checkpoints to Hugging Face Hub
+## Files
+### `loader.py`
+- **Purpose**: Load labeled data from Firebase storage
+- **Key Features**:
+  - Lists available labeled datasets from `skyledge/labeled` path
+  - Tracks already processed datasets in `trained.txt`
+  - Downloads and loads datasets into pandas DataFrames
+  - Prevents retraining on the same data
+### `saver.py`
+- **Purpose**: Save trained models to Hugging Face Hub and local storage
+- **Key Features**:
+  - Saves model components (XGBoost model, label encoder, scaler)
+  - Creates model metadata and README files
+  - Uploads to Hugging Face Hub with versioning
+  - Maintains local model directory structure
+### `rlhf.py`
+- **Purpose**: Main RLHF training pipeline
+- **Key Features**:
+  - Loads new labeled datasets
+  - Creates RLHF dataset by combining labeled data with model predictions
+  - Trains XGBoost model with enhanced dataset
+  - Evaluates model performance
+  - Coordinates with loader and saver modules
+## API Endpoints
+The RLHF training system is integrated into the main FastAPI application with the following endpoints:
+### `POST /rlhf/train`
+Trigger RLHF training session.
+**Request Body:**
+```json
+{
+  "max_datasets": 10,
+  "force_retrain": false
+}
+```
+**Response:**
+```json
+{
+  "status": "success",
+  "model_version": "20241201_143022",
+  "datasets_processed": 5,
+  "samples_processed": 1250,
+  "performance_metrics": {
+    "accuracy": 0.892,
+    "cv_mean": 0.885,
+    "cv_std": 0.012
+  },
+  "timestamp": "2024-12-01T14:30:22"
+}
+```
+### `GET /rlhf/status`
+Get status of RLHF training system and available labeled data.
+### `GET /rlhf/trained-datasets`
+Get list of datasets that have already been used for training.
+## Configuration
+### Environment Variables
+- `HF_TOKEN`: Hugging Face authentication token
+- `HF_MODEL_REPO`: Hugging Face model repository (default: `BinKhoaLe1812/Driver_Behavior_OBD`)
+- `MODEL_DIR`: Local model directory (default: `/app/models/ul`)
+- `FIREBASE_ADMIN_JSON`: Firebase Admin SDK credentials
+- `FIREBASE_SERVICE_ACCOUNT_JSON`: Firebase service account credentials
+### Firebase Storage Structure
+```
+skyledge-36b56.firebasestorage.app/
+├── skyledge/
+│   ├── processed/          # Original processed data
+│   ├── labeled/            # Human-labeled data for RLHF
+│   │   ├── dataset1.csv
+│   │   ├── dataset2.csv
+│   │   └── trained.txt     # Tracks processed datasets
+│   └── logs/               # Training logs (future)
+```
+## Usage
+## Model Versioning
+Each training session creates a new model version with timestamp format: `YYYYMMDD_HHMMSS`
+Models are saved to:
+- **Local**: `/app/models/ul/v{version}/`
+- **Hugging Face**: `BinKhoaLe1812/Driver_Behavior_OBD`
+## Data Flow
+1. **Data Collection**: Human-labeled data stored in `skyledge/labeled/`
+2. **Training Trigger**: API endpoint or manual trigger
+3. **Data Loading**: Load new labeled datasets (skip already processed)
+4. **RLHF Dataset**: Combine labeled data with model predictions
+5. **Model Training**: Train XGBoost with enhanced dataset
+6. **Evaluation**: Calculate performance metrics
+7. **Model Saving**: Save to local storage and Hugging Face Hub
+8. **Tracking**: Update `trained.txt` with processed datasets
+## Performance Monitoring
+The system tracks:
+- Number of datasets processed
+- Total samples processed
+- Model accuracy and cross-validation scores
+- Training timestamps and metadata
+## Error Handling
+- Graceful handling of missing datasets
+- Firebase connection failures
+- Model loading/saving errors
+- XGBoost compatibility issues
+- Comprehensive logging throughout the pipeline

train/__init__.py ADDED Viewed

	@@ -0,0 +1,8 @@

+# train package
+# RLHF Training System for Driver Behavior Classification
+from .rlhf import RLHFTrainer
+from .loader import LabeledDataLoader
+from .saver import ModelSaver
+__all__ = ['RLHFTrainer', 'LabeledDataLoader', 'ModelSaver']

train/loader.py ADDED Viewed

	@@ -0,0 +1,370 @@

+# loader.py
+# Load labeled data from Firebase storage for RLHF training
+import os
+import json
+import logging
+import pandas as pd
+from datetime import datetime
+from typing import List, Dict, Optional, Tuple, Any
+from pathlib import Path
+# Import Firebase client from the existing firebase_saver
+import sys
+sys.path.append(os.path.join(os.path.dirname(__file__), '..'))
+from data.firebase_saver import _AdminClient, _GCSClient
+logger = logging.getLogger("rlhf-loader")
+logger.setLevel(logging.INFO)
+if not logger.handlers:
+    _h = logging.StreamHandler()
+    _h.setFormatter(logging.Formatter("[%(levelname)s] %(asctime)s - %(message)s"))
+    logger.addHandler(_h)
+# Firebase configuration
+FIREBASE_BUCKET = "skyledge-36b56.firebasestorage.app"
+LABELED_PREFIX = "skyledge/labeled"
+RAW_PREFIX = "skyledge/raw"
+PROCESSED_PREFIX = "skyledge/processed"
+TRAINED_FILE = "trained.txt"
+class LabeledDataLoader:
+    """
+    Load labeled data from Firebase storage for RLHF training.
+    Tracks already processed datasets to avoid retraining on the same data.
+    """
+    def __init__(self):
+        self.bucket_name = FIREBASE_BUCKET
+        self.prefix = LABELED_PREFIX
+        self.trained_file = TRAINED_FILE
+        # Initialize Firebase client
+        self.client = None
+        self.mode = None
+        try:
+            if os.getenv("FIREBASE_ADMIN_JSON"):
+                self.client = _AdminClient(self.bucket_name)
+                self.mode = "admin"
+        except Exception as e:
+            logger.warning(f"⚠️ Admin SDK init failed: {e}")
+        if self.client is None:
+            try:
+                self.client = _GCSClient(self.bucket_name)
+                self.mode = "gcs"
+            except Exception as e:
+                logger.error(f"❌ GCS client init failed: {e}")
+                raise
+        logger.info(f"📦 LabeledDataLoader ready | mode={self.mode} bucket={self.bucket_name} prefix={self.prefix}")
+    def _get_trained_datasets(self) -> List[str]:
+        """Load list of already trained datasets from trained.txt"""
+        try:
+            # Check if trained.txt exists in Firebase storage
+            trained_path = f"{self.prefix}/{self.trained_file}"
+            if self.client.blob_exists(trained_path):
+                # Download and read the file
+                blob = self.client.bucket.blob(trained_path)
+                content = blob.download_as_text()
+                trained_datasets = [line.strip() for line in content.split('\n') if line.strip()]
+                logger.info(f"📋 Loaded {len(trained_datasets)} already trained datasets")
+                return trained_datasets
+            else:
+                logger.info("📋 No trained.txt found, starting fresh")
+                return []
+        except Exception as e:
+            logger.warning(f"⚠️ Failed to load trained datasets: {e}")
+            return []
+    def _update_trained_datasets(self, new_datasets: List[str]):
+        """Update trained.txt with new dataset names"""
+        try:
+            # Get existing trained datasets
+            existing = self._get_trained_datasets()
+            # Add new datasets with timestamp
+            timestamp = datetime.now().isoformat()
+            new_entries = [f"{timestamp}:{dataset}" for dataset in new_datasets]
+            all_entries = existing + new_entries
+            # Upload updated file
+            trained_path = f"{self.prefix}/{self.trained_file}"
+            content = '\n'.join(all_entries)
+            self.client.upload_from_bytes(
+                content.encode('utf-8'),
+                trained_path,
+                "text/plain"
+            )
+            logger.info(f"✅ Updated trained.txt with {len(new_datasets)} new datasets")
+        except Exception as e:
+            logger.error(f"❌ Failed to update trained datasets: {e}")
+    def list_labeled_datasets(self) -> List[Dict[str, str]]:
+        """List all available labeled datasets in Firebase storage"""
+        try:
+            # List all blobs under the labeled prefix
+            blobs = self.client.bucket.list_blobs(prefix=f"{self.prefix}/")
+            datasets = []
+            trained_datasets = self._get_trained_datasets()
+            for blob in blobs:
+                # Skip the trained.txt file itself
+                if blob.name.endswith(f"/{self.trained_file}"):
+                    continue
+                # Extract dataset name (relative to skyledge root)
+                dataset_name = blob.name.replace("skyledge/", "")
+                # Skip if already trained
+                if any(dataset_name in entry for entry in trained_datasets):
+                    continue
+                # Get blob metadata
+                blob.reload()
+                datasets.append({
+                    'name': dataset_name,
+                    'path': blob.name,
+                    'size': blob.size,
+                    'created': blob.time_created.isoformat() if blob.time_created else None,
+                    'updated': blob.updated.isoformat() if blob.updated else None,
+                    'content_type': blob.content_type
+                })
+            logger.info(f"📊 Found {len(datasets)} new labeled datasets")
+            return datasets
+        except Exception as e:
+            logger.error(f"❌ Failed to list labeled datasets: {e}")
+            return []
+    def download_dataset(self, dataset_path: str, local_path: str) -> bool:
+        """Download a dataset from Firebase storage to local path"""
+        try:
+            blob = self.client.bucket.blob(dataset_path)
+            blob.download_to_filename(local_path)
+            logger.info(f"✅ Downloaded {dataset_path} to {local_path}")
+            return True
+        except Exception as e:
+            logger.error(f"❌ Failed to download {dataset_path}: {e}")
+            return False
+    def load_dataset(self, dataset_path: str) -> Optional[pd.DataFrame]:
+        """Load a dataset directly into a pandas DataFrame"""
+        try:
+            blob = self.client.bucket.blob(dataset_path)
+            content = blob.download_as_text()
+            # Try to determine file type and load accordingly
+            if dataset_path.endswith('.csv'):
+                df = pd.read_csv(pd.StringIO(content))
+            elif dataset_path.endswith('.json'):
+                df = pd.read_json(pd.StringIO(content))
+            elif dataset_path.endswith('.parquet'):
+                # For parquet, we need to download as bytes
+                blob_bytes = blob.download_as_bytes()
+                df = pd.read_parquet(pd.BytesIO(blob_bytes))
+            else:
+                # Default to CSV
+                df = pd.read_csv(pd.StringIO(content))
+            logger.info(f"✅ Loaded dataset {dataset_path} with shape {df.shape}")
+            return df
+        except Exception as e:
+            logger.error(f"❌ Failed to load dataset {dataset_path}: {e}")
+            return None
+    def get_new_datasets_for_training(self) -> List[Dict[str, str]]:
+        """Get list of new datasets that haven't been used for training yet"""
+        return self.list_labeled_datasets()
+    def mark_datasets_as_trained(self, dataset_names: List[str]):
+        """Mark datasets as trained to avoid retraining"""
+        self._update_trained_datasets(dataset_names)
+    def _parse_labeled_filename(self, filename: str) -> Dict[str, str]:
+        """
+        Parse labeled filename to extract original dataset information.
+        Format: {id}_{source}-{original_id}_{date}-labelled.csv
+        Example: 001_raw-002_2025-09-19-labelled.csv
+        """
+        try:
+            # Remove .csv extension
+            name = filename.replace('.csv', '')
+            # Split by underscore to get parts
+            parts = name.split('_')
+            if len(parts) < 4:
+                return {"error": f"Invalid filename format: {filename}"}
+            # Extract components
+            labeled_id = parts[0]  # 001
+            source_and_original = parts[1]  # raw-002 or processed-002
+            date = parts[2]  # 2025-09-19
+            # Parse source and original ID
+            if '-' in source_and_original:
+                source, original_id = source_and_original.split('-', 1)
+            else:
+                source = source_and_original
+                original_id = "unknown"
+            return {
+                "labeled_id": labeled_id,
+                "source": source,  # raw or processed
+                "original_id": original_id,
+                "date": date,
+                "original_filename": f"{original_id}_{date}-{source}.csv" if source != "unknown" else None
+            }
+        except Exception as e:
+            logger.warning(f"⚠️ Failed to parse filename {filename}: {e}")
+            return {"error": str(e)}
+    def _find_original_dataset(self, labeled_info: Dict[str, str]) -> Optional[str]:
+        """Find the original dataset path based on labeled file info"""
+        if labeled_info.get("error") or not labeled_info.get("original_filename"):
+            return None
+        source = labeled_info["source"]
+        original_filename = labeled_info["original_filename"]
+        if source == "raw":
+            return f"{self.RAW_PREFIX}/{original_filename}"
+        elif source == "processed":
+            return f"{self.PROCESSED_PREFIX}/{original_filename}"
+        else:
+            return None
+    def load_labeled_with_original(self, labeled_path: str) -> Tuple[Optional[pd.DataFrame], Optional[pd.DataFrame], Dict[str, str]]:
+        """
+        Load labeled dataset along with its original dataset for RLHF comparison.
+        Returns: (labeled_df, original_df, metadata)
+        """
+        try:
+            # Load labeled dataset
+            labeled_df = self.load_dataset(labeled_path)
+            if labeled_df is None:
+                return None, None, {"error": "Failed to load labeled dataset"}
+            # Parse filename to get original dataset info
+            filename = labeled_path.split('/')[-1]
+            labeled_info = self._parse_labeled_filename(filename)
+            if labeled_info.get("error"):
+                logger.warning(f"⚠️ Could not parse labeled filename: {labeled_info['error']}")
+                return labeled_df, None, labeled_info
+            # Find and load original dataset
+            original_path = self._find_original_dataset(labeled_info)
+            original_df = None
+            if original_path and self.client.blob_exists(original_path):
+                original_df = self.load_dataset(original_path)
+                if original_df is not None:
+                    logger.info(f"✅ Loaded original dataset: {original_path}")
+                else:
+                    logger.warning(f"⚠️ Failed to load original dataset: {original_path}")
+            else:
+                logger.warning(f"⚠️ Original dataset not found: {original_path}")
+            return labeled_df, original_df, labeled_info
+        except Exception as e:
+            logger.error(f"❌ Failed to load labeled with original: {e}")
+            return None, None, {"error": str(e)}
+    def create_training_batch(self, max_datasets: int = 10) -> Tuple[List[pd.DataFrame], List[str]]:
+        """
+        Create a training batch by loading new datasets.
+        Returns tuple of (dataframes, dataset_names)
+        """
+        datasets = self.get_new_datasets_for_training()
+        if not datasets:
+            logger.info("📭 No new datasets available for training")
+            return [], []
+        # Limit the number of datasets
+        datasets = datasets[:max_datasets]
+        dataframes = []
+        dataset_names = []
+        for dataset in datasets:
+            df = self.load_dataset(dataset['path'])
+            if df is not None:
+                dataframes.append(df)
+                dataset_names.append(dataset['name'])
+            else:
+                logger.warning(f"⚠️ Skipping dataset {dataset['name']} due to load failure")
+        if dataframes:
+            logger.info(f"📦 Created training batch with {len(dataframes)} datasets")
+            # Mark these datasets as trained
+            self.mark_datasets_as_trained(dataset_names)
+        return dataframes, dataset_names
+    def create_rlhf_training_batch(self, max_datasets: int = 10) -> Tuple[List[Dict[str, Any]], List[str]]:
+        """
+        Create RLHF training batch with both labeled and original datasets.
+        Returns tuple of (training_data, dataset_names)
+        Each training_data item contains: {'labeled_df', 'original_df', 'metadata'}
+        """
+        datasets = self.get_new_datasets_for_training()
+        if not datasets:
+            logger.info("📭 No new datasets available for RLHF training")
+            return [], []
+        # Limit the number of datasets
+        datasets = datasets[:max_datasets]
+        training_data = []
+        dataset_names = []
+        for dataset in datasets:
+            labeled_df, original_df, metadata = self.load_labeled_with_original(dataset['path'])
+            if labeled_df is not None:
+                training_item = {
+                    'labeled_df': labeled_df,
+                    'original_df': original_df,
+                    'metadata': metadata,
+                    'dataset_name': dataset['name']
+                }
+                training_data.append(training_item)
+                dataset_names.append(dataset['name'])
+                logger.info(f"✅ Loaded RLHF dataset: {dataset['name']} (original: {metadata.get('original_filename', 'N/A')})")
+            else:
+                logger.warning(f"⚠️ Skipping dataset {dataset['name']} due to load failure")
+        if training_data:
+            logger.info(f"📦 Created RLHF training batch with {len(training_data)} datasets")
+            # Mark these datasets as trained
+            self.mark_datasets_as_trained(dataset_names)
+        return training_data, dataset_names
+def main():
+    """Test the loader functionality"""
+    loader = LabeledDataLoader()
+    # List available datasets
+    datasets = loader.list_labeled_datasets()
+    print(f"Available datasets: {len(datasets)}")
+    for dataset in datasets:
+        print(f"  - {dataset['name']} ({dataset['size']} bytes)")
+    # Create a training batch
+    dataframes, names = loader.create_training_batch(max_datasets=5)
+    print(f"Training batch: {len(dataframes)} datasets")
+    for name in names:
+        print(f"  - {name}")
+if __name__ == "__main__":
+    main()

train/rlhf.py ADDED Viewed

	@@ -0,0 +1,420 @@

+# rlhf.py
+# Reinforcement Learning from Human Feedback training pipeline
+import os
+import json
+import logging
+import pickle
+import joblib
+from datetime import datetime
+from typing import List, Dict, Any, Optional, Tuple
+import warnings
+import pandas as pd
+import numpy as np
+from sklearn.model_selection import train_test_split, cross_val_score
+from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
+from sklearn.preprocessing import LabelEncoder, StandardScaler
+import xgboost as xgb
+# Import our custom modules
+from .loader import LabeledDataLoader
+from .saver import ModelSaver
+# Suppress warnings for cleaner output
+warnings.filterwarnings("ignore", category=UserWarning, module="sklearn")
+warnings.filterwarnings("ignore", category=UserWarning, module="xgboost")
+warnings.filterwarnings("ignore", category=FutureWarning)
+logger = logging.getLogger("rlhf-trainer")
+logger.setLevel(logging.INFO)
+if not logger.handlers:
+    _h = logging.StreamHandler()
+    _h.setFormatter(logging.Formatter("[%(levelname)s] %(asctime)s - %(message)s"))
+    logger.addHandler(_h)
+class RLHFTrainer:
+    """
+    Reinforcement Learning from Human Feedback trainer for driver behavior classification.
+    This trainer:
+    1. Loads human-labeled data from Firebase storage
+    2. Combines it with existing model predictions for RLHF
+    3. Retrains the XGBoost model with the combined dataset
+    4. Evaluates performance and saves the new model
+    """
+    def __init__(self):
+        self.loader = LabeledDataLoader()
+        self.saver = ModelSaver()
+        # Model parameters
+        self.model_params = {
+            'n_estimators': 100,
+            'max_depth': 6,
+            'learning_rate': 0.1,
+            'subsample': 0.8,
+            'colsample_bytree': 0.8,
+            'random_state': 42,
+            'use_label_encoder': False,
+            'eval_metric': 'mlogloss'
+        }
+        # Feature columns to drop (non-predictive)
+        self.safe_drop = {
+            "timestamp", "driving_style", "ul_drivestyle", "gt_drivestyle",
+            "session_id", "imported_at", "record_index"
+        }
+        logger.info("🤖 RLHFTrainer initialized")
+    def _prepare_features(self, df: pd.DataFrame, expected_features: Optional[List[str]] = None) -> Tuple[np.ndarray, List[str]]:
+        """Prepare features for training"""
+        # Select numeric columns and drop non-feature columns
+        feature_cols = [c for c in df.columns
+                       if c not in self.safe_drop and pd.api.types.is_numeric_dtype(df[c])]
+        X = df[feature_cols].copy()
+        # Ensure required features are present
+        if expected_features:
+            for col in expected_features:
+                if col not in X.columns:
+                    X[col] = 0.0
+            X = X[expected_features]  # Align order
+        # Handle missing values
+        X = X.fillna(0)
+        return X.values, feature_cols
+    def _prepare_labels(self, df: pd.DataFrame, label_column: str = "driving_style") -> np.ndarray:
+        """Prepare labels for training"""
+        if label_column not in df.columns:
+            raise ValueError(f"Label column '{label_column}' not found in data")
+        return df[label_column].values
+    def _load_existing_model(self) -> Tuple[Any, Any, Any, List[str]]:
+        """Load existing model components, downloading latest version if needed"""
+        try:
+            # First, try to download the latest model
+            logger.info("🔄 Checking for latest model version...")
+            try:
+                from utils.download import download_latest_models
+                download_latest_models()
+            except Exception as e:
+                logger.warning(f"⚠️ Failed to download latest models: {e}")
+            model_dir = os.getenv("MODEL_DIR", "/app/models/ul")
+            model_path = os.path.join(model_dir, "xgb_drivestyle_ul.pkl")
+            le_path = os.path.join(model_dir, "label_encoder_ul.pkl")
+            scaler_path = os.path.join(model_dir, "scaler_ul.pkl")
+            # Load with compatibility fixes
+            model = self._load_model_with_compatibility(model_path)
+            label_encoder = joblib.load(le_path)
+            scaler = joblib.load(scaler_path)
+            # Get expected features
+            expected_features = None
+            if hasattr(scaler, "feature_names_in_"):
+                expected_features = list(scaler.feature_names_in_)
+            elif hasattr(model, "feature_names_in_"):
+                expected_features = list(model.feature_names_in_)
+            logger.info(f"✅ Loaded existing model with {len(expected_features) if expected_features else 'unknown'} features")
+            return model, label_encoder, scaler, expected_features
+        except Exception as e:
+            logger.warning(f"⚠️ Failed to load existing model: {e}")
+            return None, None, None, None
+    def _load_model_with_compatibility(self, model_path: str) -> Any:
+        """Load model with XGBoost compatibility fixes"""
+        try:
+            model = joblib.load(model_path)
+            # Fix XGBoost compatibility issues
+            if hasattr(model, 'get_booster'):  # This is an XGBoost model
+                # Remove deprecated attributes
+                deprecated_attrs = [
+                    'use_label_encoder', '_le', '_label_encoder',
+                    'use_label_encoder_', '_le_', '_label_encoder_'
+                ]
+                for attr in deprecated_attrs:
+                    if hasattr(model, attr):
+                        try:
+                            delattr(model, attr)
+                        except (AttributeError, TypeError):
+                            pass
+                # Set use_label_encoder to False
+                if hasattr(model, 'set_params'):
+                    try:
+                        model.set_params(use_label_encoder=False)
+                    except Exception:
+                        pass
+            return model
+        except Exception as e:
+            logger.error(f"❌ Failed to load model: {e}")
+            raise
+    def _create_rlhf_dataset(self, training_data: List[Dict[str, Any]]) -> Tuple[np.ndarray, np.ndarray, Dict[str, Any]]:
+        """Create RLHF dataset by combining labeled data with original data and model predictions"""
+        try:
+            # Load existing model for generating predictions
+            existing_model, label_encoder, scaler, expected_features = self._load_existing_model()
+            if existing_model is None:
+                logger.warning("⚠️ No existing model found, using only labeled data")
+                return self._prepare_rlhf_from_labeled_only(training_data)
+            # Combine all labeled datasets
+            labeled_dfs = [item['labeled_df'] for item in training_data if item['labeled_df'] is not None]
+            original_dfs = [item['original_df'] for item in training_data if item['original_df'] is not None]
+            combined_labeled_df = pd.concat(labeled_dfs, ignore_index=True)
+            # Prepare features and labels from labeled data
+            X_labeled, feature_cols = self._prepare_features(combined_labeled_df, expected_features)
+            y_labeled = self._prepare_labels(combined_labeled_df)
+            # Scale features
+            X_labeled_scaled = scaler.transform(X_labeled)
+            # Generate model predictions on original data for comparison
+            model_predictions = []
+            prediction_confidence = []
+            if original_dfs:
+                combined_original_df = pd.concat(original_dfs, ignore_index=True)
+                X_original, _ = self._prepare_features(combined_original_df, expected_features)
+                X_original_scaled = scaler.transform(X_original)
+                # Get model predictions on original data
+                original_predictions = existing_model.predict(X_original_scaled)
+                model_predictions.extend(original_predictions)
+                # Get prediction probabilities for confidence
+                if hasattr(existing_model, 'predict_proba'):
+                    proba = existing_model.predict_proba(X_original_scaled)
+                    confidence = np.max(proba, axis=1)
+                    prediction_confidence.extend(confidence)
+            # Create RLHF dataset with preference learning
+            # The labeled data represents the "correct" behavior (human preference)
+            # The model predictions on original data represent what the model thought was correct
+            # For RLHF, we want to learn from the difference between model predictions and human labels
+            rlhf_metadata = {
+                "labeled_samples": len(X_labeled),
+                "original_samples": len(model_predictions) if model_predictions else 0,
+                "model_confidence": np.mean(prediction_confidence) if prediction_confidence else 0.0,
+                "datasets_processed": len(training_data)
+            }
+            logger.info(f"📊 Created RLHF dataset: {len(X_labeled)} labeled samples, {len(model_predictions)} original samples")
+            logger.info(f"📊 Model confidence on original data: {rlhf_metadata['model_confidence']:.3f}")
+            return X_labeled_scaled, y_labeled, rlhf_metadata
+        except Exception as e:
+            logger.error(f"❌ Failed to create RLHF dataset: {e}")
+            raise
+    def _prepare_rlhf_from_labeled_only(self, training_data: List[Dict[str, Any]]) -> Tuple[np.ndarray, np.ndarray, Dict[str, Any]]:
+        """Prepare RLHF dataset from labeled data only (when no existing model)"""
+        labeled_dfs = [item['labeled_df'] for item in training_data if item['labeled_df'] is not None]
+        combined_df = pd.concat(labeled_dfs, ignore_index=True)
+        # Prepare features
+        X, feature_cols = self._prepare_features(combined_df)
+        y = self._prepare_labels(combined_df)
+        # Create and fit scaler
+        scaler = StandardScaler()
+        X_scaled = scaler.fit_transform(X)
+        rlhf_metadata = {
+            "labeled_samples": len(X),
+            "original_samples": 0,
+            "model_confidence": 0.0,
+            "datasets_processed": len(training_data)
+        }
+        return X_scaled, y, rlhf_metadata
+    def _train_model(self, X: np.ndarray, y: np.ndarray,
+                    existing_model: Optional[Any] = None) -> Tuple[Any, Any, Any]:
+        """Train the XGBoost model"""
+        try:
+            # Create label encoder
+            label_encoder = LabelEncoder()
+            y_encoded = label_encoder.fit_transform(y)
+            # Create scaler
+            scaler = StandardScaler()
+            X_scaled = scaler.fit_transform(X)
+            # Split data
+            X_train, X_test, y_train, y_test = train_test_split(
+                X_scaled, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded
+            )
+            # Create and train model
+            model = xgb.XGBClassifier(**self.model_params)
+            # If we have an existing model, we can use it for warm start or transfer learning
+            if existing_model is not None:
+                logger.info("🔄 Using existing model for warm start")
+                # For XGBoost, we can't directly warm start, but we can use similar parameters
+                # and potentially use the existing model's predictions as additional features
+            # Train the model
+            model.fit(X_train, y_train,
+                     eval_set=[(X_test, y_test)],
+                     early_stopping_rounds=10,
+                     verbose=False)
+            # Evaluate
+            y_pred = model.predict(X_test)
+            accuracy = accuracy_score(y_test, y_pred)
+            logger.info(f"✅ Model trained with accuracy: {accuracy:.4f}")
+            return model, label_encoder, scaler
+        except Exception as e:
+            logger.error(f"❌ Model training failed: {e}")
+            raise
+    def _evaluate_model(self, model: Any, label_encoder: Any, scaler: Any,
+                       X: np.ndarray, y: np.ndarray) -> Dict[str, float]:
+        """Evaluate model performance"""
+        try:
+            # Prepare test data
+            X_scaled = scaler.transform(X)
+            y_encoded = label_encoder.transform(y)
+            # Make predictions
+            y_pred = model.predict(X_scaled)
+            # Calculate metrics
+            accuracy = accuracy_score(y_encoded, y_pred)
+            # Cross-validation score
+            cv_scores = cross_val_score(model, X_scaled, y_encoded, cv=5)
+            cv_mean = cv_scores.mean()
+            cv_std = cv_scores.std()
+            metrics = {
+                "accuracy": accuracy,
+                "cv_mean": cv_mean,
+                "cv_std": cv_std,
+                "cv_scores": cv_scores.tolist()
+            }
+            logger.info(f"📊 Model evaluation: accuracy={accuracy:.4f}, cv_mean={cv_mean:.4f}±{cv_std:.4f}")
+            return metrics
+        except Exception as e:
+            logger.error(f"❌ Model evaluation failed: {e}")
+            return {"accuracy": 0.0, "cv_mean": 0.0, "cv_std": 0.0}
+    def train(self, max_datasets: int = 10) -> Dict[str, Any]:
+        """Main training pipeline"""
+        try:
+            logger.info("🚀 Starting RLHF training pipeline")
+            # Load new labeled datasets with original data for RLHF
+            training_data, dataset_names = self.loader.create_rlhf_training_batch(max_datasets=max_datasets)
+            if not training_data:
+                logger.warning("⚠️ No new datasets available for RLHF training")
+                return {"status": "no_data", "message": "No new datasets available"}
+            logger.info(f"📦 Loaded {len(training_data)} datasets for RLHF training")
+            # Create RLHF dataset
+            X, y, rlhf_metadata = self._create_rlhf_dataset(training_data)
+            # Load existing model for comparison
+            existing_model, existing_le, existing_scaler, expected_features = self._load_existing_model()
+            # Train new model
+            model, label_encoder, scaler = self._train_model(X, y, existing_model)
+            # Evaluate model
+            metrics = self._evaluate_model(model, label_encoder, scaler, X, y)
+            # Generate model version using semantic versioning
+            model_version = self.saver._get_next_version()
+            # Prepare training data info
+            training_data_info = {
+                "datasets": dataset_names,
+                "total_samples": len(X),
+                "training_date": datetime.now().isoformat(),
+                "features_count": X.shape[1]
+            }
+            # Prepare training log
+            training_log = {
+                "datasets_used": dataset_names,
+                "samples_processed": len(X),
+                "model_parameters": self.model_params,
+                "performance_metrics": metrics,
+                "training_duration": "N/A",  # Could be tracked if needed
+                "existing_model_used": existing_model is not None
+            }
+            # Save model
+            save_result = self.saver.save_complete_model(
+                model=model,
+                label_encoder=label_encoder,
+                scaler=scaler,
+                model_version=model_version,
+                training_data_info=training_data_info,
+                performance_metrics=metrics,
+                training_log=training_log,
+                rlhf_metadata=rlhf_metadata
+            )
+            result = {
+                "status": "success",
+                "model_version": model_version,
+                "datasets_processed": len(dataset_names),
+                "samples_processed": len(X),
+                "performance_metrics": metrics,
+                "save_result": save_result,
+                "training_log": training_log
+            }
+            logger.info(f"✅ RLHF training completed successfully: v{model_version}")
+            return result
+        except Exception as e:
+            logger.error(f"❌ RLHF training failed: {e}")
+            return {
+                "status": "error",
+                "error": str(e),
+                "timestamp": datetime.now().isoformat()
+            }
+def main():
+    """Test the RLHF trainer"""
+    try:
+        trainer = RLHFTrainer()
+        result = trainer.train(max_datasets=5)
+        print(f"Training result: {result}")
+    except Exception as e:
+        print(f"Training failed: {e}")
+if __name__ == "__main__":
+    main()

train/saver.py ADDED Viewed

	@@ -0,0 +1,381 @@

+# saver.py
+# Model saving functions for RLHF training
+import os
+import json
+import logging
+import pickle
+import joblib
+from datetime import datetime
+from typing import Dict, Any, Optional
+from pathlib import Path
+from huggingface_hub import HfApi, Repository
+import pandas as pd
+import numpy as np
+logger = logging.getLogger("rlhf-saver")
+logger.setLevel(logging.INFO)
+if not logger.handlers:
+    _h = logging.StreamHandler()
+    _h.setFormatter(logging.Formatter("[%(levelname)s] %(asctime)s - %(message)s"))
+    logger.addHandler(_h)
+class ModelSaver:
+    """
+    Save trained models to Hugging Face Hub and local storage.
+    Handles model artifacts, metadata, and versioning.
+    """
+    def __init__(self):
+        self.hf_token = os.getenv("HF_TOKEN")
+        if not self.hf_token:
+            raise RuntimeError("HF_TOKEN environment variable not set")
+        self.hf_api = HfApi(token=self.hf_token)
+        self.repo_id = os.getenv("HF_MODEL_REPO", "BinKhoaLe1812/Driver_Behavior_OBD")
+        # Local model directory
+        self.local_model_dir = Path(os.getenv("MODEL_DIR", "/app/models/ul"))
+        self.local_model_dir.mkdir(parents=True, exist_ok=True)
+        logger.info(f"📦 ModelSaver ready | repo={self.repo_id}")
+    def _get_next_version(self) -> str:
+        """Get the next version number (1.0, 1.1, 1.2, ..., 1.9, 2.0, etc.)"""
+        try:
+            # List existing versions in HF repo
+            repo_files = self.hf_api.list_repo_files(
+                repo_id=self.repo_id,
+                repo_type="model"
+            )
+            # Find version directories (v1.0, v1.1, etc.)
+            version_dirs = [f for f in repo_files if f.startswith('v') and '/' not in f]
+            versions = []
+            for v_dir in version_dirs:
+                try:
+                    version_str = v_dir[1:]  # Remove 'v' prefix
+                    if '.' in version_str:
+                        major, minor = version_str.split('.')
+                        versions.append((int(major), int(minor)))
+                except (ValueError, IndexError):
+                    continue
+            if not versions:
+                return "1.0"
+            # Sort versions and get the latest
+            versions.sort()
+            latest_major, latest_minor = versions[-1]
+            # Increment version
+            if latest_minor < 9:
+                return f"{latest_major}.{latest_minor + 1}"
+            else:
+                return f"{latest_major + 1}.0"
+        except Exception as e:
+            logger.warning(f"⚠️ Failed to get next version from HF repo: {e}")
+            # Fallback to timestamp-based version
+            return datetime.now().strftime("%Y%m%d_%H%M%S")
+    def _create_model_metadata(self,
+                             model_type: str,
+                             training_data_info: Dict[str, Any],
+                             performance_metrics: Dict[str, float],
+                             model_version: str,
+                             rlhf_metadata: Dict[str, Any] = None) -> Dict[str, Any]:
+        """Create metadata for the trained model"""
+        metadata = {
+            "model_type": model_type,
+            "version": model_version,
+            "created_at": datetime.now().isoformat(),
+            "training_data": training_data_info,
+            "performance_metrics": performance_metrics,
+            "framework": "xgboost",
+            "task": "driver_behavior_classification",
+            "labels": ["aggressive", "normal", "conservative"],  # Based on ul_label.py
+            "features": "obd_sensor_data",
+            "rlhf_metadata": rlhf_metadata or {}
+        }
+        return metadata
+    def save_model_locally(self,
+                          model: Any,
+                          label_encoder: Any,
+                          scaler: Any,
+                          model_version: str,
+                          metadata: Dict[str, Any]) -> Dict[str, str]:
+        """Save model components locally with versioning"""
+        try:
+            # Create versioned directory
+            version_dir = self.local_model_dir / f"v{model_version}"
+            version_dir.mkdir(exist_ok=True)
+            # Save model components
+            model_path = version_dir / "xgb_drivestyle_ul.pkl"
+            le_path = version_dir / "label_encoder_ul.pkl"
+            scaler_path = version_dir / "scaler_ul.pkl"
+            metadata_path = version_dir / "metadata.json"
+            # Save using joblib for better compatibility
+            joblib.dump(model, model_path)
+            joblib.dump(label_encoder, le_path)
+            joblib.dump(scaler, scaler_path)
+            # Save metadata
+            with open(metadata_path, 'w') as f:
+                json.dump(metadata, f, indent=2)
+            # Also save to the main model directory (for current usage)
+            joblib.dump(model, self.local_model_dir / "xgb_drivestyle_ul.pkl")
+            joblib.dump(label_encoder, self.local_model_dir / "label_encoder_ul.pkl")
+            joblib.dump(scaler, self.local_model_dir / "scaler_ul.pkl")
+            logger.info(f"✅ Model saved locally to {version_dir}")
+            return {
+                "model_path": str(model_path),
+                "label_encoder_path": str(le_path),
+                "scaler_path": str(scaler_path),
+                "metadata_path": str(metadata_path)
+            }
+        except Exception as e:
+            logger.error(f"❌ Failed to save model locally: {e}")
+            raise
+    def save_model_to_hf(self,
+                        model: Any,
+                        label_encoder: Any,
+                        scaler: Any,
+                        model_version: str,
+                        metadata: Dict[str, Any],
+                        training_data_info: Dict[str, Any]) -> str:
+        """Save model to Hugging Face Hub"""
+        try:
+            # Create temporary directory for upload
+            temp_dir = Path(f"/tmp/hf_upload_{model_version}")
+            temp_dir.mkdir(exist_ok=True)
+            # Save model components
+            model_path = temp_dir / "xgb_drivestyle_ul.pkl"
+            le_path = temp_dir / "label_encoder_ul.pkl"
+            scaler_path = temp_dir / "scaler_ul.pkl"
+            metadata_path = temp_dir / "metadata.json"
+            readme_path = temp_dir / "README.md"
+            # Save using joblib
+            joblib.dump(model, model_path)
+            joblib.dump(label_encoder, le_path)
+            joblib.dump(scaler, scaler_path)
+            # Save metadata
+            with open(metadata_path, 'w') as f:
+                json.dump(metadata, f, indent=2)
+            # Create README
+            readme_content = self._create_readme(metadata, training_data_info)
+            with open(readme_path, 'w') as f:
+                f.write(readme_content)
+            # Upload to Hugging Face Hub
+            self.hf_api.upload_folder(
+                folder_path=str(temp_dir),
+                repo_id=self.repo_id,
+                repo_type="model",
+                commit_message=f"RLHF training update v{model_version}",
+                ignore_patterns=["*.tmp", "*.log"]
+            )
+            # Clean up temp directory
+            import shutil
+            shutil.rmtree(temp_dir)
+            logger.info(f"✅ Model uploaded to Hugging Face Hub: {self.repo_id}")
+            return f"https://huggingface.co/{self.repo_id}"
+        except Exception as e:
+            logger.error(f"❌ Failed to save model to HF: {e}")
+            raise
+    def _create_readme(self, metadata: Dict[str, Any], training_data_info: Dict[str, Any]) -> str:
+        """Create README content for the model"""
+        readme = f"""---
+license: mit
+tags:
+- driver-behavior
+- obd-data
+- xgboost
+- rlhf
+- reinforcement-learning
+---
+# Driver Behavior Classification Model (RLHF v{metadata['version']})
+This model classifies driver behavior based on OBD (On-Board Diagnostics) sensor data using XGBoost.
+## Model Information
+- **Model Type**: {metadata['model_type']}
+- **Version**: {metadata['version']}
+- **Created**: {metadata['created_at']}
+- **Framework**: {metadata['framework']}
+- **Task**: {metadata['task']}
+## Performance Metrics
+"""
+        for metric, value in metadata['performance_metrics'].items():
+            readme += f"- **{metric}**: {value:.4f}\n"
+        readme += f"""
+## Training Data
+- **Datasets Used**: {len(training_data_info.get('datasets', []))}
+- **Total Samples**: {training_data_info.get('total_samples', 'N/A')}
+- **Training Date**: {training_data_info.get('training_date', 'N/A')}
+## Labels
+The model predicts one of the following driver behavior categories:
+"""
+        for label in metadata['labels']:
+            readme += f"- {label}\n"
+        readme += """
+## Usage
+```python
+import joblib
+import pandas as pd
+# Load the model
+model = joblib.load('xgb_drivestyle_ul.pkl')
+label_encoder = joblib.load('label_encoder_ul.pkl')
+scaler = joblib.load('scaler_ul.pkl')
+# Prepare your OBD data
+# (Ensure features match the training data format)
+# Make predictions
+predictions = model.predict(scaled_data)
+behavior_labels = label_encoder.inverse_transform(predictions)
+```
+## Files
+- `xgb_drivestyle_ul.pkl`: Main XGBoost model
+- `label_encoder_ul.pkl`: Label encoder for behavior categories
+- `scaler_ul.pkl`: Feature scaler
+- `metadata.json`: Model metadata and performance metrics
+## RLHF Training
+This model was trained using Reinforcement Learning from Human Feedback (RLHF) to improve performance based on human-labeled data and feedback.
+"""
+        return readme
+    def save_training_log(self,
+                         training_log: Dict[str, Any],
+                         model_version: str) -> str:
+        """Save training log to Firebase storage"""
+        try:
+            # Import Firebase client
+            import sys
+            sys.path.append(os.path.join(os.path.dirname(__file__), '..'))
+            from data.firebase_saver import FirebaseSaver
+            # Create log entry
+            log_entry = {
+                "version": model_version,
+                "timestamp": datetime.now().isoformat(),
+                "log": training_log
+            }
+            # Save to Firebase
+            saver = FirebaseSaver()
+            # Note: We'll need to modify FirebaseSaver to support different prefixes
+            # For now, we'll save to a logs subdirectory
+            log_filename = f"training_log_v{model_version}_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json"
+            # Create temporary file
+            temp_path = f"/tmp/{log_filename}"
+            with open(temp_path, 'w') as f:
+                json.dump(log_entry, f, indent=2)
+            # Upload to Firebase (we'll need to extend FirebaseSaver for this)
+            # For now, just log locally
+            logger.info(f"📝 Training log saved: {log_entry}")
+            return temp_path
+        except Exception as e:
+            logger.error(f"❌ Failed to save training log: {e}")
+            return ""
+    def save_complete_model(self,
+                           model: Any,
+                           label_encoder: Any,
+                           scaler: Any,
+                           model_version: str,
+                           training_data_info: Dict[str, Any],
+                           performance_metrics: Dict[str, float],
+                           training_log: Dict[str, Any],
+                           rlhf_metadata: Dict[str, Any] = None) -> Dict[str, str]:
+        """Complete model saving workflow"""
+        try:
+            # Create metadata
+            metadata = self._create_model_metadata(
+                model_type="xgboost_classifier",
+                training_data_info=training_data_info,
+                performance_metrics=performance_metrics,
+                model_version=model_version,
+                rlhf_metadata=rlhf_metadata
+            )
+            # Save locally
+            local_paths = self.save_model_locally(
+                model, label_encoder, scaler, model_version, metadata
+            )
+            # Save to Hugging Face Hub
+            hf_url = self.save_model_to_hf(
+                model, label_encoder, scaler, model_version, metadata, training_data_info
+            )
+            # Save training log
+            log_path = self.save_training_log(training_log, model_version)
+            result = {
+                "local_paths": local_paths,
+                "hf_url": hf_url,
+                "log_path": log_path,
+                "version": model_version,
+                "metadata": metadata
+            }
+            logger.info(f"✅ Complete model save successful: v{model_version}")
+            return result
+        except Exception as e:
+            logger.error(f"❌ Complete model save failed: {e}")
+            raise
+def main():
+    """Test the saver functionality"""
+    try:
+        saver = ModelSaver()
+        print(f"ModelSaver initialized for repo: {saver.repo_id}")
+        print(f"Local model directory: {saver.local_model_dir}")
+    except Exception as e:
+        print(f"Failed to initialize ModelSaver: {e}")
+if __name__ == "__main__":
+    main()

utils/download.py ADDED Viewed

	@@ -0,0 +1,154 @@

+# download.py
+# Download latest models from Hugging Face
+import os, shutil, pathlib, sys
+import json
+from huggingface_hub import hf_hub_download, HfApi
+def load_env_file():
+    """Load environment variables from .env file if it exists"""
+    env_path = pathlib.Path(__file__).parent.parent / ".env"
+    if env_path.exists():
+        with open(env_path, 'r') as f:
+            for line in f:
+                line = line.strip()
+                if line and not line.startswith('#') and '=' in line:
+                    key, value = line.split('=', 1)
+                    os.environ[key] = value
+        return True
+    return False
+# Load .env file first before setting any environment variables
+load_env_file()
+REPO_ID   = os.getenv("HF_MODEL_REPO", "BinKhoaLe1812/Driver_Behavior_OBD")
+MODEL_DIR = pathlib.Path(os.getenv("MODEL_DIR", "/app/models/ul")).resolve()
+FILES     = ["label_encoder_ul.pkl", "scaler_ul.pkl", "xgb_drivestyle_ul.pkl"]
+MODEL_DIR.mkdir(parents=True, exist_ok=True)
+def get_latest_version():
+    """Get the latest model version from Hugging Face repo"""
+    try:
+        hf_token = os.getenv("HF_TOKEN")
+        if not hf_token:
+            print("⚠️ HF_TOKEN not set, using default model files")
+            return None
+        api = HfApi(token=hf_token)
+        repo_files = api.list_repo_files(
+            repo_id=REPO_ID,
+            repo_type="model"
+        )
+        print(f"🔍 Checking repository files...")
+        print(f"📁 Found {len(repo_files)} files in repository")
+        # Find version directories (v1.0, v1.1, etc.)
+        version_dirs = [f for f in repo_files if f.startswith('v') and '/' not in f]
+        print(f"📦 Found version directories: {version_dirs}")
+        # Also check for version directories with files inside
+        version_dirs_with_files = []
+        for f in repo_files:
+            if f.startswith('v') and '/' in f:
+                version_dir = f.split('/')[0]
+                if version_dir not in version_dirs_with_files:
+                    version_dirs_with_files.append(version_dir)
+        if version_dirs_with_files:
+            print(f"📦 Found version directories with files: {version_dirs_with_files}")
+            version_dirs.extend(version_dirs_with_files)
+        versions = []
+        for v_dir in version_dirs:
+            try:
+                version_str = v_dir[1:]  # Remove 'v' prefix
+                if '.' in version_str:
+                    major, minor = version_str.split('.')
+                    versions.append((int(major), int(minor), v_dir))
+                    print(f"✅ Found version: {v_dir} (major={major}, minor={minor})")
+            except (ValueError, IndexError):
+                print(f"⚠️ Could not parse version: {v_dir}")
+                continue
+        if not versions:
+            print("📦 No versioned models found, checking for root files...")
+            # Check if files exist in root
+            root_files = [f for f in repo_files if f in FILES]
+            if root_files:
+                print(f"📁 Found root files: {root_files}")
+                return None  # Use root files
+            else:
+                print("❌ No model files found in repository")
+                print("💡 Available files in repository:")
+                for f in sorted(repo_files):
+                    print(f"   - {f}")
+                return None
+        # Sort versions and get the latest
+        versions.sort()
+        latest_version = versions[-1][2]  # Get the directory name
+        print(f"📦 Latest model version: {latest_version}")
+        return latest_version
+    except Exception as e:
+        print(f"⚠️ Failed to get latest version: {e}")
+        return None
+def fetch_latest(fname: str, version_dir: str = None):
+    """Download the latest version of a model file"""
+    try:
+        if version_dir:
+            # Download from versioned directory
+            versioned_path = f"{version_dir}/{fname}"
+            print(f"📥 Downloading {fname} from {versioned_path}...")
+            src = hf_hub_download(repo_id=REPO_ID, filename=versioned_path, repo_type="model")
+        else:
+            # Download from root directory (fallback)
+            print(f"📥 Downloading {fname} from root directory...")
+            src = hf_hub_download(repo_id=REPO_ID, filename=fname, repo_type="model")
+        dst = MODEL_DIR / fname
+        shutil.copy2(src, dst)
+        print(f"✅ Downloaded {fname} → {dst}")
+        return True
+    except Exception as e:
+        print(f"❌ Failed to fetch {fname}: {e}")
+        if version_dir:
+            print(f"   Tried path: {version_dir}/{fname}")
+        else:
+            print(f"   Tried path: {fname}")
+        return False
+def download_latest_models():
+    """Download the latest version of all model files"""
+    print("🔄 Checking for latest model version...")
+    latest_version = get_latest_version()
+    success_count = 0
+    for f in FILES:
+        if fetch_latest(f, latest_version):
+            success_count += 1
+    if success_count == len(FILES):
+        print(f"✅ Successfully downloaded all {len(FILES)} model files")
+        if latest_version:
+            print(f"📦 Using version: {latest_version}")
+        return True
+    else:
+        print(f"⚠️ Only {success_count}/{len(FILES)} files downloaded successfully")
+        return False
+def fetch(fname: str):
+    """Legacy function for backward compatibility"""
+    return fetch_latest(fname)
+def main():
+    """Download latest models"""
+    success = download_latest_models()
+    if not success:
+        sys.exit(1)
+if __name__ == "__main__":
+    main()

utils/mount_drive.py ADDED Viewed

	@@ -0,0 +1,44 @@

+import os
+import json
+import gspread
+import logging
+from oauth2client.service_account import ServiceAccountCredentials
+# Setup logging
+logger = logging.getLogger("upload")
+logging.basicConfig(level=logging.INFO, format="[%(levelname)s] %(asctime)s - %(message)s")
+# Authenticate with GDrive using secret
+logger.info("Authenticating to Google Drive...")
+creds_json = os.getenv("GDRIVE_CREDENTIALS_JSON")
+if not creds_json:
+    logger.error("GDRIVE_CREDENTIALS_JSON not found!")
+    exit(1)
+try:
+    creds_dict = json.loads(creds_json)
+    scope = ["https://spreadsheets.google.com/feeds", "https://www.googleapis.com/auth/drive"]
+    creds = ServiceAccountCredentials.from_json_keyfile_dict(creds_dict, scope)
+    client = gspread.authorize(creds)
+    logger.info("Authenticated with Google Drive")
+except Exception as e:
+    logger.error(f"Failed to authenticate: {e}")
+    exit(1)
+# Folder and files
+upload_dir = "./cache/obd_data/cleaned"
+if not os.path.exists(upload_dir):
+    logger.warning(f"Directory {upload_dir} does not exist.")
+    exit(0)
+# Upload all .csv files
+for file in os.listdir(upload_dir):
+    if file.endswith(".csv"):
+        try:
+            path = os.path.join(upload_dir, file)
+            logger.info(f"Uploading {file}...")
+            with open(path, "rb") as f:
+                client.import_csv(client.create(file).id, f.read())
+            logger.info(f"Uploaded {file}")
+        except Exception as e:
+            logger.error(f"Failed to upload {file}: {e}")

utils/ul_label.py ADDED Viewed

	@@ -0,0 +1,206 @@

+# ul_label.py
+# Load UL models and predict driving style
+import os, logging, pickle
+import warnings
+import joblib
+import numpy as np
+import pandas as pd
+# Import download functionality
+import sys
+sys.path.append(os.path.dirname(__file__))
+from download import download_latest_models
+log = logging.getLogger("ul-labeler")
+log.setLevel(logging.INFO)
+# Suppress version compatibility warnings in production
+warnings.filterwarnings("ignore", category=UserWarning, module="sklearn.base")
+warnings.filterwarnings("ignore", category=UserWarning, module="xgboost.core")
+MODEL_DIR = os.getenv("MODEL_DIR", "/app/models/ul")
+LE_PATH   = os.path.join(MODEL_DIR, "label_encoder_ul.pkl")
+SC_PATH   = os.path.join(MODEL_DIR, "scaler_ul.pkl")
+XGB_PATH  = os.path.join(MODEL_DIR, "xgb_drivestyle_ul.pkl")
+SAFE_DROP = {
+    "timestamp","driving_style","ul_drivestyle","gt_drivestyle",
+    "session_id","imported_at","record_index"
+}
+def _load_any(path):
+    # Suppress version compatibility warnings for production
+    with warnings.catch_warnings():
+        warnings.filterwarnings("ignore", category=UserWarning, module="sklearn")
+        warnings.filterwarnings("ignore", category=UserWarning, module="xgboost")
+        warnings.filterwarnings("ignore", category=FutureWarning, module="sklearn")
+        warnings.filterwarnings("ignore", category=FutureWarning, module="xgboost")
+        try:
+            model = joblib.load(path)
+        except Exception:
+            with open(path, "rb") as f:
+                model = pickle.load(f)
+    # Fix XGBoost compatibility issues for older trained models
+    if hasattr(model, 'get_booster'):  # This is an XGBoost model
+        # Remove deprecated use_label_encoder attribute that causes issues in newer XGBoost versions
+        if hasattr(model, '__dict__'):
+            # Remove all deprecated attributes that cause issues
+            deprecated_attrs = [
+                'use_label_encoder', '_le', '_label_encoder',
+                'use_label_encoder_', '_le_', '_label_encoder_'
+            ]
+            for attr in deprecated_attrs:
+                model.__dict__.pop(attr, None)
+            # Set use_label_encoder to False for newer XGBoost versions
+            if hasattr(model, 'set_params'):
+                try:
+                    model.set_params(use_label_encoder=False)
+                except Exception:
+                    pass
+    return model
+class ULLabeler:
+    _instance = None
+    def __init__(self, auto_download: bool = True):
+        # Auto-download latest models if enabled
+        if auto_download:
+            log.info("🔄 Checking for latest model version...")
+            try:
+                download_latest_models()
+            except Exception as e:
+                log.warning(f"⚠️ Failed to download latest models: {e}")
+        if not (os.path.exists(LE_PATH) and os.path.exists(SC_PATH) and os.path.exists(XGB_PATH)):
+            raise FileNotFoundError("Model files not found. Ensure download.py ran successfully.")
+        self.le   = _load_any(LE_PATH)
+        self.scal = _load_any(SC_PATH)
+        self.clf  = _load_any(XGB_PATH)
+        # Additional XGBoost compatibility fixes
+        self._fix_xgb_compatibility()
+        # Try to discover expected feature names from scaler or model
+        self.expected = None
+        if hasattr(self.scal, "feature_names_in_"):
+            self.expected = list(self.scal.feature_names_in_)
+        elif hasattr(self.clf, "feature_names_in_"):
+            self.expected = list(self.clf.feature_names_in_)
+        log.info(f"ULLabeler ready | expected_features={len(self.expected) if self.expected else 'unknown'}")
+    def _fix_xgb_compatibility(self):
+        """Fix XGBoost compatibility issues with older trained models."""
+        try:
+            # Check if this is an XGBoost classifier
+            if hasattr(self.clf, 'get_booster'):
+                # Remove deprecated attributes that cause issues in newer XGBoost versions
+                deprecated_attrs = [
+                    'use_label_encoder', '_le', '_label_encoder',
+                    'use_label_encoder_', '_le_', '_label_encoder_'
+                ]
+                for attr in deprecated_attrs:
+                    if hasattr(self.clf, attr):
+                        try:
+                            delattr(self.clf, attr)
+                        except (AttributeError, TypeError):
+                            pass
+                # Set use_label_encoder to False for newer XGBoost versions
+                if hasattr(self.clf, 'set_params'):
+                    try:
+                        self.clf.set_params(use_label_encoder=False)
+                    except Exception:
+                        pass
+                # Ensure the model is properly configured for prediction
+                if hasattr(self.clf, 'n_classes_') and self.clf.n_classes_ is None:
+                    # Try to infer number of classes from the label encoder
+                    if hasattr(self.le, 'classes_'):
+                        self.clf.n_classes_ = len(self.le.classes_)
+                # For newer XGBoost versions, ensure the model is properly initialized
+                if hasattr(self.clf, '_le') and self.clf._le is None:
+                    self.clf._le = None
+                log.info("XGBoost compatibility fixes applied successfully")
+        except Exception as e:
+            log.warning(f"XGBoost compatibility fix failed: {e}")
+    @classmethod
+    def get(cls, auto_download: bool = True):
+        if cls._instance is None:
+            cls._instance = ULLabeler(auto_download=auto_download)
+        return cls._instance
+    def _prepare(self, df: pd.DataFrame):
+        # numeric only + drop non-feature columns
+        cols = [c for c in df.columns if c not in SAFE_DROP and pd.api.types.is_numeric_dtype(df[c])]
+        X = df[cols].copy()
+        # ensure required features
+        if self.expected:
+            for c in self.expected:
+                if c not in X.columns:
+                    X[c] = 0.0
+            X = X[self.expected]  # align order
+        X = X.fillna(0)
+        # scale
+        try:
+            Xs = self.scal.transform(X if hasattr(self.scal, "feature_names_in_") else X.values)
+        except Exception as e:
+            log.warning(f"Scaler transform failed ({e}); using raw features.")
+            Xs = X.values
+        return Xs
+    def predict_df(self, df: pd.DataFrame) -> np.ndarray:
+        Xs = self._prepare(df)
+        try:
+            yhat = self.clf.predict(Xs)
+        except (AttributeError, TypeError) as e:
+            if 'use_label_encoder' in str(e) or 'label_encoder' in str(e):
+                # Last resort: try to fix the model and retry
+                log.warning("XGBoost compatibility issue detected, attempting fix...")
+                try:
+                    # Remove all problematic attributes
+                    deprecated_attrs = [
+                        'use_label_encoder', '_le', '_label_encoder',
+                        'use_label_encoder_', '_le_', '_label_encoder_'
+                    ]
+                    for attr in deprecated_attrs:
+                        if hasattr(self.clf, attr):
+                            try:
+                                delattr(self.clf, attr)
+                            except (AttributeError, TypeError):
+                                pass
+                    # Set use_label_encoder to False
+                    if hasattr(self.clf, 'set_params'):
+                        try:
+                            self.clf.set_params(use_label_encoder=False)
+                        except Exception:
+                            pass
+                    # Retry prediction
+                    yhat = self.clf.predict(Xs)
+                except Exception as retry_e:
+                    log.error(f"Failed to fix XGBoost compatibility: {retry_e}")
+                    raise e
+            else:
+                raise e
+        try:
+            return self.le.inverse_transform(yhat)
+        except Exception:
+            return yhat
+    def predict_csv(self, csv_path: str) -> pd.DataFrame:
+        df = pd.read_csv(csv_path)
+        y = self.predict_df(df)
+        out = df.copy()
+        out["driving_style"] = y
+        return out