initial commit

Browse files

Files changed (5) hide show

app.py +446 -0
config.json +15 -0
ev_classifier_model.pth +3 -0
inference.py +32 -0
model.py +25 -0

app.py ADDED Viewed

	@@ -0,0 +1,446 @@

+#df = pd.read_csv("data\Electric_Vehicle_Population_Data_fixed.csv", nrows=10)
+import pandas as pd
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.optim as optim
+from torch.utils.data import Dataset, DataLoader, TensorDataset
+from sklearn.model_selection import train_test_split
+from sklearn.preprocessing import StandardScaler, LabelEncoder
+from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
+import matplotlib.pyplot as plt
+import seaborn as sns
+import warnings
+warnings.filterwarnings('ignore')
+# Define a simple TabularModel class
+class TabularModel(nn.Module):
+    def __init__(self, input_size, hidden_sizes, output_size, dropout_rate=0.2):
+        super(TabularModel, self).__init__()
+        layers = []
+        prev_size = input_size
+        # Create hidden layers
+        for hidden_size in hidden_sizes:
+            layers.extend([
+                nn.Linear(prev_size, hidden_size),
+                nn.BatchNorm1d(hidden_size),
+                nn.ReLU(),
+                nn.Dropout(dropout_rate)
+            ])
+            prev_size = hidden_size
+        # Output layer
+        layers.append(nn.Linear(prev_size, output_size))
+        self.model = nn.Sequential(*layers)
+    def forward(self, x):
+        return self.model(x)
+# Data preprocessing function
+def preprocess_data(df, target_column, test_size=0.2):
+    """
+    Preprocess tabular data for neural network training
+    """
+    # Separate features and target
+    X = df.drop(columns=[target_column])
+    y = df[target_column]
+    # Handle categorical variables
+    categorical_columns = X.select_dtypes(include=['object']).columns
+    numerical_columns = X.select_dtypes(include=['int64', 'float64']).columns
+    # Encode categorical variables
+    label_encoders = {}
+    for col in categorical_columns:
+        le = LabelEncoder()
+        X[col] = le.fit_transform(X[col].astype(str))
+        label_encoders[col] = le
+    # Scale numerical features
+    scaler = StandardScaler()
+    X[numerical_columns] = scaler.fit_transform(X[numerical_columns])
+    # Encode target variable if it's categorical
+    target_encoder = None
+    if y.dtype == 'object':
+        target_encoder = LabelEncoder()
+        y = target_encoder.fit_transform(y)
+    # Split the data
+    X_train, X_test, y_train, y_test = train_test_split(
+        X.values, y.values, test_size=test_size, random_state=42, stratify=y
+    )
+    return (X_train, X_test, y_train, y_test, scaler, label_encoders, target_encoder)
+# Training function
+def train_model(model, train_loader, val_loader, epochs=100, lr=0.001):
+    """
+    Train the tabular model
+    """
+    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+    model.to(device)
+    criterion = nn.CrossEntropyLoss()
+    optimizer = optim.Adam(model.parameters(), lr=lr, weight_decay=1e-5)
+    scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=10)
+    train_losses = []
+    val_losses = []
+    for epoch in range(epochs):
+        # Training phase
+        model.train()
+        train_loss = 0.0
+        for batch_idx, (data, target) in enumerate(train_loader):
+            data, target = data.to(device), target.to(device)
+            optimizer.zero_grad()
+            output = model(data)
+            loss = criterion(output, target)
+            loss.backward()
+            optimizer.step()
+            train_loss += loss.item()
+        # Validation phase
+        model.eval()
+        val_loss = 0.0
+        with torch.no_grad():
+            for data, target in val_loader:
+                data, target = data.to(device), target.to(device)
+                output = model(data)
+                val_loss += criterion(output, target).item()
+        avg_train_loss = train_loss / len(train_loader)
+        avg_val_loss = val_loss / len(val_loader)
+        train_losses.append(avg_train_loss)
+        val_losses.append(avg_val_loss)
+        scheduler.step(avg_val_loss)
+        if (epoch + 1) % 20 == 0:
+            print(f'Epoch [{epoch+1}/{epochs}], Train Loss: {avg_train_loss:.4f}, Val Loss: {avg_val_loss:.4f}')
+    return train_losses, val_losses
+# Evaluation function
+def evaluate_model(model, test_loader, target_encoder=None):
+    """
+    Evaluate the trained model
+    """
+    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+    model.eval()
+    all_predictions = []
+    all_targets = []
+    with torch.no_grad():
+        for data, target in test_loader:
+            data, target = data.to(device), target.to(device)
+            output = model(data)
+            predictions = torch.argmax(output, dim=1)
+            all_predictions.extend(predictions.cpu().numpy())
+            all_targets.extend(target.cpu().numpy())
+    # Convert back to original labels if target was encoded
+    if target_encoder:
+        all_predictions = target_encoder.inverse_transform(all_predictions)
+        all_targets = target_encoder.inverse_transform(all_targets)
+    accuracy = accuracy_score(all_targets, all_predictions)
+    report = classification_report(all_targets, all_predictions)
+    return accuracy, report, all_predictions, all_targets
+# Plotting function for training history
+def plot_training_history(train_losses, val_losses):
+    """
+    Plot training and validation losses
+    """
+    plt.figure(figsize=(12, 5))
+    plt.subplot(1, 2, 1)
+    plt.plot(train_losses, label='Training Loss', color='blue')
+    plt.plot(val_losses, label='Validation Loss', color='red')
+    plt.xlabel('Epoch')
+    plt.ylabel('Loss')
+    plt.title('Training and Validation Loss')
+    plt.legend()
+    plt.grid(True)
+    plt.subplot(1, 2, 2)
+    plt.plot(train_losses, label='Training Loss', color='blue')
+    plt.plot(val_losses, label='Validation Loss', color='red')
+    plt.xlabel('Epoch')
+    plt.ylabel('Loss (Log Scale)')
+    plt.title('Training and Validation Loss (Log Scale)')
+    plt.yscale('log')
+    plt.legend()
+    plt.grid(True)
+    plt.tight_layout()
+    plt.show()
+# Function to plot confusion matrix
+def plot_confusion_matrix(y_true, y_pred, labels=None):
+    """
+    Plot confusion matrix
+    """
+    cm = confusion_matrix(y_true, y_pred)
+    plt.figure(figsize=(8, 6))
+    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
+                xticklabels=labels, yticklabels=labels)
+    plt.xlabel('Predicted')
+    plt.ylabel('Actual')
+    plt.title('Confusion Matrix')
+    plt.show()
+# Function to save model
+def save_model(model, filepath, scaler, label_encoders, target_encoder=None):
+    """
+    Save the trained model and preprocessing objects
+    """
+    torch.save({
+        'model_state_dict': model.state_dict(),
+        'scaler': scaler,
+        'label_encoders': label_encoders,
+        'target_encoder': target_encoder
+    }, filepath)
+    print(f"Model saved to {filepath}")
+# Function to load model
+def load_model(filepath, input_size, hidden_sizes, output_size, dropout_rate=0.2):
+    """
+    Load the trained model and preprocessing objects
+    """
+    checkpoint = torch.load(filepath)
+    model = TabularModel(input_size, hidden_sizes, output_size, dropout_rate)
+    model.load_state_dict(checkpoint['model_state_dict'])
+    return model, checkpoint['scaler'], checkpoint['label_encoders'], checkpoint['target_encoder']
+# Main training pipeline
+def main():
+    # Load your CSV file
+    # Replace 'electric_vehicles.csv' with your actual CSV file path
+    #df = pd.read_csv('data\Electric_Vehicle_Population_Data_fixed.csv", nrows=10')
+    df = pd.read_csv("Electric_Vehicle_Population.csv")
+    # Data preprocessing for Electric Vehicle dataset
+    print(f"Original dataset shape: {df.shape}")
+    print(f"Columns: {list(df.columns)}")
+    # Clean and prepare the data
+    # Remove or handle missing values
+    df = df.dropna(subset=['Make', 'Model', 'Electric Vehicle Type', 'Model Year'])
+    # Extract useful features and create target variable
+    # For this example, let's predict Electric Vehicle Type (BEV vs PHEV)
+    df_clean = df.copy()
+    # Clean numeric columns
+    df_clean['Model Year'] = pd.to_numeric(df_clean['Model Year'], errors='coerce')
+    df_clean['Electric Range'] = pd.to_numeric(df_clean['Electric Range'], errors='coerce')
+    df_clean['Base MSRP'] = pd.to_numeric(df_clean['Base MSRP'], errors='coerce')
+    df_clean['Legislative District'] = pd.to_numeric(df_clean['Legislative District'], errors='coerce')
+    # Fill missing values
+    df_clean['Electric Range'] = df_clean['Electric Range'].fillna(df_clean['Electric Range'].median())
+    df_clean['Base MSRP'] = df_clean['Base MSRP'].fillna(df_clean['Base MSRP'].median())
+    df_clean['Legislative District'] = df_clean['Legislative District'].fillna(0)
+    # Create binary target: BEV vs PHEV
+    df_clean['target'] = (df_clean['Electric Vehicle Type'] == 'Battery Electric Vehicle (BEV)').astype(int)
+    # Select relevant features for training
+    feature_columns = [
+        'Model Year', 'Make', 'Model', 'Electric Range', 'Base MSRP',
+        'Legislative District', 'County', 'State', 'Clean Alternative Fuel Vehicle (CAFV) Eligibility'
+    ]
+    # Create final dataset with selected features
+    df_final = df_clean[feature_columns + ['target']].copy()
+    # Clean column names for easier handling
+    df_final.columns = [
+        'model_year', 'make', 'model', 'electric_range', 'base_msrp',
+        'legislative_district', 'county', 'state', 'cafv_eligibility', 'target'
+    ]
+    # Handle categorical variables with too many categories
+    # Keep only top N categories for Make and Model
+    top_makes = df_final['make'].value_counts().head(10).index
+    df_final['make'] = df_final['make'].apply(lambda x: x if x in top_makes else 'OTHER')
+    top_models = df_final['model'].value_counts().head(15).index
+    df_final['model'] = df_final['model'].apply(lambda x: x if x in top_models else 'OTHER')
+    top_counties = df_final['county'].value_counts().head(20).index
+    df_final['county'] = df_final['county'].apply(lambda x: x if x in top_counties else 'OTHER')
+    # Remove rows where target might be ambiguous
+    df_final = df_final.dropna()
+    df = df_final
+    print(f"Processed dataset shape: {df.shape}")
+    print(f"Target distribution:")
+    print(f"BEV (1): {(df['target'] == 1).sum()}")
+    print(f"PHEV (0): {(df['target'] == 0).sum()}")
+    # Specify your target column name
+    target_column = 'target'
+    # Preprocess the data
+    X_train, X_test, y_train, y_test, scaler, label_encoders, target_encoder = preprocess_data(
+        df, target_column
+    )
+    # Convert to PyTorch tensors
+    X_train_tensor = torch.FloatTensor(X_train)
+    y_train_tensor = torch.LongTensor(y_train)
+    X_test_tensor = torch.FloatTensor(X_test)
+    y_test_tensor = torch.LongTensor(y_test)
+    # Create validation split from training data
+    X_train_split, X_val_split, y_train_split, y_val_split = train_test_split(
+        X_train_tensor, y_train_tensor, test_size=0.2, random_state=42, stratify=y_train_tensor
+    )
+    # Create data loaders
+    batch_size = 64
+    train_dataset = TensorDataset(X_train_split, y_train_split)
+    val_dataset = TensorDataset(X_val_split, y_val_split)
+    test_dataset = TensorDataset(X_test_tensor, y_test_tensor)
+    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
+    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
+    test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)
+    # Model parameters
+    input_size = X_train.shape[1]
+    hidden_sizes = [128, 64, 32]  # You can adjust these
+    output_size = len(np.unique(y_train))
+    # Create the model
+    model = TabularModel(
+        input_size=input_size,
+        hidden_sizes=hidden_sizes,
+        output_size=output_size,
+        dropout_rate=0.3
+    )
+    print(f"\nModel architecture:")
+    print(f"Input size: {input_size}")
+    print(f"Hidden layers: {hidden_sizes}")
+    print(f"Output size: {output_size}")
+    print(f"Total parameters: {sum(p.numel() for p in model.parameters())}")
+    # Train the model
+    print("\nStarting training...")
+    epochs = 100
+    learning_rate = 0.001
+    train_losses, val_losses = train_model(
+        model, train_loader, val_loader, epochs=epochs, lr=learning_rate
+    )
+    # Plot training history
+    plot_training_history(train_losses, val_losses)
+    # Evaluate the model
+    print("\nEvaluating model on test set...")
+    accuracy, report, predictions, targets = evaluate_model(model, test_loader, target_encoder)
+    print(f"Test Accuracy: {accuracy:.4f}")
+    print("\nClassification Report:")
+    print(report)
+    # Plot confusion matrix
+    labels = ['PHEV', 'BEV'] if target_encoder is None else None
+    plot_confusion_matrix(targets, predictions, labels)
+    # Save the model
+    model_filepath = 'ev_classifier_model.pth'
+    save_model(model, model_filepath, scaler, label_encoders, target_encoder)
+    print(f"\nTraining completed successfully!")
+    print(f"Final test accuracy: {accuracy:.4f}")
+    return model, scaler, label_encoders, target_encoder
+# Function to make predictions on new data
+def predict_new_data(model, new_data, scaler, label_encoders, target_encoder=None):
+    """
+    Make predictions on new data
+    """
+    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+    model.to(device)
+    model.eval()
+    # Preprocess new data
+    new_data_processed = new_data.copy()
+    # Apply label encoders
+    for col, encoder in label_encoders.items():
+        if col in new_data_processed.columns:
+            # Handle unseen categories
+            new_data_processed[col] = new_data_processed[col].apply(
+                lambda x: x if x in encoder.classes_ else 'OTHER'
+            )
+            new_data_processed[col] = encoder.transform(new_data_processed[col].astype(str))
+    # Apply scaler to numerical columns
+    numerical_columns = new_data_processed.select_dtypes(include=['int64', 'float64']).columns
+    new_data_processed[numerical_columns] = scaler.transform(new_data_processed[numerical_columns])
+    # Convert to tensor
+    X_new = torch.FloatTensor(new_data_processed.values)
+    X_new = X_new.to(device)
+    # Make predictions
+    with torch.no_grad():
+        outputs = model(X_new)
+        probabilities = torch.softmax(outputs, dim=1)
+        predictions = torch.argmax(outputs, dim=1)
+    # Convert back to original labels if needed
+    if target_encoder:
+        predictions = target_encoder.inverse_transform(predictions.cpu().numpy())
+    else:
+        predictions = predictions.cpu().numpy()
+    return predictions, probabilities.cpu().numpy()
+if __name__ == "__main__":
+    # Run the main training pipeline
+    model, scaler, label_encoders, target_encoder = main()
+    # Example of how to use the trained model for predictions
+    # Uncomment and modify the following code to make predictions on new data
+    # # Load new data for prediction
+    # new_data = pd.DataFrame({
+    #     'model_year': [2020, 2021, 2019],
+    #     'make': ['TESLA', 'NISSAN', 'CHEVROLET'],
+    #     'model': ['MODEL S', 'LEAF', 'BOLT EV'],
+    #     'electric_range': [370, 150, 259],
+    #     'base_msrp': [80000, 32000, 32000],
+    #     'legislative_district': [43, 11, 36],
+    #     'county': ['King', 'Snohomish', 'Pierce'],
+    #     'state': ['WA', 'WA', 'WA'],
+    #     'cafv_eligibility': ['Clean Alternative Fuel Vehicle Eligible',
+    #                         'Clean Alternative Fuel Vehicle Eligible',
+    #                         'Clean Alternative Fuel Vehicle Eligible']
+    # })
+    #
+    # predictions, probabilities = predict_new_data(model, new_data, scaler, label_encoders, target_encoder)
+    # print(f"Predictions: {predictions}")
+    # print(f"Probabilities: {probabilities}")

config.json ADDED Viewed

	@@ -0,0 +1,15 @@

+{
+    "model_type": "tabular_classifier",
+    "task": "binary_classification",
+    "input_size": 9,
+    "hidden_sizes": [128, 64, 32],
+    "output_size": 2,
+    "dropout_rate": 0.3,
+    "features": [
+        "model_year", "make", "model", "electric_range",
+        "base_msrp", "legislative_district", "county",
+        "state", "cafv_eligibility"
+    ],
+    "target": "Electric Vehicle Type (BEV vs PHEV)",
+    "classes": ["PHEV", "BEV"]
+}

ev_classifier_model.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:519c34c4eec9511725b2922fa1f8556f66aecca7dff4458bc581310cc55bb96d
+size 61754

inference.py ADDED Viewed

	@@ -0,0 +1,32 @@

+### 6. **Inference Script** (`inference.py`)
+import torch
+import pandas as pd
+from model import TabularModel
+def load_model_and_predict(data):
+    # Load model
+    checkpoint = torch.load('ev_classifier_model.pth')
+    model = TabularModel(input_size=9, hidden_sizes=[128, 64, 32], output_size=2)
+    model.load_state_dict(checkpoint['model_state_dict'])
+    model.eval()
+    # Get preprocessors
+    scaler = checkpoint['scaler']
+    label_encoders = checkpoint['label_encoders']
+    # Preprocess and predict
+    # ... (preprocessing code)
+    return predictions
+# Example usage
+if __name__ == "__main__":
+    sample_data = pd.DataFrame({
+        'model_year': [2021],
+        'make': ['TESLA'],
+        'model': ['MODEL 3'],
+        # ... other features
+    })
+    prediction = load_model_and_predict(sample_data)
+    print(f"Prediction: {prediction}")

model.py ADDED Viewed

	@@ -0,0 +1,25 @@

+# Create a standalone file with just the model class
+import torch
+import torch.nn as nn
+class TabularModel(nn.Module):
+    def __init__(self, input_size, hidden_sizes, output_size, dropout_rate=0.2):
+        super(TabularModel, self).__init__()
+        layers = []
+        prev_size = input_size
+        for hidden_size in hidden_sizes:
+            layers.extend([
+                nn.Linear(prev_size, hidden_size),
+                nn.BatchNorm1d(hidden_size),
+                nn.ReLU(),
+                nn.Dropout(dropout_rate)
+            ])
+            prev_size = hidden_size
+        layers.append(nn.Linear(prev_size, output_size))
+        self.model = nn.Sequential(*layers)
+    def forward(self, x):
+        return self.model(x)