import sqlite3 import pandas as pd import numpy as np from sklearn.model_selection import train_test_split, cross_val_score from sklearn.preprocessing import StandardScaler, LabelEncoder from sklearn.ensemble import RandomForestClassifier from sklearn.metrics import classification_report, confusion_matrix, accuracy_score import xgboost as xgb from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error import matplotlib.pyplot as plt import seaborn as sns import joblib import warnings warnings.filterwarnings('ignore') print("Training ML Models\n") # ==================== LOAD DATA ==================== print("="*70) print("Loading Data from Database") print("="*70) conn = sqlite3.connect('resource_optimization.db') # Load all tables services = pd.read_sql_query("SELECT * FROM services", conn) latency = pd.read_sql_query("SELECT * FROM regional_latency", conn) traffic = pd.read_sql_query("SELECT * FROM traffic_patterns", conn) placement = pd.read_sql_query("SELECT * FROM service_placement", conn) print(f"Loaded {len(services)} services") print(f"Loaded {len(latency)} latency records") print(f"Loaded {len(traffic)} traffic records") print(f"Loaded {len(placement)} placement records\n") # ==================== FEATURE ENGINEERING ==================== print("="*70) print("Feature Engineering") print("="*70) # Create a feature matrix from placement data placement['timestamp'] = pd.to_datetime(placement['timestamp']) traffic['timestamp'] = pd.to_datetime(traffic['timestamp']) # Aggregate traffic by service and region traffic_agg = traffic.groupby(['service_id', 'region']).agg({ 'requests': ['mean', 'std', 'max'], 'hour': 'count' # number of hours in dataset }).reset_index() traffic_agg.columns = ['service_id', 'region', 'avg_requests', 'std_requests', 'max_requests', 'num_hours'] traffic_agg['cv_requests'] = traffic_agg['std_requests'] / (traffic_agg['avg_requests'] + 1) # coefficient of variation # Aggregate latency by region pair latency_agg = latency.groupby(['region1', 'region2']).agg({ 'latency_ms': ['mean', 'std'] }).reset_index() latency_agg.columns = ['region1', 'region2', 'avg_latency', 'std_latency'] # Create training dataset for MODEL 1 (Latency Prediction) print("\nBuilding training dataset for latency prediction...") # Merge placement with service info and traffic training_data = placement.merge(services[['service_id', 'memory_mb', 'cpu_cores', 'latency_critical', 'dependencies']], on='service_id', how='left') training_data = training_data.merge(traffic_agg, left_on=['service_id', 'region'], right_on=['service_id', 'region'], how='left') # Merge with latency info (use region to all other regions as features) # For simplicity, we'll add the average latency from this region to all others region_latency_avg = latency.groupby('region1')['latency_ms'].mean().reset_index() region_latency_avg.columns = ['region', 'avg_outbound_latency'] training_data = training_data.merge(region_latency_avg, on='region', how='left') # Fill missing values training_data = training_data.fillna(0) print(f"Created training dataset with {len(training_data)} rows and {training_data.shape[1]} columns") # ==================== MODEL 1: LATENCY PREDICTION (XGBoost Regression) ==================== print("\n" + "="*70) print("MODEL 1: LATENCY PREDICTION (XGBoost Regression)") print("="*70) # Features for latency prediction feature_cols_latency = ['memory_mb', 'cpu_cores', 'dependencies', 'avg_requests', 'std_requests', 'max_requests', 'cv_requests', 'avg_outbound_latency', 'instances'] X_latency = training_data[feature_cols_latency].fillna(0) y_latency = training_data['avg_latency_ms'] # Remove any rows with NaN or infinite values mask = ~(X_latency.isna().any(axis=1) | np.isinf(X_latency.values).any(axis=1) | y_latency.isna()) X_latency = X_latency[mask] y_latency = y_latency[mask] X_train_lat, X_test_lat, y_train_lat, y_test_lat = train_test_split( X_latency, y_latency, test_size=0.2, random_state=42 ) print(f"Training set: {len(X_train_lat)}, Test set: {len(X_test_lat)}") # Scale features scaler_latency = StandardScaler() X_train_lat_scaled = scaler_latency.fit_transform(X_train_lat) X_test_lat_scaled = scaler_latency.transform(X_test_lat) # Train XGBoost model_xgb = xgb.XGBRegressor( n_estimators=100, max_depth=5, learning_rate=0.1, random_state=42, verbosity=0 ) model_xgb.fit(X_train_lat_scaled, y_train_lat) # Evaluate y_pred_lat = model_xgb.predict(X_test_lat_scaled) mse = mean_squared_error(y_test_lat, y_pred_lat) rmse = np.sqrt(mse) mae = mean_absolute_error(y_test_lat, y_pred_lat) r2 = r2_score(y_test_lat, y_pred_lat) print(f"\nModel trained!") print(f" RMSE: {rmse:.4f} ms") print(f" MAE: {mae:.4f} ms") print(f" R²: {r2:.4f}") # Feature importance feature_importance = pd.DataFrame({ 'feature': feature_cols_latency, 'importance': model_xgb.feature_importances_ }).sort_values('importance', ascending=False) print(f"\nTop 5 Important Features:") print(feature_importance.head()) # Save model joblib.dump(model_xgb, 'models/xgboost_latency_model.pkl') joblib.dump(scaler_latency, 'models/scaler_latency.pkl') print(f"Saved to models/xgboost_latency_model.pkl") # ==================== MODEL 2: PLACEMENT STRATEGY (Classification) ==================== print("\n" + "="*70) print("MODEL 2: PLACEMENT STRATEGY (Classification)") print("="*70) # Create classification target: single-region (0) vs multi-region (1) placement_counts = placement.groupby('service_id')['region'].nunique().reset_index() placement_counts.columns = ['service_id', 'num_regions'] placement_counts['strategy'] = (placement_counts['num_regions'] > 1).astype(int) # Merge with service features classification_data = services.merge(placement_counts, on='service_id', how='left') X_class = classification_data[['memory_mb', 'cpu_cores', 'latency_critical', 'traffic_volume_rps', 'dependencies']] y_class = classification_data['strategy'] print(f"Class distribution: {y_class.value_counts().to_dict()}") # Check if we have both classes if len(y_class.unique()) > 1: X_train_cls, X_test_cls, y_train_cls, y_test_cls = train_test_split( X_class, y_class, test_size=0.2, random_state=42, stratify=y_class ) print(f"Training set: {len(X_train_cls)}, Test set: {len(X_test_cls)}") # Scale features scaler_class = StandardScaler() X_train_cls_scaled = scaler_class.fit_transform(X_train_cls) X_test_cls_scaled = scaler_class.transform(X_test_cls) # Train classifier model_rf = RandomForestClassifier( n_estimators=100, max_depth=5, random_state=42, class_weight='balanced' ) model_rf.fit(X_train_cls_scaled, y_train_cls) # Evaluate y_pred_cls = model_rf.predict(X_test_cls_scaled) accuracy = accuracy_score(y_test_cls, y_pred_cls) print(f"\nModel trained!") print(f" Accuracy: {accuracy:.4f}") print(f"\nClassification Report:") print(classification_report(y_test_cls, y_pred_cls, labels=[0, 1], target_names=['Single-Region', 'Multi-Region'])) else: print(f"\nWARNING: Only one class found in data (all services are multi-region)") print(f" Creating a synthetic binary target for demonstration...") # Create synthetic target based on threshold of traffic volume threshold = X_class['traffic_volume_rps'].median() y_class = (X_class['traffic_volume_rps'] > threshold).astype(int) X_train_cls, X_test_cls, y_train_cls, y_test_cls = train_test_split( X_class, y_class, test_size=0.2, random_state=42, stratify=y_class ) print(f"New class distribution (high vs low traffic): {y_class.value_counts().to_dict()}") print(f"Training set: {len(X_train_cls)}, Test set: {len(X_test_cls)}") # Scale features scaler_class = StandardScaler() X_train_cls_scaled = scaler_class.fit_transform(X_train_cls) X_test_cls_scaled = scaler_class.transform(X_test_cls) # Train classifier model_rf = RandomForestClassifier( n_estimators=100, max_depth=5, random_state=42, class_weight='balanced' ) model_rf.fit(X_train_cls_scaled, y_train_cls) # Evaluate y_pred_cls = model_rf.predict(X_test_cls_scaled) accuracy = accuracy_score(y_test_cls, y_pred_cls) print(f"\nModel trained!") print(f" Accuracy: {accuracy:.4f}") print(f"\nClassification Report (High vs Low Traffic Services):") print(classification_report(y_test_cls, y_pred_cls, labels=[0, 1], target_names=['Low Traffic', 'High Traffic'])) # Feature importance feature_importance_cls = pd.DataFrame({ 'feature': X_class.columns, 'importance': model_rf.feature_importances_ }).sort_values('importance', ascending=False) print(f"\nTop Features for Placement Strategy:") print(feature_importance_cls) # Save model joblib.dump(model_rf, 'models/random_forest_placement_model.pkl') joblib.dump(scaler_class, 'models/scaler_classification.pkl') print(f"Saved to models/random_forest_placement_model.pkl") # ==================== SAVE FEATURE IMPORTANCE ==================== print("\n" + "="*70) print("Saving Feature Importance") print("="*70) feature_importance.to_csv('models/feature_importance_latency.csv', index=False) feature_importance_cls.to_csv('models/feature_importance_placement.csv', index=False) print("Feature importance saved") # ==================== SUMMARY ==================== print("\n" + "="*70) print("MODEL TRAINING COMPLETE!") print("="*70) print(f"\nModels saved in 'models/' folder:") print(f" • xgboost_latency_model.pkl") print(f" • random_forest_placement_model.pkl") print(f" • scaler_latency.pkl") print(f" • scaler_classification.pkl") print(f" • feature_importance_latency.csv") print(f" • feature_importance_placement.csv") print(f"\nModel Performance Summary:") print(f" XGBoost (Latency Prediction)") print(f" - RMSE: {rmse:.4f} ms") print(f" - R²: {r2:.4f}") print(f" Random Forest (Placement Strategy)") print(f" - Accuracy: {accuracy:.4f}") conn.close()