import pandas as pd import numpy as np import matplotlib.pyplot as plt import seaborn as sns from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor from sklearn.linear_model import LogisticRegression, LinearRegression from sklearn.svm import SVC, SVR from sklearn.neural_network import MLPClassifier, MLPRegressor from sklearn.model_selection import train_test_split, cross_val_score from sklearn.metrics import accuracy_score, classification_report, confusion_matrix from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error from sklearn.preprocessing import LabelEncoder, StandardScaler import joblib class PredictiveAnalytics: def __init__(self): self.models = { 'Random Forest': {'classifier': RandomForestClassifier, 'regressor': RandomForestRegressor}, 'Logistic Regression': {'classifier': LogisticRegression, 'regressor': LinearRegression}, 'SVM': {'classifier': SVC, 'regressor': SVR}, 'Neural Network': {'classifier': MLPClassifier, 'regressor': MLPRegressor} } self.trained_model = None self.scaler = StandardScaler() self.label_encoders = {} def train_model(self, df, model_type, target_column=None): """Train predictive model""" results = {} plots = [] # Prepare data X, y, task_type = self._prepare_data(df, target_column) if X is None: return {"error": "Unable to prepare data for modeling"}, [] # Split data X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2, random_state=42, stratify=y if task_type == 'classification' else None ) # Scale features X_train_scaled = self.scaler.fit_transform(X_train) X_test_scaled = self.scaler.transform(X_test) # Select and train model model_class = self.models[model_type][task_type.replace('ion', '')] if model_type == 'Neural Network': model = model_class(hidden_layer_sizes=(100, 50), max_iter=500, random_state=42) elif model_type == 'SVM': model = model_class(kernel='rbf', random_state=42) else: model = model_class(random_state=42) # Train model model.fit(X_train_scaled, y_train) self.trained_model = model # Make predictions y_pred = model.predict(X_test_scaled) # Calculate metrics if task_type == 'classification': results = self._calculate_classification_metrics(y_test, y_pred, model, X_test_scaled) plots = self._create_classification_plots(y_test, y_pred, X, y, model) else: results = self._calculate_regression_metrics(y_test, y_pred) plots = self._create_regression_plots(y_test, y_pred, X, y) # Add model info results['model_type'] = model_type results['task_type'] = task_type results['feature_names'] = list(X.columns) # Feature importance if hasattr(model, 'feature_importances_'): importance_df = pd.DataFrame({ 'feature': X.columns, 'importance': model.feature_importances_ }).sort_values('importance', ascending=False) results['feature_importance'] = importance_df.to_dict('records') # Create feature importance plot plt.figure(figsize=(10, 8)) sns.barplot(data=importance_df.head(10), x='importance', y='feature') plt.title('Top 10 Feature Importance') plt.xlabel('Importance') plt.tight_layout() plt.savefig('feature_importance.png', dpi=300, bbox_inches='tight') plots.append('feature_importance.png') plt.close() return results, plots def _prepare_data(self, df, target_column=None): """Prepare data for modeling""" # Remove ID column if exists df_clean = df.drop(columns=['ID'], errors='ignore') # Auto-detect target column if not provided if target_column is None: # Look for common target column patterns potential_targets = [col for col in df_clean.columns if any(keyword in col.lower() for keyword in ['target', 'label', 'class', 'outcome', 'value_segment', 'age_group'])] if potential_targets: target_column = potential_targets[0] else: # Create a synthetic target based on a numeric column numeric_cols = df_clean.select_dtypes(include=[np.number]).columns if len(numeric_cols) > 0: target_col = numeric_cols[0] median_val = df_clean[target_col].median() df_clean['Synthetic_Target'] = (df_clean[target_col] > median_val).astype(int) target_column = 'Synthetic_Target' else: return None, None, None if target_column not in df_clean.columns: return None, None, None # Separate features and target X = df_clean.drop(columns=[target_column]) y = df_clean[target_column] # Encode categorical variables for column in X.select_dtypes(include=['object', 'category']).columns: le = LabelEncoder() X[column] = le.fit_transform(X[column].astype(str)) self.label_encoders[column] = le # Determine task type if y.dtype == 'object' or len(y.unique()) <= 10: task_type = 'classification' if y.dtype == 'object': le = LabelEncoder() y = le.fit_transform(y) self.label_encoders[target_column] = le else: task_type = 'regression' return X, y, task_type def _calculate_classification_metrics(self, y_test, y_pred, model, X_test): """Calculate classification metrics""" results = { 'accuracy': accuracy_score(y_test, y_pred), 'classification_report': classification_report(y_test, y_pred, output_dict=True) } # Confusion matrix cm = confusion_matrix(y_test, y_pred) results['confusion_matrix'] = cm.tolist() # Probabilities if available if hasattr(model, 'predict_proba'): y_proba = model.predict_proba(X_test) results['prediction_probabilities'] = { 'mean_confidence': np.mean(np.max(y_proba, axis=1)), 'class_distribution': np.bincount(y_pred).tolist() } return results def _calculate_regression_metrics(self, y_test, y_pred): """Calculate regression metrics""" results = { 'mse': mean_squared_error(y_test, y_pred), 'rmse': np.sqrt(mean_squared_error(y_test, y_pred)), 'mae': mean_absolute_error(y_test, y_pred), 'r2_score': r2_score(y_test, y_pred) } return results def _create_classification_plots(self, y_test, y_pred, X, y, model): """Create classification visualization plots""" plots = [] # Confusion Matrix plt.figure(figsize=(8, 6)) cm = confusion_matrix(y_test, y_pred) sns.heatmap(cm, annot=True, fmt='d', cmap='Blues') plt.title('Confusion Matrix') plt.ylabel('True Label') plt.xlabel('Predicted Label') plt.tight_layout() plt.savefig('confusion_matrix.png', dpi=300, bbox_inches='tight') plots.append('confusion_matrix.png') plt.close() # Class distribution plt.figure(figsize=(10, 6)) unique, counts = np.unique(y_pred, return_counts=True) plt.bar(unique, counts, alpha=0.7) plt.title('Predicted Class Distribution') plt.xlabel('Class') plt.ylabel('Count') plt.tight_layout() plt.savefig('class_distribution.png', dpi=300, bbox_inches='tight') plots.append('class_distribution.png') plt.close() return plots def _create_regression_plots(self, y_test, y_pred, X, y): """Create regression visualization plots""" plots = [] # Actual vs Predicted plt.figure(figsize=(10, 8)) plt.scatter(y_test, y_pred, alpha=0.6) plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', lw=2) plt.xlabel('Actual Values') plt.ylabel('Predicted Values') plt.title('Actual vs Predicted Values') plt.tight_layout() plt.savefig('actual_vs_predicted.png', dpi=300, bbox_inches='tight') plots.append('actual_vs_predicted.png') plt.close() # Residuals plot residuals = y_test - y_pred plt.figure(figsize=(10, 6)) plt.scatter(y_pred, residuals, alpha=0.6) plt.axhline(y=0, color='r', linestyle='--') plt.xlabel('Predicted Values') plt.ylabel('Residuals') plt.title('Residuals Plot') plt.tight_layout() plt.savefig('residuals_plot.png', dpi=300, bbox_inches='tight') plots.append('residuals_plot.png') plt.close() return plots def save_model(self, filename): """Save trained model""" if self.trained_model: joblib.dump({ 'model': self.trained_model, 'scaler': self.scaler, 'label_encoders': self.label_encoders }, filename) return f"Model saved as {filename}" return "No trained model to save" def load_model(self, filename): """Load trained model""" try: loaded = joblib.load(filename) self.trained_model = loaded['model'] self.scaler = loaded['scaler'] self.label_encoders = loaded['label_encoders'] return "Model loaded successfully" except Exception as e: return f"Error loading model: {str(e)}"