Spaces:
Configuration error
Configuration error
| import pandas as pd | |
| import numpy as np | |
| import matplotlib.pyplot as plt | |
| import seaborn as sns | |
| from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor | |
| from sklearn.linear_model import LogisticRegression, LinearRegression | |
| from sklearn.svm import SVC, SVR | |
| from sklearn.neural_network import MLPClassifier, MLPRegressor | |
| from sklearn.model_selection import train_test_split, cross_val_score | |
| from sklearn.metrics import accuracy_score, classification_report, confusion_matrix | |
| from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error | |
| from sklearn.preprocessing import LabelEncoder, StandardScaler | |
| import joblib | |
| class PredictiveAnalytics: | |
| def __init__(self): | |
| self.models = { | |
| 'Random Forest': {'classifier': RandomForestClassifier, 'regressor': RandomForestRegressor}, | |
| 'Logistic Regression': {'classifier': LogisticRegression, 'regressor': LinearRegression}, | |
| 'SVM': {'classifier': SVC, 'regressor': SVR}, | |
| 'Neural Network': {'classifier': MLPClassifier, 'regressor': MLPRegressor} | |
| } | |
| self.trained_model = None | |
| self.scaler = StandardScaler() | |
| self.label_encoders = {} | |
| def train_model(self, df, model_type, target_column=None): | |
| """Train predictive model""" | |
| results = {} | |
| plots = [] | |
| # Prepare data | |
| X, y, task_type = self._prepare_data(df, target_column) | |
| if X is None: | |
| return {"error": "Unable to prepare data for modeling"}, [] | |
| # Split data | |
| X_train, X_test, y_train, y_test = train_test_split( | |
| X, y, test_size=0.2, random_state=42, stratify=y if task_type == 'classification' else None | |
| ) | |
| # Scale features | |
| X_train_scaled = self.scaler.fit_transform(X_train) | |
| X_test_scaled = self.scaler.transform(X_test) | |
| # Select and train model | |
| model_class = self.models[model_type][task_type.replace('ion', '')] | |
| if model_type == 'Neural Network': | |
| model = model_class(hidden_layer_sizes=(100, 50), max_iter=500, random_state=42) | |
| elif model_type == 'SVM': | |
| model = model_class(kernel='rbf', random_state=42) | |
| else: | |
| model = model_class(random_state=42) | |
| # Train model | |
| model.fit(X_train_scaled, y_train) | |
| self.trained_model = model | |
| # Make predictions | |
| y_pred = model.predict(X_test_scaled) | |
| # Calculate metrics | |
| if task_type == 'classification': | |
| results = self._calculate_classification_metrics(y_test, y_pred, model, X_test_scaled) | |
| plots = self._create_classification_plots(y_test, y_pred, X, y, model) | |
| else: | |
| results = self._calculate_regression_metrics(y_test, y_pred) | |
| plots = self._create_regression_plots(y_test, y_pred, X, y) | |
| # Add model info | |
| results['model_type'] = model_type | |
| results['task_type'] = task_type | |
| results['feature_names'] = list(X.columns) | |
| # Feature importance | |
| if hasattr(model, 'feature_importances_'): | |
| importance_df = pd.DataFrame({ | |
| 'feature': X.columns, | |
| 'importance': model.feature_importances_ | |
| }).sort_values('importance', ascending=False) | |
| results['feature_importance'] = importance_df.to_dict('records') | |
| # Create feature importance plot | |
| plt.figure(figsize=(10, 8)) | |
| sns.barplot(data=importance_df.head(10), x='importance', y='feature') | |
| plt.title('Top 10 Feature Importance') | |
| plt.xlabel('Importance') | |
| plt.tight_layout() | |
| plt.savefig('feature_importance.png', dpi=300, bbox_inches='tight') | |
| plots.append('feature_importance.png') | |
| plt.close() | |
| return results, plots | |
| def _prepare_data(self, df, target_column=None): | |
| """Prepare data for modeling""" | |
| # Remove ID column if exists | |
| df_clean = df.drop(columns=['ID'], errors='ignore') | |
| # Auto-detect target column if not provided | |
| if target_column is None: | |
| # Look for common target column patterns | |
| potential_targets = [col for col in df_clean.columns | |
| if any(keyword in col.lower() for keyword in | |
| ['target', 'label', 'class', 'outcome', 'value_segment', 'age_group'])] | |
| if potential_targets: | |
| target_column = potential_targets[0] | |
| else: | |
| # Create a synthetic target based on a numeric column | |
| numeric_cols = df_clean.select_dtypes(include=[np.number]).columns | |
| if len(numeric_cols) > 0: | |
| target_col = numeric_cols[0] | |
| median_val = df_clean[target_col].median() | |
| df_clean['Synthetic_Target'] = (df_clean[target_col] > median_val).astype(int) | |
| target_column = 'Synthetic_Target' | |
| else: | |
| return None, None, None | |
| if target_column not in df_clean.columns: | |
| return None, None, None | |
| # Separate features and target | |
| X = df_clean.drop(columns=[target_column]) | |
| y = df_clean[target_column] | |
| # Encode categorical variables | |
| for column in X.select_dtypes(include=['object', 'category']).columns: | |
| le = LabelEncoder() | |
| X[column] = le.fit_transform(X[column].astype(str)) | |
| self.label_encoders[column] = le | |
| # Determine task type | |
| if y.dtype == 'object' or len(y.unique()) <= 10: | |
| task_type = 'classification' | |
| if y.dtype == 'object': | |
| le = LabelEncoder() | |
| y = le.fit_transform(y) | |
| self.label_encoders[target_column] = le | |
| else: | |
| task_type = 'regression' | |
| return X, y, task_type | |
| def _calculate_classification_metrics(self, y_test, y_pred, model, X_test): | |
| """Calculate classification metrics""" | |
| results = { | |
| 'accuracy': accuracy_score(y_test, y_pred), | |
| 'classification_report': classification_report(y_test, y_pred, output_dict=True) | |
| } | |
| # Confusion matrix | |
| cm = confusion_matrix(y_test, y_pred) | |
| results['confusion_matrix'] = cm.tolist() | |
| # Probabilities if available | |
| if hasattr(model, 'predict_proba'): | |
| y_proba = model.predict_proba(X_test) | |
| results['prediction_probabilities'] = { | |
| 'mean_confidence': np.mean(np.max(y_proba, axis=1)), | |
| 'class_distribution': np.bincount(y_pred).tolist() | |
| } | |
| return results | |
| def _calculate_regression_metrics(self, y_test, y_pred): | |
| """Calculate regression metrics""" | |
| results = { | |
| 'mse': mean_squared_error(y_test, y_pred), | |
| 'rmse': np.sqrt(mean_squared_error(y_test, y_pred)), | |
| 'mae': mean_absolute_error(y_test, y_pred), | |
| 'r2_score': r2_score(y_test, y_pred) | |
| } | |
| return results | |
| def _create_classification_plots(self, y_test, y_pred, X, y, model): | |
| """Create classification visualization plots""" | |
| plots = [] | |
| # Confusion Matrix | |
| plt.figure(figsize=(8, 6)) | |
| cm = confusion_matrix(y_test, y_pred) | |
| sns.heatmap(cm, annot=True, fmt='d', cmap='Blues') | |
| plt.title('Confusion Matrix') | |
| plt.ylabel('True Label') | |
| plt.xlabel('Predicted Label') | |
| plt.tight_layout() | |
| plt.savefig('confusion_matrix.png', dpi=300, bbox_inches='tight') | |
| plots.append('confusion_matrix.png') | |
| plt.close() | |
| # Class distribution | |
| plt.figure(figsize=(10, 6)) | |
| unique, counts = np.unique(y_pred, return_counts=True) | |
| plt.bar(unique, counts, alpha=0.7) | |
| plt.title('Predicted Class Distribution') | |
| plt.xlabel('Class') | |
| plt.ylabel('Count') | |
| plt.tight_layout() | |
| plt.savefig('class_distribution.png', dpi=300, bbox_inches='tight') | |
| plots.append('class_distribution.png') | |
| plt.close() | |
| return plots | |
| def _create_regression_plots(self, y_test, y_pred, X, y): | |
| """Create regression visualization plots""" | |
| plots = [] | |
| # Actual vs Predicted | |
| plt.figure(figsize=(10, 8)) | |
| plt.scatter(y_test, y_pred, alpha=0.6) | |
| plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', lw=2) | |
| plt.xlabel('Actual Values') | |
| plt.ylabel('Predicted Values') | |
| plt.title('Actual vs Predicted Values') | |
| plt.tight_layout() | |
| plt.savefig('actual_vs_predicted.png', dpi=300, bbox_inches='tight') | |
| plots.append('actual_vs_predicted.png') | |
| plt.close() | |
| # Residuals plot | |
| residuals = y_test - y_pred | |
| plt.figure(figsize=(10, 6)) | |
| plt.scatter(y_pred, residuals, alpha=0.6) | |
| plt.axhline(y=0, color='r', linestyle='--') | |
| plt.xlabel('Predicted Values') | |
| plt.ylabel('Residuals') | |
| plt.title('Residuals Plot') | |
| plt.tight_layout() | |
| plt.savefig('residuals_plot.png', dpi=300, bbox_inches='tight') | |
| plots.append('residuals_plot.png') | |
| plt.close() | |
| return plots | |
| def save_model(self, filename): | |
| """Save trained model""" | |
| if self.trained_model: | |
| joblib.dump({ | |
| 'model': self.trained_model, | |
| 'scaler': self.scaler, | |
| 'label_encoders': self.label_encoders | |
| }, filename) | |
| return f"Model saved as {filename}" | |
| return "No trained model to save" | |
| def load_model(self, filename): | |
| """Load trained model""" | |
| try: | |
| loaded = joblib.load(filename) | |
| self.trained_model = loaded['model'] | |
| self.scaler = loaded['scaler'] | |
| self.label_encoders = loaded['label_encoders'] | |
| return "Model loaded successfully" | |
| except Exception as e: | |
| return f"Error loading model: {str(e)}" |