BI_ANALYTICS / predictive_analytics.py
ratulsur's picture
Upload 13 files
98bc1c2 verified
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.svm import SVC, SVR
from sklearn.neural_network import MLPClassifier, MLPRegressor
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.preprocessing import LabelEncoder, StandardScaler
import joblib
class PredictiveAnalytics:
def __init__(self):
self.models = {
'Random Forest': {'classifier': RandomForestClassifier, 'regressor': RandomForestRegressor},
'Logistic Regression': {'classifier': LogisticRegression, 'regressor': LinearRegression},
'SVM': {'classifier': SVC, 'regressor': SVR},
'Neural Network': {'classifier': MLPClassifier, 'regressor': MLPRegressor}
}
self.trained_model = None
self.scaler = StandardScaler()
self.label_encoders = {}
def train_model(self, df, model_type, target_column=None):
"""Train predictive model"""
results = {}
plots = []
# Prepare data
X, y, task_type = self._prepare_data(df, target_column)
if X is None:
return {"error": "Unable to prepare data for modeling"}, []
# Split data
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=42, stratify=y if task_type == 'classification' else None
)
# Scale features
X_train_scaled = self.scaler.fit_transform(X_train)
X_test_scaled = self.scaler.transform(X_test)
# Select and train model
model_class = self.models[model_type][task_type.replace('ion', '')]
if model_type == 'Neural Network':
model = model_class(hidden_layer_sizes=(100, 50), max_iter=500, random_state=42)
elif model_type == 'SVM':
model = model_class(kernel='rbf', random_state=42)
else:
model = model_class(random_state=42)
# Train model
model.fit(X_train_scaled, y_train)
self.trained_model = model
# Make predictions
y_pred = model.predict(X_test_scaled)
# Calculate metrics
if task_type == 'classification':
results = self._calculate_classification_metrics(y_test, y_pred, model, X_test_scaled)
plots = self._create_classification_plots(y_test, y_pred, X, y, model)
else:
results = self._calculate_regression_metrics(y_test, y_pred)
plots = self._create_regression_plots(y_test, y_pred, X, y)
# Add model info
results['model_type'] = model_type
results['task_type'] = task_type
results['feature_names'] = list(X.columns)
# Feature importance
if hasattr(model, 'feature_importances_'):
importance_df = pd.DataFrame({
'feature': X.columns,
'importance': model.feature_importances_
}).sort_values('importance', ascending=False)
results['feature_importance'] = importance_df.to_dict('records')
# Create feature importance plot
plt.figure(figsize=(10, 8))
sns.barplot(data=importance_df.head(10), x='importance', y='feature')
plt.title('Top 10 Feature Importance')
plt.xlabel('Importance')
plt.tight_layout()
plt.savefig('feature_importance.png', dpi=300, bbox_inches='tight')
plots.append('feature_importance.png')
plt.close()
return results, plots
def _prepare_data(self, df, target_column=None):
"""Prepare data for modeling"""
# Remove ID column if exists
df_clean = df.drop(columns=['ID'], errors='ignore')
# Auto-detect target column if not provided
if target_column is None:
# Look for common target column patterns
potential_targets = [col for col in df_clean.columns
if any(keyword in col.lower() for keyword in
['target', 'label', 'class', 'outcome', 'value_segment', 'age_group'])]
if potential_targets:
target_column = potential_targets[0]
else:
# Create a synthetic target based on a numeric column
numeric_cols = df_clean.select_dtypes(include=[np.number]).columns
if len(numeric_cols) > 0:
target_col = numeric_cols[0]
median_val = df_clean[target_col].median()
df_clean['Synthetic_Target'] = (df_clean[target_col] > median_val).astype(int)
target_column = 'Synthetic_Target'
else:
return None, None, None
if target_column not in df_clean.columns:
return None, None, None
# Separate features and target
X = df_clean.drop(columns=[target_column])
y = df_clean[target_column]
# Encode categorical variables
for column in X.select_dtypes(include=['object', 'category']).columns:
le = LabelEncoder()
X[column] = le.fit_transform(X[column].astype(str))
self.label_encoders[column] = le
# Determine task type
if y.dtype == 'object' or len(y.unique()) <= 10:
task_type = 'classification'
if y.dtype == 'object':
le = LabelEncoder()
y = le.fit_transform(y)
self.label_encoders[target_column] = le
else:
task_type = 'regression'
return X, y, task_type
def _calculate_classification_metrics(self, y_test, y_pred, model, X_test):
"""Calculate classification metrics"""
results = {
'accuracy': accuracy_score(y_test, y_pred),
'classification_report': classification_report(y_test, y_pred, output_dict=True)
}
# Confusion matrix
cm = confusion_matrix(y_test, y_pred)
results['confusion_matrix'] = cm.tolist()
# Probabilities if available
if hasattr(model, 'predict_proba'):
y_proba = model.predict_proba(X_test)
results['prediction_probabilities'] = {
'mean_confidence': np.mean(np.max(y_proba, axis=1)),
'class_distribution': np.bincount(y_pred).tolist()
}
return results
def _calculate_regression_metrics(self, y_test, y_pred):
"""Calculate regression metrics"""
results = {
'mse': mean_squared_error(y_test, y_pred),
'rmse': np.sqrt(mean_squared_error(y_test, y_pred)),
'mae': mean_absolute_error(y_test, y_pred),
'r2_score': r2_score(y_test, y_pred)
}
return results
def _create_classification_plots(self, y_test, y_pred, X, y, model):
"""Create classification visualization plots"""
plots = []
# Confusion Matrix
plt.figure(figsize=(8, 6))
cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.title('Confusion Matrix')
plt.ylabel('True Label')
plt.xlabel('Predicted Label')
plt.tight_layout()
plt.savefig('confusion_matrix.png', dpi=300, bbox_inches='tight')
plots.append('confusion_matrix.png')
plt.close()
# Class distribution
plt.figure(figsize=(10, 6))
unique, counts = np.unique(y_pred, return_counts=True)
plt.bar(unique, counts, alpha=0.7)
plt.title('Predicted Class Distribution')
plt.xlabel('Class')
plt.ylabel('Count')
plt.tight_layout()
plt.savefig('class_distribution.png', dpi=300, bbox_inches='tight')
plots.append('class_distribution.png')
plt.close()
return plots
def _create_regression_plots(self, y_test, y_pred, X, y):
"""Create regression visualization plots"""
plots = []
# Actual vs Predicted
plt.figure(figsize=(10, 8))
plt.scatter(y_test, y_pred, alpha=0.6)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', lw=2)
plt.xlabel('Actual Values')
plt.ylabel('Predicted Values')
plt.title('Actual vs Predicted Values')
plt.tight_layout()
plt.savefig('actual_vs_predicted.png', dpi=300, bbox_inches='tight')
plots.append('actual_vs_predicted.png')
plt.close()
# Residuals plot
residuals = y_test - y_pred
plt.figure(figsize=(10, 6))
plt.scatter(y_pred, residuals, alpha=0.6)
plt.axhline(y=0, color='r', linestyle='--')
plt.xlabel('Predicted Values')
plt.ylabel('Residuals')
plt.title('Residuals Plot')
plt.tight_layout()
plt.savefig('residuals_plot.png', dpi=300, bbox_inches='tight')
plots.append('residuals_plot.png')
plt.close()
return plots
def save_model(self, filename):
"""Save trained model"""
if self.trained_model:
joblib.dump({
'model': self.trained_model,
'scaler': self.scaler,
'label_encoders': self.label_encoders
}, filename)
return f"Model saved as {filename}"
return "No trained model to save"
def load_model(self, filename):
"""Load trained model"""
try:
loaded = joblib.load(filename)
self.trained_model = loaded['model']
self.scaler = loaded['scaler']
self.label_encoders = loaded['label_encoders']
return "Model loaded successfully"
except Exception as e:
return f"Error loading model: {str(e)}"