dataanalyst / ml_pipeline.py
RamAi2026's picture
Upload 13 files
da8e446 verified
raw
history blame
50.7 kB
import streamlit as st
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler
from sklearn.metrics import (accuracy_score, precision_score, recall_score, f1_score,
confusion_matrix, classification_report, roc_curve, auc,
mean_squared_error, r2_score, mean_absolute_error)
from sklearn.linear_model import LogisticRegression, LinearRegression, Ridge, Lasso
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.svm import SVC, SVR
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.ensemble import (RandomForestClassifier, RandomForestRegressor,
GradientBoostingClassifier, GradientBoostingRegressor,
AdaBoostClassifier, AdaBoostRegressor,
VotingClassifier, VotingRegressor)
from xgboost import XGBClassifier, XGBRegressor
from lightgbm import LGBMClassifier, LGBMRegressor
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import time
import warnings
warnings.filterwarnings('ignore')
class MLPipelineError(Exception):
"""Custom exception for ML pipeline errors"""
pass
def validate_ml_data(df, target, features):
"""Validate data for machine learning"""
issues = []
if df.empty:
issues.append("Dataset is empty")
return issues
if target not in df.columns:
issues.append(f"Target column '{target}' not found in dataset")
missing_features = [f for f in features if f not in df.columns]
if missing_features:
issues.append(f"Features not found: {missing_features}")
# Check for sufficient data
if df.shape[0] < 10:
issues.append("Dataset too small (minimum 10 rows required)")
# Check for constant columns
for col in features:
if df[col].nunique() == 1:
issues.append(f"Feature '{col}' is constant")
# Check target for classification
if target in df.columns:
if df[target].dtype in ['object', 'category'] or df[target].nunique() <= 20:
if df[target].nunique() == 1:
issues.append("Target has only one class")
elif df[target].nunique() > 50:
issues.append(f"Target has {df[target].nunique()} classes, which may cause issues")
return issues
def safe_ml_operation(func, *args, **kwargs):
"""Safely execute ML operations with error handling"""
try:
result = func(*args, **kwargs)
return result, None
except ValueError as e:
error_msg = f"Value Error: {str(e)}. Check your data types and values."
return None, error_msg
except MemoryError as e:
error_msg = "Memory Error: Dataset too large. Try reducing the number of features or using a sample."
return None, error_msg
except Exception as e:
error_msg = f"ML Error: {str(e)}"
return None, error_msg
def run_ml_pipeline(df):
"""
Enhanced machine learning pipeline with comprehensive error handling
"""
st.markdown("""
<div style='text-align: center; margin-bottom: 2rem;'>
<h2>🤖 Advanced Machine Learning Pipeline</h2>
<p style='color: gray;'>Train, evaluate, and compare multiple ML models with automatic error handling</p>
</div>
""", unsafe_allow_html=True)
try:
# Check if dataset is suitable for ML
if df.shape[0] < 10:
st.error("❌ Dataset too small for machine learning (need at least 10 rows)")
return
# Create tabs for different ML stages
tab1, tab2, tab3, tab4, tab5 = st.tabs([
"⚙️ Configuration",
"📊 Model Training",
"📈 Model Evaluation",
"🔮 Predictions",
"📋 ML Report"
])
with tab1:
st.markdown('<div class="custom-card">', unsafe_allow_html=True)
st.subheader("⚙️ Model Configuration")
try:
# Target selection with validation
st.markdown("### 🎯 Target Variable")
# Auto-detect potential target columns
potential_targets = []
target_types = {}
for col in df.columns:
try:
if df[col].dtype in ['int64', 'float64']:
if df[col].nunique() <= 20:
potential_targets.append(col)
target_types[col] = "Classification (low cardinality)"
else:
potential_targets.append(col)
target_types[col] = "Regression"
elif df[col].dtype in ['object', 'category']:
if df[col].nunique() <= 50:
potential_targets.append(col)
target_types[col] = f"Classification ({df[col].nunique()} classes)"
except Exception as e:
st.warning(f"⚠️ Couldn't analyze column {col}: {str(e)}")
if not potential_targets:
st.error("❌ No suitable target columns found. Need numeric or categorical columns with reasonable cardinality.")
return
target = st.selectbox(
"Select target column",
potential_targets,
help=f"Column types: {target_types}"
)
# Task type detection
if df[target].dtype in ['object', 'category'] or df[target].nunique() <= 20:
task_type = "Classification"
unique_values = df[target].nunique()
if unique_values == 2:
st.success("✅ **Binary Classification** problem detected")
elif unique_values <= 10:
st.info(f"📊 **Multi-class Classification** with {unique_values} classes")
else:
st.warning(f"⚠️ **Multi-class Classification** with {unique_values} classes - may be challenging")
# Check class balance
class_dist = df[target].value_counts(normalize=True)
if class_dist.min() < 0.1:
st.warning("⚠️ Class imbalance detected. Consider using class weights or resampling.")
else:
task_type = "Regression"
st.info("📈 **Regression** task detected")
# Check target distribution
target_skew = df[target].skew()
if abs(target_skew) > 1:
st.warning(f"⚠️ Target variable is highly skewed (skewness: {target_skew:.2f}). Consider log transformation.")
# Feature selection
st.markdown("### 🔍 Feature Selection")
# Auto-select features (exclude target)
all_features = [col for col in df.columns if col != target]
# Remove problematic columns
problematic_cols = []
for col in all_features:
try:
if df[col].nunique() == 1:
problematic_cols.append(col)
elif df[col].isnull().sum() > len(df) * 0.5:
problematic_cols.append(col)
except:
problematic_cols.append(col)
if problematic_cols:
st.warning(f"⚠️ Problematic columns detected (will be excluded): {problematic_cols}")
all_features = [f for f in all_features if f not in problematic_cols]
if not all_features:
st.error("❌ No valid features remaining after filtering.")
return
# Select features
selected_features = st.multiselect(
"Choose features for modeling",
all_features,
default=all_features[:min(10, len(all_features))],
help="Select the columns to use as features. Using too many features may cause overfitting."
)
if not selected_features:
st.warning("⚠️ Please select at least one feature")
return
# Validate selected features
validation_issues = validate_ml_data(df, target, selected_features)
if validation_issues:
for issue in validation_issues:
st.warning(f"⚠️ {issue}")
# Data preprocessing options
st.markdown("### 🛠️ Preprocessing Options")
col1, col2 = st.columns(2)
with col1:
test_size = st.slider("Test set size (%)", 10, 40, 20, 5) / 100
scaler_option = st.selectbox("Feature scaling", ["None", "StandardScaler", "MinMaxScaler"])
with col2:
cv_folds = st.slider("Cross-validation folds", 2, 10, 5)
if task_type == "Classification":
handle_imbalance = st.checkbox("Handle class imbalance", value=False,
help="Use class weights or sampling techniques")
else:
handle_imbalance = False
# Model selection based on task type
st.markdown("### 🤖 Model Selection")
if task_type == "Classification":
models = {
"Logistic Regression": LogisticRegression(max_iter=1000, random_state=42),
"K-Nearest Neighbors": KNeighborsClassifier(),
"Decision Tree": DecisionTreeClassifier(random_state=42),
"Random Forest": RandomForestClassifier(random_state=42, n_jobs=-1),
"Gradient Boosting": GradientBoostingClassifier(random_state=42),
"XGBoost": XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42),
"LightGBM": LGBMClassifier(verbose=-1, random_state=42),
"AdaBoost": AdaBoostClassifier(random_state=42),
"SVM": SVC(probability=True, random_state=42)
}
# Default models for quick selection
default_models = ["Logistic Regression", "Random Forest", "XGBoost"]
else: # Regression
models = {
"Linear Regression": LinearRegression(),
"Ridge Regression": Ridge(random_state=42),
"Lasso Regression": Lasso(random_state=42),
"Decision Tree": DecisionTreeRegressor(random_state=42),
"Random Forest": RandomForestRegressor(random_state=42, n_jobs=-1),
"Gradient Boosting": GradientBoostingRegressor(random_state=42),
"XGBoost": XGBRegressor(random_state=42),
"LightGBM": LGBMRegressor(verbose=-1, random_state=42),
"AdaBoost": AdaBoostRegressor(random_state=42),
"SVR": SVR()
}
default_models = ["Linear Regression", "Random Forest", "XGBoost"]
selected_models = st.multiselect(
"Choose models to train",
list(models.keys()),
default=default_models,
help="Select multiple models to compare performance"
)
if not selected_models:
st.warning("⚠️ Please select at least one model")
return
# Advanced options
with st.expander("⚡ Advanced Options"):
do_tuning = st.checkbox("Perform hyperparameter tuning", value=False,
help="Grid search for best parameters (may be slow)")
if do_tuning:
tuning_folds = st.slider("Tuning CV folds", 2, 5, 3)
max_tuning_iter = st.slider("Max tuning iterations per model", 5, 50, 20)
use_sampling = st.checkbox("Use data sampling (for large datasets)", value=False,
help="Use a sample for faster experimentation")
if use_sampling:
sample_size = st.slider("Sample size (%)", 10, 100, 100, 10) / 100
random_state = st.number_input("Random seed", value=42, min_value=0, max_value=999)
st.markdown('</div>', unsafe_allow_html=True)
# Store configuration in session state
st.session_state['ml_config'] = {
'target': target,
'features': selected_features,
'task_type': task_type,
'test_size': test_size,
'scaler': scaler_option,
'cv_folds': cv_folds,
'handle_imbalance': handle_imbalance,
'models': {name: models[name] for name in selected_models},
'do_tuning': do_tuning,
'random_state': random_state
}
except Exception as e:
st.error(f"❌ Error in configuration: {str(e)}")
st.info("💡 Tip: Check your data types and ensure all columns are valid")
return
with tab2:
if 'ml_config' not in st.session_state:
st.info("ℹ️ Please configure your model in the 'Configuration' tab first")
return
if st.button("🚀 Start Training", use_container_width=True, type="primary"):
try:
config = st.session_state['ml_config']
st.markdown('<div class="custom-card">', unsafe_allow_html=True)
# Prepare data with error handling
with st.spinner("📊 Preparing data..."):
try:
X = df[config['features']].copy()
y = df[config['target']].copy()
# Handle missing values
if X.isnull().sum().sum() > 0:
st.info(f"⚠️ Handling {X.isnull().sum().sum()} missing values in features...")
X = X.fillna(X.mean(numeric_only=True)).fillna(X.mode().iloc[0])
# Handle categorical features
cat_features = X.select_dtypes(include=['object', 'category']).columns
if len(cat_features) > 0:
st.info(f"🔄 Encoding categorical features: {list(cat_features)}")
X = pd.get_dummies(X, columns=cat_features)
# Handle target encoding for classification
le = None
if config['task_type'] == "Classification" and y.dtype == 'object':
le = LabelEncoder()
y = le.fit_transform(y)
st.info(f"📊 Target classes: {dict(zip(le.classes_, le.transform(le.classes_)))}")
# Handle class imbalance
if config['task_type'] == "Classification" and config['handle_imbalance']:
from sklearn.utils.class_weight import compute_class_weight
class_weights = compute_class_weight('balanced', classes=np.unique(y), y=y)
st.info(f"⚖️ Using class weights: {dict(zip(np.unique(y), class_weights))}")
# Scale features
scaler = None
if config['scaler'] != "None":
if config['scaler'] == "StandardScaler":
scaler = StandardScaler()
else:
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)
X = pd.DataFrame(X_scaled, columns=X.columns)
# Split data
stratify = y if config['task_type'] == "Classification" else None
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=config['test_size'],
random_state=config['random_state'],
stratify=stratify
)
st.success(f"✅ Data prepared: {X_train.shape[0]} training samples, {X_test.shape[0]} test samples")
except Exception as e:
st.error(f"❌ Error in data preparation: {str(e)}")
return
# Train models
results = []
trained_models = {}
progress_bar = st.progress(0)
status_text = st.empty()
for i, (model_name, model) in enumerate(config['models'].items()):
status_text.text(f"🔄 Training {model_name}...")
try:
# Apply class weights if needed
if config['task_type'] == "Classification" and config['handle_imbalance']:
if hasattr(model, 'class_weight'):
model.set_params(class_weight='balanced')
# Train
start_time = time.time()
model.fit(X_train, y_train)
training_time = time.time() - start_time
# Store trained model
trained_models[model_name] = {
'model': model,
'scaler': scaler,
'label_encoder': le,
'features': X.columns.tolist()
}
# Predict
y_pred = model.predict(X_test)
# Calculate metrics
if config['task_type'] == "Classification":
try:
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted', zero_division=0)
recall = recall_score(y_test, y_pred, average='weighted', zero_division=0)
f1 = f1_score(y_test, y_pred, average='weighted', zero_division=0)
# Cross-validation
cv_scores = cross_val_score(model, X_train, y_train, cv=config['cv_folds'])
results.append({
"Model": model_name,
"Accuracy": f"{accuracy:.4f}",
"Precision": f"{precision:.4f}",
"Recall": f"{recall:.4f}",
"F1 Score": f"{f1:.4f}",
"CV Score": f"{cv_scores.mean():.4f}{cv_scores.std()*2:.4f})",
"Time (s)": f"{training_time:.2f}"
})
except Exception as e:
st.warning(f"⚠️ Could not calculate all metrics for {model_name}: {str(e)}")
else: # Regression
try:
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
# Cross-validation
cv_scores = cross_val_score(model, X_train, y_train, cv=config['cv_folds'], scoring='r2')
results.append({
"Model": model_name,
"R² Score": f"{r2:.4f}",
"RMSE": f"{rmse:.4f}",
"MAE": f"{mae:.4f}",
"CV R²": f"{cv_scores.mean():.4f}{cv_scores.std()*2:.4f})",
"Time (s)": f"{training_time:.2f}"
})
except Exception as e:
st.warning(f"⚠️ Could not calculate all metrics for {model_name}: {str(e)}")
except MemoryError:
st.error(f"❌ Out of memory training {model_name}. Try using fewer features or a sample.")
except Exception as e:
st.warning(f"⚠️ Error training {model_name}: {str(e)}")
progress_bar.progress((i + 1) / len(config['models']))
status_text.text("✅ Training complete!")
if not results:
st.error("❌ No models were successfully trained")
return
# Display results
st.subheader("📊 Model Performance Comparison")
results_df = pd.DataFrame(results)
# Highlight best model
if config['task_type'] == "Classification":
best_idx = results_df['F1 Score'].astype(float).idxmax()
else:
best_idx = results_df['R² Score'].astype(float).idxmax()
# Style dataframe
def highlight_best(s):
is_best = s.index == best_idx
return ['background-color: #90EE90' if v else '' for v in is_best]
st.dataframe(results_df.style.apply(highlight_best), use_container_width=True)
# Store results
st.session_state['trained_models'] = trained_models
st.session_state['X_train'] = X_train
st.session_state['X_test'] = X_test
st.session_state['y_train'] = y_train
st.session_state['y_test'] = y_test
st.session_state['task_type'] = config['task_type']
st.session_state['results_df'] = results_df
# Best model info
best_model_name = results_df.iloc[best_idx]['Model']
st.success(f"🏆 **Best Model:** {best_model_name}")
st.markdown('</div>', unsafe_allow_html=True)
except Exception as e:
st.error(f"❌ Critical error in training: {str(e)}")
st.info("💡 Try reducing the number of features or models")
with tab3:
if 'trained_models' not in st.session_state:
st.info("ℹ️ Train some models first in the 'Model Training' tab")
return
try:
st.markdown('<div class="custom-card">', unsafe_allow_html=True)
st.subheader("📈 Detailed Model Evaluation")
# Model selection for detailed evaluation
selected_eval_model = st.selectbox(
"Select model for detailed evaluation",
list(st.session_state['trained_models'].keys())
)
model_info = st.session_state['trained_models'][selected_eval_model]
model = model_info['model']
X_test = st.session_state['X_test']
y_test = st.session_state['y_test']
task_type = st.session_state['task_type']
try:
y_pred = model.predict(X_test)
if task_type == "Classification":
# Confusion Matrix
st.markdown("### Confusion Matrix")
cm = confusion_matrix(y_test, y_pred)
fig = px.imshow(cm,
text_auto=True,
aspect="auto",
color_continuous_scale='Blues',
title=f"Confusion Matrix - {selected_eval_model}")
fig.update_layout(xaxis_title="Predicted", yaxis_title="Actual")
st.plotly_chart(fig, use_container_width=True)
# Classification Report
st.markdown("### Classification Report")
report = classification_report(y_test, y_pred, output_dict=True)
report_df = pd.DataFrame(report).transpose()
st.dataframe(report_df.style.format("{:.4f}"), use_container_width=True)
# ROC Curve (for binary classification)
if len(np.unique(y_test)) == 2 and hasattr(model, "predict_proba"):
st.markdown("### ROC Curve")
y_pred_proba = model.predict_proba(X_test)[:, 1]
fpr, tpr, _ = roc_curve(y_test, y_pred_proba)
roc_auc = auc(fpr, tpr)
fig = go.Figure()
fig.add_trace(go.Scatter(x=fpr, y=tpr,
mode='lines',
name=f'ROC (AUC = {roc_auc:.3f})',
line=dict(color='blue', width=2)))
fig.add_trace(go.Scatter(x=[0, 1], y=[0, 1],
mode='lines',
name='Random',
line=dict(color='gray', dash='dash')))
fig.update_layout(xaxis_title="False Positive Rate",
yaxis_title="True Positive Rate",
title=f"ROC Curve - {selected_eval_model}")
st.plotly_chart(fig, use_container_width=True)
else: # Regression
# Actual vs Predicted plot
st.markdown("### Actual vs Predicted")
fig = px.scatter(x=y_test, y=y_pred,
labels={'x': 'Actual', 'y': 'Predicted'},
title=f"Actual vs Predicted - {selected_eval_model}",
trendline="ols")
# Add perfect prediction line
min_val = min(y_test.min(), y_pred.min())
max_val = max(y_test.max(), y_pred.max())
fig.add_trace(go.Scatter(x=[min_val, max_val], y=[min_val, max_val],
mode='lines', name='Perfect Prediction',
line=dict(color='red', dash='dash')))
st.plotly_chart(fig, use_container_width=True)
# Residuals plot
st.markdown("### Residuals Analysis")
residuals = y_test - y_pred
fig = make_subplots(rows=1, cols=2,
subplot_titles=("Residuals vs Predicted", "Residuals Distribution"))
fig.add_trace(go.Scatter(x=y_pred, y=residuals,
mode='markers',
name='Residuals',
marker=dict(color='blue', opacity=0.5)), row=1, col=1)
fig.add_hline(y=0, line_dash="dash", line_color="red", row=1, col=1)
fig.add_trace(go.Histogram(x=residuals, nbinsx=30,
name='Distribution',
marker_color='green'), row=1, col=2)
fig.update_layout(title=f"Residual Analysis - {selected_eval_model}")
st.plotly_chart(fig, use_container_width=True)
# Residual statistics
col1, col2, col3 = st.columns(3)
with col1:
st.metric("Mean Residual", f"{residuals.mean():.4f}")
with col2:
st.metric("Std Residual", f"{residuals.std():.4f}")
with col3:
st.metric("Residual Range", f"{residuals.max() - residuals.min():.4f}")
# Feature Importance (if available)
if hasattr(model, 'feature_importances_'):
st.markdown("### Feature Importance")
feature_importance = pd.DataFrame({
'feature': X_test.columns,
'importance': model.feature_importances_
}).sort_values('importance', ascending=True)
fig = px.bar(feature_importance.tail(10),
x='importance', y='feature',
orientation='h',
title="Top 10 Feature Importances",
color='importance',
color_continuous_scale='Viridis')
st.plotly_chart(fig, use_container_width=True)
except Exception as e:
st.error(f"❌ Error in evaluation: {str(e)}")
st.markdown('</div>', unsafe_allow_html=True)
except Exception as e:
st.error(f"❌ Error loading evaluation: {str(e)}")
with tab4:
if 'trained_models' not in st.session_state:
st.info("ℹ️ Train some models first in the 'Model Training' tab")
return
try:
st.markdown('<div class="custom-card">', unsafe_allow_html=True)
st.subheader("🔮 Make Predictions")
# Model selection for predictions
selected_pred_model = st.selectbox(
"Select model for predictions",
list(st.session_state['trained_models'].keys()),
key="pred_model"
)
model_info = st.session_state['trained_models'][selected_pred_model]
model = model_info['model']
scaler = model_info['scaler']
le = model_info.get('label_encoder')
feature_names = model_info['features']
# Input method
input_method = st.radio(
"Input method",
["Manual input", "Upload new data", "Batch prediction"],
horizontal=True
)
if input_method == "Manual input":
st.markdown("### Enter feature values")
input_data = {}
cols = st.columns(3)
for i, feature in enumerate(feature_names):
with cols[i % 3]:
try:
# Get feature range from training data
if feature in st.session_state['X_train'].columns:
min_val = float(st.session_state['X_train'][feature].min())
max_val = float(st.session_state['X_train'][feature].max())
mean_val = float(st.session_state['X_train'][feature].mean())
input_data[feature] = st.slider(
f"{feature}",
min_val, max_val, mean_val,
format="%.4f",
key=f"manual_{feature}"
)
else:
input_data[feature] = st.number_input(
f"{feature}",
value=0.0,
key=f"manual_{feature}"
)
except Exception as e:
st.warning(f"⚠️ Error with {feature}: {str(e)}")
input_data[feature] = 0.0
if st.button("🔮 Predict", use_container_width=True):
try:
# Convert input to DataFrame
input_df = pd.DataFrame([input_data])
# Ensure all features are present
for col in feature_names:
if col not in input_df.columns:
input_df[col] = 0
input_df = input_df[feature_names]
# Scale if needed
if scaler is not None:
input_scaled = scaler.transform(input_df)
input_df = pd.DataFrame(input_scaled, columns=feature_names)
# Make prediction
prediction = model.predict(input_df)[0]
# Decode if needed
if le is not None:
prediction = le.inverse_transform([int(prediction)])[0]
# Display prediction with styling
st.markdown("""
<div class="success-container" style="text-align: center; padding: 2rem;">
<h3>🎯 Prediction Result</h3>
<h1 style="font-size: 3rem;">{}</h1>
</div>
""".format(prediction), unsafe_allow_html=True)
except Exception as e:
st.error(f"❌ Prediction error: {str(e)}")
elif input_method == "Upload new data":
pred_file = st.file_uploader("Upload data for predictions",
type=["csv", "xlsx"],
key="pred_file")
if pred_file:
try:
if pred_file.name.endswith("csv"):
pred_df = pd.read_csv(pred_file)
else:
pred_df = pd.read_excel(pred_file)
st.subheader("📋 Uploaded Data Preview")
st.dataframe(pred_df.head())
if st.button("🔮 Predict for all rows", use_container_width=True):
with st.spinner("Making predictions..."):
try:
# Prepare data
pred_processed = pred_df.copy()
# Handle categorical features if needed
for col in pred_processed.columns:
if pred_processed[col].dtype == 'object':
pred_processed = pd.get_dummies(pred_processed, columns=[col])
# Align columns with training data
for col in feature_names:
if col not in pred_processed.columns:
pred_processed[col] = 0
pred_processed = pred_processed[feature_names]
# Scale if needed
if scaler is not None:
pred_scaled = scaler.transform(pred_processed)
pred_processed = pd.DataFrame(pred_scaled, columns=feature_names)
# Make predictions
predictions = model.predict(pred_processed)
# Decode if needed
if le is not None:
predictions = le.inverse_transform(predictions.astype(int))
# Add predictions to dataframe
pred_df['Prediction'] = predictions
st.subheader("📊 Predictions Result")
st.dataframe(pred_df)
# Download predictions
csv = pred_df.to_csv(index=False)
st.download_button(
label="📥 Download Predictions",
data=csv,
file_name="predictions.csv",
mime="text/csv",
use_container_width=True
)
except Exception as e:
st.error(f"❌ Prediction error: {str(e)}")
except Exception as e:
st.error(f"❌ Error reading file: {str(e)}")
elif input_method == "Batch prediction":
st.markdown("### Batch Prediction Settings")
n_samples = st.number_input("Number of samples to generate",
min_value=1, max_value=1000, value=10)
if st.button("🎲 Generate Random Samples & Predict", use_container_width=True):
try:
# Generate random samples based on training data distribution
random_samples = {}
for feature in feature_names:
if feature in st.session_state['X_train'].columns:
mean = st.session_state['X_train'][feature].mean()
std = st.session_state['X_train'][feature].std()
random_samples[feature] = np.random.normal(mean, std, n_samples)
else:
random_samples[feature] = np.zeros(n_samples)
batch_df = pd.DataFrame(random_samples)
# Scale if needed
if scaler is not None:
batch_scaled = scaler.transform(batch_df)
batch_df = pd.DataFrame(batch_scaled, columns=feature_names)
# Make predictions
predictions = model.predict(batch_df)
# Decode if needed
if le is not None:
predictions = le.inverse_transform(predictions.astype(int))
# Add predictions to dataframe
batch_df['Prediction'] = predictions
st.subheader("📊 Batch Predictions")
st.dataframe(batch_df)
# Statistics
if le is None: # Numerical predictions
st.subheader("📈 Prediction Statistics")
col1, col2, col3 = st.columns(3)
with col1:
st.metric("Mean", f"{predictions.mean():.4f}")
with col2:
st.metric("Std", f"{predictions.std():.4f}")
with col3:
st.metric("Range", f"{predictions.max() - predictions.min():.4f}")
# Download predictions
csv = batch_df.to_csv(index=False)
st.download_button(
label="📥 Download Batch Predictions",
data=csv,
file_name="batch_predictions.csv",
mime="text/csv",
use_container_width=True
)
except Exception as e:
st.error(f"❌ Batch prediction error: {str(e)}")
st.markdown('</div>', unsafe_allow_html=True)
except Exception as e:
st.error(f"❌ Error in prediction: {str(e)}")
with tab5:
if 'results_df' not in st.session_state:
st.info("ℹ️ Train some models first in the 'Model Training' tab")
return
try:
st.markdown('<div class="custom-card">', unsafe_allow_html=True)
st.subheader("📋 Machine Learning Report")
results_df = st.session_state['results_df']
config = st.session_state.get('ml_config', {})
# Generate report
report = f"""
# Machine Learning Pipeline Report
## Configuration Summary
- **Task Type:** {config.get('task_type', 'N/A')}
- **Target Variable:** {config.get('target', 'N/A')}
- **Number of Features:** {len(config.get('features', []))}
- **Test Size:** {config.get('test_size', 0.2)*100:.0f}%
- **Cross-Validation Folds:** {config.get('cv_folds', 5)}
- **Feature Scaling:** {config.get('scaler', 'None')}
## Dataset Information
- **Total Samples:** {st.session_state.get('X_train', pd.DataFrame()).shape[0] + st.session_state.get('X_test', pd.DataFrame()).shape[0]}
- **Training Samples:** {st.session_state.get('X_train', pd.DataFrame()).shape[0]}
- **Test Samples:** {st.session_state.get('X_test', pd.DataFrame()).shape[0]}
## Model Performance Summary
{results_df.to_string()}
## Best Model
**{results_df.iloc[0]['Model']}** performed best based on {'F1 Score' if config.get('task_type') == 'Classification' else 'R² Score'}.
## Recommendations
"""
# Add recommendations based on results
if config.get('task_type') == 'Classification':
if float(results_df['Accuracy'].iloc[0]) > 0.9:
report += "\n- ✓ Excellent model performance achieved"
elif float(results_df['Accuracy'].iloc[0]) > 0.7:
report += "\n- ✓ Good model performance"
else:
report += "\n- ⚠️ Model performance could be improved. Consider feature engineering or trying different algorithms"
else:
if float(results_df['R² Score'].iloc[0]) > 0.8:
report += "\n- ✓ Excellent model performance achieved"
elif float(results_df['R² Score'].iloc[0]) > 0.6:
report += "\n- ✓ Good model performance"
else:
report += "\n- ⚠️ Model performance could be improved. Consider feature engineering or trying different algorithms"
st.markdown(report)
# Download report
st.download_button(
label="📥 Download ML Report",
data=report,
file_name="ml_report.txt",
mime="text/plain",
use_container_width=True
)
st.markdown('</div>', unsafe_allow_html=True)
except Exception as e:
st.error(f"❌ Error generating report: {str(e)}")
except Exception as e:
st.error(f"❌ Critical error in ML pipeline: {str(e)}")
st.info("💡 Please check your data and try again. If the problem persists, try with a smaller dataset.")