Spaces:

RamAi2026
/

dataanalyst

No application file

App Files Files Community

dataanalyst / ml_pipeline.py

RamAi2026

Upload 13 files

da8e446 verified about 1 month ago

raw

history blame

50.7 kB

	import streamlit as st
	import pandas as pd
	import numpy as np
	from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
	from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler
	from sklearn.metrics import (accuracy_score, precision_score, recall_score, f1_score,
	confusion_matrix, classification_report, roc_curve, auc,
	mean_squared_error, r2_score, mean_absolute_error)
	from sklearn.linear_model import LogisticRegression, LinearRegression, Ridge, Lasso
	from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
	from sklearn.svm import SVC, SVR
	from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
	from sklearn.ensemble import (RandomForestClassifier, RandomForestRegressor,
	GradientBoostingClassifier, GradientBoostingRegressor,
	AdaBoostClassifier, AdaBoostRegressor,
	VotingClassifier, VotingRegressor)
	from xgboost import XGBClassifier, XGBRegressor
	from lightgbm import LGBMClassifier, LGBMRegressor
	import plotly.express as px
	import plotly.graph_objects as go
	from plotly.subplots import make_subplots
	import time
	import warnings
	warnings.filterwarnings('ignore')

	class MLPipelineError(Exception):
	"""Custom exception for ML pipeline errors"""
	pass

	def validate_ml_data(df, target, features):
	"""Validate data for machine learning"""
	issues = []

	if df.empty:
	issues.append("Dataset is empty")
	return issues

	if target not in df.columns:
	issues.append(f"Target column '{target}' not found in dataset")

	missing_features = [f for f in features if f not in df.columns]
	if missing_features:
	issues.append(f"Features not found: {missing_features}")

	# Check for sufficient data
	if df.shape[0] < 10:
	issues.append("Dataset too small (minimum 10 rows required)")

	# Check for constant columns
	for col in features:
	if df[col].nunique() == 1:
	issues.append(f"Feature '{col}' is constant")

	# Check target for classification
	if target in df.columns:
	if df[target].dtype in ['object', 'category'] or df[target].nunique() <= 20:
	if df[target].nunique() == 1:
	issues.append("Target has only one class")
	elif df[target].nunique() > 50:
	issues.append(f"Target has {df[target].nunique()} classes, which may cause issues")

	return issues

	def safe_ml_operation(func, args, *kwargs):
	"""Safely execute ML operations with error handling"""
	try:
	result = func(args, *kwargs)
	return result, None
	except ValueError as e:
	error_msg = f"Value Error: {str(e)}. Check your data types and values."
	return None, error_msg
	except MemoryError as e:
	error_msg = "Memory Error: Dataset too large. Try reducing the number of features or using a sample."
	return None, error_msg
	except Exception as e:
	error_msg = f"ML Error: {str(e)}"
	return None, error_msg

	def run_ml_pipeline(df):
	"""
	Enhanced machine learning pipeline with comprehensive error handling
	"""
	st.markdown("""
	<div style='text-align: center; margin-bottom: 2rem;'>
	<h2>🤖 Advanced Machine Learning Pipeline</h2>
	<p style='color: gray;'>Train, evaluate, and compare multiple ML models with automatic error handling</p>
	</div>
	""", unsafe_allow_html=True)

	try:
	# Check if dataset is suitable for ML
	if df.shape[0] < 10:
	st.error("❌ Dataset too small for machine learning (need at least 10 rows)")
	return

	# Create tabs for different ML stages
	tab1, tab2, tab3, tab4, tab5 = st.tabs([
	"⚙️ Configuration",
	"📊 Model Training",
	"📈 Model Evaluation",
	"🔮 Predictions",
	"📋 ML Report"
	])

	with tab1:
	st.markdown('<div class="custom-card">', unsafe_allow_html=True)
	st.subheader("⚙️ Model Configuration")

	try:
	# Target selection with validation
	st.markdown("### 🎯 Target Variable")

	# Auto-detect potential target columns
	potential_targets = []
	target_types = {}

	for col in df.columns:
	try:
	if df[col].dtype in ['int64', 'float64']:
	if df[col].nunique() <= 20:
	potential_targets.append(col)
	target_types[col] = "Classification (low cardinality)"
	else:
	potential_targets.append(col)
	target_types[col] = "Regression"
	elif df[col].dtype in ['object', 'category']:
	if df[col].nunique() <= 50:
	potential_targets.append(col)
	target_types[col] = f"Classification ({df[col].nunique()} classes)"
	except Exception as e:
	st.warning(f"⚠️ Couldn't analyze column {col}: {str(e)}")

	if not potential_targets:
	st.error("❌ No suitable target columns found. Need numeric or categorical columns with reasonable cardinality.")
	return

	target = st.selectbox(
	"Select target column",
	potential_targets,
	help=f"Column types: {target_types}"
	)

	# Task type detection
	if df[target].dtype in ['object', 'category'] or df[target].nunique() <= 20:
	task_type = "Classification"
	unique_values = df[target].nunique()

	if unique_values == 2:
	st.success("✅ Binary Classification problem detected")
	elif unique_values <= 10:
	st.info(f"📊 Multi-class Classification with {unique_values} classes")
	else:
	st.warning(f"⚠️ Multi-class Classification with {unique_values} classes - may be challenging")

	# Check class balance
	class_dist = df[target].value_counts(normalize=True)
	if class_dist.min() < 0.1:
	st.warning("⚠️ Class imbalance detected. Consider using class weights or resampling.")
	else:
	task_type = "Regression"
	st.info("📈 Regression task detected")

	# Check target distribution
	target_skew = df[target].skew()
	if abs(target_skew) > 1:
	st.warning(f"⚠️ Target variable is highly skewed (skewness: {target_skew:.2f}). Consider log transformation.")

	# Feature selection
	st.markdown("### 🔍 Feature Selection")

	# Auto-select features (exclude target)
	all_features = [col for col in df.columns if col != target]

	# Remove problematic columns
	problematic_cols = []
	for col in all_features:
	try:
	if df[col].nunique() == 1:
	problematic_cols.append(col)
	elif df[col].isnull().sum() > len(df) * 0.5:
	problematic_cols.append(col)
	except:
	problematic_cols.append(col)

	if problematic_cols:
	st.warning(f"⚠️ Problematic columns detected (will be excluded): {problematic_cols}")
	all_features = [f for f in all_features if f not in problematic_cols]

	if not all_features:
	st.error("❌ No valid features remaining after filtering.")
	return

	# Select features
	selected_features = st.multiselect(
	"Choose features for modeling",
	all_features,
	default=all_features[:min(10, len(all_features))],
	help="Select the columns to use as features. Using too many features may cause overfitting."
	)

	if not selected_features:
	st.warning("⚠️ Please select at least one feature")
	return

	# Validate selected features
	validation_issues = validate_ml_data(df, target, selected_features)
	if validation_issues:
	for issue in validation_issues:
	st.warning(f"⚠️ {issue}")

	# Data preprocessing options
	st.markdown("### 🛠️ Preprocessing Options")

	col1, col2 = st.columns(2)
	with col1:
	test_size = st.slider("Test set size (%)", 10, 40, 20, 5) / 100
	scaler_option = st.selectbox("Feature scaling", ["None", "StandardScaler", "MinMaxScaler"])

	with col2:
	cv_folds = st.slider("Cross-validation folds", 2, 10, 5)
	if task_type == "Classification":
	handle_imbalance = st.checkbox("Handle class imbalance", value=False,
	help="Use class weights or sampling techniques")
	else:
	handle_imbalance = False

	# Model selection based on task type
	st.markdown("### 🤖 Model Selection")

	if task_type == "Classification":
	models = {
	"Logistic Regression": LogisticRegression(max_iter=1000, random_state=42),
	"K-Nearest Neighbors": KNeighborsClassifier(),
	"Decision Tree": DecisionTreeClassifier(random_state=42),
	"Random Forest": RandomForestClassifier(random_state=42, n_jobs=-1),
	"Gradient Boosting": GradientBoostingClassifier(random_state=42),
	"XGBoost": XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42),
	"LightGBM": LGBMClassifier(verbose=-1, random_state=42),
	"AdaBoost": AdaBoostClassifier(random_state=42),
	"SVM": SVC(probability=True, random_state=42)
	}

	# Default models for quick selection
	default_models = ["Logistic Regression", "Random Forest", "XGBoost"]
	else: # Regression
	models = {
	"Linear Regression": LinearRegression(),
	"Ridge Regression": Ridge(random_state=42),
	"Lasso Regression": Lasso(random_state=42),
	"Decision Tree": DecisionTreeRegressor(random_state=42),
	"Random Forest": RandomForestRegressor(random_state=42, n_jobs=-1),
	"Gradient Boosting": GradientBoostingRegressor(random_state=42),
	"XGBoost": XGBRegressor(random_state=42),
	"LightGBM": LGBMRegressor(verbose=-1, random_state=42),
	"AdaBoost": AdaBoostRegressor(random_state=42),
	"SVR": SVR()
	}

	default_models = ["Linear Regression", "Random Forest", "XGBoost"]

	selected_models = st.multiselect(
	"Choose models to train",
	list(models.keys()),
	default=default_models,
	help="Select multiple models to compare performance"
	)

	if not selected_models:
	st.warning("⚠️ Please select at least one model")
	return

	# Advanced options
	with st.expander("⚡ Advanced Options"):
	do_tuning = st.checkbox("Perform hyperparameter tuning", value=False,
	help="Grid search for best parameters (may be slow)")

	if do_tuning:
	tuning_folds = st.slider("Tuning CV folds", 2, 5, 3)
	max_tuning_iter = st.slider("Max tuning iterations per model", 5, 50, 20)

	use_sampling = st.checkbox("Use data sampling (for large datasets)", value=False,
	help="Use a sample for faster experimentation")

	if use_sampling:
	sample_size = st.slider("Sample size (%)", 10, 100, 100, 10) / 100

	random_state = st.number_input("Random seed", value=42, min_value=0, max_value=999)

	st.markdown('</div>', unsafe_allow_html=True)

	# Store configuration in session state
	st.session_state['ml_config'] = {
	'target': target,
	'features': selected_features,
	'task_type': task_type,
	'test_size': test_size,
	'scaler': scaler_option,
	'cv_folds': cv_folds,
	'handle_imbalance': handle_imbalance,
	'models': {name: models[name] for name in selected_models},
	'do_tuning': do_tuning,
	'random_state': random_state
	}

	except Exception as e:
	st.error(f"❌ Error in configuration: {str(e)}")
	st.info("💡 Tip: Check your data types and ensure all columns are valid")
	return

	with tab2:
	if 'ml_config' not in st.session_state:
	st.info("ℹ️ Please configure your model in the 'Configuration' tab first")
	return

	if st.button("🚀 Start Training", use_container_width=True, type="primary"):
	try:
	config = st.session_state['ml_config']

	st.markdown('<div class="custom-card">', unsafe_allow_html=True)

	# Prepare data with error handling
	with st.spinner("📊 Preparing data..."):
	try:
	X = df[config['features']].copy()
	y = df[config['target']].copy()

	# Handle missing values
	if X.isnull().sum().sum() > 0:
	st.info(f"⚠️ Handling {X.isnull().sum().sum()} missing values in features...")
	X = X.fillna(X.mean(numeric_only=True)).fillna(X.mode().iloc[0])

	# Handle categorical features
	cat_features = X.select_dtypes(include=['object', 'category']).columns
	if len(cat_features) > 0:
	st.info(f"🔄 Encoding categorical features: {list(cat_features)}")
	X = pd.get_dummies(X, columns=cat_features)

	# Handle target encoding for classification
	le = None
	if config['task_type'] == "Classification" and y.dtype == 'object':
	le = LabelEncoder()
	y = le.fit_transform(y)
	st.info(f"📊 Target classes: {dict(zip(le.classes_, le.transform(le.classes_)))}")

	# Handle class imbalance
	if config['task_type'] == "Classification" and config['handle_imbalance']:
	from sklearn.utils.class_weight import compute_class_weight
	class_weights = compute_class_weight('balanced', classes=np.unique(y), y=y)
	st.info(f"⚖️ Using class weights: {dict(zip(np.unique(y), class_weights))}")

	# Scale features
	scaler = None
	if config['scaler'] != "None":
	if config['scaler'] == "StandardScaler":
	scaler = StandardScaler()
	else:
	scaler = MinMaxScaler()
	X_scaled = scaler.fit_transform(X)
	X = pd.DataFrame(X_scaled, columns=X.columns)

	# Split data
	stratify = y if config['task_type'] == "Classification" else None
	X_train, X_test, y_train, y_test = train_test_split(
	X, y, test_size=config['test_size'],
	random_state=config['random_state'],
	stratify=stratify
	)

	st.success(f"✅ Data prepared: {X_train.shape[0]} training samples, {X_test.shape[0]} test samples")

	except Exception as e:
	st.error(f"❌ Error in data preparation: {str(e)}")
	return

	# Train models
	results = []
	trained_models = {}
	progress_bar = st.progress(0)
	status_text = st.empty()

	for i, (model_name, model) in enumerate(config['models'].items()):
	status_text.text(f"🔄 Training {model_name}...")

	try:
	# Apply class weights if needed
	if config['task_type'] == "Classification" and config['handle_imbalance']:
	if hasattr(model, 'class_weight'):
	model.set_params(class_weight='balanced')

	# Train
	start_time = time.time()
	model.fit(X_train, y_train)
	training_time = time.time() - start_time

	# Store trained model
	trained_models[model_name] = {
	'model': model,
	'scaler': scaler,
	'label_encoder': le,
	'features': X.columns.tolist()
	}

	# Predict
	y_pred = model.predict(X_test)

	# Calculate metrics
	if config['task_type'] == "Classification":
	try:
	accuracy = accuracy_score(y_test, y_pred)
	precision = precision_score(y_test, y_pred, average='weighted', zero_division=0)
	recall = recall_score(y_test, y_pred, average='weighted', zero_division=0)
	f1 = f1_score(y_test, y_pred, average='weighted', zero_division=0)

	# Cross-validation
	cv_scores = cross_val_score(model, X_train, y_train, cv=config['cv_folds'])

	results.append({
	"Model": model_name,
	"Accuracy": f"{accuracy:.4f}",
	"Precision": f"{precision:.4f}",
	"Recall": f"{recall:.4f}",
	"F1 Score": f"{f1:.4f}",
	"CV Score": f"{cv_scores.mean():.4f} (±{cv_scores.std()*2:.4f})",
	"Time (s)": f"{training_time:.2f}"
	})
	except Exception as e:
	st.warning(f"⚠️ Could not calculate all metrics for {model_name}: {str(e)}")

	else: # Regression
	try:
	mse = mean_squared_error(y_test, y_pred)
	rmse = np.sqrt(mse)
	mae = mean_absolute_error(y_test, y_pred)
	r2 = r2_score(y_test, y_pred)

	# Cross-validation
	cv_scores = cross_val_score(model, X_train, y_train, cv=config['cv_folds'], scoring='r2')

	results.append({
	"Model": model_name,
	"R² Score": f"{r2:.4f}",
	"RMSE": f"{rmse:.4f}",
	"MAE": f"{mae:.4f}",
	"CV R²": f"{cv_scores.mean():.4f} (±{cv_scores.std()*2:.4f})",
	"Time (s)": f"{training_time:.2f}"
	})
	except Exception as e:
	st.warning(f"⚠️ Could not calculate all metrics for {model_name}: {str(e)}")

	except MemoryError:
	st.error(f"❌ Out of memory training {model_name}. Try using fewer features or a sample.")
	except Exception as e:
	st.warning(f"⚠️ Error training {model_name}: {str(e)}")

	progress_bar.progress((i + 1) / len(config['models']))

	status_text.text("✅ Training complete!")

	if not results:
	st.error("❌ No models were successfully trained")
	return

	# Display results
	st.subheader("📊 Model Performance Comparison")
	results_df = pd.DataFrame(results)

	# Highlight best model
	if config['task_type'] == "Classification":
	best_idx = results_df['F1 Score'].astype(float).idxmax()
	else:
	best_idx = results_df['R² Score'].astype(float).idxmax()

	# Style dataframe
	def highlight_best(s):
	is_best = s.index == best_idx
	return ['background-color: #90EE90' if v else '' for v in is_best]

	st.dataframe(results_df.style.apply(highlight_best), use_container_width=True)

	# Store results
	st.session_state['trained_models'] = trained_models
	st.session_state['X_train'] = X_train
	st.session_state['X_test'] = X_test
	st.session_state['y_train'] = y_train
	st.session_state['y_test'] = y_test
	st.session_state['task_type'] = config['task_type']
	st.session_state['results_df'] = results_df

	# Best model info
	best_model_name = results_df.iloc[best_idx]['Model']
	st.success(f"🏆 Best Model: {best_model_name}")

	st.markdown('</div>', unsafe_allow_html=True)

	except Exception as e:
	st.error(f"❌ Critical error in training: {str(e)}")
	st.info("💡 Try reducing the number of features or models")

	with tab3:
	if 'trained_models' not in st.session_state:
	st.info("ℹ️ Train some models first in the 'Model Training' tab")
	return

	try:
	st.markdown('<div class="custom-card">', unsafe_allow_html=True)
	st.subheader("📈 Detailed Model Evaluation")

	# Model selection for detailed evaluation
	selected_eval_model = st.selectbox(
	"Select model for detailed evaluation",
	list(st.session_state['trained_models'].keys())
	)

	model_info = st.session_state['trained_models'][selected_eval_model]
	model = model_info['model']
	X_test = st.session_state['X_test']
	y_test = st.session_state['y_test']
	task_type = st.session_state['task_type']

	try:
	y_pred = model.predict(X_test)

	if task_type == "Classification":
	# Confusion Matrix
	st.markdown("### Confusion Matrix")
	cm = confusion_matrix(y_test, y_pred)

	fig = px.imshow(cm,
	text_auto=True,
	aspect="auto",
	color_continuous_scale='Blues',
	title=f"Confusion Matrix - {selected_eval_model}")

	fig.update_layout(xaxis_title="Predicted", yaxis_title="Actual")
	st.plotly_chart(fig, use_container_width=True)

	# Classification Report
	st.markdown("### Classification Report")
	report = classification_report(y_test, y_pred, output_dict=True)
	report_df = pd.DataFrame(report).transpose()
	st.dataframe(report_df.style.format("{:.4f}"), use_container_width=True)

	# ROC Curve (for binary classification)
	if len(np.unique(y_test)) == 2 and hasattr(model, "predict_proba"):
	st.markdown("### ROC Curve")
	y_pred_proba = model.predict_proba(X_test)[:, 1]
	fpr, tpr, _ = roc_curve(y_test, y_pred_proba)
	roc_auc = auc(fpr, tpr)

	fig = go.Figure()
	fig.add_trace(go.Scatter(x=fpr, y=tpr,
	mode='lines',
	name=f'ROC (AUC = {roc_auc:.3f})',
	line=dict(color='blue', width=2)))
	fig.add_trace(go.Scatter(x=[0, 1], y=[0, 1],
	mode='lines',
	name='Random',
	line=dict(color='gray', dash='dash')))

	fig.update_layout(xaxis_title="False Positive Rate",
	yaxis_title="True Positive Rate",
	title=f"ROC Curve - {selected_eval_model}")

	st.plotly_chart(fig, use_container_width=True)

	else: # Regression
	# Actual vs Predicted plot
	st.markdown("### Actual vs Predicted")

	fig = px.scatter(x=y_test, y=y_pred,
	labels={'x': 'Actual', 'y': 'Predicted'},
	title=f"Actual vs Predicted - {selected_eval_model}",
	trendline="ols")

	# Add perfect prediction line
	min_val = min(y_test.min(), y_pred.min())
	max_val = max(y_test.max(), y_pred.max())
	fig.add_trace(go.Scatter(x=[min_val, max_val], y=[min_val, max_val],
	mode='lines', name='Perfect Prediction',
	line=dict(color='red', dash='dash')))

	st.plotly_chart(fig, use_container_width=True)

	# Residuals plot
	st.markdown("### Residuals Analysis")
	residuals = y_test - y_pred

	fig = make_subplots(rows=1, cols=2,
	subplot_titles=("Residuals vs Predicted", "Residuals Distribution"))

	fig.add_trace(go.Scatter(x=y_pred, y=residuals,
	mode='markers',
	name='Residuals',
	marker=dict(color='blue', opacity=0.5)), row=1, col=1)

	fig.add_hline(y=0, line_dash="dash", line_color="red", row=1, col=1)

	fig.add_trace(go.Histogram(x=residuals, nbinsx=30,
	name='Distribution',
	marker_color='green'), row=1, col=2)

	fig.update_layout(title=f"Residual Analysis - {selected_eval_model}")
	st.plotly_chart(fig, use_container_width=True)

	# Residual statistics
	col1, col2, col3 = st.columns(3)
	with col1:
	st.metric("Mean Residual", f"{residuals.mean():.4f}")
	with col2:
	st.metric("Std Residual", f"{residuals.std():.4f}")
	with col3:
	st.metric("Residual Range", f"{residuals.max() - residuals.min():.4f}")

	# Feature Importance (if available)
	if hasattr(model, 'feature_importances_'):
	st.markdown("### Feature Importance")
	feature_importance = pd.DataFrame({
	'feature': X_test.columns,
	'importance': model.feature_importances_
	}).sort_values('importance', ascending=True)

	fig = px.bar(feature_importance.tail(10),
	x='importance', y='feature',
	orientation='h',
	title="Top 10 Feature Importances",
	color='importance',
	color_continuous_scale='Viridis')
	st.plotly_chart(fig, use_container_width=True)

	except Exception as e:
	st.error(f"❌ Error in evaluation: {str(e)}")

	st.markdown('</div>', unsafe_allow_html=True)

	except Exception as e:
	st.error(f"❌ Error loading evaluation: {str(e)}")

	with tab4:
	if 'trained_models' not in st.session_state:
	st.info("ℹ️ Train some models first in the 'Model Training' tab")
	return

	try:
	st.markdown('<div class="custom-card">', unsafe_allow_html=True)
	st.subheader("🔮 Make Predictions")

	# Model selection for predictions
	selected_pred_model = st.selectbox(
	"Select model for predictions",
	list(st.session_state['trained_models'].keys()),
	key="pred_model"
	)

	model_info = st.session_state['trained_models'][selected_pred_model]
	model = model_info['model']
	scaler = model_info['scaler']
	le = model_info.get('label_encoder')
	feature_names = model_info['features']

	# Input method
	input_method = st.radio(
	"Input method",
	["Manual input", "Upload new data", "Batch prediction"],
	horizontal=True
	)

	if input_method == "Manual input":
	st.markdown("### Enter feature values")

	input_data = {}
	cols = st.columns(3)

	for i, feature in enumerate(feature_names):
	with cols[i % 3]:
	try:
	# Get feature range from training data
	if feature in st.session_state['X_train'].columns:
	min_val = float(st.session_state['X_train'][feature].min())
	max_val = float(st.session_state['X_train'][feature].max())
	mean_val = float(st.session_state['X_train'][feature].mean())

	input_data[feature] = st.slider(
	f"{feature}",
	min_val, max_val, mean_val,
	format="%.4f",
	key=f"manual_{feature}"
	)
	else:
	input_data[feature] = st.number_input(
	f"{feature}",
	value=0.0,
	key=f"manual_{feature}"
	)
	except Exception as e:
	st.warning(f"⚠️ Error with {feature}: {str(e)}")
	input_data[feature] = 0.0

	if st.button("🔮 Predict", use_container_width=True):
	try:
	# Convert input to DataFrame
	input_df = pd.DataFrame([input_data])

	# Ensure all features are present
	for col in feature_names:
	if col not in input_df.columns:
	input_df[col] = 0

	input_df = input_df[feature_names]

	# Scale if needed
	if scaler is not None:
	input_scaled = scaler.transform(input_df)
	input_df = pd.DataFrame(input_scaled, columns=feature_names)

	# Make prediction
	prediction = model.predict(input_df)[0]

	# Decode if needed
	if le is not None:
	prediction = le.inverse_transform([int(prediction)])[0]

	# Display prediction with styling
	st.markdown("""
	<div class="success-container" style="text-align: center; padding: 2rem;">
	<h3>🎯 Prediction Result</h3>
	<h1 style="font-size: 3rem;">{}</h1>
	</div>
	""".format(prediction), unsafe_allow_html=True)

	except Exception as e:
	st.error(f"❌ Prediction error: {str(e)}")

	elif input_method == "Upload new data":
	pred_file = st.file_uploader("Upload data for predictions",
	type=["csv", "xlsx"],
	key="pred_file")

	if pred_file:
	try:
	if pred_file.name.endswith("csv"):
	pred_df = pd.read_csv(pred_file)
	else:
	pred_df = pd.read_excel(pred_file)

	st.subheader("📋 Uploaded Data Preview")
	st.dataframe(pred_df.head())

	if st.button("🔮 Predict for all rows", use_container_width=True):
	with st.spinner("Making predictions..."):
	try:
	# Prepare data
	pred_processed = pred_df.copy()

	# Handle categorical features if needed
	for col in pred_processed.columns:
	if pred_processed[col].dtype == 'object':
	pred_processed = pd.get_dummies(pred_processed, columns=[col])

	# Align columns with training data
	for col in feature_names:
	if col not in pred_processed.columns:
	pred_processed[col] = 0

	pred_processed = pred_processed[feature_names]

	# Scale if needed
	if scaler is not None:
	pred_scaled = scaler.transform(pred_processed)
	pred_processed = pd.DataFrame(pred_scaled, columns=feature_names)

	# Make predictions
	predictions = model.predict(pred_processed)

	# Decode if needed
	if le is not None:
	predictions = le.inverse_transform(predictions.astype(int))

	# Add predictions to dataframe
	pred_df['Prediction'] = predictions

	st.subheader("📊 Predictions Result")
	st.dataframe(pred_df)

	# Download predictions
	csv = pred_df.to_csv(index=False)
	st.download_button(
	label="📥 Download Predictions",
	data=csv,
	file_name="predictions.csv",
	mime="text/csv",
	use_container_width=True
	)

	except Exception as e:
	st.error(f"❌ Prediction error: {str(e)}")

	except Exception as e:
	st.error(f"❌ Error reading file: {str(e)}")

	elif input_method == "Batch prediction":
	st.markdown("### Batch Prediction Settings")

	n_samples = st.number_input("Number of samples to generate",
	min_value=1, max_value=1000, value=10)

	if st.button("🎲 Generate Random Samples & Predict", use_container_width=True):
	try:
	# Generate random samples based on training data distribution
	random_samples = {}
	for feature in feature_names:
	if feature in st.session_state['X_train'].columns:
	mean = st.session_state['X_train'][feature].mean()
	std = st.session_state['X_train'][feature].std()
	random_samples[feature] = np.random.normal(mean, std, n_samples)
	else:
	random_samples[feature] = np.zeros(n_samples)

	batch_df = pd.DataFrame(random_samples)

	# Scale if needed
	if scaler is not None:
	batch_scaled = scaler.transform(batch_df)
	batch_df = pd.DataFrame(batch_scaled, columns=feature_names)

	# Make predictions
	predictions = model.predict(batch_df)

	# Decode if needed
	if le is not None:
	predictions = le.inverse_transform(predictions.astype(int))

	# Add predictions to dataframe
	batch_df['Prediction'] = predictions

	st.subheader("📊 Batch Predictions")
	st.dataframe(batch_df)

	# Statistics
	if le is None: # Numerical predictions
	st.subheader("📈 Prediction Statistics")
	col1, col2, col3 = st.columns(3)
	with col1:
	st.metric("Mean", f"{predictions.mean():.4f}")
	with col2:
	st.metric("Std", f"{predictions.std():.4f}")
	with col3:
	st.metric("Range", f"{predictions.max() - predictions.min():.4f}")

	# Download predictions
	csv = batch_df.to_csv(index=False)
	st.download_button(
	label="📥 Download Batch Predictions",
	data=csv,
	file_name="batch_predictions.csv",
	mime="text/csv",
	use_container_width=True
	)

	except Exception as e:
	st.error(f"❌ Batch prediction error: {str(e)}")

	st.markdown('</div>', unsafe_allow_html=True)

	except Exception as e:
	st.error(f"❌ Error in prediction: {str(e)}")

	with tab5:
	if 'results_df' not in st.session_state:
	st.info("ℹ️ Train some models first in the 'Model Training' tab")
	return

	try:
	st.markdown('<div class="custom-card">', unsafe_allow_html=True)
	st.subheader("📋 Machine Learning Report")

	results_df = st.session_state['results_df']
	config = st.session_state.get('ml_config', {})

	# Generate report
	report = f"""
	# Machine Learning Pipeline Report

	## Configuration Summary
	- Task Type: {config.get('task_type', 'N/A')}
	- Target Variable: {config.get('target', 'N/A')}
	- Number of Features: {len(config.get('features', []))}
	- Test Size: {config.get('test_size', 0.2)*100:.0f}%
	- Cross-Validation Folds: {config.get('cv_folds', 5)}
	- Feature Scaling: {config.get('scaler', 'None')}

	## Dataset Information
	- Total Samples: {st.session_state.get('X_train', pd.DataFrame()).shape[0] + st.session_state.get('X_test', pd.DataFrame()).shape[0]}
	- Training Samples: {st.session_state.get('X_train', pd.DataFrame()).shape[0]}
	- Test Samples: {st.session_state.get('X_test', pd.DataFrame()).shape[0]}

	## Model Performance Summary

	{results_df.to_string()}

	## Best Model
	{results_df.iloc[0]['Model']} performed best based on {'F1 Score' if config.get('task_type') == 'Classification' else 'R² Score'}.

	## Recommendations
	"""

	# Add recommendations based on results
	if config.get('task_type') == 'Classification':
	if float(results_df['Accuracy'].iloc[0]) > 0.9:
	report += "\n- ✓ Excellent model performance achieved"
	elif float(results_df['Accuracy'].iloc[0]) > 0.7:
	report += "\n- ✓ Good model performance"
	else:
	report += "\n- ⚠️ Model performance could be improved. Consider feature engineering or trying different algorithms"
	else:
	if float(results_df['R² Score'].iloc[0]) > 0.8:
	report += "\n- ✓ Excellent model performance achieved"
	elif float(results_df['R² Score'].iloc[0]) > 0.6:
	report += "\n- ✓ Good model performance"
	else:
	report += "\n- ⚠️ Model performance could be improved. Consider feature engineering or trying different algorithms"

	st.markdown(report)

	# Download report
	st.download_button(
	label="📥 Download ML Report",
	data=report,
	file_name="ml_report.txt",
	mime="text/plain",
	use_container_width=True
	)

	st.markdown('</div>', unsafe_allow_html=True)

	except Exception as e:
	st.error(f"❌ Error generating report: {str(e)}")

	except Exception as e:
	st.error(f"❌ Critical error in ML pipeline: {str(e)}")
	st.info("💡 Please check your data and try again. If the problem persists, try with a smaller dataset.")