Spaces:

shvy
/

automl

Sleeping

App Files Files Community

automl / app.py

shvy

app.py

994aeff verified 11 months ago

raw

history blame contribute delete

13.6 kB

	import gradio as gr
	import pandas as pd
	import numpy as np
	import matplotlib.pyplot as plt
	import seaborn as sns
	from sklearn.model_selection import train_test_split
	from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
	from sklearn.compose import ColumnTransformer
	from sklearn.pipeline import Pipeline
	from sklearn.impute import SimpleImputer
	from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
	from sklearn.linear_model import LogisticRegression, LinearRegression
	from sklearn.metrics import accuracy_score, classification_report, mean_squared_error, r2_score
	import xgboost as xgb
	from catboost import CatBoostClassifier, CatBoostRegressor
	import lightgbm as lgb
	import io
	import base64
	from PIL import Image
	import os
	import pickle
	import warnings
	warnings.filterwarnings('ignore')

	def infer_problem_type(df, target_col):
	"""Determine if it's a classification or regression problem"""
	unique_values = df[target_col].nunique()
	# If the target column has less than 10 unique values and is an integer type,
	# it's likely a classification problem
	if unique_values < 10 or df[target_col].dtype in ['object', 'category', 'bool']:
	return "Classification"
	else:
	return "Regression"

	def generate_eda_report(df):
	"""Generate EDA report for the dataset"""
	buffer = io.BytesIO()

	report = {}

	# Basic info
	report['shape'] = df.shape
	report['dtypes'] = df.dtypes.astype(str).to_dict()
	report['null_counts'] = df.isnull().sum().to_dict()
	report['desc_stats'] = df.describe().to_html()

	# Correlation heatmap
	plt.figure(figsize=(10, 8))
	numeric_df = df.select_dtypes(include=['number'])
	if not numeric_df.empty:
	sns.heatmap(numeric_df.corr(), annot=True, cmap='coolwarm', linewidths=0.5)
	plt.title('Correlation Matrix')
	plt.tight_layout()
	plt.savefig(buffer, format='png')
	plt.close()
	buffer.seek(0)
	report['corr_heatmap'] = base64.b64encode(buffer.getvalue()).decode('utf-8')
	buffer.close()

	# Summary of categorical columns
	categorical_cols = df.select_dtypes(include=['object', 'category']).columns.tolist()
	report['categorical_cols'] = categorical_cols

	# Summary of numerical columns
	numerical_cols = df.select_dtypes(include=['number']).columns.tolist()
	report['numerical_cols'] = numerical_cols

	return report

	def clean_and_preprocess(df, problem_type, target_col):
	"""Clean and preprocess the dataset"""
	# Make a copy of the dataframe
	processed_df = df.copy()

	# Handle missing values
	for col in processed_df.columns:
	if processed_df[col].dtype in ['int64', 'float64']:
	processed_df[col].fillna(processed_df[col].median(), inplace=True)
	else:
	processed_df[col].fillna(processed_df[col].mode()[0], inplace=True)

	# Split features and target
	X = processed_df.drop(columns=[target_col])
	y = processed_df[target_col]

	# Identify categorical and numerical columns
	categorical_cols = X.select_dtypes(include=['object', 'category']).columns.tolist()
	numerical_cols = X.select_dtypes(include=['number']).columns.tolist()

	# Create preprocessor
	preprocessor = ColumnTransformer(
	transformers=[
	('num', Pipeline(steps=[
	('imputer', SimpleImputer(strategy='median')),
	('scaler', StandardScaler())
	]), numerical_cols),
	('cat', Pipeline(steps=[
	('imputer', SimpleImputer(strategy='most_frequent')),
	('onehot', OneHotEncoder(handle_unknown='ignore'))
	]), categorical_cols)
	]
	)

	# Create and fit preprocessor
	X_processed = preprocessor.fit_transform(X)

	# Handle target for classification
	if problem_type == "Classification":
	le = LabelEncoder()
	y = le.fit_transform(y)

	# Split data
	X_train, X_test, y_train, y_test = train_test_split(X_processed, y, test_size=0.2, random_state=42)

	preprocessing_info = {
	'preprocessor': preprocessor,
	'X_train': X_train,
	'X_test': X_test,
	'y_train': y_train,
	'y_test': y_test,
	'categorical_cols': categorical_cols,
	'numerical_cols': numerical_cols,
	'target_encoder': le if problem_type == "Classification" else None
	}

	return preprocessing_info

	def train_and_evaluate_models(preprocessing_info, problem_type):
	"""Train and evaluate models based on problem type"""
	X_train = preprocessing_info['X_train']
	X_test = preprocessing_info['X_test']
	y_train = preprocessing_info['y_train']
	y_test = preprocessing_info['y_test']

	results = {}
	models = {}

	if problem_type == "Classification":
	# Classification models
	models_to_train = {
	'RandomForest': RandomForestClassifier(n_estimators=100, random_state=42),
	'LogisticRegression': LogisticRegression(max_iter=1000, random_state=42),
	'XGBoost': xgb.XGBClassifier(random_state=42),
	'CatBoost': CatBoostClassifier(verbose=0, random_state=42),
	'LightGBM': lgb.LGBMClassifier(random_state=42)
	}

	for name, model in models_to_train.items():
	model.fit(X_train, y_train)
	y_pred = model.predict(X_test)

	accuracy = accuracy_score(y_test, y_pred)
	report = classification_report(y_test, y_pred, output_dict=True)

	results[name] = {
	'accuracy': accuracy,
	'report': report
	}
	models[name] = model

	else:
	# Regression models
	models_to_train = {
	'RandomForest': RandomForestRegressor(n_estimators=100, random_state=42),
	'LinearRegression': LinearRegression(),
	'XGBoost': xgb.XGBRegressor(random_state=42),
	'CatBoost': CatBoostRegressor(verbose=0, random_state=42),
	'LightGBM': lgb.LGBMRegressor(random_state=42)
	}

	for name, model in models_to_train.items():
	model.fit(X_train, y_train)
	y_pred = model.predict(X_test)

	mse = mean_squared_error(y_test, y_pred)
	r2 = r2_score(y_test, y_pred)

	results[name] = {
	'mse': mse,
	'r2': r2
	}
	models[name] = model

	# Find best model
	if problem_type == "Classification":
	best_model_name = max(results, key=lambda x: results[x]['accuracy'])
	best_score = results[best_model_name]['accuracy']
	metric_name = 'accuracy'
	else:
	best_model_name = max(results, key=lambda x: results[x]['r2'])
	best_score = results[best_model_name]['r2']
	metric_name = 'R²'

	return {
	'results': results,
	'best_model_name': best_model_name,
	'best_score': best_score,
	'metric_name': metric_name,
	'models': models,
	'best_model': models[best_model_name]
	}

	def save_model(model, preprocessor, target_encoder=None):
	"""Save model and preprocessor to files"""
	os.makedirs('models', exist_ok=True)

	# Save model
	with open('models/model.pkl', 'wb') as f:
	pickle.dump(model, f)

	# Save preprocessor
	with open('models/preprocessor.pkl', 'wb') as f:
	pickle.dump(preprocessor, f)

	# Save target encoder if it exists
	if target_encoder is not None:
	with open('models/target_encoder.pkl', 'wb') as f:
	pickle.dump(target_encoder, f)

	return 'models/model.pkl'

	def process_dataset(df, target_col):
	"""Process the entire dataset pipeline"""
	# Determine problem type
	problem_type = infer_problem_type(df, target_col)

	# Generate EDA report
	eda_report = generate_eda_report(df)

	# Preprocess data
	preprocessing_info = clean_and_preprocess(df, problem_type, target_col)

	# Train and evaluate models
	model_results = train_and_evaluate_models(preprocessing_info, problem_type)

	# Save best model
	model_path = save_model(
	model_results['best_model'],
	preprocessing_info['preprocessor'],
	preprocessing_info.get('target_encoder')
	)

	return {
	'problem_type': problem_type,
	'eda_report': eda_report,
	'preprocessing_info': preprocessing_info,
	'model_results': model_results,
	'model_path': model_path
	}

	def format_results_html(results_data):
	"""Format results as HTML for display"""
	problem_type = results_data['problem_type']
	eda_report = results_data['eda_report']
	model_results = results_data['model_results']

	html = f"""
	<h2>AutoML Analysis Results</h2>
	<h3>Problem Type: {problem_type}</h3>

	<h3>Dataset Information</h3>
	<p><strong>Shape:</strong> {eda_report['shape'][0]} rows, {eda_report['shape'][1]} columns</p>
	<p><strong>Numerical Columns:</strong> {', '.join(eda_report['numerical_cols'])}</p>
	<p><strong>Categorical Columns:</strong> {', '.join(eda_report['categorical_cols'])}</p>

	<h3>Missing Values</h3>
	<ul>
	"""

	for col, count in eda_report['null_counts'].items():
	if count > 0:
	html += f"<li>{col}: {count} missing values</li>"

	html += "</ul>"

	if 'corr_heatmap' in eda_report:
	html += f"""
	<h3>Correlation Heatmap</h3>
	<img src="data:image/png;base64,{eda_report['corr_heatmap']}" alt="Correlation Heatmap" width="600">
	"""

	html += f"""
	<h3>Model Results</h3>
	<p><strong>Best Model:</strong> {model_results['best_model_name']}</p>
	<p><strong>Best {model_results['metric_name']}:</strong> {model_results['best_score']:.4f}</p>

	<h4>All Models Performance</h4>
	<table border="1" cellpadding="5">
	<tr>
	<th>Model</th>
	"""

	if problem_type == "Classification":
	html += "<th>Accuracy</th></tr>"

	for model, result in model_results['results'].items():
	html += f"""
	<tr>
	<td>{model}</td>
	<td>{result['accuracy']:.4f}</td>
	</tr>
	"""
	else:
	html += "<th>MSE</th><th>R²</th></tr>"

	for model, result in model_results['results'].items():
	html += f"""
	<tr>
	<td>{model}</td>
	<td>{result['mse']:.4f}</td>
	<td>{result['r2']:.4f}</td>
	</tr>
	"""

	html += "</table>"

	# Add detailed performance metrics for classification
	if problem_type == "Classification":
	best_model = model_results['best_model_name']
	report = model_results['results'][best_model]['report']

	html += f"""
	<h4>Classification Report for {best_model}</h4>
	<table border="1" cellpadding="5">
	<tr>
	<th>Class</th>
	<th>Precision</th>
	<th>Recall</th>
	<th>F1-Score</th>
	<th>Support</th>
	</tr>
	"""

	for class_name, metrics in report.items():
	if class_name in ['accuracy', 'macro avg', 'weighted avg']:
	continue

	html += f"""
	<tr>
	<td>{class_name}</td>
	<td>{metrics['precision']:.4f}</td>
	<td>{metrics['recall']:.4f}</td>
	<td>{metrics['f1-score']:.4f}</td>
	<td>{metrics['support']}</td>
	</tr>
	"""

	html += "</table>"

	html += f"""
	<h3>Model Download</h3>
	<p>Your model has been saved and is ready for download.</p>
	"""

	return html

	def process_file(file, target_col):
	"""Process uploaded CSV file"""
	if file is None:
	return "Please upload a CSV file."

	# Read the CSV file
	try:
	df = pd.read_csv(file.name)
	except Exception as e:
	return f"Error reading the CSV file: {str(e)}"

	# Validate target column
	if target_col not in df.columns:
	return f"Target column '{target_col}' not found in the dataset. Available columns: {', '.join(df.columns)}"

	# Process the dataset
	try:
	results = process_dataset(df, target_col)
	return format_results_html(results)
	except Exception as e:
	return f"Error processing the dataset: {str(e)}"

	# Define Gradio interface
	with gr.Blocks(title="AutoML for Structured Data") as demo:
	gr.Markdown("# AutoML for Structured Data")
	gr.Markdown("""
	Upload a CSV file, specify the target column, and let AutoML do the rest! This app will:
	1. Perform exploratory data analysis (EDA)
	2. Determine if it's a regression or classification problem
	3. Handle preprocessing (cleaning, encoding, etc.)
	4. Train multiple models and select the best one
	5. Display the results and allow you to download the model
	""")

	with gr.Row():
	with gr.Column():
	file_input = gr.File(label="Upload CSV File")
	target_col = gr.Textbox(label="Target Column Name")
	submit_btn = gr.Button("Process Dataset")

	with gr.Column():
	output = gr.HTML(label="Results")

	submit_btn.click(fn=process_file, inputs=[file_input, target_col], outputs=output)

	# Launch the app
	demo.launch()