import gradio as gr import pandas as pd import json import numpy as np from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder from sklearn.decomposition import PCA import shap import matplotlib.pyplot as plt import seaborn as sns # Preprocessing Functions def preprocess_data(file, encoding, scale_method, feature_selection): try: if file.name.endswith('.csv'): df = pd.read_csv(file.name, encoding=encoding) elif file.name.endswith(('.json', '.ndjson')): df = pd.read_json(file.name, orient='records') elif file.name.endswith(('.xlsx', '.xls')): df = pd.read_excel(file.name) else: return "Unsupported file format!" # Handling Missing Values df.fillna(method='ffill', inplace=True) df.fillna(method='bfill', inplace=True) # Categorical Encoding for col in df.select_dtypes(include=['object']).columns: df[col] = LabelEncoder().fit_transform(df[col]) # Feature Scaling if scale_method == 'StandardScaler': scaler = StandardScaler() elif scale_method == 'MinMaxScaler': scaler = MinMaxScaler() else: scaler = None if scaler: df[df.columns] = scaler.fit_transform(df[df.columns]) # Feature Selection if feature_selection: pca = PCA(n_components=0.95) df_pca = pca.fit_transform(df) df = pd.DataFrame(df_pca) return df.head() except Exception as e: return f"Error processing data: {str(e)}" # SHAP Feature Importance Plot def feature_importance_plot(file): try: if file.name.endswith('.csv'): df = pd.read_csv(file.name) elif file.name.endswith(('.json', '.ndjson')): df = pd.read_json(file.name, orient='records') elif file.name.endswith(('.xlsx', '.xls')): df = pd.read_excel(file.name) else: return "Unsupported file format!" df.fillna(method='ffill', inplace=True) df.fillna(method='bfill', inplace=True) # Encoding categorical columns for col in df.select_dtypes(include=['object']).columns: df[col] = LabelEncoder().fit_transform(df[col]) # Assuming last column is the target variable X = df.iloc[:, :-1] y = df.iloc[:, -1] import xgboost as xgb model = xgb.XGBClassifier() model.fit(X, y) explainer = shap.Explainer(model) shap_values = explainer(X) plt.figure(figsize=(10,6)) shap.summary_plot(shap_values, X) plt.savefig("shap_plot.png") return "shap_plot.png" except Exception as e: return f"Error in feature importance plot: {str(e)}" # Gradio Interface def gradio_app(): with gr.Blocks() as demo: gr.Markdown(""" # 🚀 Advanced Data Preprocessing & Feature Engineering App Upload a dataset to preprocess and extract features. """) file = gr.File(label="Upload Data File") encoding = gr.Dropdown(["utf-8", "ISO-8859-1"], label="Select Encoding", value="utf-8") scale_method = gr.Dropdown(["None", "StandardScaler", "MinMaxScaler"], label="Scaling Method", value="None") feature_selection = gr.Checkbox(label="Apply PCA for Feature Selection", value=False) preprocess_button = gr.Button("Preprocess Data") output_data = gr.Dataframe() preprocess_button.click(preprocess_data, inputs=[file, encoding, scale_method, feature_selection], outputs=output_data) feature_button = gr.Button("Feature Importance Plot") output_image = gr.Image() feature_button.click(feature_importance_plot, inputs=[file], outputs=output_image) return demo if __name__ == "__main__": app = gradio_app() app.launch()