|
|
import gradio as gr |
|
|
import pandas as pd |
|
|
import json |
|
|
import numpy as np |
|
|
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder |
|
|
from sklearn.decomposition import PCA |
|
|
import shap |
|
|
import matplotlib.pyplot as plt |
|
|
import seaborn as sns |
|
|
|
|
|
|
|
|
def preprocess_data(file, encoding, scale_method, feature_selection): |
|
|
try: |
|
|
if file.name.endswith('.csv'): |
|
|
df = pd.read_csv(file.name, encoding=encoding) |
|
|
elif file.name.endswith(('.json', '.ndjson')): |
|
|
df = pd.read_json(file.name, orient='records') |
|
|
elif file.name.endswith(('.xlsx', '.xls')): |
|
|
df = pd.read_excel(file.name) |
|
|
else: |
|
|
return "Unsupported file format!" |
|
|
|
|
|
|
|
|
df.fillna(method='ffill', inplace=True) |
|
|
df.fillna(method='bfill', inplace=True) |
|
|
|
|
|
|
|
|
for col in df.select_dtypes(include=['object']).columns: |
|
|
df[col] = LabelEncoder().fit_transform(df[col]) |
|
|
|
|
|
|
|
|
if scale_method == 'StandardScaler': |
|
|
scaler = StandardScaler() |
|
|
elif scale_method == 'MinMaxScaler': |
|
|
scaler = MinMaxScaler() |
|
|
else: |
|
|
scaler = None |
|
|
|
|
|
if scaler: |
|
|
df[df.columns] = scaler.fit_transform(df[df.columns]) |
|
|
|
|
|
|
|
|
if feature_selection: |
|
|
pca = PCA(n_components=0.95) |
|
|
df_pca = pca.fit_transform(df) |
|
|
df = pd.DataFrame(df_pca) |
|
|
|
|
|
return df.head() |
|
|
except Exception as e: |
|
|
return f"Error processing data: {str(e)}" |
|
|
|
|
|
|
|
|
def feature_importance_plot(file): |
|
|
try: |
|
|
if file.name.endswith('.csv'): |
|
|
df = pd.read_csv(file.name) |
|
|
elif file.name.endswith(('.json', '.ndjson')): |
|
|
df = pd.read_json(file.name, orient='records') |
|
|
elif file.name.endswith(('.xlsx', '.xls')): |
|
|
df = pd.read_excel(file.name) |
|
|
else: |
|
|
return "Unsupported file format!" |
|
|
|
|
|
df.fillna(method='ffill', inplace=True) |
|
|
df.fillna(method='bfill', inplace=True) |
|
|
|
|
|
|
|
|
for col in df.select_dtypes(include=['object']).columns: |
|
|
df[col] = LabelEncoder().fit_transform(df[col]) |
|
|
|
|
|
|
|
|
X = df.iloc[:, :-1] |
|
|
y = df.iloc[:, -1] |
|
|
|
|
|
import xgboost as xgb |
|
|
model = xgb.XGBClassifier() |
|
|
model.fit(X, y) |
|
|
|
|
|
explainer = shap.Explainer(model) |
|
|
shap_values = explainer(X) |
|
|
|
|
|
plt.figure(figsize=(10,6)) |
|
|
shap.summary_plot(shap_values, X) |
|
|
plt.savefig("shap_plot.png") |
|
|
return "shap_plot.png" |
|
|
except Exception as e: |
|
|
return f"Error in feature importance plot: {str(e)}" |
|
|
|
|
|
|
|
|
def gradio_app(): |
|
|
with gr.Blocks() as demo: |
|
|
gr.Markdown(""" |
|
|
# 🚀 Advanced Data Preprocessing & Feature Engineering App |
|
|
Upload a dataset to preprocess and extract features. |
|
|
""") |
|
|
|
|
|
file = gr.File(label="Upload Data File") |
|
|
encoding = gr.Dropdown(["utf-8", "ISO-8859-1"], label="Select Encoding", value="utf-8") |
|
|
scale_method = gr.Dropdown(["None", "StandardScaler", "MinMaxScaler"], label="Scaling Method", value="None") |
|
|
feature_selection = gr.Checkbox(label="Apply PCA for Feature Selection", value=False) |
|
|
|
|
|
preprocess_button = gr.Button("Preprocess Data") |
|
|
output_data = gr.Dataframe() |
|
|
|
|
|
preprocess_button.click(preprocess_data, inputs=[file, encoding, scale_method, feature_selection], outputs=output_data) |
|
|
|
|
|
feature_button = gr.Button("Feature Importance Plot") |
|
|
output_image = gr.Image() |
|
|
|
|
|
feature_button.click(feature_importance_plot, inputs=[file], outputs=output_image) |
|
|
|
|
|
return demo |
|
|
|
|
|
if __name__ == "__main__": |
|
|
app = gradio_app() |
|
|
app.launch() |
|
|
|