DataGenie / app.py
kidwaiaun's picture
Update app.py
8b74775 verified
import gradio as gr
import pandas as pd
import json
import numpy as np
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder
from sklearn.decomposition import PCA
import shap
import matplotlib.pyplot as plt
import seaborn as sns
# Preprocessing Functions
def preprocess_data(file, encoding, scale_method, feature_selection):
try:
if file.name.endswith('.csv'):
df = pd.read_csv(file.name, encoding=encoding)
elif file.name.endswith(('.json', '.ndjson')):
df = pd.read_json(file.name, orient='records')
elif file.name.endswith(('.xlsx', '.xls')):
df = pd.read_excel(file.name)
else:
return "Unsupported file format!"
# Handling Missing Values
df.fillna(method='ffill', inplace=True)
df.fillna(method='bfill', inplace=True)
# Categorical Encoding
for col in df.select_dtypes(include=['object']).columns:
df[col] = LabelEncoder().fit_transform(df[col])
# Feature Scaling
if scale_method == 'StandardScaler':
scaler = StandardScaler()
elif scale_method == 'MinMaxScaler':
scaler = MinMaxScaler()
else:
scaler = None
if scaler:
df[df.columns] = scaler.fit_transform(df[df.columns])
# Feature Selection
if feature_selection:
pca = PCA(n_components=0.95)
df_pca = pca.fit_transform(df)
df = pd.DataFrame(df_pca)
return df.head()
except Exception as e:
return f"Error processing data: {str(e)}"
# SHAP Feature Importance Plot
def feature_importance_plot(file):
try:
if file.name.endswith('.csv'):
df = pd.read_csv(file.name)
elif file.name.endswith(('.json', '.ndjson')):
df = pd.read_json(file.name, orient='records')
elif file.name.endswith(('.xlsx', '.xls')):
df = pd.read_excel(file.name)
else:
return "Unsupported file format!"
df.fillna(method='ffill', inplace=True)
df.fillna(method='bfill', inplace=True)
# Encoding categorical columns
for col in df.select_dtypes(include=['object']).columns:
df[col] = LabelEncoder().fit_transform(df[col])
# Assuming last column is the target variable
X = df.iloc[:, :-1]
y = df.iloc[:, -1]
import xgboost as xgb
model = xgb.XGBClassifier()
model.fit(X, y)
explainer = shap.Explainer(model)
shap_values = explainer(X)
plt.figure(figsize=(10,6))
shap.summary_plot(shap_values, X)
plt.savefig("shap_plot.png")
return "shap_plot.png"
except Exception as e:
return f"Error in feature importance plot: {str(e)}"
# Gradio Interface
def gradio_app():
with gr.Blocks() as demo:
gr.Markdown("""
# 🚀 Advanced Data Preprocessing & Feature Engineering App
Upload a dataset to preprocess and extract features.
""")
file = gr.File(label="Upload Data File")
encoding = gr.Dropdown(["utf-8", "ISO-8859-1"], label="Select Encoding", value="utf-8")
scale_method = gr.Dropdown(["None", "StandardScaler", "MinMaxScaler"], label="Scaling Method", value="None")
feature_selection = gr.Checkbox(label="Apply PCA for Feature Selection", value=False)
preprocess_button = gr.Button("Preprocess Data")
output_data = gr.Dataframe()
preprocess_button.click(preprocess_data, inputs=[file, encoding, scale_method, feature_selection], outputs=output_data)
feature_button = gr.Button("Feature Importance Plot")
output_image = gr.Image()
feature_button.click(feature_importance_plot, inputs=[file], outputs=output_image)
return demo
if __name__ == "__main__":
app = gradio_app()
app.launch()