File size: 4,041 Bytes
40229c7
 
8b74775
 
 
 
 
 
 
40229c7
8b74775
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
40229c7
8b74775
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
40229c7
 
8b74775
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
40229c7
8b74775
40229c7
8b74775
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
import gradio as gr
import pandas as pd
import json
import numpy as np
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder
from sklearn.decomposition import PCA
import shap
import matplotlib.pyplot as plt
import seaborn as sns

# Preprocessing Functions
def preprocess_data(file, encoding, scale_method, feature_selection):
    try:
        if file.name.endswith('.csv'):
            df = pd.read_csv(file.name, encoding=encoding)
        elif file.name.endswith(('.json', '.ndjson')):
            df = pd.read_json(file.name, orient='records')
        elif file.name.endswith(('.xlsx', '.xls')):
            df = pd.read_excel(file.name)
        else:
            return "Unsupported file format!"
        
        # Handling Missing Values
        df.fillna(method='ffill', inplace=True)
        df.fillna(method='bfill', inplace=True)
        
        # Categorical Encoding
        for col in df.select_dtypes(include=['object']).columns:
            df[col] = LabelEncoder().fit_transform(df[col])
        
        # Feature Scaling
        if scale_method == 'StandardScaler':
            scaler = StandardScaler()
        elif scale_method == 'MinMaxScaler':
            scaler = MinMaxScaler()
        else:
            scaler = None
        
        if scaler:
            df[df.columns] = scaler.fit_transform(df[df.columns])
        
        # Feature Selection
        if feature_selection:
            pca = PCA(n_components=0.95)
            df_pca = pca.fit_transform(df)
            df = pd.DataFrame(df_pca)
        
        return df.head()
    except Exception as e:
        return f"Error processing data: {str(e)}"

# SHAP Feature Importance Plot
def feature_importance_plot(file):
    try:
        if file.name.endswith('.csv'):
            df = pd.read_csv(file.name)
        elif file.name.endswith(('.json', '.ndjson')):
            df = pd.read_json(file.name, orient='records')
        elif file.name.endswith(('.xlsx', '.xls')):
            df = pd.read_excel(file.name)
        else:
            return "Unsupported file format!"
        
        df.fillna(method='ffill', inplace=True)
        df.fillna(method='bfill', inplace=True)
        
        # Encoding categorical columns
        for col in df.select_dtypes(include=['object']).columns:
            df[col] = LabelEncoder().fit_transform(df[col])
        
        # Assuming last column is the target variable
        X = df.iloc[:, :-1]
        y = df.iloc[:, -1]
        
        import xgboost as xgb
        model = xgb.XGBClassifier()
        model.fit(X, y)
        
        explainer = shap.Explainer(model)
        shap_values = explainer(X)
        
        plt.figure(figsize=(10,6))
        shap.summary_plot(shap_values, X)
        plt.savefig("shap_plot.png")
        return "shap_plot.png"
    except Exception as e:
        return f"Error in feature importance plot: {str(e)}"

# Gradio Interface
def gradio_app():
    with gr.Blocks() as demo:
        gr.Markdown("""
        # 🚀 Advanced Data Preprocessing & Feature Engineering App
        Upload a dataset to preprocess and extract features.
        """)
        
        file = gr.File(label="Upload Data File")
        encoding = gr.Dropdown(["utf-8", "ISO-8859-1"], label="Select Encoding", value="utf-8")
        scale_method = gr.Dropdown(["None", "StandardScaler", "MinMaxScaler"], label="Scaling Method", value="None")
        feature_selection = gr.Checkbox(label="Apply PCA for Feature Selection", value=False)
        
        preprocess_button = gr.Button("Preprocess Data")
        output_data = gr.Dataframe()
        
        preprocess_button.click(preprocess_data, inputs=[file, encoding, scale_method, feature_selection], outputs=output_data)
        
        feature_button = gr.Button("Feature Importance Plot")
        output_image = gr.Image()
        
        feature_button.click(feature_importance_plot, inputs=[file], outputs=output_image)
    
    return demo

if __name__ == "__main__":
    app = gradio_app()
    app.launch()