kidwaiaun commited on
Commit
8b74775
·
verified ·
1 Parent(s): e6618ed

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +108 -40
app.py CHANGED
@@ -1,48 +1,116 @@
1
  import gradio as gr
2
  import pandas as pd
3
- from sklearn.impute import SimpleImputer
4
- from sklearn.preprocessing import StandardScaler
5
- from sklearn.ensemble import IsolationForest
 
 
 
 
6
 
7
- def preprocess_data(file, impute, normalize, detect_outliers):
8
- df = pd.read_csv(file.name)
9
-
10
- if impute:
11
- imputer = SimpleImputer(strategy='mean')
12
- df = pd.DataFrame(imputer.fit_transform(df), columns=df.columns)
13
-
14
- if normalize:
15
- scaler = StandardScaler()
16
- df[df.columns] = scaler.fit_transform(df)
17
-
18
- if detect_outliers:
19
- iso_forest = IsolationForest(contamination=0.1)
20
- outliers = iso_forest.fit_predict(df)
21
- df = df.iloc[outliers == 1] # Keeping only non-outlier rows
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
22
 
23
- return df.head().to_html(), df.describe().to_html()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
24
 
25
  # Gradio Interface
26
- with gr.Blocks() as demo:
27
- gr.Markdown("# DataGenie")
28
- gr.Markdown("## Automated Data Preprocessing and Feature Engineering Pipeline")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
29
 
30
- with gr.Row():
31
- with gr.Column():
32
- file_input = gr.File(label="Upload your dataset (CSV)")
33
- impute_check = gr.Checkbox(label="Impute Missing Values")
34
- normalize_check = gr.Checkbox(label="Normalize Data")
35
- outlier_check = gr.Checkbox(label="Detect and Remove Outliers")
36
- submit_btn = gr.Button("Process Data")
37
-
38
- with gr.Column():
39
- output_df = gr.HTML(label="Processed Data Preview")
40
- stats_output = gr.HTML(label="Data Statistics")
41
-
42
- submit_btn.click(
43
- preprocess_data,
44
- inputs=[file_input, impute_check, normalize_check, outlier_check],
45
- outputs=[output_df, stats_output]
46
- )
47
 
48
- demo.launch()
 
 
 
1
  import gradio as gr
2
  import pandas as pd
3
+ import json
4
+ import numpy as np
5
+ from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder
6
+ from sklearn.decomposition import PCA
7
+ import shap
8
+ import matplotlib.pyplot as plt
9
+ import seaborn as sns
10
 
11
+ # Preprocessing Functions
12
+ def preprocess_data(file, encoding, scale_method, feature_selection):
13
+ try:
14
+ if file.name.endswith('.csv'):
15
+ df = pd.read_csv(file.name, encoding=encoding)
16
+ elif file.name.endswith(('.json', '.ndjson')):
17
+ df = pd.read_json(file.name, orient='records')
18
+ elif file.name.endswith(('.xlsx', '.xls')):
19
+ df = pd.read_excel(file.name)
20
+ else:
21
+ return "Unsupported file format!"
22
+
23
+ # Handling Missing Values
24
+ df.fillna(method='ffill', inplace=True)
25
+ df.fillna(method='bfill', inplace=True)
26
+
27
+ # Categorical Encoding
28
+ for col in df.select_dtypes(include=['object']).columns:
29
+ df[col] = LabelEncoder().fit_transform(df[col])
30
+
31
+ # Feature Scaling
32
+ if scale_method == 'StandardScaler':
33
+ scaler = StandardScaler()
34
+ elif scale_method == 'MinMaxScaler':
35
+ scaler = MinMaxScaler()
36
+ else:
37
+ scaler = None
38
+
39
+ if scaler:
40
+ df[df.columns] = scaler.fit_transform(df[df.columns])
41
+
42
+ # Feature Selection
43
+ if feature_selection:
44
+ pca = PCA(n_components=0.95)
45
+ df_pca = pca.fit_transform(df)
46
+ df = pd.DataFrame(df_pca)
47
+
48
+ return df.head()
49
+ except Exception as e:
50
+ return f"Error processing data: {str(e)}"
51
 
52
+ # SHAP Feature Importance Plot
53
+ def feature_importance_plot(file):
54
+ try:
55
+ if file.name.endswith('.csv'):
56
+ df = pd.read_csv(file.name)
57
+ elif file.name.endswith(('.json', '.ndjson')):
58
+ df = pd.read_json(file.name, orient='records')
59
+ elif file.name.endswith(('.xlsx', '.xls')):
60
+ df = pd.read_excel(file.name)
61
+ else:
62
+ return "Unsupported file format!"
63
+
64
+ df.fillna(method='ffill', inplace=True)
65
+ df.fillna(method='bfill', inplace=True)
66
+
67
+ # Encoding categorical columns
68
+ for col in df.select_dtypes(include=['object']).columns:
69
+ df[col] = LabelEncoder().fit_transform(df[col])
70
+
71
+ # Assuming last column is the target variable
72
+ X = df.iloc[:, :-1]
73
+ y = df.iloc[:, -1]
74
+
75
+ import xgboost as xgb
76
+ model = xgb.XGBClassifier()
77
+ model.fit(X, y)
78
+
79
+ explainer = shap.Explainer(model)
80
+ shap_values = explainer(X)
81
+
82
+ plt.figure(figsize=(10,6))
83
+ shap.summary_plot(shap_values, X)
84
+ plt.savefig("shap_plot.png")
85
+ return "shap_plot.png"
86
+ except Exception as e:
87
+ return f"Error in feature importance plot: {str(e)}"
88
 
89
  # Gradio Interface
90
+ def gradio_app():
91
+ with gr.Blocks() as demo:
92
+ gr.Markdown("""
93
+ # 🚀 Advanced Data Preprocessing & Feature Engineering App
94
+ Upload a dataset to preprocess and extract features.
95
+ """)
96
+
97
+ file = gr.File(label="Upload Data File")
98
+ encoding = gr.Dropdown(["utf-8", "ISO-8859-1"], label="Select Encoding", value="utf-8")
99
+ scale_method = gr.Dropdown(["None", "StandardScaler", "MinMaxScaler"], label="Scaling Method", value="None")
100
+ feature_selection = gr.Checkbox(label="Apply PCA for Feature Selection", value=False)
101
+
102
+ preprocess_button = gr.Button("Preprocess Data")
103
+ output_data = gr.Dataframe()
104
+
105
+ preprocess_button.click(preprocess_data, inputs=[file, encoding, scale_method, feature_selection], outputs=output_data)
106
+
107
+ feature_button = gr.Button("Feature Importance Plot")
108
+ output_image = gr.Image()
109
+
110
+ feature_button.click(feature_importance_plot, inputs=[file], outputs=output_image)
111
 
112
+ return demo
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
113
 
114
+ if __name__ == "__main__":
115
+ app = gradio_app()
116
+ app.launch()