Spaces:

kidwaiaun
/

DataGenie

Sleeping

App Files Files Community

kidwaiaun commited on Feb 19, 2025

Commit

8b74775

verified ·

1 Parent(s): e6618ed

Update app.py

Browse files

Files changed (1) hide show

app.py +108 -40

app.py CHANGED Viewed

@@ -1,48 +1,116 @@
 import gradio as gr
 import pandas as pd
-from sklearn.impute import SimpleImputer
-from sklearn.preprocessing import StandardScaler
-from sklearn.ensemble import IsolationForest
-def preprocess_data(file, impute, normalize, detect_outliers):
-    df = pd.read_csv(file.name)
-    if impute:
-        imputer = SimpleImputer(strategy='mean')
-        df = pd.DataFrame(imputer.fit_transform(df), columns=df.columns)
-    if normalize:
-        scaler = StandardScaler()
-        df[df.columns] = scaler.fit_transform(df)
-    if detect_outliers:
-        iso_forest = IsolationForest(contamination=0.1)
-        outliers = iso_forest.fit_predict(df)
-        df = df.iloc[outliers == 1]  # Keeping only non-outlier rows
-    return df.head().to_html(), df.describe().to_html()
 # Gradio Interface
-with gr.Blocks() as demo:
-    gr.Markdown("# DataGenie")
-    gr.Markdown("## Automated Data Preprocessing and Feature Engineering Pipeline")
-    with gr.Row():
-        with gr.Column():
-            file_input = gr.File(label="Upload your dataset (CSV)")
-            impute_check = gr.Checkbox(label="Impute Missing Values")
-            normalize_check = gr.Checkbox(label="Normalize Data")
-            outlier_check = gr.Checkbox(label="Detect and Remove Outliers")
-            submit_btn = gr.Button("Process Data")
-        with gr.Column():
-            output_df = gr.HTML(label="Processed Data Preview")
-            stats_output = gr.HTML(label="Data Statistics")
-    submit_btn.click(
-        preprocess_data,
-        inputs=[file_input, impute_check, normalize_check, outlier_check],
-        outputs=[output_df, stats_output]
-    )
-demo.launch()

 import gradio as gr
 import pandas as pd
+import json
+import numpy as np
+from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder
+from sklearn.decomposition import PCA
+import shap
+import matplotlib.pyplot as plt
+import seaborn as sns
+# Preprocessing Functions
+def preprocess_data(file, encoding, scale_method, feature_selection):
+    try:
+        if file.name.endswith('.csv'):
+            df = pd.read_csv(file.name, encoding=encoding)
+        elif file.name.endswith(('.json', '.ndjson')):
+            df = pd.read_json(file.name, orient='records')
+        elif file.name.endswith(('.xlsx', '.xls')):
+            df = pd.read_excel(file.name)
+        else:
+            return "Unsupported file format!"
+        # Handling Missing Values
+        df.fillna(method='ffill', inplace=True)
+        df.fillna(method='bfill', inplace=True)
+        # Categorical Encoding
+        for col in df.select_dtypes(include=['object']).columns:
+            df[col] = LabelEncoder().fit_transform(df[col])
+        # Feature Scaling
+        if scale_method == 'StandardScaler':
+            scaler = StandardScaler()
+        elif scale_method == 'MinMaxScaler':
+            scaler = MinMaxScaler()
+        else:
+            scaler = None
+        if scaler:
+            df[df.columns] = scaler.fit_transform(df[df.columns])
+        # Feature Selection
+        if feature_selection:
+            pca = PCA(n_components=0.95)
+            df_pca = pca.fit_transform(df)
+            df = pd.DataFrame(df_pca)
+        return df.head()
+    except Exception as e:
+        return f"Error processing data: {str(e)}"
+# SHAP Feature Importance Plot
+def feature_importance_plot(file):
+    try:
+        if file.name.endswith('.csv'):
+            df = pd.read_csv(file.name)
+        elif file.name.endswith(('.json', '.ndjson')):
+            df = pd.read_json(file.name, orient='records')
+        elif file.name.endswith(('.xlsx', '.xls')):
+            df = pd.read_excel(file.name)
+        else:
+            return "Unsupported file format!"
+        df.fillna(method='ffill', inplace=True)
+        df.fillna(method='bfill', inplace=True)
+        # Encoding categorical columns
+        for col in df.select_dtypes(include=['object']).columns:
+            df[col] = LabelEncoder().fit_transform(df[col])
+        # Assuming last column is the target variable
+        X = df.iloc[:, :-1]
+        y = df.iloc[:, -1]
+        import xgboost as xgb
+        model = xgb.XGBClassifier()
+        model.fit(X, y)
+        explainer = shap.Explainer(model)
+        shap_values = explainer(X)
+        plt.figure(figsize=(10,6))
+        shap.summary_plot(shap_values, X)
+        plt.savefig("shap_plot.png")
+        return "shap_plot.png"
+    except Exception as e:
+        return f"Error in feature importance plot: {str(e)}"
 # Gradio Interface
+def gradio_app():
+    with gr.Blocks() as demo:
+        gr.Markdown("""
+        # 🚀 Advanced Data Preprocessing & Feature Engineering App
+        Upload a dataset to preprocess and extract features.
+        """)
+        file = gr.File(label="Upload Data File")
+        encoding = gr.Dropdown(["utf-8", "ISO-8859-1"], label="Select Encoding", value="utf-8")
+        scale_method = gr.Dropdown(["None", "StandardScaler", "MinMaxScaler"], label="Scaling Method", value="None")
+        feature_selection = gr.Checkbox(label="Apply PCA for Feature Selection", value=False)
+        preprocess_button = gr.Button("Preprocess Data")
+        output_data = gr.Dataframe()
+        preprocess_button.click(preprocess_data, inputs=[file, encoding, scale_method, feature_selection], outputs=output_data)
+        feature_button = gr.Button("Feature Importance Plot")
+        output_image = gr.Image()
+        feature_button.click(feature_importance_plot, inputs=[file], outputs=output_image)
+    return demo
+if __name__ == "__main__":
+    app = gradio_app()
+    app.launch()