ilsa15 commited on
Commit
da84785
Β·
verified Β·
1 Parent(s): 96fe723

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +99 -0
app.py ADDED
@@ -0,0 +1,99 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # app.py
2
+ import gradio as gr
3
+ import pandas as pd
4
+ import matplotlib.pyplot as plt
5
+ import seaborn as sns
6
+ import io
7
+ from PIL import Image
8
+
9
+ # πŸ”Ή Main Cleaning Function
10
+ def clean_csv(file_path):
11
+ if file_path is None:
12
+ return "❌ Please upload a CSV.", None, None, None
13
+
14
+ try:
15
+ # Load CSV
16
+ df = pd.read_csv(file_path)
17
+
18
+ # --- Store original stats ---
19
+ before_shape = df.shape
20
+ before_missing = df.isnull().sum().sum()
21
+ before_duplicates = df.duplicated().sum()
22
+
23
+ # --- Cleaning steps ---
24
+ df = df.drop_duplicates() # Remove duplicates
25
+ df = df.fillna(df.mean(numeric_only=True)) # Fill numeric NaN
26
+ df = df.apply(lambda x: x.fillna(x.mode()[0]) if x.isnull().any() else x) # Fill categorical NaN
27
+
28
+ # --- Outlier removal (z-score method) ---
29
+ for col in df.select_dtypes(include="number").columns:
30
+ mean, std = df[col].mean(), df[col].std()
31
+ if std > 0: # avoid divide by zero
32
+ df = df[(df[col] >= mean - 3*std) & (df[col] <= mean + 3*std)]
33
+
34
+ # --- Store after stats ---
35
+ after_shape = df.shape
36
+ after_missing = df.isnull().sum().sum()
37
+ after_duplicates = df.duplicated().sum()
38
+
39
+ # --- Heatmap for missing values (before cleaning) ---
40
+ plt.figure(figsize=(6,4))
41
+ sns.heatmap(pd.read_csv(file_path).isnull(), cbar=False, cmap="viridis")
42
+ plt.title("Missing Values Heatmap (Before Cleaning)")
43
+ buf = io.BytesIO()
44
+ plt.savefig(buf, format="png")
45
+ buf.seek(0)
46
+ plt.close()
47
+
48
+ # βœ… Convert buffer to PIL Image
49
+ heatmap_img = Image.open(buf)
50
+
51
+ # --- Save cleaned CSV ---
52
+ cleaned_path = "cleaned_data.csv"
53
+ df.to_csv(cleaned_path, index=False)
54
+
55
+ # --- Report ---
56
+ report = f"""
57
+ ## 🧹 Data Cleaning Report
58
+
59
+ **Before Cleaning**
60
+ - Shape: {before_shape}
61
+ - Missing values: {before_missing}
62
+ - Duplicates: {before_duplicates}
63
+
64
+ **After Cleaning**
65
+ - Shape: {after_shape}
66
+ - Missing values: {after_missing}
67
+ - Duplicates: {after_duplicates}
68
+ """
69
+
70
+ # --- Preview (first 5 rows) ---
71
+ preview_html = df.head(5).to_html(index=False)
72
+
73
+ return report, heatmap_img, cleaned_path, preview_html
74
+
75
+ except Exception as e:
76
+ return f"⚠️ Error: {e}", None, None, None
77
+
78
+
79
+ # 🎨 Gradio UI
80
+ with gr.Blocks(theme="soft") as demo:
81
+ gr.Markdown("## 🧹 AI Data Cleaner\nUpload a CSV β†’ Clean it β†’ Get Report + Heatmap + Download")
82
+
83
+ file_input = gr.File(label="πŸ“‚ Upload CSV", file_types=[".csv"], type="filepath")
84
+ clean_btn = gr.Button("πŸš€ Clean Data")
85
+
86
+ with gr.Row():
87
+ report_output = gr.Markdown()
88
+ heatmap_output = gr.Image(type="pil", label="Missing Values Heatmap")
89
+
90
+ with gr.Row():
91
+ download_output = gr.File(label="⬇️ Download Cleaned CSV")
92
+ preview_output = gr.HTML(label="πŸ” Preview (First 5 Rows)")
93
+
94
+ clean_btn.click(fn=clean_csv,
95
+ inputs=file_input,
96
+ outputs=[report_output, heatmap_output, download_output, preview_output])
97
+
98
+ # πŸš€ Launch (Hugging Face Spaces will auto-run this)
99
+ demo.launch()