Amaanali01's picture
Update app.py
63c7a29 verified
import gradio as gr
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
# ------------------ Data Cleaning ------------------
def clean_data(file_path, remove_na, fill_na, method, remove_duplicates, outlier, convert_dtype, dtype_column, dtype):
if file_path is None:
return None, None
# Read file
if file_path.endswith(".csv"):
df = pd.read_csv(file_path)
else:
df = pd.read_excel(file_path)
# Cleaning operations
if remove_na:
df = df.dropna()
if fill_na:
if method == "Mean":
df = df.fillna(df.mean(numeric_only=True))
elif method == "Median":
df = df.fillna(df.median(numeric_only=True))
else:
df = df.fillna(df.mode().iloc[0])
if remove_duplicates:
df = df.drop_duplicates()
if outlier:
Q1 = df.quantile(0.25)
Q3 = df.quantile(0.75)
IQR = Q3 - Q1
df = df[~((df < (Q1 - 1.5 * IQR)) | (df > (Q3 + 1.5 * IQR))).any(axis=1)]
if convert_dtype and dtype_column:
try:
if dtype == "int":
df[dtype_column] = df[dtype_column].astype(int)
elif dtype == "float":
df[dtype_column] = df[dtype_column].astype(float)
else:
df[dtype_column] = df[dtype_column].astype(str)
except Exception as e:
return None, None
# Save cleaned file
output_path = "cleaned_output.csv"
df.to_csv(output_path, index=False)
# Generate dashboard images
dashboard_images = generate_dashboard(df)
return output_path, dashboard_images
# ------------------ Dashboard Analytics ------------------
def generate_dashboard(df):
os.makedirs("dashboard_images", exist_ok=True)
images = []
# Missing value heatmap
plt.figure(figsize=(6,4))
sns.heatmap(df.isna(), cbar=False)
plt.title("Missing Values Heatmap")
heatmap_file = "dashboard_images/missing_heatmap.png"
plt.savefig(heatmap_file)
plt.close()
images.append(heatmap_file)
# Numeric histograms
numeric_cols = df.select_dtypes(include=np.number).columns
for col in numeric_cols:
plt.figure(figsize=(5,3))
sns.histplot(df[col].dropna(), kde=True)
plt.title(f"Histogram of {col}")
hist_file = f"dashboard_images/hist_{col}.png"
plt.savefig(hist_file)
plt.close()
images.append(hist_file)
return images
# ------------------ Guide Popup ------------------
def show_guide_popup():
guide_text = """
## 🧹 Data Cleaning Tool Guide
### 1️⃣ Basic Statistical Terms
- **Mean (Average)**: Sum of all numeric values divided by the number of values.
- **Median**: Middle value when sorted.
- **Mode**: Most frequent value.
### 2️⃣ Cleaning Options
- **Remove Missing Values**: Deletes rows with missing values.
- **Fill Missing Values**: Replace NaNs with Mean/Median/Mode.
- **Remove Duplicates**: Deletes repeated rows.
- **Outlier Removal (IQR Method)**: Removes extreme values beyond 1.5*IQR.
- **Convert Column Data Type**: Change a column to int, float, or string.
### 3️⃣ Dashboard Analytics
- **Missing Values Heatmap**: See which columns have missing data.
- **Numeric Histograms**: Check distribution of numeric columns.
### 4️⃣ Recommended Workflow
1. Upload CSV/Excel.
2. Remove/fill missing values.
3. Remove duplicates/outliers.
4. Convert column types.
5. Explore dashboard.
6. Download cleaned dataset.
**Tip:** Always keep a backup of the original file!
"""
return guide_text
# ------------------ Update Columns for Dtype ------------------
def update_columns(file_path):
if file_path is None:
return gr.update(choices=[], value=None)
if file_path.endswith(".csv"):
df = pd.read_csv(file_path)
else:
df = pd.read_excel(file_path)
cols = list(df.columns)
return gr.update(choices=cols, value=cols[0] if cols else None)
# ------------------ Gradio Interface ------------------
with gr.Blocks() as demo:
gr.Markdown("## 🧹 Data Cleaning Tool with Dashboard Analytics")
with gr.Row():
file_input = gr.File(label="Upload CSV or Excel", type="filepath")
guide_button = gr.Button("Show Guide")
remove_na = gr.Checkbox(label="Remove rows with missing values")
fill_na = gr.Checkbox(label="Fill missing values")
method = gr.Dropdown(["Mean", "Median", "Mode"], label="Fill method", value="Mean")
remove_duplicates = gr.Checkbox(label="Remove duplicate rows")
outlier = gr.Checkbox(label="Remove outliers (IQR Method)")
convert_dtype = gr.Checkbox(label="Convert column data type")
dtype_column = gr.Dropdown([], label="Column to convert")
dtype = gr.Dropdown(["int", "float", "string"], label="Convert to", value="int")
file_input.change(update_columns, inputs=file_input, outputs=dtype_column)
clean_button = gr.Button("Apply Cleaning")
cleaned_output = gr.File(label="Download Cleaned Data")
dashboard_output = gr.Gallery(label="Dashboard Analytics", show_label=True)
# Guide popup (hidden initially)
guide_popup = gr.Textbox(value="", label="Guide", interactive=False, visible=False, lines=25)
# Show guide on button click
def show_guide_and_popup():
return gr.update(value=show_guide_popup(), visible=True)
guide_button.click(
show_guide_and_popup,
inputs=None,
outputs=guide_popup
)
# Cleaning function
clean_button.click(
clean_data,
inputs=[file_input, remove_na, fill_na, method, remove_duplicates, outlier, convert_dtype, dtype_column, dtype],
outputs=[cleaned_output, dashboard_output]
)
demo.launch()