Spaces:
Runtime error
Runtime error
| import gradio as gr | |
| import pandas as pd | |
| import numpy as np | |
| import matplotlib.pyplot as plt | |
| import seaborn as sns | |
| import os | |
| # ------------------ Data Cleaning ------------------ | |
| def clean_data(file_path, remove_na, fill_na, method, remove_duplicates, outlier, convert_dtype, dtype_column, dtype): | |
| if file_path is None: | |
| return None, None | |
| # Read file | |
| if file_path.endswith(".csv"): | |
| df = pd.read_csv(file_path) | |
| else: | |
| df = pd.read_excel(file_path) | |
| # Cleaning operations | |
| if remove_na: | |
| df = df.dropna() | |
| if fill_na: | |
| if method == "Mean": | |
| df = df.fillna(df.mean(numeric_only=True)) | |
| elif method == "Median": | |
| df = df.fillna(df.median(numeric_only=True)) | |
| else: | |
| df = df.fillna(df.mode().iloc[0]) | |
| if remove_duplicates: | |
| df = df.drop_duplicates() | |
| if outlier: | |
| Q1 = df.quantile(0.25) | |
| Q3 = df.quantile(0.75) | |
| IQR = Q3 - Q1 | |
| df = df[~((df < (Q1 - 1.5 * IQR)) | (df > (Q3 + 1.5 * IQR))).any(axis=1)] | |
| if convert_dtype and dtype_column: | |
| try: | |
| if dtype == "int": | |
| df[dtype_column] = df[dtype_column].astype(int) | |
| elif dtype == "float": | |
| df[dtype_column] = df[dtype_column].astype(float) | |
| else: | |
| df[dtype_column] = df[dtype_column].astype(str) | |
| except Exception as e: | |
| return None, None | |
| # Save cleaned file | |
| output_path = "cleaned_output.csv" | |
| df.to_csv(output_path, index=False) | |
| # Generate dashboard images | |
| dashboard_images = generate_dashboard(df) | |
| return output_path, dashboard_images | |
| # ------------------ Dashboard Analytics ------------------ | |
| def generate_dashboard(df): | |
| os.makedirs("dashboard_images", exist_ok=True) | |
| images = [] | |
| # Missing value heatmap | |
| plt.figure(figsize=(6,4)) | |
| sns.heatmap(df.isna(), cbar=False) | |
| plt.title("Missing Values Heatmap") | |
| heatmap_file = "dashboard_images/missing_heatmap.png" | |
| plt.savefig(heatmap_file) | |
| plt.close() | |
| images.append(heatmap_file) | |
| # Numeric histograms | |
| numeric_cols = df.select_dtypes(include=np.number).columns | |
| for col in numeric_cols: | |
| plt.figure(figsize=(5,3)) | |
| sns.histplot(df[col].dropna(), kde=True) | |
| plt.title(f"Histogram of {col}") | |
| hist_file = f"dashboard_images/hist_{col}.png" | |
| plt.savefig(hist_file) | |
| plt.close() | |
| images.append(hist_file) | |
| return images | |
| # ------------------ Guide Popup ------------------ | |
| def show_guide_popup(): | |
| guide_text = """ | |
| ## 🧹 Data Cleaning Tool Guide | |
| ### 1️⃣ Basic Statistical Terms | |
| - **Mean (Average)**: Sum of all numeric values divided by the number of values. | |
| - **Median**: Middle value when sorted. | |
| - **Mode**: Most frequent value. | |
| ### 2️⃣ Cleaning Options | |
| - **Remove Missing Values**: Deletes rows with missing values. | |
| - **Fill Missing Values**: Replace NaNs with Mean/Median/Mode. | |
| - **Remove Duplicates**: Deletes repeated rows. | |
| - **Outlier Removal (IQR Method)**: Removes extreme values beyond 1.5*IQR. | |
| - **Convert Column Data Type**: Change a column to int, float, or string. | |
| ### 3️⃣ Dashboard Analytics | |
| - **Missing Values Heatmap**: See which columns have missing data. | |
| - **Numeric Histograms**: Check distribution of numeric columns. | |
| ### 4️⃣ Recommended Workflow | |
| 1. Upload CSV/Excel. | |
| 2. Remove/fill missing values. | |
| 3. Remove duplicates/outliers. | |
| 4. Convert column types. | |
| 5. Explore dashboard. | |
| 6. Download cleaned dataset. | |
| **Tip:** Always keep a backup of the original file! | |
| """ | |
| return guide_text | |
| # ------------------ Update Columns for Dtype ------------------ | |
| def update_columns(file_path): | |
| if file_path is None: | |
| return gr.update(choices=[], value=None) | |
| if file_path.endswith(".csv"): | |
| df = pd.read_csv(file_path) | |
| else: | |
| df = pd.read_excel(file_path) | |
| cols = list(df.columns) | |
| return gr.update(choices=cols, value=cols[0] if cols else None) | |
| # ------------------ Gradio Interface ------------------ | |
| with gr.Blocks() as demo: | |
| gr.Markdown("## 🧹 Data Cleaning Tool with Dashboard Analytics") | |
| with gr.Row(): | |
| file_input = gr.File(label="Upload CSV or Excel", type="filepath") | |
| guide_button = gr.Button("Show Guide") | |
| remove_na = gr.Checkbox(label="Remove rows with missing values") | |
| fill_na = gr.Checkbox(label="Fill missing values") | |
| method = gr.Dropdown(["Mean", "Median", "Mode"], label="Fill method", value="Mean") | |
| remove_duplicates = gr.Checkbox(label="Remove duplicate rows") | |
| outlier = gr.Checkbox(label="Remove outliers (IQR Method)") | |
| convert_dtype = gr.Checkbox(label="Convert column data type") | |
| dtype_column = gr.Dropdown([], label="Column to convert") | |
| dtype = gr.Dropdown(["int", "float", "string"], label="Convert to", value="int") | |
| file_input.change(update_columns, inputs=file_input, outputs=dtype_column) | |
| clean_button = gr.Button("Apply Cleaning") | |
| cleaned_output = gr.File(label="Download Cleaned Data") | |
| dashboard_output = gr.Gallery(label="Dashboard Analytics", show_label=True) | |
| # Guide popup (hidden initially) | |
| guide_popup = gr.Textbox(value="", label="Guide", interactive=False, visible=False, lines=25) | |
| # Show guide on button click | |
| def show_guide_and_popup(): | |
| return gr.update(value=show_guide_popup(), visible=True) | |
| guide_button.click( | |
| show_guide_and_popup, | |
| inputs=None, | |
| outputs=guide_popup | |
| ) | |
| # Cleaning function | |
| clean_button.click( | |
| clean_data, | |
| inputs=[file_input, remove_na, fill_na, method, remove_duplicates, outlier, convert_dtype, dtype_column, dtype], | |
| outputs=[cleaned_output, dashboard_output] | |
| ) | |
| demo.launch() | |