File size: 12,901 Bytes
136b539
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ed275df
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
#==============================================================
#            Deendencies
#===============================================================

import gradio as gr
import pandas as pd
from sklearn.preprocessing import LabelEncoder, OrdinalEncoder
import io
import numpy as np
import tempfile
import os


#==================================================================
#        Other Dependencies
#==================================================================

from helper_functions import file_summary, load_csv
from helper_functions import check_duplicate_columns, remove_duplicate_columns, check_duplicate_rows, remove_duplicate_rows, check_missing_columns, drop_high_missing, delete_column
from helper_functions import get_missing_columns, detect_column_type, apply_missing_value
from helper_functions import show_value_counts, encode_column
from helper_functions import normalize_column_names, rename_single_column
from helper_functions import get_numeric_columns, show_current_dtype, change_column_dtype
from helper_functions import get_continuous_columns, show_column_stats, handle_outliers
from helper_functions import make_csv_download

from report_generation import generate_profile_report



# ===========================================================
#                     Gradio Layout
# ===========================================================

with gr.Blocks(theme="soft") as demo:
    gr.Markdown("# <div align = 'center'> **Clean Data Dashboard** </div>") 
    gr.Markdown("<div align = 'center'>In every machine learning workflow, data cleaning is one of the most time-consuming and repetitive tasks. yet, as ML engineers, our true focus should be on building models, crafting architectures, and solving real problems - not spending endless hours handling missing values, formatting inconsistencies and unwanted noise in CSV files.</div>") 
    gr.Markdown("<div align = 'center'> That's exactly why I build this CSV Data Cleaning App. This tool helps you clean your data in few steps. All you need to do is to click on the button the operation you want to apply on the file. After applying all the operations, you can download the final cleaned CSV File.</div>")
    gr.Markdown("---")

    with gr.Row():
        with gr.Column(scale=1, min_width=400):
            gr.HTML("<div style='max-height: 90vh; overflow-y: auto; padding-right: 10px;'>")
        
            gr.Markdown("# โš™๏ธ Tools Panel")
            
            file_input = gr.File(label="Choose CSV", file_types=[".csv"])
            load_btn = gr.Button("๐Ÿ“‚ Load CSV")
            status_box = gr.Textbox(label="Status", interactive=False)
            gr.Markdown("---")

            delete_col = gr.Dropdown(label="Select Column to Delete")
            gr.Markdown("Delete Columns which you don't need!")
            delete_btn = gr.Button("๐Ÿ—‘๏ธ Delete Column")
            delete_status = gr.Textbox(label="Delete Status", interactive=True)
            gr.Markdown("---")

            dup_col_status = gr.Textbox(label="Duplicate Columns", interactive=False)
            dup_col_check = gr.Button("๐Ÿ” Check Duplicate Columns")
            dup_col_btn = gr.Button("๐Ÿงฌ Remove Duplicate Columns")
            gr.Markdown("---")

            dup_row_status = gr.Textbox(label="Duplicate Rows", interactive=False)
            dup_row_check = gr.Button("๐Ÿ” Check Duplicate Rows")
            dup_row_btn = gr.Button("๐Ÿ“„ Remove Duplicate Rows")
            gr.Markdown("---")

            missing_status = gr.Textbox(label="Missing Columns Check", interactive=False)
            check_missing_btn = gr.Button("๐Ÿ” Check Columns with Missing Values")
            drop_high_missing_btn = gr.Button("๐Ÿงฎ Drop Columns with >50% Missing Values")
            gr.Markdown("---")

            gr.Markdown("### ๐Ÿงฉ Handle Missing Values")
            missing_col = gr.Dropdown(label="Select Column with Missing Values")
            detect_type_box = gr.Textbox(label="Column Type", interactive=False)
            fill_method = gr.Dropdown(label="Select Fill Method", choices=[])
            apply_fill_btn = gr.Button("โœจ Apply Fill Method")
            fill_status = gr.Textbox(label="Fill Operation Status", interactive=False)
            gr.Markdown("---")

            gr.Markdown("### ๐Ÿ”ค Encoding Section")
            encode_col = gr.Dropdown(label="Select Column to Encode")
            encode_method = gr.Radio(["Label Encoding", "Ordinal Encoding"], label="Encoding Type", value="Label Encoding")
            value_counts_box = gr.Textbox(label="Value Counts (for Ordinal Encoding)", interactive=False, lines=8)
            encode_order = gr.Textbox(label="If Ordinal, Enter Order (comma-separated)")
            encode_status = gr.Textbox(label="Encoding Status", interactive=False)
            encode_btn = gr.Button("โš™๏ธ Apply Encoding")
            gr.Markdown("---")

            gr.Markdown("### ๐Ÿท๏ธ Column Name Normalization & Renaming")
            normalize_btn = gr.Button("๐Ÿ”ก Normalize Column Names")
            normalize_status = gr.Textbox(label="Normalization Status", interactive=False)
            rename_col = gr.Dropdown(label="Select Column to Rename")
            new_col_name = gr.Textbox(label="Enter New Column Name")
            rename_btn = gr.Button("โœ๏ธ Rename Column")
            rename_status = gr.Textbox(label="Rename Status", interactive=False)

            gr.Markdown("---")
            gr.Markdown("### ๐Ÿ”ข Change Data Type of Columns")
            numeric_detect_btn = gr.Button("๐Ÿ” Detect Numeric Columns")
            numeric_detect_status = gr.Textbox(label="Numeric Column Detection", interactive=False)
            dtype_col = gr.Dropdown(label="Select Numeric Column")
            current_dtype_box = gr.Textbox(label="Current Data Type", interactive=False)

            # Target dtype selection
            dtype_choices = [
                "int8", "int16", "int32", "int64",
                "float16", "float32", "float64",
                "complex64", "complex128"
            ]
            new_dtype = gr.Dropdown(label="Select New Data Type", choices=dtype_choices)
            convert_dtype_btn = gr.Button("๐Ÿ” Convert Data Type")
            convert_dtype_status = gr.Textbox(label="Data Type Conversion Status", interactive=False)
            gr.Markdown("---")

            gr.Markdown("### ๐Ÿšจ Outlier Detection & Handling")
            detect_cont_col_btn = gr.Button("๐Ÿ” Detect Continuous Columns")
            cont_col_status = gr.Textbox(label="Continuous Columns Detection", interactive=False)
            outlier_col = gr.Dropdown(label="Select Continuous Column")
            col_stats_box = gr.Textbox(label="Column Statistics", interactive=False)

            # Technique + threshold
            outlier_method = gr.Radio(
                ["IQR", "Z-score", "Winsorization", "MinMax"],
                label="Select Outlier Handling Technique",
                value="IQR"
            )
            threshold_value = gr.Textbox(label="Enter Threshold Value (e.g., 1.5 for IQR, 3 for Z-score, etc.)")

            # Apply technique
            apply_outlier_btn = gr.Button("๐Ÿงฎ Apply Technique")
            outlier_status = gr.Textbox(label="Outlier Handling Status", interactive=False)

            gr.Markdown("---")
            reset_btn = gr.Button("โ™ป๏ธ Reset to Original")
            download_trigger = gr.Button("๐Ÿ“ฅ Generate & Download Cleaned CSV")
            download_file = gr.File(label="Your Cleaned CSV File Will Appear Below ๐Ÿ‘‡")
            gr.HTML("</div>")


        with gr.Column(scale=3):
            gr.Markdown("# Data Panel")
            summary_table = gr.DataFrame(label="๐Ÿ“Š File Summary", interactive=True, wrap=True)
            gr.Markdown("---")
            gr.Markdown("## ๐Ÿงพ Data Preview")
            original_df = gr.DataFrame(label="๐Ÿ“˜ Original Dataset", wrap=True, interactive=False)
            working_df = gr.DataFrame(label="๐Ÿงช Working Dataset", wrap=True)

    gr.Markdown("---")
    gr.Markdown("### ๐Ÿงพ Generate Detailed Data Report")

    generate_report_btn = gr.Button("๐Ÿ“ˆ Create Data Report (It might take time)")
    report_status = gr.HTML(label="Report Status")
    report_file = gr.File(label="Download or View Report")


    # ===========================================================
    #                    Event Bindings
    # ===========================================================

    load_btn.click(load_csv,
        inputs=file_input,
        outputs=[original_df, working_df, summary_table, delete_col, encode_col, status_box]
    )

    delete_btn.click(delete_column, inputs=[working_df, delete_col], outputs=[working_df, delete_status])
    dup_col_check.click(check_duplicate_columns, inputs=working_df, outputs=dup_col_status)
    dup_col_btn.click(remove_duplicate_columns, inputs=working_df, outputs=[working_df, dup_col_status])
    dup_row_check.click(check_duplicate_rows, inputs=working_df, outputs=dup_row_status)
    dup_row_btn.click(remove_duplicate_rows, inputs=working_df, outputs=[working_df, dup_row_status])
    check_missing_btn.click(check_missing_columns, inputs=working_df, outputs=missing_status)
    drop_high_missing_btn.click(drop_high_missing, inputs=working_df, outputs=[working_df, missing_status])

    # Missing values section
    check_missing_btn.click(get_missing_columns, inputs=working_df, outputs=[missing_col, missing_status])
    missing_col.change(detect_column_type, inputs=[working_df, missing_col], outputs=[detect_type_box, fill_method])
    apply_fill_btn.click(apply_missing_value, inputs=[working_df, missing_col, fill_method], outputs=[working_df, fill_status])

    # Encoding section
    encode_col.change(show_value_counts, inputs=[working_df, encode_col, encode_method], outputs=value_counts_box)
    encode_method.change(show_value_counts, inputs=[working_df, encode_col, encode_method], outputs=value_counts_box)
    encode_btn.click(
        lambda df, col, method, order_str: encode_column(df, col, method, [x.strip() for x in order_str.split(",")] if order_str else None),
        inputs=[working_df, encode_col, encode_method, encode_order],
        outputs=[working_df, encode_status]
    )


   # Normalize column names
    def normalize_and_update(df):
        df, msg = normalize_column_names(df)
        if df is None:
            return df, gr.update(choices=[]), gr.update(choices=[]), gr.update(choices=[]), msg
        cols = df.columns.tolist()
        return df, gr.update(choices=cols), gr.update(choices=cols), gr.update(choices=cols), msg

    normalize_btn.click(
        normalize_and_update,
        inputs=working_df,
        outputs=[working_df, delete_col, rename_col, encode_col, normalize_status]
    )

    # rename columns
    def rename_and_update(df, old_col, new_col):
        df, msg = rename_single_column(df, old_col, new_col)
        if df is None:
            return df, gr.update(choices=[]), gr.update(choices=[]), gr.update(choices=[]), msg
        cols = df.columns.tolist()
        return df, gr.update(choices=cols), gr.update(choices=cols), gr.update(choices=cols), msg

    rename_btn.click(
        rename_and_update,
        inputs=[working_df, rename_col, new_col_name],
        outputs=[working_df, delete_col, rename_col, encode_col, rename_status]
    )

    # ====================== Data Type Change Section ======================

    # Detect numeric columns
    numeric_detect_btn.click(get_numeric_columns, inputs=working_df, outputs=[dtype_col, numeric_detect_status])

    # Show current dtype when a column is selected
    dtype_col.change(show_current_dtype, inputs=[working_df, dtype_col], outputs=current_dtype_box)

    # Apply dtype change
    convert_dtype_btn.click(change_column_dtype, inputs=[working_df, dtype_col, new_dtype], outputs=[working_df, convert_dtype_status])

    # ===================== Outlier Detection Section =====================

    # Detect continuous columns
    detect_cont_col_btn.click(get_continuous_columns, inputs=working_df, outputs=[outlier_col, cont_col_status])

    # Show stats when a column is selected
    outlier_col.change(show_column_stats, inputs=[working_df, outlier_col], outputs=col_stats_box)

    # Apply selected outlier handling technique
    apply_outlier_btn.click(
        handle_outliers,
        inputs=[working_df, outlier_col, outlier_method, threshold_value],
        outputs=[working_df, outlier_status]
    )


    reset_btn.click(lambda df_orig: (df_orig.copy(), "โœ… Reset to original dataset."),
        inputs=original_df,
        outputs=[working_df, status_box]
    )

    download_trigger.click(make_csv_download, inputs=working_df, outputs=download_file)

    generate_report_btn.click(
        generate_profile_report,
        inputs=working_df,
        outputs=[report_file, report_status]
    )

demo.launch()