Spaces:

reab5555
/

Data-Cleaner

Sleeping

App Files Files Community

reab5555 commited on Sep 14, 2024

Commit

4395fda

verified ·

1 Parent(s): 8887e6d

Upload 4 files

Browse files

Files changed (4) hide show

app.py +124 -0
clean.py +275 -0
report.py +271 -0
requirements.txt +6 -0

app.py ADDED Viewed

	@@ -0,0 +1,124 @@

+import gradio as gr
+from pyspark.sql import SparkSession
+import os
+import pandas as pd
+from datetime import datetime
+from clean import clean_data, get_numeric_columns
+from report import create_full_report, REPORT_DIR
+def clean_and_visualize(file, primary_key_column, progress=gr.Progress()):
+    # Create a Spark session
+    spark = SparkSession.builder.appName("DataCleaner").getOrCreate()
+    # Read the CSV file
+    progress(0.05, desc="Reading CSV file")
+    df = spark.read.csv(file.name, header=True, inferSchema=True)
+    # Clean the data
+    progress(0.1, desc="Starting data cleaning")
+    cleaned_df, nonconforming_cells_before, process_times = clean_data(spark, df, primary_key_column, progress)
+    progress(0.8, desc="Data cleaning completed")
+    # Calculate removed columns and rows
+    removed_columns = len(df.columns) - len(cleaned_df.columns)
+    removed_rows = df.count() - cleaned_df.count()
+    # Generate full visualization report
+    progress(0.9, desc="Generating report")
+    create_full_report(
+        df,
+        cleaned_df,
+        nonconforming_cells_before,
+        process_times,
+        removed_columns,
+        removed_rows,
+        primary_key_column
+    )
+    # Convert PySpark DataFrame to Pandas DataFrame and save as CSV
+    progress(0.95, desc="Saving cleaned data")
+    pandas_df = cleaned_df.toPandas()
+    # Generate cleaned CSV file name with current date and time
+    current_time = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
+    cleaned_csv_path = os.path.join(f"cleaned_data_{current_time}.csv")
+    pandas_df.to_csv(cleaned_csv_path, index=False)
+    # Collect all generated images
+    image_files = [os.path.join(REPORT_DIR, f) for f in os.listdir(REPORT_DIR) if f.endswith('.png')]
+    # Stop the Spark session
+    spark.stop()
+    progress(1.0, desc="Process completed")
+    return cleaned_csv_path, image_files
+def launch_app():
+    with gr.Blocks() as app:
+        gr.Markdown("# Data Cleaner")
+        with gr.Row():
+            file_input = gr.File(label="Upload CSV File", file_count="single", file_types=[".csv"])
+        with gr.Row():
+            primary_key_dropdown = gr.Dropdown(label="Select Primary Key Column", choices=[], interactive=True)
+        with gr.Row():
+            clean_button = gr.Button("Start Cleaning")
+        with gr.Row():
+            progress_bar = gr.Progress()
+        with gr.Row():
+            cleaned_file_output = gr.File(label="Cleaned CSV", visible=True)
+        with gr.Row():
+            output_gallery = gr.Gallery(
+                label="Visualization Results",
+                show_label=True,
+                elem_id="gallery",
+                columns=[3],
+                rows=[3],
+                object_fit="contain",
+                height="auto",
+                visible=False
+            )
+        def update_primary_key_options(file):
+            if file is None:
+                return gr.Dropdown(choices=[])
+            spark = SparkSession.builder.appName("DataCleaner").getOrCreate()
+            df = spark.read.csv(file.name, header=True, inferSchema=True)
+            numeric_columns = get_numeric_columns(df)
+            spark.stop()
+            return gr.Dropdown(choices=numeric_columns)
+        def process_and_show_results(file, primary_key_column):
+            cleaned_csv_path, image_files = clean_and_visualize(file, primary_key_column, progress=progress_bar)
+            return (
+                cleaned_csv_path,
+                gr.Gallery(visible=True, value=image_files)
+            )
+        file_input.change(
+            fn=update_primary_key_options,
+            inputs=file_input,
+            outputs=primary_key_dropdown
+        )
+        clean_button.click(
+            fn=process_and_show_results,
+            inputs=[file_input, primary_key_dropdown],
+            outputs=[cleaned_file_output, output_gallery]
+        )
+    app.launch()
+if __name__ == "__main__":
+    launch_app()

clean.py ADDED Viewed

	@@ -0,0 +1,275 @@

+import re
+from pyspark.sql import SparkSession
+from pyspark.sql.functions import col, isnan, when, count, lower, regexp_replace, to_date, to_timestamp, udf, \
+    levenshtein, array, lit, trim, size, coalesce
+from pyspark.sql.types import DoubleType, IntegerType, StringType, DateType, TimestampType, ArrayType
+from pyspark.sql.utils import AnalysisException
+import time
+from time import perf_counter
+# Constants
+EMPTY_THRESHOLD = 0.5
+LOW_COUNT_THRESHOLD = 2
+VALID_DATA_THRESHOLD = 0.5
+def print_dataframe_info(df, step=""):
+    num_columns = len(df.columns)
+    num_rows = df.count()
+    num_cells = num_columns * num_rows
+    print(f"{step}Dataframe info:")
+    print(f"  Number of columns: {num_columns}")
+    print(f"  Number of rows: {num_rows}")
+    print(f"  Total number of cells: {num_cells}")
+def check_and_normalize_column_headers(df):
+    print("Checking and normalizing column headers...")
+    for old_name in df.columns:
+        # Create the new name using string manipulation
+        new_name = old_name.lower().replace(' ', '_')
+        # Remove any non-alphanumeric characters (excluding underscores)
+        new_name = re.sub(r'[^0-9a-zA-Z_]', '', new_name)
+        # Rename the column
+        df = df.withColumnRenamed(old_name, new_name)
+    print("Column names have been normalized.")
+    return df
+def remove_empty_columns(df, threshold=EMPTY_THRESHOLD):
+    print(f"Removing columns with less than {threshold * 100}% valid data...")
+    # Calculate the percentage of non-null values for each column
+    df_stats = df.select(
+        [((count(when(col(c).isNotNull(), c)) / count('*')) >= threshold).alias(c) for c in df.columns])
+    valid_columns = [c for c in df_stats.columns if df_stats.select(c).first()[0]]
+    return df.select(valid_columns)
+def remove_empty_rows(df, threshold=EMPTY_THRESHOLD):
+    print(f"Removing rows with less than {threshold * 100}% valid data...")
+    # Count the number of non-null values for each row
+    expr = sum([when(col(c).isNotNull(), lit(1)).otherwise(lit(0)) for c in df.columns])
+    df_valid_count = df.withColumn('valid_count', expr)
+    # Filter rows based on the threshold
+    total_columns = len(df.columns)
+    df_filtered = df_valid_count.filter(col('valid_count') >= threshold * total_columns)
+    print('count of valid rows:', df_filtered.count())
+    return df_filtered.drop('valid_count')
+def drop_rows_with_nas(df, threshold=VALID_DATA_THRESHOLD):
+    print(f"Dropping rows with NAs for columns with more than {threshold * 100}% valid data...")
+    # Calculate the percentage of non-null values for each column
+    df_stats = df.select([((count(when(col(c).isNotNull(), c)) / count('*'))).alias(c) for c in df.columns])
+    # Get columns with more than threshold valid data
+    valid_columns = [c for c in df_stats.columns if df_stats.select(c).first()[0] > threshold]
+    # Drop rows with NAs only for the valid columns
+    for column in valid_columns:
+        df = df.filter(col(column).isNotNull())
+    return df
+def check_typos(df, column_name, threshold=2, top_n=100):
+    # Check if the column is of StringType
+    if not isinstance(df.schema[column_name].dataType, StringType):
+        print(f"Skipping typo check for column {column_name} as it is not a string type.")
+        return None
+    print(f"Checking for typos in column: {column_name}")
+    try:
+        # Get value counts for the specific column
+        value_counts = df.groupBy(column_name).count().orderBy("count", ascending=False)
+        # Take top N most frequent values
+        top_values = [row[column_name] for row in value_counts.limit(top_n).collect()]
+        # Broadcast the top values to all nodes
+        broadcast_top_values = df.sparkSession.sparkContext.broadcast(top_values)
+        # Define UDF to find similar strings
+        @udf(returnType=ArrayType(StringType()))
+        def find_similar_strings(value):
+            if value is None:
+                return []
+            similar = []
+            for top_value in broadcast_top_values.value:
+                if value != top_value and levenshtein(value, top_value) <= threshold:
+                    similar.append(top_value)
+            return similar
+        # Apply the UDF to the column
+        df_with_typos = df.withColumn("possible_typos", find_similar_strings(col(column_name)))
+        # Filter rows with possible typos and select only the relevant columns
+        typos_df = df_with_typos.filter(size("possible_typos") > 0).select(column_name, "possible_typos")
+        # Check if there are any potential typos
+        typo_count = typos_df.count()
+        if typo_count > 0:
+            print(f"Potential typos found in column {column_name}: {typo_count}")
+            typos_df.show(10, truncate=False)
+            return typos_df
+        else:
+            print(f"No potential typos found in column {column_name}")
+            return None
+    except AnalysisException as e:
+        print(f"Error analyzing column {column_name}: {str(e)}")
+        return None
+    except Exception as e:
+        print(f"Unexpected error in check_typos for column {column_name}: {str(e)}")
+        return None
+def transform_string_column(df, column_name):
+    print(f"Transforming string column: {column_name}")
+    # Lower case transformation (if applicable)
+    df = df.withColumn(column_name, lower(col(column_name)))
+    # Remove leading and trailing spaces
+    df = df.withColumn(column_name, trim(col(column_name)))
+    # Replace multiple spaces with a single space
+    df = df.withColumn(column_name, regexp_replace(col(column_name), "\\s+", " "))
+    # Remove special characters except those used in dates and times
+    df = df.withColumn(column_name, regexp_replace(col(column_name), "[^a-zA-Z0-9\\s/:.-]", ""))
+    return df
+def clean_column(df, column_name):
+    print(f"Cleaning column: {column_name}")
+    start_time = perf_counter()
+    # Get the data type of the current column
+    column_type = df.schema[column_name].dataType
+    if isinstance(column_type, StringType):
+        # Skip date detection and directly process as string
+        # For string columns, check for typos and transform
+        typos_df = check_typos(df, column_name)
+        if typos_df is not None and typos_df.count() > 0:
+            print(f"Detailed typos for column {column_name}:")
+            typos_df.show(truncate=False)
+        df = transform_string_column(df, column_name)
+    elif isinstance(column_type, (DoubleType, IntegerType)):
+        # For numeric columns, we'll do a simple null check
+        df = df.withColumn(column_name, when(col(column_name).isNull(), lit(None)).otherwise(col(column_name)))
+    end_time = perf_counter()
+    print(f"Time taken to clean {column_name}: {end_time - start_time:.6f} seconds")
+    return df
+# Update the remove_outliers function to work on a single column
+def remove_outliers(df, column):
+    print(f"Removing outliers from column: {column}")
+    stats = df.select(column).summary("25%", "75%").collect()
+    q1 = float(stats[0][1])
+    q3 = float(stats[1][1])
+    iqr = q3 - q1
+    lower_bound = q1 - 1.5 * iqr
+    upper_bound = q3 + 1.5 * iqr
+    df = df.filter((col(column) >= lower_bound) & (col(column) <= upper_bound))
+    return df
+def calculate_nonconforming_cells(df):
+    nonconforming_cells = {}
+    for column in df.columns:
+        nonconforming_count = df.filter(col(column).isNull() | isnan(column)).count()
+        nonconforming_cells[column] = nonconforming_count
+    return nonconforming_cells
+def get_numeric_columns(df):
+    return [field.name for field in df.schema.fields if isinstance(field.dataType, (IntegerType, DoubleType))]
+def remove_duplicates_from_primary_key(df, primary_key_column):
+    print(f"Removing duplicates based on primary key column: {primary_key_column}")
+    return df.dropDuplicates([primary_key_column])
+def clean_data(spark, df, primary_key_column, progress):
+    start_time = time.time()
+    process_times = {}
+    print("Starting data validation and cleaning...")
+    print_dataframe_info(df, "Initial - ")
+    # Calculate nonconforming cells before cleaning
+    nonconforming_cells_before = calculate_nonconforming_cells(df)
+    # Step 1: Normalize column headers
+    progress(0.1, desc="Normalizing column headers")
+    step_start_time = time.time()
+    df = check_and_normalize_column_headers(df)
+    process_times['Normalize headers'] = time.time() - step_start_time
+    # Step 2: Remove empty columns
+    progress(0.2, desc="Removing empty columns")
+    step_start_time = time.time()
+    df = remove_empty_columns(df)
+    print('2) count of valid rows:', df.count())
+    process_times['Remove empty columns'] = time.time() - step_start_time
+    # Step 3: Remove empty rows
+    progress(0.3, desc="Removing empty rows")
+    step_start_time = time.time()
+    df = remove_empty_rows(df)
+    print('3) count of valid rows:', df.count())
+    process_times['Remove empty rows'] = time.time() - step_start_time
+    # Step 4: Drop rows with NAs for columns with more than 50% valid data
+    progress(0.4, desc="Dropping rows with NAs")
+    step_start_time = time.time()
+    df = drop_rows_with_nas(df)
+    print('4) count of valid rows:', df.count())
+    process_times['Drop rows with NAs'] = time.time() - step_start_time
+    # Step 5: Clean columns (including typo checking and string transformation)
+    column_cleaning_times = {}
+    total_columns = len(df.columns)
+    for index, column in enumerate(df.columns):
+        progress(0.5 + (0.2 * (index / total_columns)), desc=f"Cleaning column: {column}")
+        column_start_time = time.time()
+        df = clean_column(df, column)
+        print('5) count of valid rows:', df.count())
+        column_cleaning_times[f"Clean column: {column}"] = time.time() - column_start_time
+    process_times.update(column_cleaning_times)
+    # Step 6: Remove outliers from numeric columns (excluding primary key)
+    progress(0.7, desc="Removing outliers")
+    step_start_time = time.time()
+    numeric_columns = get_numeric_columns(df)
+    numeric_columns = [col for col in numeric_columns if col != primary_key_column]
+    for column in numeric_columns:
+        df = remove_outliers(df, column)
+    print('6) count of valid rows:', df.count())
+    process_times['Remove outliers'] = time.time() - step_start_time
+    # Step 7: Remove duplicates from primary key column
+    progress(0.8, desc="Removing duplicates from primary key")
+    step_start_time = time.time()
+    df = remove_duplicates_from_primary_key(df, primary_key_column)
+    print('7) count of valid rows:', df.count())
+    process_times['Remove duplicates from primary key'] = time.time() - step_start_time
+    print("Cleaning process completed.")
+    print_dataframe_info(df, "Final - ")
+    return df, nonconforming_cells_before, process_times

report.py ADDED Viewed

	@@ -0,0 +1,271 @@

+import os
+from collections import Counter
+import numpy as np
+import pandas as pd
+import seaborn as sns
+import matplotlib.pyplot as plt
+from datetime import datetime
+from pyspark.ml.feature import VectorAssembler
+from pyspark.ml.stat import Correlation
+from pyspark.sql.functions import col, count, when, lit, isnan
+from pyspark.sql.types import DoubleType, IntegerType, LongType, FloatType, StringType, DateType, TimestampType
+REPORT_DIR = f"cleaning_report_{datetime.now().strftime('%Y%m%d_%H%M%S')}"
+os.makedirs(REPORT_DIR, exist_ok=True)
+def save_plot(fig, filename):
+    fig.savefig(os.path.join(REPORT_DIR, filename), dpi=400, bbox_inches='tight')
+    plt.close(fig)
+def plot_heatmap(df, title):
+    # Calculate the percentage of null values for each column
+    null_percentages = df.select([
+        (100 * count(when(col(c).isNull() | isnan(c), c)) / count('*')).alias(c)
+        for c in df.columns
+    ]).toPandas()
+    plt.figure(figsize=(12, 8))
+    sns.heatmap(null_percentages, cbar=True, cmap='Reds', annot=True, fmt='.1f')
+    plt.title(title)
+    plt.ylabel('Percentage of Missing Values')
+    plt.tight_layout()
+    save_plot(plt.gcf(), f'{title.lower().replace(" ", "_")}.png')
+def plot_column_schemas(df):
+    # Get the data types of all columns
+    schema = df.schema
+    data_types = []
+    for field in schema.fields:
+        dtype_name = field.dataType.typeName()
+        print(f"Column '{field.name}' has data type '{dtype_name}'")
+        data_types.append(dtype_name.capitalize())
+    # Count the occurrences of each data type
+    type_counts = Counter(data_types)
+    fig, ax = plt.subplots(figsize=(10, 6))
+    # Generate a color palette with as many colors as there are bars
+    colors = plt.cm.tab20(np.linspace(0, 1, len(type_counts)))
+    # Plot the bars
+    bars = ax.bar(type_counts.keys(), type_counts.values(), color=colors)
+    ax.set_title('Column Data Types')
+    ax.set_xlabel('Data Type')
+    ax.set_ylabel('Count')
+    # Add value labels on top of each bar
+    for bar in bars:
+        height = bar.get_height()
+        ax.text(bar.get_x() + bar.get_width() / 2., height,
+                f'{int(height)}',
+                ha='center', va='bottom')
+    plt.xticks(rotation=45)
+    plt.tight_layout()
+    save_plot(fig, 'column_schemas.png')
+def plot_nonconforming_cells(nonconforming_cells):
+    # Ensure that nonconforming_cells is a dictionary
+    if isinstance(nonconforming_cells, dict):
+        # Proceed with plotting if it's a dictionary
+        fig, ax = plt.subplots(figsize=(12, 6))
+        # Generate a color palette with as many colors as there are bars
+        colors = plt.cm.rainbow(np.linspace(0, 1, len(nonconforming_cells)))
+        # Plot the bars
+        bars = ax.bar(list(nonconforming_cells.keys()), list(nonconforming_cells.values()), color=colors)
+        ax.set_title('Nonconforming Cells by Column')
+        ax.set_xlabel('Columns')
+        ax.set_ylabel('Number of Nonconforming Cells')
+        plt.xticks(rotation=90)
+        # Add value labels on top of each bar
+        for bar in bars:
+            height = bar.get_height()
+            ax.text(bar.get_x() + bar.get_width() / 2., height,
+                    f'{height:,}',
+                    ha='center', va='bottom')
+        save_plot(fig, 'nonconforming_cells.png')
+    else:
+        print(f"Expected nonconforming_cells to be a dictionary, but got {type(nonconforming_cells)}.")
+def plot_column_distributions(cleaned_df, primary_key_column):
+    print("Plotting distribution charts for numeric columns in the cleaned DataFrame...")
+    def get_numeric_columns(df):
+        return [field.name for field in df.schema.fields
+                if isinstance(field.dataType, (IntegerType, LongType, FloatType, DoubleType))
+                and field.name != primary_key_column]
+    numeric_columns = get_numeric_columns(cleaned_df)
+    num_columns = len(numeric_columns)
+    if num_columns == 0:
+        print("No numeric columns found in the cleaned DataFrame for distribution plots.")
+        return
+    # Create subplots for distributions
+    ncols = 3
+    nrows = (num_columns + ncols - 1) // ncols  # Ceiling division
+    fig, axes = plt.subplots(nrows=nrows, ncols=ncols, figsize=(18, 5 * nrows))
+    axes = axes.flatten() if num_columns > 1 else [axes]
+    for i, column in enumerate(numeric_columns):
+        # Convert to pandas for plotting
+        cleaned_data = cleaned_df.select(column).toPandas()[column].dropna()
+        sns.histplot(cleaned_data, ax=axes[i], kde=True, color='orange', label='After Cleaning', alpha=0.7)
+        axes[i].set_title(f'{column} - Distribution After Cleaning')
+        axes[i].legend()
+    # Remove any unused subplots
+    for j in range(i + 1, len(axes)):
+        fig.delaxes(axes[j])
+    plt.tight_layout()
+    save_plot(fig, 'distributions_after_cleaning.png')
+def plot_boxplot_with_outliers(original_df, primary_key_column):
+    print("Plotting boxplots for numeric columns in the original DataFrame...")
+    def get_numeric_columns(df):
+        return [field.name for field in df.schema.fields
+                if isinstance(field.dataType, (IntegerType, LongType, FloatType, DoubleType))
+                and field.name != primary_key_column]
+    numeric_columns = get_numeric_columns(original_df)
+    num_columns = len(numeric_columns)
+    if num_columns == 0:
+        print("No numeric columns found in the original DataFrame for boxplots.")
+        return
+    # Create subplots based on the number of numeric columns
+    ncols = 3
+    nrows = (num_columns + ncols - 1) // ncols  # Ceiling division
+    fig, axes = plt.subplots(nrows=nrows, ncols=ncols, figsize=(18, 5 * nrows))
+    axes = axes.flatten() if num_columns > 1 else [axes]
+    for i, column in enumerate(numeric_columns):
+        # Convert data to pandas for plotting
+        data = original_df.select(column).toPandas()[column].dropna()
+        sns.boxplot(x=data, ax=axes[i], color='blue', orient='h')
+        axes[i].set_title(f'Boxplot of {column} (Before Cleaning)')
+    # Remove any unused subplots
+    for j in range(i + 1, len(axes)):
+        fig.delaxes(axes[j])
+    plt.tight_layout()
+    save_plot(fig, 'boxplots_before_cleaning.png')
+def plot_correlation_heatmap(df, primary_key_column):
+    # Select only numeric columns
+    numeric_columns = [field.name for field in df.schema.fields
+                       if isinstance(field.dataType, (IntegerType, LongType, FloatType, DoubleType))
+                       and field.name != primary_key_column]
+    if not numeric_columns:
+        print("No numeric columns found for correlation heatmap.")
+        return
+    # Create a vector column of numeric columns
+    assembler = VectorAssembler(inputCols=numeric_columns, outputCol="features")
+    df_vector = assembler.transform(df).select("features")
+    # Compute correlation matrix
+    matrix = Correlation.corr(df_vector, "features").collect()[0][0]
+    corr_matrix = matrix.toArray().tolist()
+    # Convert to pandas DataFrame for plotting
+    corr_df = pd.DataFrame(corr_matrix, columns=numeric_columns, index=numeric_columns)
+    # Plot the heatmap
+    plt.figure(figsize=(15, 10))
+    sns.heatmap(corr_df, annot=True, fmt=".2f", cmap='coolwarm', cbar_kws={'label': 'Correlation'})
+    plt.title('Correlation Heatmap')
+    plt.tight_layout()
+    save_plot(plt.gcf(), 'correlation_heatmap.png')
+def plot_process_times(process_times):
+    # Convert seconds to minutes
+    process_times_minutes = {k: v / 60 for k, v in process_times.items()}
+    # Separate main processes and column cleaning processes
+    main_processes = {k: v for k, v in process_times_minutes.items() if not k.startswith("Clean column:")}
+    column_processes = {k: v for k, v in process_times_minutes.items() if k.startswith("Clean column:")}
+    # Create the plot
+    fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(15, 10))
+    # Plot main processes
+    bars1 = ax1.bar(main_processes.keys(), main_processes.values())
+    ax1.set_title('Main Process Times')
+    ax1.set_ylabel('Time (minutes)')
+    ax1.tick_params(axis='x', rotation=45)
+    # Plot column cleaning processes
+    bars2 = ax2.bar(column_processes.keys(), column_processes.values())
+    ax2.set_title('Column Cleaning Times')
+    ax2.set_ylabel('Time (minutes)')
+    ax2.tick_params(axis='x', rotation=90)
+    # Add value labels on top of each bar
+    for ax, bars in zip([ax1, ax2], [bars1, bars2]):
+        for bar in bars:
+            height = bar.get_height()
+            ax.text(bar.get_x() + bar.get_width() / 2., height,
+                    f'{height:.2f}', ha='center', va='bottom')
+    # Add total time to the plot
+    total_time = sum(process_times_minutes.values())
+    fig.suptitle(f'Process Times (Total: {total_time:.2f} minutes)', fontsize=16)
+    plt.tight_layout()
+    save_plot(fig, 'process_times.png')
+def create_full_report(original_df, cleaned_df, nonconforming_cells_before, process_times, removed_columns,
+                       removed_rows, primary_key_column):
+    os.makedirs(REPORT_DIR, exist_ok=True)
+    sns.set_style("whitegrid")
+    plt.rcParams['figure.dpi'] = 400
+    print("Plotting nonconforming cells before cleaning...")
+    plot_nonconforming_cells(nonconforming_cells_before)
+    print("Plotting column distributions...")
+    plot_column_distributions(cleaned_df, primary_key_column)
+    print("Plotting boxplots for original data...")
+    plot_boxplot_with_outliers(original_df, primary_key_column)
+    print("Plotting process times...")
+    plot_process_times(process_times)
+    print("Plotting heatmaps...")
+    plot_heatmap(original_df, "Missing Values Before Cleaning")
+    print("Plotting correlation heatmap...")
+    plot_correlation_heatmap(cleaned_df, primary_key_column)
+    print("Plotting column schemas...")
+    plot_column_schemas(cleaned_df)
+    print(f"All visualization reports saved in directory: {REPORT_DIR}")

requirements.txt ADDED Viewed

	@@ -0,0 +1,6 @@

+numpy
+pandas
+seaborn
+matplotlib
+pyspark
+gradio