Spaces:

translators-will
/

Data-Cleaner

Sleeping

File size: 6,018 Bytes

# data_clean_simple.py

import pandas as pd
import streamlit as st
import re
from functools import lru_cache

# Dictionary of common strategies for data cleaning
cleaning_strategies = {
    "date": "For date columns, consider forward-fill or backward-fill from existing dates, or converting to datetime format.",
    "numeric": "For numeric columns, consider filling missing values with the mean or median of the column.",
    "text": "For text columns, consider filling missing values with a placeholder like 'Unknown' or the most frequent value.",
    "categorical": "For categorical columns, consider filling missing values with the mode or a placeholder like 'Unknown'."
}

@lru_cache(maxsize=32)
def get_column_type(column_name):
    column_lower = column_name.lower()

    if any(keyword in column_lower for keyword in ['date', 'year', 'time', 'timestamp']):
        return 'date'
    elif any(keyword in column_lower for keyword in ['price', 'amount', 'cost', 'quantity', 'value']):
        return 'numeric'
    elif any(keyword in column_lower for keyword in ['name', 'description', 'text', 'comment']):
        return 'text'
    elif any(keyword in column_lower for keyword in ['category', 'type', 'label', 'class']):
        return 'categorical'
    else:
        if re.search(r'(num|amt|count|age|height|weight|total|\d+)', column_lower):
            return 'numeric'
        return 'text'
    
def suggest_fill_strategies(column_name, examples):
    column_type = get_column_type(column_name)

    # Create a basic analysis of the data
    valid_examples = [ex for ex in examples if pd.notna(ex) and ex != '']

    # Build a simple suggestion based on column type and examples
    suggestion = cleaning_strategies.get(column_type, cleaning_strategies['text'])

    if valid_examples:
        suggestion += f"\n\nExample values: {', '.join(map(str, valid_examples[:3]))}."

        # For numeric data, add statistics
        if column_type == 'numeric':
            mean_value = pd.Series(valid_examples).astype(float).mean()
            median_value = pd.Series(valid_examples).astype(float).median()
            suggestion += f"Consider replacing values with\n\nMean: {mean_value:.2f}, Median: {median_value:.2f}."

    return suggestion

def clean_data(file_path, progress_callback=None):
    # Support CSV and TSV files
    # Load data and drop duplicates
    # Clean data with progress updates
    if file_path.endswith('.tsv'):
        df = pd.read_csv(file_path, sep='\t').drop_duplicates().copy()
    else:
        df = pd.read_csv(file_path).drop_duplicates().copy()

    suggestions_log = []

    total_columns = len(df.columns)

    # Convert column types
    for i, col in enumerate(df.columns):
        # Update progress if callback provided
        if progress_callback:
            progress = i / total_columns
            progress_callback(progress)
            
        if df[col].dtype == 'object':
            df[col] = df[col].str.strip().str.lower()  # Normalize text

            # Escape newline characters
            df[col] = df[col].str.replace('\n', ' ', regex=False).replace('\r', ' ', regex=False)
            
            if any(keyword in col.lower() for keyword in ['date', 'year', 'time', 'timestamp']):
                df[col] = df[col].str.replace(r'[^\d]', '', regex=True)
                # Normalize 4-digit year ranges (e.g., 2000-2001, 2000--2001, 20002001)
                df[col] = df[col].replace(
                    r'(?<!\d)(\d{4})\s*[-–—./]?\s*(\d{4})(?!\d)', r'\1-\2', regex=True
                )
            # Remove currency symbols and commas
            if df[col].astype(str).str.contains(r'[$,]', na=False, regex=True).any():
                df[col] = df[col].str.replace(r'[$,]', '', regex=True)

        # Always try to convert to numeric if possible
        if col.lower().find('id') == -1: # Skip ID columns which should remain as strings
            try:
                df[col] = pd.to_numeric(df[col], errors='ignore')
            except:
                return None

        # Check for missing or weird values
        null_count = df[col].isnull().sum()
        empty_str_count = (df[col] == '').sum() if df[col].dtype == 'object' else 0
        pattern_matches = df[col].astype(str).str.contains(r'none|null|n/a|na|\?+missing|unknown',
                                                           na=False, case=False, regex=True).sum()
        
        # Generate suggestions for missing or weird values
        if null_count > 0 or empty_str_count > 0 or pattern_matches > 0:
            # Get non-null, non-empty examples for analysis - handle empty dataframes
            try:
                good_df = df[col][
                df[col].notnull() &
                (df[col].astype(str) != "") &
                ~df[col].astype(str).contains(r'none|null|n/a|na|\?+|missing|unknown',
                                             na=False, case=False, regex=True)
                ]

                if len(good_df) > 0:
                    sample_size = min(5, len(good_df))
                    good_examples = good_df.drop_duplicates().sample(n=sample_size, random_state=1).tolist()
                else:
                    good_examples = []
            except:
                good_examples = []

            # Generate suggestions
            suggestion = suggest_fill_strategies(col, good_examples)
            suggestions_log.append((col, suggestion))

    # Final progress update
    if progress_callback:
        progress_callback(1.0)

    # Reset index for consistency

    df = df.reset_index(drop=True)

    return df, suggestions_log

def display_suggestions_report(suggestions_log):
    if suggestions_log:
        st.subheader("🤖 Data Cleaning Suggestions")
        for col, suggestion in suggestions_log:
            st.markdown(f"**Column:** `{col}`")
            if suggestion:
                    st.code(suggestion, language="python")
            else:
                st.write("No suggestions or response error.")