File size: 6,018 Bytes
f49e649
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6a7180d
f49e649
 
ce50072
f49e649
 
 
 
 
 
 
fb7ef82
 
f49e649
ce50072
 
 
 
 
 
f49e649
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2d0c05d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f49e649
ce50072
 
 
 
 
 
f49e649
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
# data_clean_simple.py

import pandas as pd
import streamlit as st
import re
from functools import lru_cache

# Dictionary of common strategies for data cleaning
cleaning_strategies = {
    "date": "For date columns, consider forward-fill or backward-fill from existing dates, or converting to datetime format.",
    "numeric": "For numeric columns, consider filling missing values with the mean or median of the column.",
    "text": "For text columns, consider filling missing values with a placeholder like 'Unknown' or the most frequent value.",
    "categorical": "For categorical columns, consider filling missing values with the mode or a placeholder like 'Unknown'."
}

@lru_cache(maxsize=32)
def get_column_type(column_name):
    column_lower = column_name.lower()

    if any(keyword in column_lower for keyword in ['date', 'year', 'time', 'timestamp']):
        return 'date'
    elif any(keyword in column_lower for keyword in ['price', 'amount', 'cost', 'quantity', 'value']):
        return 'numeric'
    elif any(keyword in column_lower for keyword in ['name', 'description', 'text', 'comment']):
        return 'text'
    elif any(keyword in column_lower for keyword in ['category', 'type', 'label', 'class']):
        return 'categorical'
    else:
        if re.search(r'(num|amt|count|age|height|weight|total|\d+)', column_lower):
            return 'numeric'
        return 'text'
    
def suggest_fill_strategies(column_name, examples):
    column_type = get_column_type(column_name)

    # Create a basic analysis of the data
    valid_examples = [ex for ex in examples if pd.notna(ex) and ex != '']

    # Build a simple suggestion based on column type and examples
    suggestion = cleaning_strategies.get(column_type, cleaning_strategies['text'])

    if valid_examples:
        suggestion += f"\n\nExample values: {', '.join(map(str, valid_examples[:3]))}."

        # For numeric data, add statistics
        if column_type == 'numeric':
            mean_value = pd.Series(valid_examples).astype(float).mean()
            median_value = pd.Series(valid_examples).astype(float).median()
            suggestion += f"Consider replacing values with\n\nMean: {mean_value:.2f}, Median: {median_value:.2f}."

    return suggestion

def clean_data(file_path, progress_callback=None):
    # Support CSV and TSV files
    # Load data and drop duplicates
    # Clean data with progress updates
    if file_path.endswith('.tsv'):
        df = pd.read_csv(file_path, sep='\t').drop_duplicates().copy()
    else:
        df = pd.read_csv(file_path).drop_duplicates().copy()

    suggestions_log = []

    total_columns = len(df.columns)

    # Convert column types
    for i, col in enumerate(df.columns):
        # Update progress if callback provided
        if progress_callback:
            progress = i / total_columns
            progress_callback(progress)
            
        if df[col].dtype == 'object':
            df[col] = df[col].str.strip().str.lower()  # Normalize text

            # Escape newline characters
            df[col] = df[col].str.replace('\n', ' ', regex=False).replace('\r', ' ', regex=False)
            
            if any(keyword in col.lower() for keyword in ['date', 'year', 'time', 'timestamp']):
                df[col] = df[col].str.replace(r'[^\d]', '', regex=True)
                # Normalize 4-digit year ranges (e.g., 2000-2001, 2000--2001, 20002001)
                df[col] = df[col].replace(
                    r'(?<!\d)(\d{4})\s*[-–—./]?\s*(\d{4})(?!\d)', r'\1-\2', regex=True
                )
            # Remove currency symbols and commas
            if df[col].astype(str).str.contains(r'[$,]', na=False, regex=True).any():
                df[col] = df[col].str.replace(r'[$,]', '', regex=True)

        # Always try to convert to numeric if possible
        if col.lower().find('id') == -1: # Skip ID columns which should remain as strings
            try:
                df[col] = pd.to_numeric(df[col], errors='ignore')
            except:
                return None

        # Check for missing or weird values
        null_count = df[col].isnull().sum()
        empty_str_count = (df[col] == '').sum() if df[col].dtype == 'object' else 0
        pattern_matches = df[col].astype(str).str.contains(r'none|null|n/a|na|\?+missing|unknown',
                                                           na=False, case=False, regex=True).sum()
        
        # Generate suggestions for missing or weird values
        if null_count > 0 or empty_str_count > 0 or pattern_matches > 0:
            # Get non-null, non-empty examples for analysis - handle empty dataframes
            try:
                good_df = df[col][
                df[col].notnull() &
                (df[col].astype(str) != "") &
                ~df[col].astype(str).contains(r'none|null|n/a|na|\?+|missing|unknown',
                                             na=False, case=False, regex=True)
                ]

                if len(good_df) > 0:
                    sample_size = min(5, len(good_df))
                    good_examples = good_df.drop_duplicates().sample(n=sample_size, random_state=1).tolist()
                else:
                    good_examples = []
            except:
                good_examples = []

            # Generate suggestions
            suggestion = suggest_fill_strategies(col, good_examples)
            suggestions_log.append((col, suggestion))

    # Final progress update
    if progress_callback:
        progress_callback(1.0)

    # Reset index for consistency

    df = df.reset_index(drop=True)

    return df, suggestions_log

def display_suggestions_report(suggestions_log):
    if suggestions_log:
        st.subheader("🤖 Data Cleaning Suggestions")
        for col, suggestion in suggestions_log:
            st.markdown(f"**Column:** `{col}`")
            if suggestion:
                    st.code(suggestion, language="python")
            else:
                st.write("No suggestions or response error.")