Spaces:
Sleeping
Sleeping
File size: 6,018 Bytes
f49e649 6a7180d f49e649 ce50072 f49e649 fb7ef82 f49e649 ce50072 f49e649 2d0c05d f49e649 ce50072 f49e649 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 |
# data_clean_simple.py
import pandas as pd
import streamlit as st
import re
from functools import lru_cache
# Dictionary of common strategies for data cleaning
cleaning_strategies = {
"date": "For date columns, consider forward-fill or backward-fill from existing dates, or converting to datetime format.",
"numeric": "For numeric columns, consider filling missing values with the mean or median of the column.",
"text": "For text columns, consider filling missing values with a placeholder like 'Unknown' or the most frequent value.",
"categorical": "For categorical columns, consider filling missing values with the mode or a placeholder like 'Unknown'."
}
@lru_cache(maxsize=32)
def get_column_type(column_name):
column_lower = column_name.lower()
if any(keyword in column_lower for keyword in ['date', 'year', 'time', 'timestamp']):
return 'date'
elif any(keyword in column_lower for keyword in ['price', 'amount', 'cost', 'quantity', 'value']):
return 'numeric'
elif any(keyword in column_lower for keyword in ['name', 'description', 'text', 'comment']):
return 'text'
elif any(keyword in column_lower for keyword in ['category', 'type', 'label', 'class']):
return 'categorical'
else:
if re.search(r'(num|amt|count|age|height|weight|total|\d+)', column_lower):
return 'numeric'
return 'text'
def suggest_fill_strategies(column_name, examples):
column_type = get_column_type(column_name)
# Create a basic analysis of the data
valid_examples = [ex for ex in examples if pd.notna(ex) and ex != '']
# Build a simple suggestion based on column type and examples
suggestion = cleaning_strategies.get(column_type, cleaning_strategies['text'])
if valid_examples:
suggestion += f"\n\nExample values: {', '.join(map(str, valid_examples[:3]))}."
# For numeric data, add statistics
if column_type == 'numeric':
mean_value = pd.Series(valid_examples).astype(float).mean()
median_value = pd.Series(valid_examples).astype(float).median()
suggestion += f"Consider replacing values with\n\nMean: {mean_value:.2f}, Median: {median_value:.2f}."
return suggestion
def clean_data(file_path, progress_callback=None):
# Support CSV and TSV files
# Load data and drop duplicates
# Clean data with progress updates
if file_path.endswith('.tsv'):
df = pd.read_csv(file_path, sep='\t').drop_duplicates().copy()
else:
df = pd.read_csv(file_path).drop_duplicates().copy()
suggestions_log = []
total_columns = len(df.columns)
# Convert column types
for i, col in enumerate(df.columns):
# Update progress if callback provided
if progress_callback:
progress = i / total_columns
progress_callback(progress)
if df[col].dtype == 'object':
df[col] = df[col].str.strip().str.lower() # Normalize text
# Escape newline characters
df[col] = df[col].str.replace('\n', ' ', regex=False).replace('\r', ' ', regex=False)
if any(keyword in col.lower() for keyword in ['date', 'year', 'time', 'timestamp']):
df[col] = df[col].str.replace(r'[^\d]', '', regex=True)
# Normalize 4-digit year ranges (e.g., 2000-2001, 2000--2001, 20002001)
df[col] = df[col].replace(
r'(?<!\d)(\d{4})\s*[-–—./]?\s*(\d{4})(?!\d)', r'\1-\2', regex=True
)
# Remove currency symbols and commas
if df[col].astype(str).str.contains(r'[$,]', na=False, regex=True).any():
df[col] = df[col].str.replace(r'[$,]', '', regex=True)
# Always try to convert to numeric if possible
if col.lower().find('id') == -1: # Skip ID columns which should remain as strings
try:
df[col] = pd.to_numeric(df[col], errors='ignore')
except:
return None
# Check for missing or weird values
null_count = df[col].isnull().sum()
empty_str_count = (df[col] == '').sum() if df[col].dtype == 'object' else 0
pattern_matches = df[col].astype(str).str.contains(r'none|null|n/a|na|\?+missing|unknown',
na=False, case=False, regex=True).sum()
# Generate suggestions for missing or weird values
if null_count > 0 or empty_str_count > 0 or pattern_matches > 0:
# Get non-null, non-empty examples for analysis - handle empty dataframes
try:
good_df = df[col][
df[col].notnull() &
(df[col].astype(str) != "") &
~df[col].astype(str).contains(r'none|null|n/a|na|\?+|missing|unknown',
na=False, case=False, regex=True)
]
if len(good_df) > 0:
sample_size = min(5, len(good_df))
good_examples = good_df.drop_duplicates().sample(n=sample_size, random_state=1).tolist()
else:
good_examples = []
except:
good_examples = []
# Generate suggestions
suggestion = suggest_fill_strategies(col, good_examples)
suggestions_log.append((col, suggestion))
# Final progress update
if progress_callback:
progress_callback(1.0)
# Reset index for consistency
df = df.reset_index(drop=True)
return df, suggestions_log
def display_suggestions_report(suggestions_log):
if suggestions_log:
st.subheader("🤖 Data Cleaning Suggestions")
for col, suggestion in suggestions_log:
st.markdown(f"**Column:** `{col}`")
if suggestion:
st.code(suggestion, language="python")
else:
st.write("No suggestions or response error.")
|