Spaces:
Sleeping
Sleeping
Create app.py
Browse files
app.py
ADDED
|
@@ -0,0 +1,127 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# data_clean_simple.py
|
| 2 |
+
|
| 3 |
+
import pandas as pd
|
| 4 |
+
import streamlit as st
|
| 5 |
+
import re
|
| 6 |
+
from functools import lru_cache
|
| 7 |
+
|
| 8 |
+
# Dictionary of common strategies for data cleaning
|
| 9 |
+
cleaning_strategies = {
|
| 10 |
+
"date": "For date columns, consider forward-fill or backward-fill from existing dates, or converting to datetime format.",
|
| 11 |
+
"numeric": "For numeric columns, consider filling missing values with the mean or median of the column.",
|
| 12 |
+
"text": "For text columns, consider filling missing values with a placeholder like 'Unknown' or the most frequent value.",
|
| 13 |
+
"categorical": "For categorical columns, consider filling missing values with the mode or a placeholder like 'Unknown'."
|
| 14 |
+
}
|
| 15 |
+
|
| 16 |
+
@lru_cache(maxsize=32)
|
| 17 |
+
def get_column_type(column_name):
|
| 18 |
+
column_lower = column_name.lower()
|
| 19 |
+
|
| 20 |
+
if any(keyword in column_lower for keyword in ['date', 'year', 'time', 'timestamp']):
|
| 21 |
+
return 'date'
|
| 22 |
+
elif any(keyword in column_lower for keyword in ['price', 'amount', 'cost', 'quantity', 'value']):
|
| 23 |
+
return 'numeric'
|
| 24 |
+
elif any(keyword in column_lower for keyword in ['name', 'description', 'text', 'comment']):
|
| 25 |
+
return 'text'
|
| 26 |
+
elif any(keyword in column_lower for keyword in ['category', 'type', 'label', 'class']):
|
| 27 |
+
return 'categorical'
|
| 28 |
+
else:
|
| 29 |
+
if re.search(r'(num|amt|count|age|height|weight|total|\d+)', column_lower):
|
| 30 |
+
return 'numeric'
|
| 31 |
+
return 'text'
|
| 32 |
+
|
| 33 |
+
def suggest_fill_strategies(column_name, examples):
|
| 34 |
+
column_type = get_column_type(column_name)
|
| 35 |
+
|
| 36 |
+
# Create a basic analysis of the data
|
| 37 |
+
valid_examples = [ex for ex in examples if pd.notna(ex) and ex != '']
|
| 38 |
+
|
| 39 |
+
# Build a simple suggestion based on column type and examples
|
| 40 |
+
suggestion = cleaning_strategies.get(column_type, cleaning_strategies['text'])
|
| 41 |
+
|
| 42 |
+
if valid_examples:
|
| 43 |
+
suggestion += f"\n\nExample values: {', '.join(map(str, valid_examples[:3]))}."
|
| 44 |
+
|
| 45 |
+
# For numeric data, add statistics
|
| 46 |
+
if column_type == 'numeric':
|
| 47 |
+
mean_value = pd.Series(valid_examples).astype(float).mean()
|
| 48 |
+
median_value = pd.Series(valid_examples).astype(float).median()
|
| 49 |
+
suggestion += f"Consider replacing values with\n\nMean: {mean_value:.2f}, Median: {median_value:.2f}."
|
| 50 |
+
|
| 51 |
+
return suggestion
|
| 52 |
+
|
| 53 |
+
def clean_data(file_path):
|
| 54 |
+
# Support CSV and TSV files
|
| 55 |
+
# Load data and drop duplicates
|
| 56 |
+
if file_path.endswith('.tsv'):
|
| 57 |
+
df = pd.read_csv(file_path, sep='\t').drop_duplicates().copy()
|
| 58 |
+
else:
|
| 59 |
+
df = pd.read_csv(file_path).drop_duplicates().copy()
|
| 60 |
+
|
| 61 |
+
suggestions_log = []
|
| 62 |
+
|
| 63 |
+
# Convert column types
|
| 64 |
+
for col in df.columns:
|
| 65 |
+
if df[col].dtype == 'object':
|
| 66 |
+
df[col] = df[col].str.strip().str.lower() # Normalize text
|
| 67 |
+
|
| 68 |
+
# Escape newline characters
|
| 69 |
+
df[col] = df[col].str.replace('\n', ' ', regex=False).replace('\r', ' ', regex=False)
|
| 70 |
+
|
| 71 |
+
if any(keyword in col.lower() for keyword in ['date', 'year', 'time', 'timestamp']):
|
| 72 |
+
df[col] = df[col].str.replace(r'[^\d]', '', regex=True)
|
| 73 |
+
# Normalize 4-digit year ranges (e.g., 2000-2001, 2000--2001, 20002001)
|
| 74 |
+
df[col] = df[col].replace(
|
| 75 |
+
r'(?<!\d)(\d{4})\s*[-–—./]?\s*(\d{4})(?!\d)', r'\1-\2', regex=True
|
| 76 |
+
)
|
| 77 |
+
# Remove currency symbols and commas
|
| 78 |
+
if df[col].astype(str).str.contains(r'[$,]', na=False, regex=True).any():
|
| 79 |
+
df[col] = df[col].str.replace(r'[$,]', '', regex=True)
|
| 80 |
+
|
| 81 |
+
# Always try to convert to numeric if possible
|
| 82 |
+
if col.lower().find('id') == -1: # Skip ID columns which should remain as strings
|
| 83 |
+
try:
|
| 84 |
+
df[col] = pd.to_numeric(df[col], errors='ignore')
|
| 85 |
+
except:
|
| 86 |
+
return None
|
| 87 |
+
|
| 88 |
+
# Check for missing or weird values
|
| 89 |
+
null_count = df[col].isnull().sum()
|
| 90 |
+
empty_str_count = (df[col] == '').sum() if df[col].dtype == 'object' else 0
|
| 91 |
+
pattern_matches = df[col].astype(str).str.contains(r'none|null|n/a|na|\?+missing|unknown',
|
| 92 |
+
na=False, case=False, regex=True).sum()
|
| 93 |
+
|
| 94 |
+
# Generate suggestions for missing or weird values
|
| 95 |
+
if null_count > 0 or empty_str_count > 0 or pattern_matches > 0:
|
| 96 |
+
# Get examples for analysis (both good and bad examples)
|
| 97 |
+
# Get non-null, non-empty examples
|
| 98 |
+
good_examples = df[col][df[col].notnull() & (df[col] != '')].drop_duplicates().sample(n=min(5, len(df)), random_state=1)
|
| 99 |
+
|
| 100 |
+
# Get bad examples
|
| 101 |
+
bad_examples = df[col][df[col].isna() | (df[col] == '') | df[col].astype(str).str.contains(r'none|null|n/a|na|\?+missing|unknown',
|
| 102 |
+
na=False, case=False, regex=True)].sample(
|
| 103 |
+
min(5, df[col].isna().sum()), random_state=1)
|
| 104 |
+
# Combine good and bad examples
|
| 105 |
+
examples = good_examples + bad_examples
|
| 106 |
+
|
| 107 |
+
if examples is not None:
|
| 108 |
+
suggestion = suggest_fill_strategies(col, examples)
|
| 109 |
+
suggestions_log.append({
|
| 110 |
+
'col': col,
|
| 111 |
+
'suggestion': suggestion
|
| 112 |
+
})
|
| 113 |
+
|
| 114 |
+
df = df.reset_index(drop=True)
|
| 115 |
+
|
| 116 |
+
return df, suggestions_log
|
| 117 |
+
|
| 118 |
+
def display_suggestions_report(suggestions_log):
|
| 119 |
+
if suggestions_log:
|
| 120 |
+
st.subheader("🤖 Data Cleaning Suggestions")
|
| 121 |
+
for col, suggestion in suggestions_log:
|
| 122 |
+
st.markdown(f"**Column:** `{col}`")
|
| 123 |
+
if suggestion:
|
| 124 |
+
st.code(suggestion, language="python")
|
| 125 |
+
else:
|
| 126 |
+
st.write("No suggestions or response error.")
|
| 127 |
+
|