# data_clean_simple.py import pandas as pd import streamlit as st import re from functools import lru_cache # Dictionary of common strategies for data cleaning cleaning_strategies = { "date": "For date columns, consider forward-fill or backward-fill from existing dates, or converting to datetime format.", "numeric": "For numeric columns, consider filling missing values with the mean or median of the column.", "text": "For text columns, consider filling missing values with a placeholder like 'Unknown' or the most frequent value.", "categorical": "For categorical columns, consider filling missing values with the mode or a placeholder like 'Unknown'." } @lru_cache(maxsize=32) def get_column_type(column_name): column_lower = column_name.lower() if any(keyword in column_lower for keyword in ['date', 'year', 'time', 'timestamp']): return 'date' elif any(keyword in column_lower for keyword in ['price', 'amount', 'cost', 'quantity', 'value']): return 'numeric' elif any(keyword in column_lower for keyword in ['name', 'description', 'text', 'comment']): return 'text' elif any(keyword in column_lower for keyword in ['category', 'type', 'label', 'class']): return 'categorical' else: if re.search(r'(num|amt|count|age|height|weight|total|\d+)', column_lower): return 'numeric' return 'text' def suggest_fill_strategies(column_name, examples): column_type = get_column_type(column_name) # Create a basic analysis of the data valid_examples = [ex for ex in examples if pd.notna(ex) and ex != ''] # Build a simple suggestion based on column type and examples suggestion = cleaning_strategies.get(column_type, cleaning_strategies['text']) if valid_examples: suggestion += f"\n\nExample values: {', '.join(map(str, valid_examples[:3]))}." # For numeric data, add statistics if column_type == 'numeric': mean_value = pd.Series(valid_examples).astype(float).mean() median_value = pd.Series(valid_examples).astype(float).median() suggestion += f"Consider replacing values with\n\nMean: {mean_value:.2f}, Median: {median_value:.2f}." return suggestion def clean_data(file_path, progress_callback=None): # Support CSV and TSV files # Load data and drop duplicates # Clean data with progress updates if file_path.endswith('.tsv'): df = pd.read_csv(file_path, sep='\t').drop_duplicates().copy() else: df = pd.read_csv(file_path).drop_duplicates().copy() suggestions_log = [] total_columns = len(df.columns) # Convert column types for i, col in enumerate(df.columns): # Update progress if callback provided if progress_callback: progress = i / total_columns progress_callback(progress) if df[col].dtype == 'object': df[col] = df[col].str.strip().str.lower() # Normalize text # Escape newline characters df[col] = df[col].str.replace('\n', ' ', regex=False).replace('\r', ' ', regex=False) if any(keyword in col.lower() for keyword in ['date', 'year', 'time', 'timestamp']): df[col] = df[col].str.replace(r'[^\d]', '', regex=True) # Normalize 4-digit year ranges (e.g., 2000-2001, 2000--2001, 20002001) df[col] = df[col].replace( r'(? 0 or empty_str_count > 0 or pattern_matches > 0: # Get non-null, non-empty examples for analysis - handle empty dataframes try: good_df = df[col][ df[col].notnull() & (df[col].astype(str) != "") & ~df[col].astype(str).contains(r'none|null|n/a|na|\?+|missing|unknown', na=False, case=False, regex=True) ] if len(good_df) > 0: sample_size = min(5, len(good_df)) good_examples = good_df.drop_duplicates().sample(n=sample_size, random_state=1).tolist() else: good_examples = [] except: good_examples = [] # Generate suggestions suggestion = suggest_fill_strategies(col, good_examples) suggestions_log.append((col, suggestion)) # Final progress update if progress_callback: progress_callback(1.0) # Reset index for consistency df = df.reset_index(drop=True) return df, suggestions_log def display_suggestions_report(suggestions_log): if suggestions_log: st.subheader("🤖 Data Cleaning Suggestions") for col, suggestion in suggestions_log: st.markdown(f"**Column:** `{col}`") if suggestion: st.code(suggestion, language="python") else: st.write("No suggestions or response error.")