Spaces:

translators-will
/

Data-Cleaner

Sleeping

App Files Files Community

translators-will commited on Apr 9, 2025

Commit

f49e649

verified ·

1 Parent(s): b230fab

Create app.py

Browse files

Files changed (1) hide show

app.py +127 -0

app.py ADDED Viewed

	@@ -0,0 +1,127 @@

+# data_clean_simple.py
+import pandas as pd
+import streamlit as st
+import re
+from functools import lru_cache
+# Dictionary of common strategies for data cleaning
+cleaning_strategies = {
+    "date": "For date columns, consider forward-fill or backward-fill from existing dates, or converting to datetime format.",
+    "numeric": "For numeric columns, consider filling missing values with the mean or median of the column.",
+    "text": "For text columns, consider filling missing values with a placeholder like 'Unknown' or the most frequent value.",
+    "categorical": "For categorical columns, consider filling missing values with the mode or a placeholder like 'Unknown'."
+}
+@lru_cache(maxsize=32)
+def get_column_type(column_name):
+    column_lower = column_name.lower()
+    if any(keyword in column_lower for keyword in ['date', 'year', 'time', 'timestamp']):
+        return 'date'
+    elif any(keyword in column_lower for keyword in ['price', 'amount', 'cost', 'quantity', 'value']):
+        return 'numeric'
+    elif any(keyword in column_lower for keyword in ['name', 'description', 'text', 'comment']):
+        return 'text'
+    elif any(keyword in column_lower for keyword in ['category', 'type', 'label', 'class']):
+        return 'categorical'
+    else:
+        if re.search(r'(num|amt|count|age|height|weight|total|\d+)', column_lower):
+            return 'numeric'
+        return 'text'
+def suggest_fill_strategies(column_name, examples):
+    column_type = get_column_type(column_name)
+    # Create a basic analysis of the data
+    valid_examples = [ex for ex in examples if pd.notna(ex) and ex != '']
+    # Build a simple suggestion based on column type and examples
+    suggestion = cleaning_strategies.get(column_type, cleaning_strategies['text'])
+    if valid_examples:
+        suggestion += f"\n\nExample values: {', '.join(map(str, valid_examples[:3]))}."
+        # For numeric data, add statistics
+        if column_type == 'numeric':
+            mean_value = pd.Series(valid_examples).astype(float).mean()
+            median_value = pd.Series(valid_examples).astype(float).median()
+            suggestion += f"Consider replacing values with\n\nMean: {mean_value:.2f}, Median: {median_value:.2f}."
+    return suggestion
+def clean_data(file_path):
+    # Support CSV and TSV files
+    # Load data and drop duplicates
+    if file_path.endswith('.tsv'):
+        df = pd.read_csv(file_path, sep='\t').drop_duplicates().copy()
+    else:
+        df = pd.read_csv(file_path).drop_duplicates().copy()
+    suggestions_log = []
+    # Convert column types
+    for col in df.columns:
+        if df[col].dtype == 'object':
+            df[col] = df[col].str.strip().str.lower()  # Normalize text
+            # Escape newline characters
+            df[col] = df[col].str.replace('\n', ' ', regex=False).replace('\r', ' ', regex=False)
+            if any(keyword in col.lower() for keyword in ['date', 'year', 'time', 'timestamp']):
+                df[col] = df[col].str.replace(r'[^\d]', '', regex=True)
+                # Normalize 4-digit year ranges (e.g., 2000-2001, 2000--2001, 20002001)
+                df[col] = df[col].replace(
+                    r'(?<!\d)(\d{4})\s*[-–—./]?\s*(\d{4})(?!\d)', r'\1-\2', regex=True
+                )
+            # Remove currency symbols and commas
+            if df[col].astype(str).str.contains(r'[$,]', na=False, regex=True).any():
+                df[col] = df[col].str.replace(r'[$,]', '', regex=True)
+        # Always try to convert to numeric if possible
+        if col.lower().find('id') == -1: # Skip ID columns which should remain as strings
+            try:
+                df[col] = pd.to_numeric(df[col], errors='ignore')
+            except:
+                return None
+        # Check for missing or weird values
+        null_count = df[col].isnull().sum()
+        empty_str_count = (df[col] == '').sum() if df[col].dtype == 'object' else 0
+        pattern_matches = df[col].astype(str).str.contains(r'none|null|n/a|na|\?+missing|unknown',
+                                                           na=False, case=False, regex=True).sum()
+        # Generate suggestions for missing or weird values
+        if null_count > 0 or empty_str_count > 0 or pattern_matches > 0:
+            # Get examples for analysis (both good and bad examples)
+            # Get non-null, non-empty examples
+            good_examples = df[col][df[col].notnull() & (df[col] != '')].drop_duplicates().sample(n=min(5, len(df)), random_state=1)
+            # Get bad examples
+            bad_examples = df[col][df[col].isna() | (df[col] == '') | df[col].astype(str).str.contains(r'none|null|n/a|na|\?+missing|unknown',
+                                                                                                       na=False, case=False, regex=True)].sample(
+                                                                                                           min(5, df[col].isna().sum()), random_state=1)
+            # Combine good and bad examples
+            examples = good_examples + bad_examples
+            if examples is not None:
+                suggestion = suggest_fill_strategies(col, examples)
+                suggestions_log.append({
+                    'col': col,
+                    'suggestion': suggestion
+                })
+    df = df.reset_index(drop=True)
+    return df, suggestions_log
+def display_suggestions_report(suggestions_log):
+    if suggestions_log:
+        st.subheader("🤖 Data Cleaning Suggestions")
+        for col, suggestion in suggestions_log:
+            st.markdown(f"**Column:** `{col}`")
+            if suggestion:
+                    st.code(suggestion, language="python")
+            else:
+                st.write("No suggestions or response error.")