translators-will commited on
Commit
f49e649
·
verified ·
1 Parent(s): b230fab

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +127 -0
app.py ADDED
@@ -0,0 +1,127 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # data_clean_simple.py
2
+
3
+ import pandas as pd
4
+ import streamlit as st
5
+ import re
6
+ from functools import lru_cache
7
+
8
+ # Dictionary of common strategies for data cleaning
9
+ cleaning_strategies = {
10
+ "date": "For date columns, consider forward-fill or backward-fill from existing dates, or converting to datetime format.",
11
+ "numeric": "For numeric columns, consider filling missing values with the mean or median of the column.",
12
+ "text": "For text columns, consider filling missing values with a placeholder like 'Unknown' or the most frequent value.",
13
+ "categorical": "For categorical columns, consider filling missing values with the mode or a placeholder like 'Unknown'."
14
+ }
15
+
16
+ @lru_cache(maxsize=32)
17
+ def get_column_type(column_name):
18
+ column_lower = column_name.lower()
19
+
20
+ if any(keyword in column_lower for keyword in ['date', 'year', 'time', 'timestamp']):
21
+ return 'date'
22
+ elif any(keyword in column_lower for keyword in ['price', 'amount', 'cost', 'quantity', 'value']):
23
+ return 'numeric'
24
+ elif any(keyword in column_lower for keyword in ['name', 'description', 'text', 'comment']):
25
+ return 'text'
26
+ elif any(keyword in column_lower for keyword in ['category', 'type', 'label', 'class']):
27
+ return 'categorical'
28
+ else:
29
+ if re.search(r'(num|amt|count|age|height|weight|total|\d+)', column_lower):
30
+ return 'numeric'
31
+ return 'text'
32
+
33
+ def suggest_fill_strategies(column_name, examples):
34
+ column_type = get_column_type(column_name)
35
+
36
+ # Create a basic analysis of the data
37
+ valid_examples = [ex for ex in examples if pd.notna(ex) and ex != '']
38
+
39
+ # Build a simple suggestion based on column type and examples
40
+ suggestion = cleaning_strategies.get(column_type, cleaning_strategies['text'])
41
+
42
+ if valid_examples:
43
+ suggestion += f"\n\nExample values: {', '.join(map(str, valid_examples[:3]))}."
44
+
45
+ # For numeric data, add statistics
46
+ if column_type == 'numeric':
47
+ mean_value = pd.Series(valid_examples).astype(float).mean()
48
+ median_value = pd.Series(valid_examples).astype(float).median()
49
+ suggestion += f"Consider replacing values with\n\nMean: {mean_value:.2f}, Median: {median_value:.2f}."
50
+
51
+ return suggestion
52
+
53
+ def clean_data(file_path):
54
+ # Support CSV and TSV files
55
+ # Load data and drop duplicates
56
+ if file_path.endswith('.tsv'):
57
+ df = pd.read_csv(file_path, sep='\t').drop_duplicates().copy()
58
+ else:
59
+ df = pd.read_csv(file_path).drop_duplicates().copy()
60
+
61
+ suggestions_log = []
62
+
63
+ # Convert column types
64
+ for col in df.columns:
65
+ if df[col].dtype == 'object':
66
+ df[col] = df[col].str.strip().str.lower() # Normalize text
67
+
68
+ # Escape newline characters
69
+ df[col] = df[col].str.replace('\n', ' ', regex=False).replace('\r', ' ', regex=False)
70
+
71
+ if any(keyword in col.lower() for keyword in ['date', 'year', 'time', 'timestamp']):
72
+ df[col] = df[col].str.replace(r'[^\d]', '', regex=True)
73
+ # Normalize 4-digit year ranges (e.g., 2000-2001, 2000--2001, 20002001)
74
+ df[col] = df[col].replace(
75
+ r'(?<!\d)(\d{4})\s*[-–—./]?\s*(\d{4})(?!\d)', r'\1-\2', regex=True
76
+ )
77
+ # Remove currency symbols and commas
78
+ if df[col].astype(str).str.contains(r'[$,]', na=False, regex=True).any():
79
+ df[col] = df[col].str.replace(r'[$,]', '', regex=True)
80
+
81
+ # Always try to convert to numeric if possible
82
+ if col.lower().find('id') == -1: # Skip ID columns which should remain as strings
83
+ try:
84
+ df[col] = pd.to_numeric(df[col], errors='ignore')
85
+ except:
86
+ return None
87
+
88
+ # Check for missing or weird values
89
+ null_count = df[col].isnull().sum()
90
+ empty_str_count = (df[col] == '').sum() if df[col].dtype == 'object' else 0
91
+ pattern_matches = df[col].astype(str).str.contains(r'none|null|n/a|na|\?+missing|unknown',
92
+ na=False, case=False, regex=True).sum()
93
+
94
+ # Generate suggestions for missing or weird values
95
+ if null_count > 0 or empty_str_count > 0 or pattern_matches > 0:
96
+ # Get examples for analysis (both good and bad examples)
97
+ # Get non-null, non-empty examples
98
+ good_examples = df[col][df[col].notnull() & (df[col] != '')].drop_duplicates().sample(n=min(5, len(df)), random_state=1)
99
+
100
+ # Get bad examples
101
+ bad_examples = df[col][df[col].isna() | (df[col] == '') | df[col].astype(str).str.contains(r'none|null|n/a|na|\?+missing|unknown',
102
+ na=False, case=False, regex=True)].sample(
103
+ min(5, df[col].isna().sum()), random_state=1)
104
+ # Combine good and bad examples
105
+ examples = good_examples + bad_examples
106
+
107
+ if examples is not None:
108
+ suggestion = suggest_fill_strategies(col, examples)
109
+ suggestions_log.append({
110
+ 'col': col,
111
+ 'suggestion': suggestion
112
+ })
113
+
114
+ df = df.reset_index(drop=True)
115
+
116
+ return df, suggestions_log
117
+
118
+ def display_suggestions_report(suggestions_log):
119
+ if suggestions_log:
120
+ st.subheader("🤖 Data Cleaning Suggestions")
121
+ for col, suggestion in suggestions_log:
122
+ st.markdown(f"**Column:** `{col}`")
123
+ if suggestion:
124
+ st.code(suggestion, language="python")
125
+ else:
126
+ st.write("No suggestions or response error.")
127
+