rairo commited on
Commit
3bfe933
·
verified ·
1 Parent(s): 9cc35d1

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +136 -118
app.py CHANGED
@@ -1,4 +1,3 @@
1
-
2
  import streamlit as st
3
  import pandas as pd
4
  import numpy as np
@@ -14,119 +13,123 @@ genai.configure(api_key=os.environ["GOOGLE_API_KEY"])
14
  model = genai.GenerativeModel('gemini-2.0-flash-thinking-exp')
15
 
16
  def clean_column_name(col_name):
17
- """Clean column names to be compatible with Arrow"""
18
- cleaned = re.sub(r"[^\w\s]", " ", str(col_name).lower())
19
- cleaned = re.sub(r"\s+", "_", cleaned.strip())
20
- # Preserve currency indicators
21
- if "usd" in cleaned:
22
- return cleaned.replace("usd", "_usd")
23
- if "zw" in cleaned:
24
- return cleaned.replace("zw", "_zw")
25
- return cleaned
26
 
27
  def clean_tin_value(val):
28
- """Clean TIN while preserving format"""
29
- val_str = str(val).strip().upper()
30
- # Remove trailing .0 but keep hyphens and letters
31
- val_str = re.sub(r"\.0$", "", val_str)
32
- return re.sub(r"[^\w-]", "", val_str)
 
 
 
 
 
33
 
34
  def standardize_dataframe(df: pd.DataFrame) -> pd.DataFrame:
35
- """Enhanced standardization for multi-currency support"""
 
 
 
 
 
 
36
  rename_map = {}
37
- currency_keywords = {
38
- 'salary': ['salary', 'wage', 'earning'],
39
- 'overtime': ['overtime'],
40
- 'bonus': ['bonus'],
41
- 'gratuity': ['gratuity'],
42
- 'housing': ['housing'],
43
- 'vehicle': ['vehicle'],
44
- 'pension': ['pension'],
45
- 'nssa': ['nssa']
46
- }
47
-
48
  for col in df.columns:
49
  col_lower = col.lower()
50
- # Handle TIN first
51
- if any(kw in col_lower for kw in ['tin', 'personal_id', 'tax_id']):
52
  rename_map[col] = 'tin'
53
- continue
54
-
55
- # Handle currency columns
56
- found = False
57
- for base_name, keywords in currency_keywords.items():
58
- if any(kw in col_lower for kw in keywords):
59
- currency = '_usd' if 'usd' in col_lower else '_zwl' if any(kw in col_lower for kw in ['zw', 'zwl', 'zwg']) else ''
60
- new_name = f"{base_name}{currency}"
61
- rename_map[col] = new_name
62
- found = True
63
- break
64
- if not found:
65
- if 'name' in col_lower:
66
- rename_map[col] = 'employee_name'
67
-
68
- # Apply renaming and handle duplicates
69
- df = df.rename(columns=rename_map)
70
-
71
- # Merge similar columns
72
- for base in currency_keywords.keys():
73
- cols = [c for c in df.columns if c.startswith(base)]
74
- if len(cols) > 1:
75
- df[base] = df[cols].bfill(axis=1).iloc[:, 0]
76
- df = df.drop(columns=cols)
77
-
78
- # Create employee_name if split
79
- if 'employee_name' not in df.columns and {'first_name', 'last_name'}.issubset(df.columns):
80
- df['employee_name'] = df['first_name'] + ' ' + df['last_name']
81
-
82
- # Clean TIN column
83
  if 'tin' in df.columns:
84
- df['tin'] = df['tin'].apply(clean_tin_value).str.strip()
85
-
 
86
  return df
87
 
88
  def analyze_columns(df: pd.DataFrame, filename: str) -> dict:
89
- """Improved analysis prompt for financial data"""
90
  try:
91
- sample_data = df.head(3).to_dict()
92
- prompt = f"""Analyze this payroll data from {filename}. Focus on currency columns (USD/ZWL) and employee identifiers.
93
- Return JSON with columns, key fields, and merging suggestions. Sample: {sample_data}"""
94
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
95
  response = model.generate_content(prompt)
96
- return json.loads(response.text.replace('```json', '').replace('```', ''))
97
- except:
98
- return {"key_columns": ["tin", "employee_name"]}
99
-
100
- def merge_with_master(processed_files):
101
- """Enhanced merging with fuzzy matching"""
102
- master_df = next((f["df"] for f in processed_files if "paye" in f["filename"].lower()), None)
103
- if not master_df:
104
- master_df = processed_files[0]["df"]
105
-
106
- for other in processed_files:
107
- if other["df"] is master_df: continue
108
-
109
- # Fuzzy match on TIN and names
110
- other_df = other["df"]
111
- merge_keys = []
112
- if 'tin' in master_df and 'tin' in other_df:
113
- master_df['clean_tin'] = master_df['tin'].apply(clean_tin_value)
114
- other_df['clean_tin'] = other_df['tin'].apply(clean_tin_value)
115
- merge_keys.append('clean_tin')
116
-
117
- if 'employee_name' in both:
118
- master_df['clean_name'] = master_df['employee_name'].str.lower().str.strip()
119
- other_df['clean_name'] = other_df['employee_name'].str.lower().str.strip()
120
- merge_keys.append('clean_name')
121
-
122
- if merge_keys:
123
- master_df = pd.merge(master_df, other_df, on=merge_keys, how='left', suffixes=('', '_drop'))
124
- master_df = master_df.loc[:, ~master_df.columns.str.endswith('_drop')]
125
-
126
- return master_df
127
 
128
  def read_excel_file(file) -> pd.DataFrame:
129
- """Read Excel file with improved error handling"""
130
  try:
131
  return pd.read_excel(file, engine="openpyxl")
132
  except Exception as e1:
@@ -136,63 +139,83 @@ def read_excel_file(file) -> pd.DataFrame:
136
  st.error(f"Failed to read Excel file: {str(e2)}")
137
  return None
138
 
139
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
140
 
141
  def safe_display_df(df: pd.DataFrame) -> pd.DataFrame:
142
- """Ensure DataFrame is safe for display in Streamlit"""
143
  return df.astype(str).replace({"nan": "", "None": ""})
144
 
145
-
146
  def main():
147
  st.title("Smart CSV Processor")
148
  st.write("Upload CSV or Excel files for intelligent analysis and merging.")
149
-
150
  uploaded_files = st.file_uploader(
151
  "Choose files", accept_multiple_files=True, type=["csv", "xlsx", "xls"]
152
  )
153
-
154
  if uploaded_files:
155
  st.write("### Processing Files")
156
  processed_files = []
157
-
158
  for uploaded_file in uploaded_files:
159
  st.write(f"#### Analyzing: {uploaded_file.name}")
160
-
161
  try:
162
  if uploaded_file.name.endswith((".xlsx", ".xls")):
163
  df = read_excel_file(uploaded_file)
164
  else:
165
  df = pd.read_csv(uploaded_file)
166
-
167
  if df is not None:
168
  df.columns = [clean_column_name(col) for col in df.columns]
169
  df = standardize_dataframe(df)
170
-
171
  st.write("Initial Preview:")
172
  st.dataframe(df.head())
173
-
174
  with st.spinner("Analyzing columns..."):
175
  analysis = analyze_columns(df, uploaded_file.name)
176
-
177
  if analysis:
178
  st.write("Column Analysis:")
179
  st.json(analysis)
 
 
 
180
  processed_files.append(
181
  {"filename": uploaded_file.name, "df": df, "analysis": analysis}
182
  )
183
-
184
  except Exception as e:
185
  st.error(f"Error processing {uploaded_file.name}: {str(e)}")
186
  continue
187
-
188
  if len(processed_files) > 1:
189
  st.write("### Merging DataFrames with Earnings Schedule as Master")
190
  merged_df = merge_with_master(processed_files)
191
-
192
  if merged_df is not None:
193
  st.write("### Preview of Merged Data")
194
  st.dataframe(safe_display_df(merged_df.head()))
195
-
196
  try:
197
  csv = merged_df.to_csv(index=False)
198
  st.download_button(
@@ -201,11 +224,9 @@ def main():
201
  file_name="merged_data.csv",
202
  mime="text/csv",
203
  )
204
-
205
  st.write("### Dataset Statistics")
206
  st.write(f"Total rows: {len(merged_df)}")
207
  st.write(f"Total columns: {len(merged_df.columns)}")
208
-
209
  st.write("### Data Quality Metrics")
210
  missing_df = pd.DataFrame(
211
  {
@@ -215,15 +236,12 @@ def main():
215
  }
216
  )
217
  st.dataframe(missing_df)
218
-
219
  duplicates = merged_df.duplicated().sum()
220
  st.write(f"Number of duplicate rows: {duplicates}")
221
-
222
  except Exception as e:
223
  st.error(f"Error preparing download: {str(e)}")
224
  else:
225
  st.warning("Please upload at least 2 files to merge.")
226
 
227
-
228
  if __name__ == "__main__":
229
  main()
 
 
1
  import streamlit as st
2
  import pandas as pd
3
  import numpy as np
 
13
  model = genai.GenerativeModel('gemini-2.0-flash-thinking-exp')
14
 
15
  def clean_column_name(col_name):
16
+ """Clean column names to be compatible with Arrow."""
17
+ if not isinstance(col_name, str):
18
+ return str(col_name)
19
+ cleaned = re.sub(r"[^\w\s]", " ", col_name)
20
+ return re.sub(r"\s+", "_", cleaned.strip().lower())
 
 
 
 
21
 
22
  def clean_tin_value(val):
23
+ """
24
+ Clean the TIN value by stripping whitespace and, if it ends with '.0', converting it to an integer string.
25
+ """
26
+ val_str = str(val).strip()
27
+ if val_str.endswith('.0'):
28
+ try:
29
+ return str(int(float(val_str)))
30
+ except Exception:
31
+ return val_str
32
+ return val_str
33
 
34
  def standardize_dataframe(df: pd.DataFrame) -> pd.DataFrame:
35
+ """
36
+ Standardize DataFrame column names and data types.
37
+ - Renames synonyms to common names (e.g., 'tin', 'salary').
38
+ - Creates an 'employee_name' column if missing but first_name and last_name exist.
39
+ - Combines duplicate key columns (e.g., multiple 'salary' or 'tin' columns) into one.
40
+ - Forces the key columns 'tin' and 'employee_name' to be strings.
41
+ """
42
  rename_map = {}
 
 
 
 
 
 
 
 
 
 
 
43
  for col in df.columns:
44
  col_lower = col.lower()
45
+ if col_lower in ['personal id', 'personal_id', 'tax id', 'taxid'] or "personal_id_of_employee" in col_lower:
 
46
  rename_map[col] = 'tin'
47
+ elif 'tin' in col_lower:
48
+ rename_map[col] = 'tin'
49
+ if any(keyword in col_lower for keyword in ['salary', 'wage', 'earning', 'commission', 'fee', 'payment', 'compensation']):
50
+ rename_map[col] = 'salary'
51
+ if rename_map:
52
+ df = df.rename(columns=rename_map)
53
+ if 'salary' in df.columns and list(df.columns).count('salary') > 1:
54
+ salary_cols = [col for col in df.columns if col == 'salary']
55
+ df['salary'] = df[salary_cols].bfill(axis=1).iloc[:, 0]
56
+ df = df.loc[:, ~df.columns.duplicated()]
57
+ if 'tin' in df.columns and list(df.columns).count('tin') > 1:
58
+ tin_cols = [col for col in df.columns if col == 'tin']
59
+ df['tin'] = df[tin_cols].bfill(axis=1).iloc[:, 0]
60
+ df = df.loc[:, ~df.columns.duplicated()]
61
+ if 'employee_name' not in df.columns and 'first_name' in df.columns and 'last_name' in df.columns:
62
+ df['employee_name'] = df['first_name'].astype(str).str.strip() + ' ' + df['last_name'].astype(str).str.strip()
63
+ if 'salary' in df.columns:
64
+ df['salary'] = pd.to_numeric(df['salary'], errors='coerce')
 
 
 
 
 
 
 
 
 
 
 
 
65
  if 'tin' in df.columns:
66
+ df['tin'] = df['tin'].fillna('').astype(str).apply(clean_tin_value)
67
+ if 'employee_name' in df.columns:
68
+ df['employee_name'] = df['employee_name'].fillna('').astype(str).str.strip()
69
  return df
70
 
71
  def analyze_columns(df: pd.DataFrame, filename: str) -> dict:
72
+ """Analyze DataFrame columns using Gemini AI with improved error handling."""
73
  try:
74
+ display_df = df.head(5).copy()
75
+ for col in display_df.columns:
76
+ display_df[col] = display_df[col].astype(str)
77
+ sample_csv = display_df.to_csv(index=False)
78
+ prompt = f"""
79
+ Analyze this CSV data and provide analysis in JSON format.
80
+ Filename: {filename}
81
+ Sample data:
82
+ {sample_csv}
83
+ Respond with only a valid JSON object in this format:
84
+ {{
85
+ "subject": "Employee payroll data",
86
+ "columns": [
87
+ {{
88
+ "name": "column_name",
89
+ "type": "string/number/date",
90
+ "description": "Brief description"
91
+ }}
92
+ ],
93
+ "key_columns": ["employee_id", "tin"],
94
+ "issues": ["Missing values in salary column"],
95
+ "suggested_renames": {{
96
+ "old_name": "new_name"
97
+ }}
98
+ }}
99
+ """
100
  response = model.generate_content(prompt)
101
+ response_text = response.text.strip()
102
+ if response_text.startswith("```json"):
103
+ response_text = response_text[7:-3]
104
+ elif response_text.startswith("```"):
105
+ response_text = response_text[3:-3]
106
+ response_text = response_text.strip()
107
+ try:
108
+ analysis = json.loads(response_text)
109
+ return analysis
110
+ except json.JSONDecodeError as je:
111
+ st.error(f"JSON parsing error: {str(je)}")
112
+ st.text("Raw response:")
113
+ st.text(response_text)
114
+ return {
115
+ "subject": "Error parsing analysis",
116
+ "columns": [],
117
+ "key_columns": [],
118
+ "issues": ["Error analyzing columns"],
119
+ "suggested_renames": {},
120
+ }
121
+ except Exception as e:
122
+ st.error(f"Error in column analysis: {str(e)}")
123
+ return {
124
+ "subject": "Error in analysis",
125
+ "columns": [],
126
+ "key_columns": [],
127
+ "issues": [str(e)],
128
+ "suggested_renames": {},
129
+ }
 
 
130
 
131
  def read_excel_file(file) -> pd.DataFrame:
132
+ """Read Excel file with improved error handling."""
133
  try:
134
  return pd.read_excel(file, engine="openpyxl")
135
  except Exception as e1:
 
139
  st.error(f"Failed to read Excel file: {str(e2)}")
140
  return None
141
 
142
+ def merge_with_master(processed_files):
143
+ """
144
+ Merge multiple DataFrames using the earnings schedule file as the master.
145
+ The master file is identified by having 'earnings' in its filename (case insensitive).
146
+ Other files are merged onto the master using key columns identified by AI analysis.
147
+ """
148
+ master_file = None
149
+ other_files = []
150
+ for file_info in processed_files:
151
+ if "earnings" in file_info["filename"].lower():
152
+ master_file = file_info
153
+ else:
154
+ other_files.append(file_info)
155
+ if not master_file:
156
+ st.warning("No master file with 'earnings' found. Using the first file as master.")
157
+ master_file = processed_files[0]
158
+ other_files = processed_files[1:]
159
+ master_df = master_file["df"]
160
+ master_keys = master_file["analysis"].get("key_columns", [])
161
+ st.write(f"Using '{master_file['filename']}' as master with key columns: {master_keys}")
162
+ merged_df = master_df
163
+ for other in other_files:
164
+ other_df = other["df"]
165
+ other_keys = other["analysis"].get("key_columns", [])
166
+ common_keys = list(set(master_keys).intersection(set(other_keys)))
167
+ if common_keys:
168
+ st.write(f"Merging '{other['filename']}' on keys: {common_keys}")
169
+ merged_df = merged_df.merge(other_df, on=common_keys, how="left")
170
+ else:
171
+ st.warning(f"No common keys found for merging '{other['filename']}'. Skipping this file.")
172
+ return merged_df
173
 
174
  def safe_display_df(df: pd.DataFrame) -> pd.DataFrame:
175
+ """Ensure DataFrame is safe for display in Streamlit."""
176
  return df.astype(str).replace({"nan": "", "None": ""})
177
 
 
178
  def main():
179
  st.title("Smart CSV Processor")
180
  st.write("Upload CSV or Excel files for intelligent analysis and merging.")
 
181
  uploaded_files = st.file_uploader(
182
  "Choose files", accept_multiple_files=True, type=["csv", "xlsx", "xls"]
183
  )
 
184
  if uploaded_files:
185
  st.write("### Processing Files")
186
  processed_files = []
 
187
  for uploaded_file in uploaded_files:
188
  st.write(f"#### Analyzing: {uploaded_file.name}")
 
189
  try:
190
  if uploaded_file.name.endswith((".xlsx", ".xls")):
191
  df = read_excel_file(uploaded_file)
192
  else:
193
  df = pd.read_csv(uploaded_file)
 
194
  if df is not None:
195
  df.columns = [clean_column_name(col) for col in df.columns]
196
  df = standardize_dataframe(df)
 
197
  st.write("Initial Preview:")
198
  st.dataframe(df.head())
 
199
  with st.spinner("Analyzing columns..."):
200
  analysis = analyze_columns(df, uploaded_file.name)
 
201
  if analysis:
202
  st.write("Column Analysis:")
203
  st.json(analysis)
204
+ # Apply suggested renames
205
+ if 'suggested_renames' in analysis:
206
+ df = df.rename(columns=analysis['suggested_renames'])
207
  processed_files.append(
208
  {"filename": uploaded_file.name, "df": df, "analysis": analysis}
209
  )
 
210
  except Exception as e:
211
  st.error(f"Error processing {uploaded_file.name}: {str(e)}")
212
  continue
 
213
  if len(processed_files) > 1:
214
  st.write("### Merging DataFrames with Earnings Schedule as Master")
215
  merged_df = merge_with_master(processed_files)
 
216
  if merged_df is not None:
217
  st.write("### Preview of Merged Data")
218
  st.dataframe(safe_display_df(merged_df.head()))
 
219
  try:
220
  csv = merged_df.to_csv(index=False)
221
  st.download_button(
 
224
  file_name="merged_data.csv",
225
  mime="text/csv",
226
  )
 
227
  st.write("### Dataset Statistics")
228
  st.write(f"Total rows: {len(merged_df)}")
229
  st.write(f"Total columns: {len(merged_df.columns)}")
 
230
  st.write("### Data Quality Metrics")
231
  missing_df = pd.DataFrame(
232
  {
 
236
  }
237
  )
238
  st.dataframe(missing_df)
 
239
  duplicates = merged_df.duplicated().sum()
240
  st.write(f"Number of duplicate rows: {duplicates}")
 
241
  except Exception as e:
242
  st.error(f"Error preparing download: {str(e)}")
243
  else:
244
  st.warning("Please upload at least 2 files to merge.")
245
 
 
246
  if __name__ == "__main__":
247
  main()