rairo commited on
Commit
5d021f4
·
verified ·
1 Parent(s): d05ef69

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +92 -165
app.py CHANGED
@@ -1,3 +1,4 @@
 
1
  import streamlit as st
2
  import pandas as pd
3
  import numpy as np
@@ -12,150 +13,115 @@ import re
12
  genai.configure(api_key=os.environ["GOOGLE_API_KEY"])
13
  model = genai.GenerativeModel('gemini-2.0-flash-thinking-exp')
14
 
15
-
16
  def clean_column_name(col_name):
17
  """Clean column names to be compatible with Arrow"""
18
- if not isinstance(col_name, str):
19
- return str(col_name)
20
- cleaned = re.sub(r"[^\w\s]", " ", col_name)
21
- return re.sub(r"\s+", "_", cleaned.strip().lower())
22
-
 
23
 
24
  def clean_tin_value(val):
25
- """
26
- Clean the TIN value by stripping whitespace and, if it ends with '.0', converting it to an integer string.
27
- """
28
- val_str = str(val).strip()
29
- if val_str.endswith('.0'):
30
- try:
31
- return str(int(float(val_str)))
32
- except Exception:
33
- return val_str
34
- return val_str
35
-
36
 
37
  def standardize_dataframe(df: pd.DataFrame) -> pd.DataFrame:
38
- """
39
- Standardize DataFrame column names and data types.
40
- - Renames synonyms to common names (e.g., 'tin', 'salary').
41
- - In particular, any header containing 'personal_id_of_employee' (or similar) or 'tin' is renamed to 'tin'.
42
- - Creates an 'employee_name' column if missing but first_name and last_name exist.
43
- - Combines duplicate key columns (e.g., multiple 'salary' or 'tin' columns) into one.
44
- - Forces the key columns 'tin' and 'employee_name' to be strings.
45
- """
46
  rename_map = {}
 
 
 
 
 
 
 
 
 
 
47
 
48
  for col in df.columns:
49
  col_lower = col.lower()
50
- # Rename headers to 'tin'
51
- if col_lower in ['personal id', 'personal_id', 'tax id', 'taxid'] or "personal_id_of_employee" in col_lower:
52
- rename_map[col] = 'tin'
53
- elif 'tin' in col_lower:
54
  rename_map[col] = 'tin'
55
- # Rename headers to 'salary'
56
- if any(keyword in col_lower for keyword in ['salary', 'wage', 'earning', 'commission', 'fee', 'payment', 'compensation']):
57
- rename_map[col] = 'salary'
58
-
59
- if rename_map:
60
- df = df.rename(columns=rename_map)
61
-
62
- # Combine duplicate columns for 'salary'
63
- if 'salary' in df.columns and list(df.columns).count('salary') > 1:
64
- salary_cols = [col for col in df.columns if col == 'salary']
65
- df['salary'] = df[salary_cols].bfill(axis=1).iloc[:, 0]
66
- df = df.loc[:, ~df.columns.duplicated()]
67
-
68
- # Combine duplicate columns for 'tin'
69
- if 'tin' in df.columns and list(df.columns).count('tin') > 1:
70
- tin_cols = [col for col in df.columns if col == 'tin']
71
- df['tin'] = df[tin_cols].bfill(axis=1).iloc[:, 0]
72
- df = df.loc[:, ~df.columns.duplicated()]
73
-
74
- # Create employee_name if not present but first_name and last_name exist
75
- if 'employee_name' not in df.columns and 'first_name' in df.columns and 'last_name' in df.columns:
76
- df['employee_name'] = df['first_name'].astype(str).str.strip() + ' ' + df['last_name'].astype(str).str.strip()
77
-
78
- # Ensure salary column is numeric (to avoid conversion errors later)
79
- if 'salary' in df.columns:
80
- df['salary'] = pd.to_numeric(df['salary'], errors='coerce')
81
-
82
- # Force key columns to be strings, filling NaNs with empty strings
 
 
83
  if 'tin' in df.columns:
84
- df['tin'] = df['tin'].fillna('').astype(str).apply(clean_tin_value)
85
- if 'employee_name' in df.columns:
86
- df['employee_name'] = df['employee_name'].fillna('').astype(str).str.strip()
87
-
88
  return df
89
 
90
-
91
  def analyze_columns(df: pd.DataFrame, filename: str) -> dict:
92
- """Analyze DataFrame columns using Gemini AI with improved error handling"""
93
  try:
94
- display_df = df.head(5).copy()
95
- for col in display_df.columns:
96
- display_df[col] = display_df[col].astype(str)
97
- sample_csv = display_df.to_csv(index=False)
98
-
99
- prompt = f"""
100
- Analyze this CSV data and provide analysis in JSON format.
101
- Filename: {filename}
102
-
103
- Sample data:
104
- {sample_csv}
105
-
106
- Respond with only a valid JSON object in this format:
107
- {{
108
- "subject": "Employee payroll data",
109
- "columns": [
110
- {{
111
- "name": "column_name",
112
- "type": "string/number/date",
113
- "description": "Brief description"
114
- }}
115
- ],
116
- "key_columns": ["employee_id", "tin"],
117
- "issues": ["Missing values in salary column"],
118
- "suggested_renames": {{
119
- "old_name": "new_name"
120
- }}
121
- }}
122
- """
123
-
124
  response = model.generate_content(prompt)
125
- response_text = response.text.strip()
126
-
127
- if response_text.startswith("```json"):
128
- response_text = response_text[7:-3]
129
- elif response_text.startswith("```"):
130
- response_text = response_text[3:-3]
131
-
132
- response_text = response_text.strip()
133
-
134
- try:
135
- analysis = json.loads(response_text)
136
- return analysis
137
- except json.JSONDecodeError as je:
138
- st.error(f"JSON parsing error: {str(je)}")
139
- st.text("Raw response:")
140
- st.text(response_text)
141
- return {
142
- "subject": "Error parsing analysis",
143
- "columns": [],
144
- "key_columns": [],
145
- "issues": ["Error analyzing columns"],
146
- "suggested_renames": {},
147
- }
148
-
149
- except Exception as e:
150
- st.error(f"Error in column analysis: {str(e)}")
151
- return {
152
- "subject": "Error in analysis",
153
- "columns": [],
154
- "key_columns": [],
155
- "issues": [str(e)],
156
- "suggested_renames": {},
157
- }
158
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
159
 
160
  def read_excel_file(file) -> pd.DataFrame:
161
  """Read Excel file with improved error handling"""
@@ -169,45 +135,6 @@ def read_excel_file(file) -> pd.DataFrame:
169
  return None
170
 
171
 
172
- def merge_with_master(processed_files):
173
- """
174
- Merge multiple DataFrames using the earnings schedule file as the master.
175
- The master file is identified by having 'earnings' in its filename (case insensitive).
176
- Other files are merged onto the master using key columns (e.g., 'tin', 'employee_name').
177
- """
178
- master_file = None
179
- other_files = []
180
-
181
- for file_info in processed_files:
182
- if "earnings" in file_info["filename"].lower():
183
- master_file = file_info
184
- else:
185
- other_files.append(file_info)
186
-
187
- if not master_file:
188
- st.warning("No master file with 'earnings' found. Using the first file as master.")
189
- master_file = processed_files[0]
190
- other_files = processed_files[1:]
191
-
192
- master_df = master_file["df"]
193
- st.write(f"Using '{master_file['filename']}' as master for merging.")
194
-
195
- default_keys = ['tin', 'employee_name']
196
- merged_df = master_df
197
-
198
- for other in other_files:
199
- other_df = other["df"]
200
- keys_to_use = [key for key in default_keys if key in other_df.columns and key in merged_df.columns]
201
- if not keys_to_use:
202
- keys_to_use = list(set(merged_df.columns).intersection(set(other_df.columns)))
203
- if keys_to_use:
204
- st.write(f"Merging '{other['filename']}' on keys: {keys_to_use}")
205
- merged_df = merged_df.merge(other_df, on=keys_to_use, how="left")
206
- else:
207
- st.warning(f"No common keys found for merging '{other['filename']}'. Skipping this file.")
208
-
209
- return merged_df
210
-
211
 
212
  def safe_display_df(df: pd.DataFrame) -> pd.DataFrame:
213
  """Ensure DataFrame is safe for display in Streamlit"""
 
1
+
2
  import streamlit as st
3
  import pandas as pd
4
  import numpy as np
 
13
  genai.configure(api_key=os.environ["GOOGLE_API_KEY"])
14
  model = genai.GenerativeModel('gemini-2.0-flash-thinking-exp')
15
 
 
16
  def clean_column_name(col_name):
17
  """Clean column names to be compatible with Arrow"""
18
+ cleaned = re.sub(r"[^\w\s]", " ", str(col_name).lower())
19
+ cleaned = re.sub(r"\s+", "_", cleaned.strip())
20
+ # Preserve currency indicators
21
+ if "usd" in cleaned: return cleaned.replace("usd", "_usd")
22
+ if "zw" in cleaned: return cleaned.replace("zw", "_zw")
23
+ return cleaned
24
 
25
  def clean_tin_value(val):
26
+ """Clean TIN while preserving format"""
27
+ val_str = str(val).strip().upper()
28
+ # Remove trailing .0 but keep hyphens and letters
29
+ val_str = re.sub(r"\.0$", "", val_str)
30
+ return re.sub(r"[^\w-]", "", val_str)
 
 
 
 
 
 
31
 
32
  def standardize_dataframe(df: pd.DataFrame) -> pd.DataFrame:
33
+ """Enhanced standardization for multi-currency support"""
 
 
 
 
 
 
 
34
  rename_map = {}
35
+ currency_keywords = {
36
+ 'salary': ['salary', 'wage', 'earning'],
37
+ 'overtime': ['overtime'],
38
+ 'bonus': ['bonus'],
39
+ 'gratuity': ['gratuity'],
40
+ 'housing': ['housing'],
41
+ 'vehicle': ['vehicle'],
42
+ 'pension': ['pension'],
43
+ 'nssa': ['nssa']
44
+ }
45
 
46
  for col in df.columns:
47
  col_lower = col.lower()
48
+ # Handle TIN first
49
+ if any(kw in col_lower for kw in ['tin', 'personal_id', 'tax_id']):
 
 
50
  rename_map[col] = 'tin'
51
+ continue
52
+
53
+ # Handle currency columns
54
+ found = False
55
+ for base_name, keywords in currency_keywords.items():
56
+ if any(kw in col_lower for kw in keywords):
57
+ currency = '_usd' if 'usd' in col_lower else '_zwl' if any(kw in col_lower for kw in ['zw', 'zwl', 'zwg']) else ''
58
+ new_name = f"{base_name}{currency}"
59
+ rename_map[col] = new_name
60
+ found = True
61
+ break
62
+ if not found:
63
+ if 'name' in col_lower:
64
+ rename_map[col] = 'employee_name'
65
+
66
+ # Apply renaming and handle duplicates
67
+ df = df.rename(columns=rename_map)
68
+
69
+ # Merge similar columns
70
+ for base in currency_keywords.keys():
71
+ cols = [c for c in df.columns if c.startswith(base)]
72
+ if len(cols) > 1:
73
+ df[base] = df[cols].bfill(axis=1).iloc[:, 0]
74
+ df = df.drop(columns=cols)
75
+
76
+ # Create employee_name if split
77
+ if 'employee_name' not in df.columns and {'first_name', 'last_name'}.issubset(df.columns):
78
+ df['employee_name'] = df['first_name'] + ' ' + df['last_name']
79
+
80
+ # Clean TIN column
81
  if 'tin' in df.columns:
82
+ df['tin'] = df['tin'].apply(clean_tin_value).str.strip()
83
+
 
 
84
  return df
85
 
 
86
  def analyze_columns(df: pd.DataFrame, filename: str) -> dict:
87
+ """Improved analysis prompt for financial data"""
88
  try:
89
+ sample_data = df.head(3).to_dict()
90
+ prompt = f"""Analyze this payroll data from {filename}. Focus on currency columns (USD/ZWL) and employee identifiers.
91
+ Return JSON with columns, key fields, and merging suggestions. Sample: {sample_data}"""
92
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
93
  response = model.generate_content(prompt)
94
+ return json.loads(response.text.replace('```json', '').replace('```', ''))
95
+ except:
96
+ return {"key_columns": ["tin", "employee_name"]}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
97
 
98
+ def merge_with_master(processed_files):
99
+ """Enhanced merging with fuzzy matching"""
100
+ master_df = next((f["df"] for f in processed_files if "paye" in f["filename"].lower()), None)
101
+ if not master_df:
102
+ master_df = processed_files[0]["df"]
103
+
104
+ for other in processed_files:
105
+ if other["df"] is master_df: continue
106
+
107
+ # Fuzzy match on TIN and names
108
+ other_df = other["df"]
109
+ merge_keys = []
110
+ if 'tin' in master_df and 'tin' in other_df:
111
+ master_df['clean_tin'] = master_df['tin'].apply(clean_tin_value)
112
+ other_df['clean_tin'] = other_df['tin'].apply(clean_tin_value)
113
+ merge_keys.append('clean_tin')
114
+
115
+ if 'employee_name' in both:
116
+ master_df['clean_name'] = master_df['employee_name'].str.lower().str.strip()
117
+ other_df['clean_name'] = other_df['employee_name'].str.lower().str.strip()
118
+ merge_keys.append('clean_name')
119
+
120
+ if merge_keys:
121
+ master_df = pd.merge(master_df, other_df, on=merge_keys, how='left', suffixes=('', '_drop'))
122
+ master_df = master_df.loc[:, ~master_df.columns.str.endswith('_drop')]
123
+
124
+ return master_df
125
 
126
  def read_excel_file(file) -> pd.DataFrame:
127
  """Read Excel file with improved error handling"""
 
135
  return None
136
 
137
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
138
 
139
  def safe_display_df(df: pd.DataFrame) -> pd.DataFrame:
140
  """Ensure DataFrame is safe for display in Streamlit"""