File size: 13,157 Bytes
943ef38
 
fddc41f
943ef38
 
7236409
943ef38
 
fddc41f
943ef38
 
531fbac
3a17237
943ef38
91d26e7
fc7b3ea
8018c05
fc7b3ea
3bfe933
 
 
 
91d26e7
d6d231e
3bfe933
fc7b3ea
 
3bfe933
 
 
 
 
 
 
 
d6d231e
5e8d055
3bfe933
fc7b3ea
8018c05
 
 
 
 
3bfe933
8018c05
9c17313
 
 
 
8018c05
fc7b3ea
 
8018c05
5e8d055
 
fc7b3ea
5e8d055
8e09dee
3bfe933
fc7b3ea
3bfe933
 
 
fc7b3ea
8018c05
3bfe933
 
 
 
 
 
 
 
fc7b3ea
8018c05
3bfe933
 
fc7b3ea
8018c05
3bfe933
 
55b92d6
3bfe933
 
 
fc7b3ea
5e8d055
91d26e7
943ef38
fc7b3ea
8018c05
fc7b3ea
943ef38
3bfe933
 
 
 
 
8018c05
3bfe933
320193a
3bfe933
8018c05
 
3bfe933
68a00f6
3bfe933
 
 
 
 
 
 
 
 
 
 
 
 
8018c05
3bfe933
 
8018c05
364e421
68a00f6
fc7b3ea
 
 
68a00f6
364e421
68a00f6
 
364e421
68a00f6
 
 
943ef38
3bfe933
 
8018c05
 
 
 
 
3bfe933
9c17313
c71c6c1
 
fc7b3ea
8018c05
3bfe933
c71c6c1
 
9c17313
c71c6c1
 
 
 
a66eb56
9c17313
c71c6c1
9c17313
c71c6c1
8018c05
9c17313
 
 
 
 
c71c6c1
9c17313
c71c6c1
8018c05
c71c6c1
fc7b3ea
d2c0f12
8018c05
8e09dee
 
 
 
9c17313
 
8018c05
9c17313
 
 
 
3bfe933
8018c05
c71c6c1
8018c05
c71c6c1
8018c05
8e09dee
 
8018c05
a66eb56
 
8018c05
9c17313
fc7b3ea
9c17313
a66eb56
9c17313
 
a66eb56
9c17313
 
 
3bfe933
364e421
 
fc7b3ea
8018c05
fc7b3ea
364e421
 
943ef38
 
 
8018c05
943ef38
 
 
 
 
6250aa7
364e421
68a00f6
6250aa7
 
 
c71c6c1
fc7b3ea
c71c6c1
5e8d055
91d26e7
68a00f6
 
6250aa7
 
 
 
3bfe933
 
8018c05
320193a
c71c6c1
6250aa7
 
 
320193a
364e421
c71c6c1
 
320193a
c71c6c1
320193a
 
 
 
8018c05
320193a
 
 
 
fc7b3ea
 
 
 
 
320193a
 
 
 
 
c71c6c1
320193a
 
 
364e421
 
 
943ef38
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
import streamlit as st
import pandas as pd
import numpy as np
import os
from io import BytesIO
import google.generativeai as genai
import json
import tempfile
import re

# Initialize Google Gemini AI client
genai.configure(api_key=os.environ["GOOGLE_API_KEY"])
model = genai.GenerativeModel('gemini-2.0-flash-thinking-exp')

def clean_column_name(col_name):
    """
    Clean column names: convert to lowercase, replace non-alphanumeric characters with underscores.
    """
    if not isinstance(col_name, str):
        return str(col_name)
    cleaned = re.sub(r"[^\w\s]", " ", col_name)
    return re.sub(r"\s+", "_", cleaned.strip().lower())

def clean_tin_value(val):
    """
    Clean the TIN value by stripping whitespace and, if it ends with '.0',
    converting it to an integer string.
    """
    val_str = str(val).strip()
    if val_str.endswith('.0'):
        try:
            return str(int(float(val_str)))
        except Exception:
            return val_str
    return val_str

def standardize_dataframe(df: pd.DataFrame) -> pd.DataFrame:
    """
    Standardize DataFrame column names and data types:
      - Drop any middle name columns.
      - Clean column names (e.g. "Employee Name" becomes "employee_name").
      - Rename synonyms (e.g., "Personal ID of Employee" to "tin").
      - If missing, construct an 'employee_name' column from first and last names.
      - Ensure key columns (tin and employee_name) are strings.
    """
    # Drop columns containing 'middle_name'
    middle_name_cols = [col for col in df.columns if 'middle_name' in col.lower()]
    if middle_name_cols:
        df = df.drop(columns=middle_name_cols)
    
    # Clean all column names
    df.columns = [clean_column_name(col) for col in df.columns]
    
    # Rename synonyms for TIN and salary
    rename_map = {}
    for col in df.columns:
        if col in ['personal id', 'personal_id', 'tax id', 'taxid'] or "personal_id_of_employee" in col:
            rename_map[col] = 'tin'
        elif 'tin' in col:
            rename_map[col] = 'tin'
        if any(keyword in col for keyword in ['salary', 'wage', 'earning', 'commission', 'fee', 'payment', 'compensation']):
            rename_map[col] = 'salary'
    if rename_map:
        df = df.rename(columns=rename_map)
    
    # Combine duplicate columns if necessary
    if 'salary' in df.columns and list(df.columns).count('salary') > 1:
        salary_cols = [col for col in df.columns if col == 'salary']
        df['salary'] = df[salary_cols].bfill(axis=1).iloc[:, 0]
        df = df.loc[:, ~df.columns.duplicated()]
    if 'tin' in df.columns and list(df.columns).count('tin') > 1:
        tin_cols = [col for col in df.columns if col == 'tin']
        df['tin'] = df[tin_cols].bfill(axis=1).iloc[:, 0]
        df = df.loc[:, ~df.columns.duplicated()]
    
    # Construct employee_name if missing
    if 'employee_name' not in df.columns and 'first_name' in df.columns and 'last_name' in df.columns:
        df['employee_name'] = df['first_name'].astype(str).str.strip() + ' ' + df['last_name'].astype(str).str.strip()
    
    # Ensure proper types for key columns
    if 'salary' in df.columns:
        df['salary'] = pd.to_numeric(df['salary'], errors='coerce')
    if 'tin' in df.columns:
        df['tin'] = df['tin'].fillna('').astype(str).apply(clean_tin_value)
    if 'employee_name' in df.columns:
        df['employee_name'] = df['employee_name'].fillna('').astype(str).str.strip()
    
    return df

def analyze_columns(df: pd.DataFrame, filename: str) -> dict:
    """
    Use Gemini AI to analyze DataFrame columns and suggest key columns and renames.
    """
    try:
        display_df = df.head(5).copy()
        for col in display_df.columns:
            display_df[col] = display_df[col].astype(str)
        sample_csv = display_df.to_csv(index=False)
        prompt = f"""
        Analyze this CSV data, which may represent an employee earnings schedule, PAYE figures, or template info for payroll processing.
        Filename: {filename}
        Sample data (first 5 rows):
        {sample_csv}
        Identify potential key columns for merging and suggest renames.
        Respond with a valid JSON object.
        """
        response = model.generate_content(prompt)
        response_text = response.text.strip()
        if response_text.startswith("```json"):
            response_text = response_text[7:-3]
        elif response_text.startswith("```"):
            response_text = response_text[3:-3]
        response_text = response_text.strip()
        try:
            analysis = json.loads(response_text)
            return analysis
        except json.JSONDecodeError as je:
            st.error(f"JSON parsing error: {str(je)}")
            st.text("Raw response:")
            st.text(response_text)
            return {"subject": "Error parsing analysis", "columns": [], "key_columns": [], "issues": ["Error analyzing columns"], "suggested_renames": {}}
    except Exception as e:
        st.error(f"Error in column analysis: {str(e)}")
        return {"subject": "Error in analysis", "columns": [], "key_columns": [], "issues": [str(e)], "suggested_renames": {}}

def read_excel_file(file) -> pd.DataFrame:
    """
    Read an Excel file with error handling.
    """
    try:
        return pd.read_excel(file, engine="openpyxl")
    except Exception as e1:
        try:
            return pd.read_excel(file, engine="xlrd")
        except Exception as e2:
            st.error(f"Failed to read Excel file: {str(e2)}")
            return None

def merge_with_master(processed_files):
    """
    Merge DataFrames in two steps:
      1. Use the earnings file as master (dropping its inaccurate 'tin').
      2. Merge the template file (which supplies the trusted TIN via its first column)
         with the earnings data using 'employee_name'.
      3. Finally, merge the combined data with the PAYE file using 'tin'.
    """
    earnings_file = None
    paye_file = None
    template_file = None

    # Identify files by filename keywords
    for file_info in processed_files:
        lower_filename = file_info["filename"].lower()
        if "earnings" in lower_filename:
            earnings_file = file_info
        elif "paye" in lower_filename:
            paye_file = file_info
        elif "template" in lower_filename:
            template_file = file_info

    if not earnings_file:
        st.warning("No earnings file found as master. Using the first file as master.")
        earnings_file = processed_files[0]
    
    # Process earnings file: drop its inaccurate TIN column
    earnings_df = earnings_file["df"]
    if 'tin' in earnings_df.columns:
        earnings_df = earnings_df.drop(columns=['tin'])
    if 'middle_name' in earnings_df.columns:
        earnings_df = earnings_df.drop(columns=['middle_name'])
    
    merged_df = earnings_df.copy()
    
    # Process and merge the template file using employee_name
    if template_file is not None:
        st.write(f"Merging template info from '{template_file['filename']}' using key 'employee_name'.")
        template_df = template_file["df"].copy()
        # Force the first column (Personal ID of Employee) to be 'tin'
        if not template_df.empty:
            cols = list(template_df.columns)
            cols[0] = "tin"
            template_df.columns = cols
        if 'middle_name' in template_df.columns:
            template_df = template_df.drop(columns=['middle_name'])
        # If employee_name is not present, construct it from first_name and last_name
        if 'employee_name' not in template_df.columns and 'first_name' in template_df.columns and 'last_name' in template_df.columns:
            template_df['employee_name'] = template_df['first_name'].astype(str).str.strip() + ' ' + template_df['last_name'].astype(str).str.strip()
        if 'employee_name' in merged_df.columns and 'employee_name' in template_df.columns:
            merged_df = merged_df.merge(template_df, on='employee_name', how='left', suffixes=('', '_template'))
        else:
            st.warning("Column 'employee_name' missing in either earnings or template file. Skipping template merge.")
    else:
        st.warning("No template file detected. Cannot proceed without a trusted TIN.")
    
    # Check for a trusted 'tin' column after merging earnings and template
    if 'tin' not in merged_df.columns or merged_df['tin'].isnull().all():
        st.error("No trusted 'tin' column found in the merged earnings-template data. Aborting further merge. "
                 "Ensure the template file's first column (Personal ID of Employee) is correctly populated.")
        return merged_df

    # Merge PAYE file using the trusted 'tin'
    if paye_file is not None:
        st.write(f"Merging PAYE figures from '{paye_file['filename']}' using key 'tin'.")
        paye_df = paye_file["df"]
        if 'tin' in paye_df.columns:
            merged_df = merged_df.merge(paye_df, on='tin', how='left', suffixes=('', '_paye'))
        else:
            st.warning("Column 'tin' missing in the PAYE file. Skipping PAYE merge.")
    else:
        st.warning("No PAYE file detected.")
    
    return merged_df

def safe_display_df(df: pd.DataFrame) -> pd.DataFrame:
    """
    Convert all entries in the DataFrame to strings and replace common null placeholders.
    """
    return df.astype(str).replace({"nan": "", "None": ""})

def main():
    st.title("Smart CSV Processor")
    st.write("Upload CSV or Excel files for intelligent analysis and merging.")
    uploaded_files = st.file_uploader("Choose files", accept_multiple_files=True, type=["csv", "xlsx", "xls"])
    if uploaded_files:
        st.write("### Processing Files")
        processed_files = []
        for uploaded_file in uploaded_files:
            st.write(f"#### Analyzing: {uploaded_file.name}")
            try:
                if uploaded_file.name.endswith((".xlsx", ".xls")):
                    df = read_excel_file(uploaded_file)
                else:
                    df = pd.read_csv(uploaded_file)
                if df is not None:
                    if df.empty:
                        st.warning(f"DataFrame from '{uploaded_file.name}' is empty. Please check the file.")
                        continue
                    df = standardize_dataframe(df)
                    st.write("Initial Preview:")
                    st.dataframe(df.head())
                    with st.spinner("Analyzing columns..."):
                        analysis = analyze_columns(df, uploaded_file.name)
                    if analysis:
                        st.write("Column Analysis:")
                        st.json(analysis)
                        if 'suggested_renames' in analysis:
                            df = df.rename(columns=analysis['suggested_renames'])
                        processed_files.append({"filename": uploaded_file.name, "df": df, "analysis": analysis})
                else:
                    st.error(f"Could not read data from '{uploaded_file.name}'.")
            except Exception as e:
                st.error(f"Error processing {uploaded_file.name}: {str(e)}")
                continue

        if len(processed_files) > 1:
            if not any(file_info["df"].empty for file_info in processed_files):
                st.write("### Merging DataFrames (Earnings as Master)")
                merged_df = merge_with_master(processed_files)
                if merged_df is not None and not merged_df.empty:
                    st.write("### Preview of Merged Data")
                    st.dataframe(safe_display_df(merged_df.head()))
                    try:
                        csv = merged_df.to_csv(index=False)
                        st.download_button(label="Download Merged CSV", data=csv, file_name="merged_data.csv", mime="text/csv")
                        st.write("### Dataset Statistics")
                        st.write(f"Total rows: {len(merged_df)}")
                        st.write(f"Total columns: {len(merged_df.columns)}")
                        st.write("### Data Quality Metrics")
                        missing_df = pd.DataFrame({
                            "Column": merged_df.columns,
                            "Missing Values": merged_df.isnull().sum().values,
                            "Missing Percentage": (merged_df.isnull().sum().values / len(merged_df) * 100).round(2),
                        })
                        st.dataframe(missing_df)
                        duplicates = merged_df.duplicated().sum()
                        st.write(f"Number of duplicate rows: {duplicates}")
                    except Exception as e:
                        st.error(f"Error preparing download: {str(e)}")
                elif merged_df is not None and merged_df.empty:
                    st.warning("The merged DataFrame is empty. Please check the input files and merging keys.")
            else:
                st.warning("One or more of the processed DataFrames is empty. Merging cannot proceed meaningfully.")
        else:
            st.warning("Please upload at least 2 files to merge.")

if __name__ == "__main__":
    main()