rairo commited on
Commit
68a00f6
·
verified ·
1 Parent(s): fddc41f

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +94 -194
app.py CHANGED
@@ -17,7 +17,9 @@ def clean_column_name(col_name):
17
  """Clean column names to be compatible with Arrow"""
18
  if not isinstance(col_name, str):
19
  return str(col_name)
20
- return col_name.strip()
 
 
21
 
22
  def is_salary_column(column_name: str) -> bool:
23
  """Check if column name suggests it contains salary/monetary data"""
@@ -32,9 +34,7 @@ def clean_monetary_value(value):
32
  if isinstance(value, (int, float)):
33
  return float(value)
34
 
35
- # Convert to string if not already
36
  value_str = str(value)
37
-
38
  # Remove currency symbols, commas, and other non-numeric characters except decimal points
39
  cleaned = re.sub(r'[^0-9.-]', '', value_str)
40
 
@@ -43,132 +43,92 @@ def clean_monetary_value(value):
43
  except (ValueError, TypeError):
44
  return np.nan
45
 
46
- def safe_convert_column(df: pd.DataFrame, column: str) -> pd.Series:
47
- """Safely convert a column to the appropriate type"""
48
- series = df[column].copy()
49
-
50
- # Handle salary/monetary columns
51
- if is_salary_column(column):
52
- return series.apply(clean_monetary_value)
53
-
54
- # Try numeric conversion first
55
- numeric_series = pd.to_numeric(series, errors='coerce')
56
- if numeric_series.notna().any():
57
- return numeric_series
58
-
59
- # If not numeric, convert to string
60
- return series.astype(str)
61
-
62
- def clean_dataframe(df: pd.DataFrame) -> pd.DataFrame:
63
- """Clean DataFrame to ensure Arrow compatibility"""
64
- # Create a copy to avoid modifying the original
65
- cleaned_df = df.copy()
66
-
67
- # Clean column names
68
- cleaned_df.columns = [clean_column_name(col) for col in cleaned_df.columns]
69
-
70
- # Process each column
71
- for column in cleaned_df.columns:
72
- try:
73
- cleaned_df[column] = safe_convert_column(cleaned_df, column)
74
- except Exception as e:
75
- st.warning(f"Error processing column {column}: {str(e)}")
76
- # Fallback to string conversion
77
- cleaned_df[column] = cleaned_df[column].astype(str)
78
-
79
- return cleaned_df
80
-
81
- def safe_display_df(df: pd.DataFrame) -> pd.DataFrame:
82
- """Prepare DataFrame for safe display in Streamlit"""
83
- display_df = df.copy()
84
-
85
- # Convert all columns to string for display
86
- for col in display_df.columns:
87
- try:
88
- if is_salary_column(col):
89
- # Format monetary values
90
- display_df[col] = display_df[col].apply(lambda x: f"${x:,.2f}" if pd.notna(x) else "")
91
- else:
92
- # Convert other columns to string
93
- display_df[col] = display_df[col].astype(str).apply(lambda x: "" if x == "nan" else x)
94
- except Exception as e:
95
- display_df[col] = display_df[col].astype(str)
96
-
97
- return display_df
98
-
99
- def convert_excel_to_csv(excel_file):
100
- """Convert Excel file to CSV and return the DataFrame"""
101
- try:
102
- df = pd.read_excel(excel_file)
103
- return clean_dataframe(df)
104
- except Exception as e:
105
- st.error(f"Error converting Excel file: {str(e)}")
106
- return None
107
-
108
  def analyze_columns(df: pd.DataFrame, filename: str) -> dict:
109
- """Analyze DataFrame columns using Gemini AI"""
110
- # Prepare sample data for analysis
111
- display_df = safe_display_df(df.head(5))
112
- sample_csv = display_df.to_csv(index=False)
113
-
114
- analysis_prompt = f"""
115
- Analyze this CSV data from file '{filename}' and provide the following in JSON format:
116
-
117
- CSV Data:
118
- {sample_csv}
119
-
120
- Provide analysis in this exact JSON format:
121
- {{
122
- "subject": "string describing main subject of dataset",
123
- "columns": [
124
- {{"name": "column_name", "type": "data_type", "description": "column description"}}
125
- ],
126
- "key_columns": ["potential columns for merging"],
127
- "issues": ["list of data quality issues found"],
128
- "suggested_renames": {{"old_name": "new_name"}}
129
- }}
130
-
131
- Only respond with the JSON object, no additional text.
132
- """
133
-
134
  try:
135
- response = model.generate_content(analysis_prompt)
136
- return json.loads(response.text)
137
- except Exception as e:
138
- st.error(f"Error analyzing columns: {str(e)}")
139
- return None
140
-
141
- def merge_dataframes(dataframes: List[Dict], common_columns: List[str]) -> pd.DataFrame:
142
- """Merge all DataFrames using specified common columns"""
143
- if not dataframes:
144
- return None
145
-
146
- try:
147
- # Start with the first DataFrame
148
- merged_df = dataframes[0]['df'].copy()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
149
 
150
- # Merge with remaining DataFrames
151
- for df_info in dataframes[1:]:
152
- # Ensure common columns have matching types
153
- for col in common_columns:
154
- if col in merged_df.columns and col in df_info['df'].columns:
155
- # Convert to string if types don't match
156
- if merged_df[col].dtype != df_info['df'][col].dtype:
157
- merged_df[col] = merged_df[col].astype(str)
158
- df_info['df'][col] = df_info['df'][col].astype(str)
159
 
160
- merged_df = pd.merge(
161
- merged_df,
162
- df_info['df'],
163
- on=common_columns,
164
- how='outer',
165
- suffixes=(None, f'_{df_info["filename"]}')
166
- )
167
 
168
- return clean_dataframe(merged_df)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
169
  except Exception as e:
170
- st.error(f"Error merging DataFrames: {str(e)}")
171
- return None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
172
 
173
  def main():
174
  st.title("Smart CSV Processor")
@@ -188,22 +148,24 @@ def main():
188
  st.write(f"#### Analyzing: {uploaded_file.name}")
189
 
190
  try:
191
- # Read and clean data
192
  if uploaded_file.name.endswith(('.xlsx', '.xls')):
193
- df = convert_excel_to_csv(uploaded_file)
194
  else:
195
  df = pd.read_csv(uploaded_file)
196
- df = clean_dataframe(df)
197
 
198
  if df is not None:
 
 
 
199
  # Show initial data preview
200
  st.write("Initial Preview:")
201
- st.dataframe(safe_display_df(df.head()))
202
 
203
- # Analyze columns
204
- with st.spinner("Analyzing columns with AI..."):
205
  analysis = analyze_columns(df, uploaded_file.name)
206
-
207
  if analysis:
208
  st.write("Column Analysis:")
209
  st.json(analysis)
@@ -214,73 +176,11 @@ def main():
214
  'analysis': analysis
215
  })
216
 
217
- if 'suggested_renames' in analysis and analysis['suggested_renames']:
218
- df.rename(columns=analysis['suggested_renames'], inplace=True)
219
- st.write("Updated Preview (after renaming):")
220
- st.dataframe(safe_display_df(df.head()))
221
-
222
  except Exception as e:
223
  st.error(f"Error processing {uploaded_file.name}: {str(e)}")
224
  continue
225
-
226
- if len(processed_files) > 1:
227
- st.write("### Merging DataFrames")
228
-
229
- # Find common columns
230
- common_columns = list(set.intersection(*[
231
- set(df_info['df'].columns) for df_info in processed_files
232
- ]))
233
-
234
- if common_columns:
235
- st.write("Common columns found:", common_columns)
236
-
237
- selected_columns = st.multiselect(
238
- "Select columns to use for merging",
239
- options=common_columns,
240
- default=common_columns
241
- )
242
-
243
- if selected_columns:
244
- with st.spinner("Merging datasets..."):
245
- merged_df = merge_dataframes(processed_files, selected_columns)
246
-
247
- if merged_df is not None:
248
- st.write("### Preview of Merged Data")
249
- st.dataframe(safe_display_df(merged_df.head()))
250
-
251
- # Create downloadable CSV
252
- try:
253
- csv = merged_df.to_csv(index=False)
254
- st.download_button(
255
- label="Download Merged CSV",
256
- data=csv,
257
- file_name="merged_data.csv",
258
- mime="text/csv"
259
- )
260
-
261
- # Show statistics
262
- st.write("### Dataset Statistics")
263
- st.write(f"Total rows: {len(merged_df)}")
264
- st.write(f"Total columns: {len(merged_df.columns)}")
265
-
266
- # Data quality metrics
267
- st.write("### Data Quality Metrics")
268
- missing_df = pd.DataFrame({
269
- 'Column': merged_df.columns,
270
- 'Missing Values': merged_df.isnull().sum().values,
271
- 'Missing Percentage': (merged_df.isnull().sum().values / len(merged_df) * 100).round(2)
272
- })
273
- st.dataframe(missing_df)
274
-
275
- duplicates = merged_df.duplicated().sum()
276
- st.write(f"Number of duplicate rows: {duplicates}")
277
-
278
- except Exception as e:
279
- st.error(f"Error preparing download: {str(e)}")
280
- else:
281
- st.warning("No common columns found across datasets.")
282
- else:
283
- st.warning("Please upload at least 2 files to merge.")
284
 
 
 
285
  if __name__ == "__main__":
286
  main()
 
17
  """Clean column names to be compatible with Arrow"""
18
  if not isinstance(col_name, str):
19
  return str(col_name)
20
+ # Remove special characters and extra spaces
21
+ cleaned = re.sub(r'[^\w\s]', ' ', col_name)
22
+ return re.sub(r'\s+', '_', cleaned.strip().lower())
23
 
24
  def is_salary_column(column_name: str) -> bool:
25
  """Check if column name suggests it contains salary/monetary data"""
 
34
  if isinstance(value, (int, float)):
35
  return float(value)
36
 
 
37
  value_str = str(value)
 
38
  # Remove currency symbols, commas, and other non-numeric characters except decimal points
39
  cleaned = re.sub(r'[^0-9.-]', '', value_str)
40
 
 
43
  except (ValueError, TypeError):
44
  return np.nan
45
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
46
  def analyze_columns(df: pd.DataFrame, filename: str) -> dict:
47
+ """Analyze DataFrame columns using Gemini AI with improved error handling"""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
48
  try:
49
+ # Prepare sample data for analysis
50
+ display_df = df.head(5).copy()
51
+
52
+ # Convert all columns to string for display
53
+ for col in display_df.columns:
54
+ display_df[col] = display_df[col].astype(str)
55
+
56
+ sample_csv = display_df.to_csv(index=False)
57
+
58
+ # Create a more structured prompt
59
+ prompt = f"""
60
+ Analyze this CSV data and provide analysis in JSON format.
61
+ Filename: {filename}
62
+
63
+ Sample data:
64
+ {sample_csv}
65
+
66
+ Respond with only a valid JSON object in this format:
67
+ {{
68
+ "subject": "Employee payroll data",
69
+ "columns": [
70
+ {{
71
+ "name": "column_name",
72
+ "type": "string/number/date",
73
+ "description": "Brief description"
74
+ }}
75
+ ],
76
+ "key_columns": ["employee_id", "tin"],
77
+ "issues": ["Missing values in salary column"],
78
+ "suggested_renames": {{
79
+ "old_name": "new_name"
80
+ }}
81
+ }}
82
+ """
83
 
84
+ response = model.generate_content(prompt)
85
+ response_text = response.text.strip()
86
+
87
+ # Handle potential markdown code block
88
+ if response_text.startswith('```json'):
89
+ response_text = response_text[7:-3] # Remove ```json and ```
90
+ elif response_text.startswith('```'):
91
+ response_text = response_text[3:-3] # Remove ``` and ```
 
92
 
93
+ response_text = response_text.strip()
 
 
 
 
 
 
94
 
95
+ try:
96
+ analysis = json.loads(response_text)
97
+ return analysis
98
+ except json.JSONDecodeError as je:
99
+ st.error(f"JSON parsing error: {str(je)}")
100
+ st.text("Raw response:")
101
+ st.text(response_text)
102
+ return {
103
+ "subject": "Error parsing analysis",
104
+ "columns": [],
105
+ "key_columns": [],
106
+ "issues": ["Error analyzing columns"],
107
+ "suggested_renames": {}
108
+ }
109
+
110
  except Exception as e:
111
+ st.error(f"Error in column analysis: {str(e)}")
112
+ return {
113
+ "subject": "Error in analysis",
114
+ "columns": [],
115
+ "key_columns": [],
116
+ "issues": [str(e)],
117
+ "suggested_renames": {}
118
+ }
119
+
120
+ def read_excel_file(file) -> pd.DataFrame:
121
+ """Read Excel file with improved error handling"""
122
+ try:
123
+ # Try reading with default engine
124
+ return pd.read_excel(file, engine='openpyxl')
125
+ except Exception as e1:
126
+ try:
127
+ # Fallback to xlrd engine for older Excel files
128
+ return pd.read_excel(file, engine='xlrd')
129
+ except Exception as e2:
130
+ st.error(f"Failed to read Excel file: {str(e2)}")
131
+ return None
132
 
133
  def main():
134
  st.title("Smart CSV Processor")
 
148
  st.write(f"#### Analyzing: {uploaded_file.name}")
149
 
150
  try:
151
+ # Read the file
152
  if uploaded_file.name.endswith(('.xlsx', '.xls')):
153
+ df = read_excel_file(uploaded_file)
154
  else:
155
  df = pd.read_csv(uploaded_file)
 
156
 
157
  if df is not None:
158
+ # Clean column names
159
+ df.columns = [clean_column_name(col) for col in df.columns]
160
+
161
  # Show initial data preview
162
  st.write("Initial Preview:")
163
+ st.dataframe(df.head())
164
 
165
+ # Analyze columns with improved error handling
166
+ with st.spinner("Analyzing columns..."):
167
  analysis = analyze_columns(df, uploaded_file.name)
168
+
169
  if analysis:
170
  st.write("Column Analysis:")
171
  st.json(analysis)
 
176
  'analysis': analysis
177
  })
178
 
 
 
 
 
 
179
  except Exception as e:
180
  st.error(f"Error processing {uploaded_file.name}: {str(e)}")
181
  continue
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
182
 
183
+ # Rest of the merging logic remains the same...
184
+
185
  if __name__ == "__main__":
186
  main()