rairo commited on
Commit
364e421
·
verified ·
1 Parent(s): 68a00f6

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +123 -41
app.py CHANGED
@@ -13,56 +13,69 @@ import re
13
  genai.configure(api_key=os.environ["GOOGLE_API_KEY"])
14
  model = genai.GenerativeModel('gemini-2.0-flash-thinking-exp')
15
 
 
16
  def clean_column_name(col_name):
17
  """Clean column names to be compatible with Arrow"""
18
  if not isinstance(col_name, str):
19
  return str(col_name)
20
  # Remove special characters and extra spaces
21
- cleaned = re.sub(r'[^\w\s]', ' ', col_name)
22
- return re.sub(r'\s+', '_', cleaned.strip().lower())
 
23
 
24
  def is_salary_column(column_name: str) -> bool:
25
  """Check if column name suggests it contains salary/monetary data"""
26
- salary_keywords = ['salary', 'wage', 'income', 'earning', 'commission', 'fee', 'payment', 'compensation']
 
 
 
 
 
 
 
 
 
27
  column_lower = column_name.lower()
28
  return any(keyword in column_lower for keyword in salary_keywords)
29
 
 
30
  def clean_monetary_value(value):
31
  """Clean monetary values by removing currency symbols and converting to float"""
32
  if pd.isna(value):
33
  return np.nan
34
  if isinstance(value, (int, float)):
35
  return float(value)
36
-
37
  value_str = str(value)
38
  # Remove currency symbols, commas, and other non-numeric characters except decimal points
39
- cleaned = re.sub(r'[^0-9.-]', '', value_str)
40
-
41
  try:
42
  return float(cleaned)
43
  except (ValueError, TypeError):
44
  return np.nan
45
 
 
46
  def analyze_columns(df: pd.DataFrame, filename: str) -> dict:
47
  """Analyze DataFrame columns using Gemini AI with improved error handling"""
48
  try:
49
  # Prepare sample data for analysis
50
  display_df = df.head(5).copy()
51
-
52
  # Convert all columns to string for display
53
  for col in display_df.columns:
54
  display_df[col] = display_df[col].astype(str)
55
-
56
  sample_csv = display_df.to_csv(index=False)
57
-
58
  # Create a more structured prompt
59
  prompt = f"""
60
  Analyze this CSV data and provide analysis in JSON format.
61
  Filename: {filename}
62
-
63
  Sample data:
64
  {sample_csv}
65
-
66
  Respond with only a valid JSON object in this format:
67
  {{
68
  "subject": "Employee payroll data",
@@ -80,18 +93,18 @@ def analyze_columns(df: pd.DataFrame, filename: str) -> dict:
80
  }}
81
  }}
82
  """
83
-
84
  response = model.generate_content(prompt)
85
  response_text = response.text.strip()
86
-
87
  # Handle potential markdown code block
88
- if response_text.startswith('```json'):
89
  response_text = response_text[7:-3] # Remove ```json and ```
90
- elif response_text.startswith('```'):
91
  response_text = response_text[3:-3] # Remove ``` and ```
92
-
93
  response_text = response_text.strip()
94
-
95
  try:
96
  analysis = json.loads(response_text)
97
  return analysis
@@ -104,9 +117,9 @@ def analyze_columns(df: pd.DataFrame, filename: str) -> dict:
104
  "columns": [],
105
  "key_columns": [],
106
  "issues": ["Error analyzing columns"],
107
- "suggested_renames": {}
108
  }
109
-
110
  except Exception as e:
111
  st.error(f"Error in column analysis: {str(e)}")
112
  return {
@@ -114,73 +127,142 @@ def analyze_columns(df: pd.DataFrame, filename: str) -> dict:
114
  "columns": [],
115
  "key_columns": [],
116
  "issues": [str(e)],
117
- "suggested_renames": {}
118
  }
119
 
 
120
  def read_excel_file(file) -> pd.DataFrame:
121
  """Read Excel file with improved error handling"""
122
  try:
123
  # Try reading with default engine
124
- return pd.read_excel(file, engine='openpyxl')
125
  except Exception as e1:
126
  try:
127
  # Fallback to xlrd engine for older Excel files
128
- return pd.read_excel(file, engine='xlrd')
129
  except Exception as e2:
130
  st.error(f"Failed to read Excel file: {str(e2)}")
131
  return None
132
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
133
  def main():
134
  st.title("Smart CSV Processor")
135
  st.write("Upload CSV or Excel files for intelligent analysis and merging.")
136
 
137
  uploaded_files = st.file_uploader(
138
- "Choose files",
139
- accept_multiple_files=True,
140
- type=['csv', 'xlsx', 'xls']
141
  )
142
 
143
  if uploaded_files:
144
  st.write("### Processing Files")
145
  processed_files = []
146
-
147
  for uploaded_file in uploaded_files:
148
  st.write(f"#### Analyzing: {uploaded_file.name}")
149
-
150
  try:
151
  # Read the file
152
- if uploaded_file.name.endswith(('.xlsx', '.xls')):
153
  df = read_excel_file(uploaded_file)
154
  else:
155
  df = pd.read_csv(uploaded_file)
156
-
157
  if df is not None:
158
  # Clean column names
159
  df.columns = [clean_column_name(col) for col in df.columns]
160
-
161
  # Show initial data preview
162
  st.write("Initial Preview:")
163
  st.dataframe(df.head())
164
-
165
  # Analyze columns with improved error handling
166
  with st.spinner("Analyzing columns..."):
167
  analysis = analyze_columns(df, uploaded_file.name)
168
-
169
  if analysis:
170
  st.write("Column Analysis:")
171
  st.json(analysis)
172
-
173
- processed_files.append({
174
- 'filename': uploaded_file.name,
175
- 'df': df,
176
- 'analysis': analysis
177
- })
178
-
179
  except Exception as e:
180
  st.error(f"Error processing {uploaded_file.name}: {str(e)}")
181
  continue
182
 
183
- # Rest of the merging logic remains the same...
184
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
185
  if __name__ == "__main__":
186
  main()
 
13
  genai.configure(api_key=os.environ["GOOGLE_API_KEY"])
14
  model = genai.GenerativeModel('gemini-2.0-flash-thinking-exp')
15
 
16
+
17
  def clean_column_name(col_name):
18
  """Clean column names to be compatible with Arrow"""
19
  if not isinstance(col_name, str):
20
  return str(col_name)
21
  # Remove special characters and extra spaces
22
+ cleaned = re.sub(r"[^\w\s]", " ", col_name)
23
+ return re.sub(r"\s+", "_", cleaned.strip().lower())
24
+
25
 
26
  def is_salary_column(column_name: str) -> bool:
27
  """Check if column name suggests it contains salary/monetary data"""
28
+ salary_keywords = [
29
+ "salary",
30
+ "wage",
31
+ "income",
32
+ "earning",
33
+ "commission",
34
+ "fee",
35
+ "payment",
36
+ "compensation",
37
+ ]
38
  column_lower = column_name.lower()
39
  return any(keyword in column_lower for keyword in salary_keywords)
40
 
41
+
42
  def clean_monetary_value(value):
43
  """Clean monetary values by removing currency symbols and converting to float"""
44
  if pd.isna(value):
45
  return np.nan
46
  if isinstance(value, (int, float)):
47
  return float(value)
48
+
49
  value_str = str(value)
50
  # Remove currency symbols, commas, and other non-numeric characters except decimal points
51
+ cleaned = re.sub(r"[^0-9.-]", "", value_str)
52
+
53
  try:
54
  return float(cleaned)
55
  except (ValueError, TypeError):
56
  return np.nan
57
 
58
+
59
  def analyze_columns(df: pd.DataFrame, filename: str) -> dict:
60
  """Analyze DataFrame columns using Gemini AI with improved error handling"""
61
  try:
62
  # Prepare sample data for analysis
63
  display_df = df.head(5).copy()
64
+
65
  # Convert all columns to string for display
66
  for col in display_df.columns:
67
  display_df[col] = display_df[col].astype(str)
68
+
69
  sample_csv = display_df.to_csv(index=False)
70
+
71
  # Create a more structured prompt
72
  prompt = f"""
73
  Analyze this CSV data and provide analysis in JSON format.
74
  Filename: {filename}
75
+
76
  Sample data:
77
  {sample_csv}
78
+
79
  Respond with only a valid JSON object in this format:
80
  {{
81
  "subject": "Employee payroll data",
 
93
  }}
94
  }}
95
  """
96
+
97
  response = model.generate_content(prompt)
98
  response_text = response.text.strip()
99
+
100
  # Handle potential markdown code block
101
+ if response_text.startswith("```json"):
102
  response_text = response_text[7:-3] # Remove ```json and ```
103
+ elif response_text.startswith("```"):
104
  response_text = response_text[3:-3] # Remove ``` and ```
105
+
106
  response_text = response_text.strip()
107
+
108
  try:
109
  analysis = json.loads(response_text)
110
  return analysis
 
117
  "columns": [],
118
  "key_columns": [],
119
  "issues": ["Error analyzing columns"],
120
+ "suggested_renames": {},
121
  }
122
+
123
  except Exception as e:
124
  st.error(f"Error in column analysis: {str(e)}")
125
  return {
 
127
  "columns": [],
128
  "key_columns": [],
129
  "issues": [str(e)],
130
+ "suggested_renames": {},
131
  }
132
 
133
+
134
  def read_excel_file(file) -> pd.DataFrame:
135
  """Read Excel file with improved error handling"""
136
  try:
137
  # Try reading with default engine
138
+ return pd.read_excel(file, engine="openpyxl")
139
  except Exception as e1:
140
  try:
141
  # Fallback to xlrd engine for older Excel files
142
+ return pd.read_excel(file, engine="xlrd")
143
  except Exception as e2:
144
  st.error(f"Failed to read Excel file: {str(e2)}")
145
  return None
146
 
147
+
148
+ def merge_dataframes(processed_files, selected_columns):
149
+ """Merge multiple DataFrames on selected columns"""
150
+ merged_df = processed_files[0]["df"]
151
+ for df_info in processed_files[1:]:
152
+ merged_df = merged_df.merge(df_info["df"], on=selected_columns, how="outer")
153
+ return merged_df
154
+
155
+
156
+ def safe_display_df(df: pd.DataFrame) -> pd.DataFrame:
157
+ """Ensure DataFrame is safe for display in Streamlit"""
158
+ return df.astype(str).replace({"nan": "", "None": ""})
159
+
160
+
161
  def main():
162
  st.title("Smart CSV Processor")
163
  st.write("Upload CSV or Excel files for intelligent analysis and merging.")
164
 
165
  uploaded_files = st.file_uploader(
166
+ "Choose files", accept_multiple_files=True, type=["csv", "xlsx", "xls"]
 
 
167
  )
168
 
169
  if uploaded_files:
170
  st.write("### Processing Files")
171
  processed_files = []
172
+
173
  for uploaded_file in uploaded_files:
174
  st.write(f"#### Analyzing: {uploaded_file.name}")
175
+
176
  try:
177
  # Read the file
178
+ if uploaded_file.name.endswith((".xlsx", ".xls")):
179
  df = read_excel_file(uploaded_file)
180
  else:
181
  df = pd.read_csv(uploaded_file)
182
+
183
  if df is not None:
184
  # Clean column names
185
  df.columns = [clean_column_name(col) for col in df.columns]
186
+
187
  # Show initial data preview
188
  st.write("Initial Preview:")
189
  st.dataframe(df.head())
190
+
191
  # Analyze columns with improved error handling
192
  with st.spinner("Analyzing columns..."):
193
  analysis = analyze_columns(df, uploaded_file.name)
194
+
195
  if analysis:
196
  st.write("Column Analysis:")
197
  st.json(analysis)
198
+
199
+ processed_files.append(
200
+ {"filename": uploaded_file.name, "df": df, "analysis": analysis}
201
+ )
202
+
 
 
203
  except Exception as e:
204
  st.error(f"Error processing {uploaded_file.name}: {str(e)}")
205
  continue
206
 
207
+ if len(processed_files) > 1:
208
+ st.write("### Merging DataFrames")
209
+
210
+ # Find common columns
211
+ common_columns = list(
212
+ set.intersection(*[set(df_info["df"].columns) for df_info in processed_files])
213
+ )
214
+
215
+ if common_columns:
216
+ st.write("Common columns found:", common_columns)
217
+
218
+ selected_columns = st.multiselect(
219
+ "Select columns to use for merging", options=common_columns, default=common_columns
220
+ )
221
+
222
+ if selected_columns:
223
+ with st.spinner("Merging datasets..."):
224
+ merged_df = merge_dataframes(processed_files, selected_columns)
225
+
226
+ if merged_df is not None:
227
+ st.write("### Preview of Merged Data")
228
+ st.dataframe(safe_display_df(merged_df.head()))
229
+
230
+ # Create downloadable CSV
231
+ try:
232
+ csv = merged_df.to_csv(index=False)
233
+ st.download_button(
234
+ label="Download Merged CSV",
235
+ data=csv,
236
+ file_name="merged_data.csv",
237
+ mime="text/csv",
238
+ )
239
+
240
+ # Show statistics
241
+ st.write("### Dataset Statistics")
242
+ st.write(f"Total rows: {len(merged_df)}")
243
+ st.write(f"Total columns: {len(merged_df.columns)}")
244
+
245
+ # Data quality metrics
246
+ st.write("### Data Quality Metrics")
247
+ missing_df = pd.DataFrame(
248
+ {
249
+ "Column": merged_df.columns,
250
+ "Missing Values": merged_df.isnull().sum().values,
251
+ "Missing Percentage": (merged_df.isnull().sum().values / len(merged_df) * 100).round(2),
252
+ }
253
+ )
254
+ st.dataframe(missing_df)
255
+
256
+ duplicates = merged_df.duplicated().sum()
257
+ st.write(f"Number of duplicate rows: {duplicates}")
258
+
259
+ except Exception as e:
260
+ st.error(f"Error preparing download: {str(e)}")
261
+ else:
262
+ st.warning("No common columns found across datasets.")
263
+ else:
264
+ st.warning("Please upload at least 2 files to merge.")
265
+
266
+
267
  if __name__ == "__main__":
268
  main()