rairo commited on
Commit
91d26e7
·
verified ·
1 Parent(s): 6250aa7

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +124 -111
app.py CHANGED
@@ -11,38 +11,88 @@ import tempfile
11
  genai.configure(api_key=os.environ["GOOGLE_API_KEY"])
12
  model = genai.GenerativeModel('gemini-2.0-flash-thinking-exp')
13
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
14
  def clean_dataframe(df: pd.DataFrame) -> pd.DataFrame:
15
  """Clean DataFrame to ensure Arrow compatibility"""
16
- for column in df.columns:
17
- # Convert column name to string if it's not already
18
- if not isinstance(column, str):
19
- df.rename(columns={column: str(column)}, inplace=True)
20
-
21
- # Handle mixed types in columns
22
- if df[column].dtype == 'object':
23
- # Try to convert to numeric, coerce errors to NaN
24
- numeric_conversion = pd.to_numeric(df[column], errors='coerce')
25
- if numeric_conversion.notna().any():
26
- df[column] = numeric_conversion
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
27
  else:
28
- # If not numeric, ensure string type
29
- df[column] = df[column].astype(str)
30
-
31
- # Convert any remaining object types to string
32
- if df[column].dtype == 'object':
33
- df[column] = df[column].astype(str)
34
-
35
- # Handle special cases for numeric columns
36
- if pd.api.types.is_numeric_dtype(df[column]):
37
- # Check if column contains large numbers that might cause overflow
38
- if df[column].max() > 1e9 or df[column].min() < -1e9:
39
- df[column] = df[column].astype('float64')
40
-
41
- # Replace infinity values with NaN
42
- if pd.api.types.is_numeric_dtype(df[column]):
43
- df[column] = df[column].replace([np.inf, -np.inf], np.nan)
44
 
45
- return df
46
 
47
  def convert_excel_to_csv(excel_file):
48
  """Convert Excel file to CSV and return the DataFrame"""
@@ -55,12 +105,9 @@ def convert_excel_to_csv(excel_file):
55
 
56
  def analyze_columns(df: pd.DataFrame, filename: str) -> dict:
57
  """Analyze DataFrame columns using Gemini AI"""
58
- # Convert sample of DataFrame to CSV string
59
- sample_df = df.head(5).copy()
60
- # Convert all columns to string for analysis
61
- for col in sample_df.columns:
62
- sample_df[col] = sample_df[col].astype(str)
63
- sample_csv = sample_df.to_csv(index=False)
64
 
65
  analysis_prompt = f"""
66
  Analyze this CSV data from file '{filename}' and provide the following in JSON format:
@@ -84,40 +131,23 @@ def analyze_columns(df: pd.DataFrame, filename: str) -> dict:
84
 
85
  try:
86
  response = model.generate_content(analysis_prompt)
87
- # Parse JSON response
88
- analysis = json.loads(response.text)
89
- return analysis
90
  except Exception as e:
91
  st.error(f"Error analyzing columns: {str(e)}")
92
  return None
93
 
94
- def find_common_columns(dataframes: List[Dict]) -> List[str]:
95
- """Find potential common columns across all DataFrames based on Gemini analysis"""
96
- all_key_columns = []
97
- for df_info in dataframes:
98
- if df_info['analysis'] and 'key_columns' in df_info['analysis']:
99
- all_key_columns.extend(df_info['analysis']['key_columns'])
100
-
101
- # Count frequency of each column
102
- from collections import Counter
103
- column_freq = Counter(all_key_columns)
104
-
105
- # Return columns that appear in multiple datasets
106
- common_columns = [col for col, freq in column_freq.items() if freq > 1]
107
- return common_columns
108
-
109
  def merge_dataframes(dataframes: List[Dict], common_columns: List[str]) -> pd.DataFrame:
110
  """Merge all DataFrames using specified common columns"""
111
  if not dataframes:
112
  return None
113
 
114
- # Start with the first DataFrame
115
- merged_df = dataframes[0]['df'].copy()
116
-
117
- # Merge with remaining DataFrames
118
- for df_info in dataframes[1:]:
119
- try:
120
- # Ensure common columns have the same type before merging
121
  for col in common_columns:
122
  if col in merged_df.columns and col in df_info['df'].columns:
123
  # Convert to string if types don't match
@@ -132,30 +162,16 @@ def merge_dataframes(dataframes: List[Dict], common_columns: List[str]) -> pd.Da
132
  how='outer',
133
  suffixes=(None, f'_{df_info["filename"]}')
134
  )
135
- except Exception as e:
136
- st.error(f"Error merging {df_info['filename']}: {str(e)}")
137
- continue
138
-
139
- return clean_dataframe(merged_df)
140
-
141
- def display_dataframe_sample(df: pd.DataFrame, title: str = "Data Preview"):
142
- """Safely display a DataFrame sample in Streamlit"""
143
- try:
144
- st.write(title)
145
- # Create a clean copy for display
146
- display_df = df.head().copy()
147
- # Convert all columns to string for safe display
148
- for col in display_df.columns:
149
- display_df[col] = display_df[col].astype(str)
150
- st.dataframe(display_df)
151
  except Exception as e:
152
- st.error(f"Error displaying DataFrame: {str(e)}")
 
153
 
154
  def main():
155
  st.title("Smart CSV Processor")
156
  st.write("Upload CSV or Excel files for intelligent analysis and merging.")
157
 
158
- # File uploader
159
  uploaded_files = st.file_uploader(
160
  "Choose files",
161
  accept_multiple_files=True,
@@ -164,15 +180,13 @@ def main():
164
 
165
  if uploaded_files:
166
  st.write("### Processing Files")
167
-
168
- # Process each file and store DataFrames with their analysis
169
  processed_files = []
170
 
171
  for uploaded_file in uploaded_files:
172
  st.write(f"#### Analyzing: {uploaded_file.name}")
173
 
174
  try:
175
- # Read file into DataFrame
176
  if uploaded_file.name.endswith(('.xlsx', '.xls')):
177
  df = convert_excel_to_csv(uploaded_file)
178
  else:
@@ -181,9 +195,10 @@ def main():
181
 
182
  if df is not None:
183
  # Show initial data preview
184
- display_dataframe_sample(df, "Initial Preview:")
 
185
 
186
- # Analyze columns using Gemini
187
  with st.spinner("Analyzing columns with AI..."):
188
  analysis = analyze_columns(df, uploaded_file.name)
189
 
@@ -191,18 +206,16 @@ def main():
191
  st.write("Column Analysis:")
192
  st.json(analysis)
193
 
194
- # Store DataFrame and its analysis
195
  processed_files.append({
196
  'filename': uploaded_file.name,
197
  'df': df,
198
  'analysis': analysis
199
  })
200
 
201
- # Apply suggested column renames if any
202
  if 'suggested_renames' in analysis and analysis['suggested_renames']:
203
  df.rename(columns=analysis['suggested_renames'], inplace=True)
204
- st.write("Applied suggested column renames.")
205
- display_dataframe_sample(df, "Updated Preview:")
206
 
207
  except Exception as e:
208
  st.error(f"Error processing {uploaded_file.name}: {str(e)}")
@@ -212,12 +225,13 @@ def main():
212
  st.write("### Merging DataFrames")
213
 
214
  # Find common columns
215
- common_columns = find_common_columns(processed_files)
 
 
216
 
217
  if common_columns:
218
- st.write("Detected common columns:", common_columns)
219
 
220
- # Let user select columns to use for merging
221
  selected_columns = st.multiselect(
222
  "Select columns to use for merging",
223
  options=common_columns,
@@ -225,15 +239,14 @@ def main():
225
  )
226
 
227
  if selected_columns:
228
- # Merge DataFrames
229
  with st.spinner("Merging datasets..."):
230
  merged_df = merge_dataframes(processed_files, selected_columns)
231
 
232
  if merged_df is not None:
233
  st.write("### Preview of Merged Data")
234
- display_dataframe_sample(merged_df)
235
 
236
- # Download button for merged CSV
237
  try:
238
  csv = merged_df.to_csv(index=False)
239
  st.download_button(
@@ -242,26 +255,26 @@ def main():
242
  file_name="merged_data.csv",
243
  mime="text/csv"
244
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
245
  except Exception as e:
246
- st.error(f"Error creating download file: {str(e)}")
247
-
248
- # Show statistics
249
- st.write("### Dataset Statistics")
250
- st.write(f"Total rows: {len(merged_df)}")
251
- st.write(f"Total columns: {len(merged_df.columns)}")
252
-
253
- # Show data quality metrics
254
- st.write("### Data Quality Metrics")
255
- missing_values = merged_df.isnull().sum()
256
- st.write("Missing values per column:")
257
- st.dataframe(pd.DataFrame({
258
- 'Column': missing_values.index,
259
- 'Missing Values': missing_values.values
260
- }))
261
-
262
- # Show duplicate check
263
- duplicates = merged_df.duplicated().sum()
264
- st.write(f"Number of duplicate rows: {duplicates}")
265
  else:
266
  st.warning("No common columns found across datasets.")
267
  else:
 
11
  genai.configure(api_key=os.environ["GOOGLE_API_KEY"])
12
  model = genai.GenerativeModel('gemini-2.0-flash-thinking-exp')
13
 
14
+ def clean_column_name(col_name):
15
+ """Clean column names to be compatible with Arrow"""
16
+ if not isinstance(col_name, str):
17
+ return str(col_name)
18
+ return col_name.strip()
19
+
20
+ def is_salary_column(column_name: str) -> bool:
21
+ """Check if column name suggests it contains salary/monetary data"""
22
+ salary_keywords = ['salary', 'wage', 'income', 'earning', 'commission', 'fee', 'payment', 'compensation']
23
+ column_lower = column_name.lower()
24
+ return any(keyword in column_lower for keyword in salary_keywords)
25
+
26
+ def clean_monetary_value(value):
27
+ """Clean monetary values by removing currency symbols and converting to float"""
28
+ if pd.isna(value):
29
+ return np.nan
30
+ if isinstance(value, (int, float)):
31
+ return float(value)
32
+
33
+ # Convert to string if not already
34
+ value_str = str(value)
35
+
36
+ # Remove currency symbols, commas, and other non-numeric characters except decimal points
37
+ cleaned = re.sub(r'[^0-9.-]', '', value_str)
38
+
39
+ try:
40
+ return float(cleaned)
41
+ except (ValueError, TypeError):
42
+ return np.nan
43
+
44
+ def safe_convert_column(df: pd.DataFrame, column: str) -> pd.Series:
45
+ """Safely convert a column to the appropriate type"""
46
+ series = df[column].copy()
47
+
48
+ # Handle salary/monetary columns
49
+ if is_salary_column(column):
50
+ return series.apply(clean_monetary_value)
51
+
52
+ # Try numeric conversion first
53
+ numeric_series = pd.to_numeric(series, errors='coerce')
54
+ if numeric_series.notna().any():
55
+ return numeric_series
56
+
57
+ # If not numeric, convert to string
58
+ return series.astype(str)
59
+
60
  def clean_dataframe(df: pd.DataFrame) -> pd.DataFrame:
61
  """Clean DataFrame to ensure Arrow compatibility"""
62
+ # Create a copy to avoid modifying the original
63
+ cleaned_df = df.copy()
64
+
65
+ # Clean column names
66
+ cleaned_df.columns = [clean_column_name(col) for col in cleaned_df.columns]
67
+
68
+ # Process each column
69
+ for column in cleaned_df.columns:
70
+ try:
71
+ cleaned_df[column] = safe_convert_column(cleaned_df, column)
72
+ except Exception as e:
73
+ st.warning(f"Error processing column {column}: {str(e)}")
74
+ # Fallback to string conversion
75
+ cleaned_df[column] = cleaned_df[column].astype(str)
76
+
77
+ return cleaned_df
78
+
79
+ def safe_display_df(df: pd.DataFrame) -> pd.DataFrame:
80
+ """Prepare DataFrame for safe display in Streamlit"""
81
+ display_df = df.copy()
82
+
83
+ # Convert all columns to string for display
84
+ for col in display_df.columns:
85
+ try:
86
+ if is_salary_column(col):
87
+ # Format monetary values
88
+ display_df[col] = display_df[col].apply(lambda x: f"${x:,.2f}" if pd.notna(x) else "")
89
  else:
90
+ # Convert other columns to string
91
+ display_df[col] = display_df[col].astype(str).apply(lambda x: "" if x == "nan" else x)
92
+ except Exception as e:
93
+ display_df[col] = display_df[col].astype(str)
 
 
 
 
 
 
 
 
 
 
 
 
94
 
95
+ return display_df
96
 
97
  def convert_excel_to_csv(excel_file):
98
  """Convert Excel file to CSV and return the DataFrame"""
 
105
 
106
  def analyze_columns(df: pd.DataFrame, filename: str) -> dict:
107
  """Analyze DataFrame columns using Gemini AI"""
108
+ # Prepare sample data for analysis
109
+ display_df = safe_display_df(df.head(5))
110
+ sample_csv = display_df.to_csv(index=False)
 
 
 
111
 
112
  analysis_prompt = f"""
113
  Analyze this CSV data from file '{filename}' and provide the following in JSON format:
 
131
 
132
  try:
133
  response = model.generate_content(analysis_prompt)
134
+ return json.loads(response.text)
 
 
135
  except Exception as e:
136
  st.error(f"Error analyzing columns: {str(e)}")
137
  return None
138
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
139
  def merge_dataframes(dataframes: List[Dict], common_columns: List[str]) -> pd.DataFrame:
140
  """Merge all DataFrames using specified common columns"""
141
  if not dataframes:
142
  return None
143
 
144
+ try:
145
+ # Start with the first DataFrame
146
+ merged_df = dataframes[0]['df'].copy()
147
+
148
+ # Merge with remaining DataFrames
149
+ for df_info in dataframes[1:]:
150
+ # Ensure common columns have matching types
151
  for col in common_columns:
152
  if col in merged_df.columns and col in df_info['df'].columns:
153
  # Convert to string if types don't match
 
162
  how='outer',
163
  suffixes=(None, f'_{df_info["filename"]}')
164
  )
165
+
166
+ return clean_dataframe(merged_df)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
167
  except Exception as e:
168
+ st.error(f"Error merging DataFrames: {str(e)}")
169
+ return None
170
 
171
  def main():
172
  st.title("Smart CSV Processor")
173
  st.write("Upload CSV or Excel files for intelligent analysis and merging.")
174
 
 
175
  uploaded_files = st.file_uploader(
176
  "Choose files",
177
  accept_multiple_files=True,
 
180
 
181
  if uploaded_files:
182
  st.write("### Processing Files")
 
 
183
  processed_files = []
184
 
185
  for uploaded_file in uploaded_files:
186
  st.write(f"#### Analyzing: {uploaded_file.name}")
187
 
188
  try:
189
+ # Read and clean data
190
  if uploaded_file.name.endswith(('.xlsx', '.xls')):
191
  df = convert_excel_to_csv(uploaded_file)
192
  else:
 
195
 
196
  if df is not None:
197
  # Show initial data preview
198
+ st.write("Initial Preview:")
199
+ st.dataframe(safe_display_df(df.head()))
200
 
201
+ # Analyze columns
202
  with st.spinner("Analyzing columns with AI..."):
203
  analysis = analyze_columns(df, uploaded_file.name)
204
 
 
206
  st.write("Column Analysis:")
207
  st.json(analysis)
208
 
 
209
  processed_files.append({
210
  'filename': uploaded_file.name,
211
  'df': df,
212
  'analysis': analysis
213
  })
214
 
 
215
  if 'suggested_renames' in analysis and analysis['suggested_renames']:
216
  df.rename(columns=analysis['suggested_renames'], inplace=True)
217
+ st.write("Updated Preview (after renaming):")
218
+ st.dataframe(safe_display_df(df.head()))
219
 
220
  except Exception as e:
221
  st.error(f"Error processing {uploaded_file.name}: {str(e)}")
 
225
  st.write("### Merging DataFrames")
226
 
227
  # Find common columns
228
+ common_columns = list(set.intersection(*[
229
+ set(df_info['df'].columns) for df_info in processed_files
230
+ ]))
231
 
232
  if common_columns:
233
+ st.write("Common columns found:", common_columns)
234
 
 
235
  selected_columns = st.multiselect(
236
  "Select columns to use for merging",
237
  options=common_columns,
 
239
  )
240
 
241
  if selected_columns:
 
242
  with st.spinner("Merging datasets..."):
243
  merged_df = merge_dataframes(processed_files, selected_columns)
244
 
245
  if merged_df is not None:
246
  st.write("### Preview of Merged Data")
247
+ st.dataframe(safe_display_df(merged_df.head()))
248
 
249
+ # Create downloadable CSV
250
  try:
251
  csv = merged_df.to_csv(index=False)
252
  st.download_button(
 
255
  file_name="merged_data.csv",
256
  mime="text/csv"
257
  )
258
+
259
+ # Show statistics
260
+ st.write("### Dataset Statistics")
261
+ st.write(f"Total rows: {len(merged_df)}")
262
+ st.write(f"Total columns: {len(merged_df.columns)}")
263
+
264
+ # Data quality metrics
265
+ st.write("### Data Quality Metrics")
266
+ missing_df = pd.DataFrame({
267
+ 'Column': merged_df.columns,
268
+ 'Missing Values': merged_df.isnull().sum().values,
269
+ 'Missing Percentage': (merged_df.isnull().sum().values / len(merged_df) * 100).round(2)
270
+ })
271
+ st.dataframe(missing_df)
272
+
273
+ duplicates = merged_df.duplicated().sum()
274
+ st.write(f"Number of duplicate rows: {duplicates}")
275
+
276
  except Exception as e:
277
+ st.error(f"Error preparing download: {str(e)}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
278
  else:
279
  st.warning("No common columns found across datasets.")
280
  else: