rairo commited on
Commit
6250aa7
·
verified ·
1 Parent(s): 3a17237

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +109 -41
app.py CHANGED
@@ -11,11 +11,44 @@ import tempfile
11
  genai.configure(api_key=os.environ["GOOGLE_API_KEY"])
12
  model = genai.GenerativeModel('gemini-2.0-flash-thinking-exp')
13
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
14
  def convert_excel_to_csv(excel_file):
15
  """Convert Excel file to CSV and return the DataFrame"""
16
  try:
17
  df = pd.read_excel(excel_file)
18
- return df
19
  except Exception as e:
20
  st.error(f"Error converting Excel file: {str(e)}")
21
  return None
@@ -23,7 +56,11 @@ def convert_excel_to_csv(excel_file):
23
  def analyze_columns(df: pd.DataFrame, filename: str) -> dict:
24
  """Analyze DataFrame columns using Gemini AI"""
25
  # Convert sample of DataFrame to CSV string
26
- sample_csv = df.head(5).to_csv(index=False)
 
 
 
 
27
 
28
  analysis_prompt = f"""
29
  Analyze this CSV data from file '{filename}' and provide the following in JSON format:
@@ -80,6 +117,14 @@ def merge_dataframes(dataframes: List[Dict], common_columns: List[str]) -> pd.Da
80
  # Merge with remaining DataFrames
81
  for df_info in dataframes[1:]:
82
  try:
 
 
 
 
 
 
 
 
83
  merged_df = pd.merge(
84
  merged_df,
85
  df_info['df'],
@@ -91,7 +136,20 @@ def merge_dataframes(dataframes: List[Dict], common_columns: List[str]) -> pd.Da
91
  st.error(f"Error merging {df_info['filename']}: {str(e)}")
92
  continue
93
 
94
- return merged_df
 
 
 
 
 
 
 
 
 
 
 
 
 
95
 
96
  def main():
97
  st.title("Smart CSV Processor")
@@ -113,38 +171,42 @@ def main():
113
  for uploaded_file in uploaded_files:
114
  st.write(f"#### Analyzing: {uploaded_file.name}")
115
 
116
- # Read file into DataFrame
117
- if uploaded_file.name.endswith(('.xlsx', '.xls')):
118
- df = convert_excel_to_csv(uploaded_file)
119
- else:
120
- df = pd.read_csv(uploaded_file)
121
-
122
- if df is not None:
123
- # Show initial data preview
124
- st.write("Initial Preview:")
125
- st.dataframe(df.head())
126
 
127
- # Analyze columns using Gemini
128
- with st.spinner("Analyzing columns with AI..."):
129
- analysis = analyze_columns(df, uploaded_file.name)
130
-
131
- if analysis:
132
- st.write("Column Analysis:")
133
- st.json(analysis)
134
 
135
- # Store DataFrame and its analysis
136
- processed_files.append({
137
- 'filename': uploaded_file.name,
138
- 'df': df,
139
- 'analysis': analysis
140
- })
141
 
142
- # Apply suggested column renames if any
143
- if 'suggested_renames' in analysis and analysis['suggested_renames']:
144
- df.rename(columns=analysis['suggested_renames'], inplace=True)
145
- st.write("Applied suggested column renames.")
146
- st.write("Updated Preview:")
147
- st.dataframe(df.head())
 
 
 
 
 
 
 
 
 
 
 
 
 
 
148
 
149
  if len(processed_files) > 1:
150
  st.write("### Merging DataFrames")
@@ -169,16 +231,19 @@ def main():
169
 
170
  if merged_df is not None:
171
  st.write("### Preview of Merged Data")
172
- st.dataframe(merged_df.head())
173
 
174
  # Download button for merged CSV
175
- csv = merged_df.to_csv(index=False)
176
- st.download_button(
177
- label="Download Merged CSV",
178
- data=csv,
179
- file_name="merged_data.csv",
180
- mime="text/csv"
181
- )
 
 
 
182
 
183
  # Show statistics
184
  st.write("### Dataset Statistics")
@@ -189,7 +254,10 @@ def main():
189
  st.write("### Data Quality Metrics")
190
  missing_values = merged_df.isnull().sum()
191
  st.write("Missing values per column:")
192
- st.dataframe(missing_values)
 
 
 
193
 
194
  # Show duplicate check
195
  duplicates = merged_df.duplicated().sum()
 
11
  genai.configure(api_key=os.environ["GOOGLE_API_KEY"])
12
  model = genai.GenerativeModel('gemini-2.0-flash-thinking-exp')
13
 
14
+ def clean_dataframe(df: pd.DataFrame) -> pd.DataFrame:
15
+ """Clean DataFrame to ensure Arrow compatibility"""
16
+ for column in df.columns:
17
+ # Convert column name to string if it's not already
18
+ if not isinstance(column, str):
19
+ df.rename(columns={column: str(column)}, inplace=True)
20
+
21
+ # Handle mixed types in columns
22
+ if df[column].dtype == 'object':
23
+ # Try to convert to numeric, coerce errors to NaN
24
+ numeric_conversion = pd.to_numeric(df[column], errors='coerce')
25
+ if numeric_conversion.notna().any():
26
+ df[column] = numeric_conversion
27
+ else:
28
+ # If not numeric, ensure string type
29
+ df[column] = df[column].astype(str)
30
+
31
+ # Convert any remaining object types to string
32
+ if df[column].dtype == 'object':
33
+ df[column] = df[column].astype(str)
34
+
35
+ # Handle special cases for numeric columns
36
+ if pd.api.types.is_numeric_dtype(df[column]):
37
+ # Check if column contains large numbers that might cause overflow
38
+ if df[column].max() > 1e9 or df[column].min() < -1e9:
39
+ df[column] = df[column].astype('float64')
40
+
41
+ # Replace infinity values with NaN
42
+ if pd.api.types.is_numeric_dtype(df[column]):
43
+ df[column] = df[column].replace([np.inf, -np.inf], np.nan)
44
+
45
+ return df
46
+
47
  def convert_excel_to_csv(excel_file):
48
  """Convert Excel file to CSV and return the DataFrame"""
49
  try:
50
  df = pd.read_excel(excel_file)
51
+ return clean_dataframe(df)
52
  except Exception as e:
53
  st.error(f"Error converting Excel file: {str(e)}")
54
  return None
 
56
  def analyze_columns(df: pd.DataFrame, filename: str) -> dict:
57
  """Analyze DataFrame columns using Gemini AI"""
58
  # Convert sample of DataFrame to CSV string
59
+ sample_df = df.head(5).copy()
60
+ # Convert all columns to string for analysis
61
+ for col in sample_df.columns:
62
+ sample_df[col] = sample_df[col].astype(str)
63
+ sample_csv = sample_df.to_csv(index=False)
64
 
65
  analysis_prompt = f"""
66
  Analyze this CSV data from file '{filename}' and provide the following in JSON format:
 
117
  # Merge with remaining DataFrames
118
  for df_info in dataframes[1:]:
119
  try:
120
+ # Ensure common columns have the same type before merging
121
+ for col in common_columns:
122
+ if col in merged_df.columns and col in df_info['df'].columns:
123
+ # Convert to string if types don't match
124
+ if merged_df[col].dtype != df_info['df'][col].dtype:
125
+ merged_df[col] = merged_df[col].astype(str)
126
+ df_info['df'][col] = df_info['df'][col].astype(str)
127
+
128
  merged_df = pd.merge(
129
  merged_df,
130
  df_info['df'],
 
136
  st.error(f"Error merging {df_info['filename']}: {str(e)}")
137
  continue
138
 
139
+ return clean_dataframe(merged_df)
140
+
141
+ def display_dataframe_sample(df: pd.DataFrame, title: str = "Data Preview"):
142
+ """Safely display a DataFrame sample in Streamlit"""
143
+ try:
144
+ st.write(title)
145
+ # Create a clean copy for display
146
+ display_df = df.head().copy()
147
+ # Convert all columns to string for safe display
148
+ for col in display_df.columns:
149
+ display_df[col] = display_df[col].astype(str)
150
+ st.dataframe(display_df)
151
+ except Exception as e:
152
+ st.error(f"Error displaying DataFrame: {str(e)}")
153
 
154
  def main():
155
  st.title("Smart CSV Processor")
 
171
  for uploaded_file in uploaded_files:
172
  st.write(f"#### Analyzing: {uploaded_file.name}")
173
 
174
+ try:
175
+ # Read file into DataFrame
176
+ if uploaded_file.name.endswith(('.xlsx', '.xls')):
177
+ df = convert_excel_to_csv(uploaded_file)
178
+ else:
179
+ df = pd.read_csv(uploaded_file)
180
+ df = clean_dataframe(df)
 
 
 
181
 
182
+ if df is not None:
183
+ # Show initial data preview
184
+ display_dataframe_sample(df, "Initial Preview:")
 
 
 
 
185
 
186
+ # Analyze columns using Gemini
187
+ with st.spinner("Analyzing columns with AI..."):
188
+ analysis = analyze_columns(df, uploaded_file.name)
 
 
 
189
 
190
+ if analysis:
191
+ st.write("Column Analysis:")
192
+ st.json(analysis)
193
+
194
+ # Store DataFrame and its analysis
195
+ processed_files.append({
196
+ 'filename': uploaded_file.name,
197
+ 'df': df,
198
+ 'analysis': analysis
199
+ })
200
+
201
+ # Apply suggested column renames if any
202
+ if 'suggested_renames' in analysis and analysis['suggested_renames']:
203
+ df.rename(columns=analysis['suggested_renames'], inplace=True)
204
+ st.write("Applied suggested column renames.")
205
+ display_dataframe_sample(df, "Updated Preview:")
206
+
207
+ except Exception as e:
208
+ st.error(f"Error processing {uploaded_file.name}: {str(e)}")
209
+ continue
210
 
211
  if len(processed_files) > 1:
212
  st.write("### Merging DataFrames")
 
231
 
232
  if merged_df is not None:
233
  st.write("### Preview of Merged Data")
234
+ display_dataframe_sample(merged_df)
235
 
236
  # Download button for merged CSV
237
+ try:
238
+ csv = merged_df.to_csv(index=False)
239
+ st.download_button(
240
+ label="Download Merged CSV",
241
+ data=csv,
242
+ file_name="merged_data.csv",
243
+ mime="text/csv"
244
+ )
245
+ except Exception as e:
246
+ st.error(f"Error creating download file: {str(e)}")
247
 
248
  # Show statistics
249
  st.write("### Dataset Statistics")
 
254
  st.write("### Data Quality Metrics")
255
  missing_values = merged_df.isnull().sum()
256
  st.write("Missing values per column:")
257
+ st.dataframe(pd.DataFrame({
258
+ 'Column': missing_values.index,
259
+ 'Missing Values': missing_values.values
260
+ }))
261
 
262
  # Show duplicate check
263
  duplicates = merged_df.duplicated().sum()