pavanmutha commited on
Commit
fcdbea4
·
verified ·
1 Parent(s): 89c639a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +49 -24
app.py CHANGED
@@ -58,30 +58,40 @@ def set_target_column(col_name):
58
  return f"✅ Target column set to: {col_name}"
59
 
60
  def clean_data(df):
61
- # Drop rows and columns where all values are NaN
 
 
 
62
  df = df.dropna(how='all', axis=1).dropna(how='all', axis=0)
63
-
64
- # Convert columns with object (string) data type to strings
65
- for col in df.select_dtypes(include='object').columns:
66
- df[col] = df[col].astype(str)
67
-
68
- # Ensure that the 'Amount' column is treated as a string before using str accessor
69
- if 'Amount' in df.columns:
70
- df['Amount'] = df['Amount'].astype(str).str.replace(',', '').str.replace('$', '').str.strip()
71
-
72
- # Convert categorical columns to numeric using LabelEncoder
 
 
73
  for col in df.select_dtypes(include='object').columns:
74
- if col != 'Amount': # Skip 'Amount' column as it was already cleaned
 
75
  df[col] = LabelEncoder().fit_transform(df[col])
76
-
77
- # Fill missing values in numeric columns with the mean of each column
 
 
78
  df = df.fillna(df.mean(numeric_only=True))
79
-
80
  return df
81
 
82
 
83
 
84
 
 
 
85
  # Add a extraction of JSON if CodeAgent Output is not in format
86
 
87
  def extract_json_from_codeagent_output(raw_output):
@@ -114,11 +124,25 @@ def extract_json_from_codeagent_output(raw_output):
114
  # Return an error if JSON extraction fails
115
  return {"error": "Failed to extract structured JSON"}
116
 
 
 
 
117
  def analyze_data(csv_file, additional_notes=""):
118
  start_time = time.time()
119
  process = psutil.Process(os.getpid())
120
  initial_memory = process.memory_info().rss / 1024 ** 2
121
 
 
 
 
 
 
 
 
 
 
 
 
122
  # Clear or create figures folder
123
  if os.path.exists('./figures'):
124
  shutil.rmtree('./figures')
@@ -129,16 +153,17 @@ def analyze_data(csv_file, additional_notes=""):
129
  run = wandb.init(project="huggingface-data-analysis", config={
130
  "model": "mistralai/Mixtral-8x7B-Instruct-v0.1",
131
  "additional_notes": additional_notes,
132
- "source_file": csv_file.name if csv_file else None
133
  })
134
 
 
135
  agent = CodeAgent(
136
  tools=[],
137
  model=model,
138
  additional_authorized_imports=["numpy", "pandas", "matplotlib.pyplot", "seaborn", "sklearn", "json"]
139
  )
140
 
141
- # Run the CodeAgent
142
  raw_output = agent.run("""
143
  You are a data analysis agent. Follow these instructions EXACTLY:
144
  1. Load the data from the given `source_file` ONLY. DO NOT create your OWN DATA.
@@ -146,7 +171,7 @@ def analyze_data(csv_file, additional_notes=""):
146
  3. Save all figures to `./figures` as PNG using matplotlib or seaborn.
147
  4. Use only authorized imports: `pandas`, `numpy`, `matplotlib.pyplot`, `seaborn`, `json`.
148
  5. DO NOT return any explanations, thoughts, or narration outside the final JSON block
149
- 6. Run only 5 iteration and return output quickly.
150
  7. Output ONLY the following JSON code block format, exactly:
151
  {
152
  'observations': {
@@ -158,14 +183,14 @@ def analyze_data(csv_file, additional_notes=""):
158
  ...
159
  }
160
  }
161
- """, additional_args={"additional_notes": additional_notes, "source_file": csv_file})
162
 
163
- # Parse agent output
164
  parsed_result = extract_json_from_codeagent_output(raw_output) or {
165
  "error": "Failed to extract structured JSON"
166
  }
167
 
168
- # Record execution time and memory usage
169
  execution_time = time.time() - start_time
170
  final_memory = process.memory_info().rss / 1024 ** 2
171
  memory_usage = final_memory - initial_memory
@@ -175,14 +200,14 @@ def analyze_data(csv_file, additional_notes=""):
175
  "memory_usage_mb": round(memory_usage, 2)
176
  })
177
 
178
- # Collect generated visualizations
179
  visuals = [os.path.join('./figures', f) for f in os.listdir('./figures') if f.lower().endswith(('.png', '.jpg', '.jpeg'))]
180
  for viz in visuals:
181
  wandb.log({os.path.basename(viz): wandb.Image(viz)})
182
 
183
  run.finish()
184
 
185
- # Generate summary HTML
186
  summary_html = "<h3>📊 Data Analysis Summary</h3>"
187
  if "observations" in parsed_result:
188
  summary_html += "<h4>🔍 Observations</h4><ul>" + "".join(
@@ -195,11 +220,11 @@ def analyze_data(csv_file, additional_notes=""):
195
  if "error" in parsed_result:
196
  summary_html += f"<p style='color:red'><b>Error:</b> {parsed_result['error']}</p>"
197
 
198
- # Return summary HTML and visual paths for gr.HTML + gr.Gallery
199
  return summary_html, visuals
200
 
201
 
202
 
 
203
  def format_analysis_report(raw_output, visuals):
204
  import json
205
 
 
58
  return f"✅ Target column set to: {col_name}"
59
 
60
  def clean_data(df):
61
+ from sklearn.preprocessing import LabelEncoder
62
+ import numpy as np
63
+
64
+ # Drop completely empty rows/columns
65
  df = df.dropna(how='all', axis=1).dropna(how='all', axis=0)
66
+
67
+ # Sanitize 'Amount' or similar money/number-looking columns
68
+ for col in df.columns:
69
+ if df[col].dtype == 'object':
70
+ # Attempt cleaning for common currency/number strings
71
+ try:
72
+ cleaned = df[col].str.replace(r'[$,]', '', regex=True).str.strip()
73
+ df[col] = pd.to_numeric(cleaned, errors='ignore') # Keep original if conversion fails
74
+ except Exception:
75
+ pass
76
+
77
+ # Encode any remaining object-type columns
78
  for col in df.select_dtypes(include='object').columns:
79
+ try:
80
+ df[col] = df[col].astype(str)
81
  df[col] = LabelEncoder().fit_transform(df[col])
82
+ except Exception:
83
+ pass
84
+
85
+ # Fill remaining NaNs
86
  df = df.fillna(df.mean(numeric_only=True))
87
+
88
  return df
89
 
90
 
91
 
92
 
93
+
94
+
95
  # Add a extraction of JSON if CodeAgent Output is not in format
96
 
97
  def extract_json_from_codeagent_output(raw_output):
 
124
  # Return an error if JSON extraction fails
125
  return {"error": "Failed to extract structured JSON"}
126
 
127
+ import pandas as pd
128
+ import tempfile
129
+
130
  def analyze_data(csv_file, additional_notes=""):
131
  start_time = time.time()
132
  process = psutil.Process(os.getpid())
133
  initial_memory = process.memory_info().rss / 1024 ** 2
134
 
135
+ # Load and clean the data BEFORE passing to the agent
136
+ try:
137
+ df = pd.read_csv(csv_file)
138
+ df = clean_data(df)
139
+ except Exception as e:
140
+ return f"<p style='color:red'><b>Error loading or cleaning CSV:</b> {e}</p>", []
141
+
142
+ # Save cleaned data to a temporary file
143
+ tmp_cleaned = tempfile.NamedTemporaryFile(delete=False, suffix=".csv", mode='w')
144
+ df.to_csv(tmp_cleaned.name, index=False)
145
+
146
  # Clear or create figures folder
147
  if os.path.exists('./figures'):
148
  shutil.rmtree('./figures')
 
153
  run = wandb.init(project="huggingface-data-analysis", config={
154
  "model": "mistralai/Mixtral-8x7B-Instruct-v0.1",
155
  "additional_notes": additional_notes,
156
+ "source_file": tmp_cleaned.name
157
  })
158
 
159
+ # Initialize agent
160
  agent = CodeAgent(
161
  tools=[],
162
  model=model,
163
  additional_authorized_imports=["numpy", "pandas", "matplotlib.pyplot", "seaborn", "sklearn", "json"]
164
  )
165
 
166
+ # Run the agent on the cleaned file
167
  raw_output = agent.run("""
168
  You are a data analysis agent. Follow these instructions EXACTLY:
169
  1. Load the data from the given `source_file` ONLY. DO NOT create your OWN DATA.
 
171
  3. Save all figures to `./figures` as PNG using matplotlib or seaborn.
172
  4. Use only authorized imports: `pandas`, `numpy`, `matplotlib.pyplot`, `seaborn`, `json`.
173
  5. DO NOT return any explanations, thoughts, or narration outside the final JSON block
174
+ 6. Run agent efficiently and remove repetitive task and complete in less than 40 seconds.
175
  7. Output ONLY the following JSON code block format, exactly:
176
  {
177
  'observations': {
 
183
  ...
184
  }
185
  }
186
+ """, additional_args={"additional_notes": additional_notes, "source_file": tmp_cleaned})
187
 
188
+ # Parse output
189
  parsed_result = extract_json_from_codeagent_output(raw_output) or {
190
  "error": "Failed to extract structured JSON"
191
  }
192
 
193
+ # Log execution stats
194
  execution_time = time.time() - start_time
195
  final_memory = process.memory_info().rss / 1024 ** 2
196
  memory_usage = final_memory - initial_memory
 
200
  "memory_usage_mb": round(memory_usage, 2)
201
  })
202
 
203
+ # Upload any figures
204
  visuals = [os.path.join('./figures', f) for f in os.listdir('./figures') if f.lower().endswith(('.png', '.jpg', '.jpeg'))]
205
  for viz in visuals:
206
  wandb.log({os.path.basename(viz): wandb.Image(viz)})
207
 
208
  run.finish()
209
 
210
+ # HTML Summary
211
  summary_html = "<h3>📊 Data Analysis Summary</h3>"
212
  if "observations" in parsed_result:
213
  summary_html += "<h4>🔍 Observations</h4><ul>" + "".join(
 
220
  if "error" in parsed_result:
221
  summary_html += f"<p style='color:red'><b>Error:</b> {parsed_result['error']}</p>"
222
 
 
223
  return summary_html, visuals
224
 
225
 
226
 
227
+
228
  def format_analysis_report(raw_output, visuals):
229
  import json
230