pavanmutha commited on
Commit
e989ad4
·
verified ·
1 Parent(s): 06536ec

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +75 -78
app.py CHANGED
@@ -68,114 +68,111 @@ def clean_data(df):
68
 
69
  # Add a extraction of JSON if CodeAgent Output is not in format
70
 
71
- def extract_json_from_codeagent_output(raw_output):
72
- import re, json, ast
 
73
 
 
74
  try:
75
- # Extract code blocks from ```python ... ```
76
  code_blocks = re.findall(r"```(?:py|python)?\n(.*?)```", raw_output, re.DOTALL)
77
  for block in code_blocks:
78
- # Try extracting from print(json.dumps({...}))
79
- json_match = re.search(
80
  r"print\(\s*json\.dumps\(\s*(\{[\s\S]*?\})\s*\)\s*\)",
81
- block,
82
- re.DOTALL
83
- ) or re.search(
84
  r"json\.dumps\(\s*(\{[\s\S]*?\})\s*\)",
85
- block,
86
- re.DOTALL
87
- )
88
- if json_match:
89
- return json.loads(json_match.group(1))
90
-
91
- # Try extracting from: result = {...}
92
- result_match = re.search(
93
- r"result\s*=\s*(\{[\s\S]*?\})",
94
- block,
95
- re.DOTALL
96
- )
97
- if result_match:
98
- raw_dict = result_match.group(1)
99
- try:
100
- return json.loads(raw_dict) # Try strict JSON
101
- except json.JSONDecodeError:
102
- return ast.literal_eval(raw_dict) # Try Python dict parsing
103
-
104
- # Final fallback: look for any dict-like thing in entire output
105
- fallback_match = re.search(r"\{[\s\S]+\}", raw_output)
106
- if fallback_match:
107
- raw_dict = fallback_match.group(0)
108
- try:
109
- return json.loads(raw_dict)
110
- except json.JSONDecodeError:
111
- return ast.literal_eval(raw_dict)
112
-
113
  except Exception as e:
114
- print(f"extract_json_from_codeagent_output() failed: {e}")
115
- return None
116
-
117
-
118
-
119
 
120
- # Data Analysis Function with CodeAgent
121
  def analyze_data(csv_file, additional_notes=""):
122
  start_time = time.time()
123
  process = psutil.Process(os.getpid())
124
  initial_memory = process.memory_info().rss / 1024 ** 2
125
-
 
 
 
 
 
 
 
126
  if os.path.exists('./figures'):
127
  shutil.rmtree('./figures')
128
  os.makedirs('./figures', exist_ok=True)
129
-
 
130
  wandb.login(key=os.environ.get('WANDB_API_KEY'))
131
  run = wandb.init(project="huggingface-data-analysis", config={
132
  "model": "mistralai/Mixtral-8x7B-Instruct-v0.1",
133
  "additional_notes": additional_notes,
134
- "source_file": csv_file.name if csv_file else None
135
  })
136
-
137
- agent = CodeAgent(tools=[], model=model, additional_authorized_imports=["numpy", "pandas", "matplotlib.pyplot", "seaborn", "sklearn", "json"])
138
- analysis_result = agent.run("""
 
 
 
 
 
 
139
  You are a helpful data analysis agent. Follow these instructions EXACTLY:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
140
 
141
- 1. Load the data from the given `source_file` ONLY.
142
- 2. Analyze the data structure and generate up to 5 visualizations and 5 insights.
143
- 3. Save all figures to `./figures` as PNG using matplotlib or seaborn.
144
- 4. Use only authorized imports: `pandas`, `numpy`, `matplotlib.pyplot`, `seaborn`, `json`.
145
- 5. DO NOT return any explanations, thoughts, or narration outside the final output block.
146
- 6. Run only 5 iteration and return output quickly.
147
-
148
- ⚠️ Output ONLY the following code block format, exactly:
149
- {
150
- 'observations': {
151
- 'observation_1_key': 'observation_1_value',
152
- 'observation_2_key': 'observation_2_value',
153
- ...
154
- },
155
- 'insights': {
156
- 'insight_1_key': 'insight_1_value',
157
- 'insight_2_key': 'insight_2_value',
158
- ...
159
- }
160
- }
161
- """, additional_args={"additional_notes": additional_notes, "source_file": csv_file})
162
-
163
  execution_time = time.time() - start_time
164
  final_memory = process.memory_info().rss / 1024 ** 2
165
  memory_usage = final_memory - initial_memory
166
  wandb.log({"execution_time_sec": execution_time, "memory_usage_mb": memory_usage})
167
-
168
- visuals = [os.path.join('./figures', f) for f in os.listdir('./figures') if f.endswith(('.png', '.jpg', '.jpeg'))]
 
169
  for viz in visuals:
170
  wandb.log({os.path.basename(viz): wandb.Image(viz)})
171
-
172
- run.finish()
173
- return format_analysis_report(analysis_result, visuals)
174
-
175
-
176
-
177
 
 
178
 
 
 
 
 
 
 
179
 
180
 
181
 
 
68
 
69
  # Add a extraction of JSON if CodeAgent Output is not in format
70
 
71
+ import os, json, shutil, time, psutil, tempfile, re, ast
72
+ import pandas as pd
73
+ import wandb
74
 
75
+ def extract_json_from_codeagent_output(raw_output):
76
  try:
 
77
  code_blocks = re.findall(r"```(?:py|python)?\n(.*?)```", raw_output, re.DOTALL)
78
  for block in code_blocks:
79
+ for pattern in [
 
80
  r"print\(\s*json\.dumps\(\s*(\{[\s\S]*?\})\s*\)\s*\)",
 
 
 
81
  r"json\.dumps\(\s*(\{[\s\S]*?\})\s*\)",
82
+ r"result\s*=\s*(\{[\s\S]*?\})"
83
+ ]:
84
+ match = re.search(pattern, block, re.DOTALL)
85
+ if match:
86
+ try:
87
+ return json.loads(match.group(1))
88
+ except json.JSONDecodeError:
89
+ return ast.literal_eval(match.group(1))
90
+ fallback = re.search(r"\{[\s\S]+?\}", raw_output)
91
+ if fallback:
92
+ return json.loads(fallback.group(0))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
93
  except Exception as e:
94
+ print(f"[extract_json] Error: {e}")
95
+ return {"error": "Failed to extract structured JSON"}
 
 
 
96
 
 
97
  def analyze_data(csv_file, additional_notes=""):
98
  start_time = time.time()
99
  process = psutil.Process(os.getpid())
100
  initial_memory = process.memory_info().rss / 1024 ** 2
101
+
102
+ # Load and trim dataset
103
+ df = pd.read_csv(csv_file)
104
+ df_trimmed = df.iloc[:300, :10] # Limit rows and columns for performance
105
+ temp_path = tempfile.NamedTemporaryFile(delete=False, suffix=".csv").name
106
+ df_trimmed.to_csv(temp_path, index=False)
107
+
108
+ # Clear figures
109
  if os.path.exists('./figures'):
110
  shutil.rmtree('./figures')
111
  os.makedirs('./figures', exist_ok=True)
112
+
113
+ # Start W&B
114
  wandb.login(key=os.environ.get('WANDB_API_KEY'))
115
  run = wandb.init(project="huggingface-data-analysis", config={
116
  "model": "mistralai/Mixtral-8x7B-Instruct-v0.1",
117
  "additional_notes": additional_notes,
118
+ "source_file": csv_file.name
119
  })
120
+
121
+ # Create CodeAgent instance
122
+ agent = CodeAgent(
123
+ tools=[],
124
+ model=model,
125
+ additional_authorized_imports=["numpy", "pandas", "matplotlib.pyplot", "seaborn", "sklearn", "json"]
126
+ )
127
+
128
+ prompt = f"""
129
  You are a helpful data analysis agent. Follow these instructions EXACTLY:
130
+ 1. Load the data from `source_file` ONLY.
131
+ 2. Generate up to 3 observations and 3 visualizations.
132
+ 3. Save all figures to ./figures as PNGs using matplotlib/seaborn.
133
+ 4. Use only: pandas, numpy, matplotlib.pyplot, seaborn, json.
134
+ 5. ⚠️ Output ONLY the following JSON format inside a single code block:
135
+ {{
136
+ "observations": {{
137
+ "key": "value"
138
+ }},
139
+ "insights": {{
140
+ "key": "value"
141
+ }}
142
+ }}
143
+ 6. Do not include comments or narration.
144
+ 7. Complete the analysis quickly (limit iterations).
145
+ """
146
 
147
+ try:
148
+ raw_output = agent.run(prompt, additional_args={
149
+ "source_file": open(temp_path, "rb"),
150
+ "additional_notes": additional_notes
151
+ })
152
+ parsed_result = extract_json_from_codeagent_output(raw_output)
153
+ except Exception as e:
154
+ print(f"[analyze_data] Agent failed: {e}")
155
+ parsed_result = {"error": str(e)}
156
+
157
+ # Log performance
 
 
 
 
 
 
 
 
 
 
 
158
  execution_time = time.time() - start_time
159
  final_memory = process.memory_info().rss / 1024 ** 2
160
  memory_usage = final_memory - initial_memory
161
  wandb.log({"execution_time_sec": execution_time, "memory_usage_mb": memory_usage})
162
+
163
+ # Upload visuals
164
+ visuals = [os.path.join('./figures', f) for f in os.listdir('./figures') if f.lower().endswith(('.png', '.jpg', '.jpeg'))]
165
  for viz in visuals:
166
  wandb.log({os.path.basename(viz): wandb.Image(viz)})
 
 
 
 
 
 
167
 
168
+ run.finish()
169
 
170
+ return {
171
+ "summary": parsed_result,
172
+ "visuals": visuals,
173
+ "execution_time_sec": round(execution_time, 2),
174
+ "memory_usage_mb": round(memory_usage, 2)
175
+ }
176
 
177
 
178