AIDataAgentProjectFinal

Paused

App Files Files Community

pavanmutha commited on Apr 18, 2025

Commit

e989ad4

verified ·

1 Parent(s): 06536ec

Update app.py

Browse files

Files changed (1) hide show

app.py +75 -78

app.py CHANGED Viewed

@@ -68,114 +68,111 @@ def clean_data(df):
 # Add a extraction of JSON if CodeAgent Output is not in format
-def extract_json_from_codeagent_output(raw_output):
-    import re, json, ast
     try:
-        # Extract code blocks from ```python ... ```
         code_blocks = re.findall(r"```(?:py|python)?\n(.*?)```", raw_output, re.DOTALL)
         for block in code_blocks:
-            # Try extracting from print(json.dumps({...}))
-            json_match = re.search(
                 r"print\(\s*json\.dumps\(\s*(\{[\s\S]*?\})\s*\)\s*\)",
-                block,
-                re.DOTALL
-            ) or re.search(
                 r"json\.dumps\(\s*(\{[\s\S]*?\})\s*\)",
-                block,
-                re.DOTALL
-            )
-            if json_match:
-                return json.loads(json_match.group(1))
-            # Try extracting from: result = {...}
-            result_match = re.search(
-                r"result\s*=\s*(\{[\s\S]*?\})",
-                block,
-                re.DOTALL
-            )
-            if result_match:
-                raw_dict = result_match.group(1)
-                try:
-                    return json.loads(raw_dict)  # Try strict JSON
-                except json.JSONDecodeError:
-                    return ast.literal_eval(raw_dict)  # Try Python dict parsing
-        # Final fallback: look for any dict-like thing in entire output
-        fallback_match = re.search(r"\{[\s\S]+\}", raw_output)
-        if fallback_match:
-            raw_dict = fallback_match.group(0)
-            try:
-                return json.loads(raw_dict)
-            except json.JSONDecodeError:
-                return ast.literal_eval(raw_dict)
     except Exception as e:
-        print(f"extract_json_from_codeagent_output() failed: {e}")
-        return None
-# Data Analysis Function with CodeAgent
 def analyze_data(csv_file, additional_notes=""):
     start_time = time.time()
     process = psutil.Process(os.getpid())
     initial_memory = process.memory_info().rss / 1024 ** 2
     if os.path.exists('./figures'):
         shutil.rmtree('./figures')
     os.makedirs('./figures', exist_ok=True)
     wandb.login(key=os.environ.get('WANDB_API_KEY'))
     run = wandb.init(project="huggingface-data-analysis", config={
         "model": "mistralai/Mixtral-8x7B-Instruct-v0.1",
         "additional_notes": additional_notes,
-        "source_file": csv_file.name if csv_file else None
     })
-    agent = CodeAgent(tools=[], model=model, additional_authorized_imports=["numpy", "pandas", "matplotlib.pyplot", "seaborn", "sklearn", "json"])
-    analysis_result = agent.run("""
         You are a helpful data analysis agent. Follow these instructions EXACTLY:
-        1. Load the data from the given `source_file` ONLY.
-        2. Analyze the data structure and generate up to 5 visualizations and 5 insights.
-        3. Save all figures to `./figures` as PNG using matplotlib or seaborn.
-        4. Use only authorized imports: `pandas`, `numpy`, `matplotlib.pyplot`, `seaborn`, `json`.
-        5. DO NOT return any explanations, thoughts, or narration outside the final output block.
-        6. Run only 5 iteration and return output quickly.
-        ⚠️ Output ONLY the following code block format, exactly:
-        {
-            'observations': {
-                'observation_1_key': 'observation_1_value',
-                'observation_2_key': 'observation_2_value',
-                ...
-            },
-            'insights': {
-                'insight_1_key': 'insight_1_value',
-                'insight_2_key': 'insight_2_value',
-                ...
-            }
-        }
-    """, additional_args={"additional_notes": additional_notes, "source_file": csv_file})
     execution_time = time.time() - start_time
     final_memory = process.memory_info().rss / 1024 ** 2
     memory_usage = final_memory - initial_memory
     wandb.log({"execution_time_sec": execution_time, "memory_usage_mb": memory_usage})
-    visuals = [os.path.join('./figures', f) for f in os.listdir('./figures') if f.endswith(('.png', '.jpg', '.jpeg'))]
     for viz in visuals:
         wandb.log({os.path.basename(viz): wandb.Image(viz)})
-    run.finish()
-    return format_analysis_report(analysis_result, visuals)

 # Add a extraction of JSON if CodeAgent Output is not in format
+import os, json, shutil, time, psutil, tempfile, re, ast
+import pandas as pd
+import wandb
+def extract_json_from_codeagent_output(raw_output):
     try:
         code_blocks = re.findall(r"```(?:py|python)?\n(.*?)```", raw_output, re.DOTALL)
         for block in code_blocks:
+            for pattern in [
                 r"print\(\s*json\.dumps\(\s*(\{[\s\S]*?\})\s*\)\s*\)",
                 r"json\.dumps\(\s*(\{[\s\S]*?\})\s*\)",
+                r"result\s*=\s*(\{[\s\S]*?\})"
+            ]:
+                match = re.search(pattern, block, re.DOTALL)
+                if match:
+                    try:
+                        return json.loads(match.group(1))
+                    except json.JSONDecodeError:
+                        return ast.literal_eval(match.group(1))
+        fallback = re.search(r"\{[\s\S]+?\}", raw_output)
+        if fallback:
+            return json.loads(fallback.group(0))
     except Exception as e:
+        print(f"[extract_json] Error: {e}")
+    return {"error": "Failed to extract structured JSON"}
 def analyze_data(csv_file, additional_notes=""):
     start_time = time.time()
     process = psutil.Process(os.getpid())
     initial_memory = process.memory_info().rss / 1024 ** 2
+    # Load and trim dataset
+    df = pd.read_csv(csv_file)
+    df_trimmed = df.iloc[:300, :10]  # Limit rows and columns for performance
+    temp_path = tempfile.NamedTemporaryFile(delete=False, suffix=".csv").name
+    df_trimmed.to_csv(temp_path, index=False)
+    # Clear figures
     if os.path.exists('./figures'):
         shutil.rmtree('./figures')
     os.makedirs('./figures', exist_ok=True)
+    # Start W&B
     wandb.login(key=os.environ.get('WANDB_API_KEY'))
     run = wandb.init(project="huggingface-data-analysis", config={
         "model": "mistralai/Mixtral-8x7B-Instruct-v0.1",
         "additional_notes": additional_notes,
+        "source_file": csv_file.name
     })
+    # Create CodeAgent instance
+    agent = CodeAgent(
+        tools=[],
+        model=model,
+        additional_authorized_imports=["numpy", "pandas", "matplotlib.pyplot", "seaborn", "sklearn", "json"]
+    )
+    prompt = f"""
         You are a helpful data analysis agent. Follow these instructions EXACTLY:
+        1. Load the data from `source_file` ONLY.
+        2. Generate up to 3 observations and 3 visualizations.
+        3. Save all figures to ./figures as PNGs using matplotlib/seaborn.
+        4. Use only: pandas, numpy, matplotlib.pyplot, seaborn, json.
+        5. ⚠️ Output ONLY the following JSON format inside a single code block:
+        {{
+            "observations": {{
+                "key": "value"
+            }},
+            "insights": {{
+                "key": "value"
+            }}
+        }}
+        6. Do not include comments or narration.
+        7. Complete the analysis quickly (limit iterations).
+    """
+    try:
+        raw_output = agent.run(prompt, additional_args={
+            "source_file": open(temp_path, "rb"),
+            "additional_notes": additional_notes
+        })
+        parsed_result = extract_json_from_codeagent_output(raw_output)
+    except Exception as e:
+        print(f"[analyze_data] Agent failed: {e}")
+        parsed_result = {"error": str(e)}
+    # Log performance
     execution_time = time.time() - start_time
     final_memory = process.memory_info().rss / 1024 ** 2
     memory_usage = final_memory - initial_memory
     wandb.log({"execution_time_sec": execution_time, "memory_usage_mb": memory_usage})
+    # Upload visuals
+    visuals = [os.path.join('./figures', f) for f in os.listdir('./figures') if f.lower().endswith(('.png', '.jpg', '.jpeg'))]
     for viz in visuals:
         wandb.log({os.path.basename(viz): wandb.Image(viz)})
+    run.finish()
+    return {
+        "summary": parsed_result,
+        "visuals": visuals,
+        "execution_time_sec": round(execution_time, 2),
+        "memory_usage_mb": round(memory_usage, 2)
+    }