Update app.py
Browse files
app.py
CHANGED
|
@@ -95,84 +95,89 @@ def extract_json_from_codeagent_output(raw_output):
|
|
| 95 |
return {"error": "Failed to extract structured JSON"}
|
| 96 |
|
| 97 |
def analyze_data(csv_file, additional_notes=""):
|
|
|
|
|
|
|
|
|
|
| 98 |
start_time = time.time()
|
| 99 |
process = psutil.Process(os.getpid())
|
| 100 |
initial_memory = process.memory_info().rss / 1024 ** 2
|
| 101 |
|
| 102 |
-
#
|
| 103 |
-
df = pd.read_csv(csv_file)
|
| 104 |
-
df_trimmed = df.iloc[:300, :10] # Limit rows and columns for performance
|
| 105 |
-
temp_path = tempfile.NamedTemporaryFile(delete=False, suffix=".csv").name
|
| 106 |
-
df_trimmed.to_csv(temp_path, index=False)
|
| 107 |
-
|
| 108 |
-
# Clear figures
|
| 109 |
if os.path.exists('./figures'):
|
| 110 |
shutil.rmtree('./figures')
|
| 111 |
os.makedirs('./figures', exist_ok=True)
|
| 112 |
|
| 113 |
-
# Start W&B
|
| 114 |
wandb.login(key=os.environ.get('WANDB_API_KEY'))
|
| 115 |
run = wandb.init(project="huggingface-data-analysis", config={
|
| 116 |
"model": "mistralai/Mixtral-8x7B-Instruct-v0.1",
|
| 117 |
"additional_notes": additional_notes,
|
| 118 |
-
"source_file": csv_file.name
|
| 119 |
})
|
| 120 |
|
| 121 |
-
# Create CodeAgent instance
|
| 122 |
agent = CodeAgent(
|
| 123 |
tools=[],
|
| 124 |
-
model=model,
|
| 125 |
additional_authorized_imports=["numpy", "pandas", "matplotlib.pyplot", "seaborn", "sklearn", "json"]
|
| 126 |
)
|
| 127 |
|
| 128 |
-
|
|
|
|
| 129 |
You are a helpful data analysis agent. Follow these instructions EXACTLY:
|
| 130 |
-
1. Load the data from `source_file` ONLY.
|
| 131 |
-
2.
|
| 132 |
-
3. Save all figures to
|
| 133 |
-
4. Use only: pandas
|
| 134 |
-
5.
|
| 135 |
-
|
| 136 |
-
|
| 137 |
-
|
| 138 |
-
|
| 139 |
-
|
| 140 |
-
|
| 141 |
-
}
|
| 142 |
-
|
| 143 |
-
|
| 144 |
-
|
| 145 |
-
|
|
|
|
|
|
|
| 146 |
|
| 147 |
-
|
| 148 |
-
|
| 149 |
-
|
| 150 |
-
|
| 151 |
-
})
|
| 152 |
-
parsed_result = extract_json_from_codeagent_output(raw_output)
|
| 153 |
-
except Exception as e:
|
| 154 |
-
print(f"[analyze_data] Agent failed: {e}")
|
| 155 |
-
parsed_result = {"error": str(e)}
|
| 156 |
|
| 157 |
-
# Log performance
|
| 158 |
execution_time = time.time() - start_time
|
| 159 |
final_memory = process.memory_info().rss / 1024 ** 2
|
| 160 |
memory_usage = final_memory - initial_memory
|
| 161 |
-
wandb.log({"execution_time_sec": execution_time, "memory_usage_mb": memory_usage})
|
| 162 |
|
| 163 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 164 |
visuals = [os.path.join('./figures', f) for f in os.listdir('./figures') if f.lower().endswith(('.png', '.jpg', '.jpeg'))]
|
| 165 |
for viz in visuals:
|
| 166 |
wandb.log({os.path.basename(viz): wandb.Image(viz)})
|
| 167 |
|
| 168 |
run.finish()
|
| 169 |
|
| 170 |
-
|
| 171 |
-
|
| 172 |
-
|
| 173 |
-
"
|
| 174 |
-
|
| 175 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 176 |
|
| 177 |
|
| 178 |
|
|
|
|
| 95 |
return {"error": "Failed to extract structured JSON"}
|
| 96 |
|
| 97 |
def analyze_data(csv_file, additional_notes=""):
|
| 98 |
+
import time, os, shutil, psutil, json
|
| 99 |
+
from pathlib import Path
|
| 100 |
+
|
| 101 |
start_time = time.time()
|
| 102 |
process = psutil.Process(os.getpid())
|
| 103 |
initial_memory = process.memory_info().rss / 1024 ** 2
|
| 104 |
|
| 105 |
+
# Clear or create figures folder
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 106 |
if os.path.exists('./figures'):
|
| 107 |
shutil.rmtree('./figures')
|
| 108 |
os.makedirs('./figures', exist_ok=True)
|
| 109 |
|
|
|
|
| 110 |
wandb.login(key=os.environ.get('WANDB_API_KEY'))
|
| 111 |
run = wandb.init(project="huggingface-data-analysis", config={
|
| 112 |
"model": "mistralai/Mixtral-8x7B-Instruct-v0.1",
|
| 113 |
"additional_notes": additional_notes,
|
| 114 |
+
"source_file": csv_file.name if csv_file else None
|
| 115 |
})
|
| 116 |
|
|
|
|
| 117 |
agent = CodeAgent(
|
| 118 |
tools=[],
|
| 119 |
+
model=model,
|
| 120 |
additional_authorized_imports=["numpy", "pandas", "matplotlib.pyplot", "seaborn", "sklearn", "json"]
|
| 121 |
)
|
| 122 |
|
| 123 |
+
# Run the CodeAgent
|
| 124 |
+
raw_output = agent.run("""
|
| 125 |
You are a helpful data analysis agent. Follow these instructions EXACTLY:
|
| 126 |
+
1. Load the data from the given `source_file` ONLY.
|
| 127 |
+
2. Analyze the data structure and generate up to 5 visualizations and 5 insights.
|
| 128 |
+
3. Save all figures to `./figures` as PNG using matplotlib or seaborn.
|
| 129 |
+
4. Use only authorized imports: `pandas`, `numpy`, `matplotlib.pyplot`, `seaborn`, `json`.
|
| 130 |
+
5. DO NOT return any explanations, thoughts, or narration outside the final output block.
|
| 131 |
+
6. Run only 5 iteration and return output quickly.
|
| 132 |
+
⚠️ Output ONLY the following code block format, exactly:
|
| 133 |
+
{
|
| 134 |
+
'observations': {
|
| 135 |
+
'observation_1_key': 'observation_1_value',
|
| 136 |
+
...
|
| 137 |
+
},
|
| 138 |
+
'insights': {
|
| 139 |
+
'insight_1_key': 'insight_1_value',
|
| 140 |
+
...
|
| 141 |
+
}
|
| 142 |
+
}
|
| 143 |
+
""", additional_args={"additional_notes": additional_notes, "source_file": csv_file})
|
| 144 |
|
| 145 |
+
# Parse agent output
|
| 146 |
+
parsed_result = extract_json_from_codeagent_output(raw_output) or {
|
| 147 |
+
"error": "Failed to extract structured JSON"
|
| 148 |
+
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 149 |
|
|
|
|
| 150 |
execution_time = time.time() - start_time
|
| 151 |
final_memory = process.memory_info().rss / 1024 ** 2
|
| 152 |
memory_usage = final_memory - initial_memory
|
|
|
|
| 153 |
|
| 154 |
+
wandb.log({
|
| 155 |
+
"execution_time_sec": round(execution_time, 2),
|
| 156 |
+
"memory_usage_mb": round(memory_usage, 2)
|
| 157 |
+
})
|
| 158 |
+
|
| 159 |
+
# Collect generated visualizations
|
| 160 |
visuals = [os.path.join('./figures', f) for f in os.listdir('./figures') if f.lower().endswith(('.png', '.jpg', '.jpeg'))]
|
| 161 |
for viz in visuals:
|
| 162 |
wandb.log({os.path.basename(viz): wandb.Image(viz)})
|
| 163 |
|
| 164 |
run.finish()
|
| 165 |
|
| 166 |
+
# Generate summary HTML
|
| 167 |
+
summary_html = "<h3>📊 Data Analysis Summary</h3>"
|
| 168 |
+
if "observations" in parsed_result:
|
| 169 |
+
summary_html += "<h4>🔍 Observations</h4><ul>" + "".join(
|
| 170 |
+
f"<li><b>{k}:</b> {v}</li>" for k, v in parsed_result["observations"].items()
|
| 171 |
+
) + "</ul>"
|
| 172 |
+
if "insights" in parsed_result:
|
| 173 |
+
summary_html += "<h4>💡 Insights</h4><ul>" + "".join(
|
| 174 |
+
f"<li><b>{k}:</b> {v}</li>" for k, v in parsed_result["insights"].items()
|
| 175 |
+
) + "</ul>"
|
| 176 |
+
if "error" in parsed_result:
|
| 177 |
+
summary_html += f"<p style='color:red'><b>Error:</b> {parsed_result['error']}</p>"
|
| 178 |
+
|
| 179 |
+
# Return summary HTML and visual paths for gr.HTML + gr.Gallery
|
| 180 |
+
return summary_html, visuals
|
| 181 |
|
| 182 |
|
| 183 |
|