Spaces:
Running
Running
Bhavya commited on
Commit ·
fec76d9
1
Parent(s): 9b80da8
added agent running and e2e eval activities
Browse files- analysis_src/utils.py +70 -0
- download_run_scenario.ipynb +0 -0
- evaluation.ipynb +0 -0
analysis_src/utils.py
CHANGED
|
@@ -1,5 +1,6 @@
|
|
| 1 |
import json
|
| 2 |
from pathlib import Path
|
|
|
|
| 3 |
|
| 4 |
# Model display names (short for figures)
|
| 5 |
# Follows ArtificialAnalysis.ai naming conventions
|
|
@@ -173,3 +174,72 @@ def find_latest_rollout_file(trial_dir: Path) -> Path:
|
|
| 173 |
# Sort by modification time and return the latest
|
| 174 |
return max(rollout_files, key=lambda p: p.stat().st_mtime)
|
| 175 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
import json
|
| 2 |
from pathlib import Path
|
| 3 |
+
import pandas as pd
|
| 4 |
|
| 5 |
# Model display names (short for figures)
|
| 6 |
# Follows ArtificialAnalysis.ai naming conventions
|
|
|
|
| 174 |
# Sort by modification time and return the latest
|
| 175 |
return max(rollout_files, key=lambda p: p.stat().st_mtime)
|
| 176 |
|
| 177 |
+
def json_to_filtered_df(path: str) -> pd.DataFrame:
|
| 178 |
+
"""
|
| 179 |
+
Load a .json or .jsonl file, keep only rows whose payload.type is in
|
| 180 |
+
DESIRED_TYPES, select USEFUL_COLS, and return the DataFrame sorted by
|
| 181 |
+
timestamp ascending.
|
| 182 |
+
|
| 183 |
+
Parameters
|
| 184 |
+
----------
|
| 185 |
+
path : str
|
| 186 |
+
Path to the JSON or JSON Lines file.
|
| 187 |
+
|
| 188 |
+
Returns
|
| 189 |
+
-------
|
| 190 |
+
pd.DataFrame
|
| 191 |
+
Tidied DataFrame ready for analysis/labs.
|
| 192 |
+
"""
|
| 193 |
+
DESIRED_TYPES = {
|
| 194 |
+
"agent_message",
|
| 195 |
+
"function_call",
|
| 196 |
+
"function_call_output"
|
| 197 |
+
}
|
| 198 |
+
|
| 199 |
+
# Union of all “useful” columns
|
| 200 |
+
USEFUL_COLS = [
|
| 201 |
+
"timestamp",
|
| 202 |
+
"payload.type",
|
| 203 |
+
"payload.message",
|
| 204 |
+
"payload.role",
|
| 205 |
+
"payload.content",
|
| 206 |
+
"payload.name",
|
| 207 |
+
"payload.arguments",
|
| 208 |
+
"payload.call_id",
|
| 209 |
+
"payload.output",
|
| 210 |
+
]
|
| 211 |
+
path = Path(path)
|
| 212 |
+
if not path.exists():
|
| 213 |
+
raise FileNotFoundError(f"{path} does not exist")
|
| 214 |
+
|
| 215 |
+
# 1. Load the records -----------------------------------------------------
|
| 216 |
+
if path.suffix.lower() in {".jsonl", ".ndjson"}:
|
| 217 |
+
with path.open("r", encoding="utf-8") as f:
|
| 218 |
+
records = [json.loads(line) for line in f if line.strip()]
|
| 219 |
+
else:
|
| 220 |
+
with path.open("r", encoding="utf-8") as f:
|
| 221 |
+
data = json.load(f)
|
| 222 |
+
records = data if isinstance(data, list) else [data]
|
| 223 |
+
|
| 224 |
+
# 2. Flatten nested JSON --------------------------------------------------
|
| 225 |
+
df = pd.json_normalize(records)
|
| 226 |
+
|
| 227 |
+
# 3. Filter by payload.type ----------------------------------------------
|
| 228 |
+
if "payload.type" not in df.columns:
|
| 229 |
+
raise KeyError("'payload.type' column missing from data")
|
| 230 |
+
df = df[df["payload.type"].isin(DESIRED_TYPES)].copy()
|
| 231 |
+
|
| 232 |
+
# 4. Ensure all useful columns exist (add empty if missing) --------------
|
| 233 |
+
for col in USEFUL_COLS:
|
| 234 |
+
if col not in df.columns:
|
| 235 |
+
df[col] = pd.NA
|
| 236 |
+
|
| 237 |
+
# 5. Subset to useful columns only ---------------------------------------
|
| 238 |
+
df = df[USEFUL_COLS]
|
| 239 |
+
|
| 240 |
+
# 6. Sort by timestamp ----------------------------------------------------
|
| 241 |
+
df["timestamp"] = pd.to_datetime(df["timestamp"], errors="coerce")
|
| 242 |
+
df = df.sort_values("timestamp", ignore_index=True)
|
| 243 |
+
|
| 244 |
+
return df
|
| 245 |
+
|
download_run_scenario.ipynb
CHANGED
|
The diff for this file is too large to render.
See raw diff
|
|
|
evaluation.ipynb
CHANGED
|
The diff for this file is too large to render.
See raw diff
|
|
|