ITBench-Lite

Running

App Files Files Community

Bhavya commited on Jan 19

Commit

fec76d9

1 Parent(s): 9b80da8

added agent running and e2e eval activities

Browse files

Files changed (3) hide show

analysis_src/utils.py +70 -0
download_run_scenario.ipynb +0 -0
evaluation.ipynb +0 -0

analysis_src/utils.py CHANGED Viewed

@@ -1,5 +1,6 @@
 import json
 from pathlib import Path
 # Model display names (short for figures)
 # Follows ArtificialAnalysis.ai naming conventions
@@ -173,3 +174,72 @@ def find_latest_rollout_file(trial_dir: Path) -> Path:
     # Sort by modification time and return the latest
     return max(rollout_files, key=lambda p: p.stat().st_mtime)

 import json
 from pathlib import Path
+import pandas as pd
 # Model display names (short for figures)
 # Follows ArtificialAnalysis.ai naming conventions
     # Sort by modification time and return the latest
     return max(rollout_files, key=lambda p: p.stat().st_mtime)
+def json_to_filtered_df(path: str) -> pd.DataFrame:
+    """
+    Load a .json or .jsonl file, keep only rows whose payload.type is in
+    DESIRED_TYPES, select USEFUL_COLS, and return the DataFrame sorted by
+    timestamp ascending.
+    Parameters
+    ----------
+    path : str
+        Path to the JSON or JSON Lines file.
+    Returns
+    -------
+    pd.DataFrame
+        Tidied DataFrame ready for analysis/labs.
+    """
+    DESIRED_TYPES = {
+    "agent_message",
+    "function_call",
+    "function_call_output"
+    }
+    # Union of all “useful” columns
+    USEFUL_COLS = [
+        "timestamp",
+        "payload.type",
+        "payload.message",
+        "payload.role",
+        "payload.content",
+        "payload.name",
+        "payload.arguments",
+        "payload.call_id",
+        "payload.output",
+    ]
+    path = Path(path)
+    if not path.exists():
+        raise FileNotFoundError(f"{path} does not exist")
+    # 1. Load the records -----------------------------------------------------
+    if path.suffix.lower() in {".jsonl", ".ndjson"}:
+        with path.open("r", encoding="utf-8") as f:
+            records = [json.loads(line) for line in f if line.strip()]
+    else:
+        with path.open("r", encoding="utf-8") as f:
+            data = json.load(f)
+        records = data if isinstance(data, list) else [data]
+    # 2. Flatten nested JSON --------------------------------------------------
+    df = pd.json_normalize(records)
+    # 3. Filter by payload.type ----------------------------------------------
+    if "payload.type" not in df.columns:
+        raise KeyError("'payload.type' column missing from data")
+    df = df[df["payload.type"].isin(DESIRED_TYPES)].copy()
+    # 4. Ensure all useful columns exist (add empty if missing) --------------
+    for col in USEFUL_COLS:
+        if col not in df.columns:
+            df[col] = pd.NA
+    # 5. Subset to useful columns only ---------------------------------------
+    df = df[USEFUL_COLS]
+    # 6. Sort by timestamp ----------------------------------------------------
+    df["timestamp"] = pd.to_datetime(df["timestamp"], errors="coerce")
+    df = df.sort_values("timestamp", ignore_index=True)
+    return df

download_run_scenario.ipynb CHANGED Viewed

The diff for this file is too large to render. See raw diff

evaluation.ipynb CHANGED Viewed

The diff for this file is too large to render. See raw diff