taagarwa commited on
Commit
be7275a
·
1 Parent(s): 96d72d6

✨ Add new/updated metrics

Browse files
app.py CHANGED
@@ -61,7 +61,8 @@ def init_leaderboard(dataframe):
61
  if dataframe is None or dataframe.empty:
62
  raise ValueError("Leaderboard DataFrame is empty or None.")
63
 
64
- # Make ColumnFilter choices from md format
 
65
  dataset_choices = sorted({(extract_body(v), v) for v in dataframe["Dataset"]})
66
 
67
  return Leaderboard(
@@ -73,6 +74,7 @@ def init_leaderboard(dataframe):
73
  datatype="markdown",
74
  search_columns=SEARCH_COLUMNS,
75
  filter_columns=[
 
76
  ColumnFilter(label="Dataset", column="Dataset", type="checkboxgroup", choices=dataset_choices),
77
  ColumnFilter(label="Model License", column="Model License", type="checkboxgroup"),
78
  ColumnFilter(label="Harness License", column="Harness License", type="checkboxgroup"),
 
61
  if dataframe is None or dataframe.empty:
62
  raise ValueError("Leaderboard DataFrame is empty or None.")
63
 
64
+ # Make ColumnFilter choices
65
+ label_choices = [("🟠 Fully FOSS", "🟠"), ("🔶 Proprietary", "🔶")]
66
  dataset_choices = sorted({(extract_body(v), v) for v in dataframe["Dataset"]})
67
 
68
  return Leaderboard(
 
74
  datatype="markdown",
75
  search_columns=SEARCH_COLUMNS,
76
  filter_columns=[
77
+ ColumnFilter(label="Category", column=" ", type="checkboxgroup", choices=label_choices),
78
  ColumnFilter(label="Dataset", column="Dataset", type="checkboxgroup", choices=dataset_choices),
79
  ColumnFilter(label="Model License", column="Model License", type="checkboxgroup"),
80
  ColumnFilter(label="Harness License", column="Harness License", type="checkboxgroup"),
results/claude-opus-4-7-internal.json CHANGED
@@ -24,8 +24,6 @@
24
  "url": "https://www.anthropic.com/news/claude-opus-4-7"
25
  },
26
  "metrics": {
27
- "score": 0.876,
28
- "time": null,
29
- "costUSD": null
30
  }
31
  }
 
24
  "url": "https://www.anthropic.com/news/claude-opus-4-7"
25
  },
26
  "metrics": {
27
+ "score": 0.876
 
 
28
  }
29
  }
results/qwen3-6-35b-internal.json CHANGED
@@ -24,8 +24,6 @@
24
  "url": "https://qwen.ai/blog?id=qwen3.6-35b-a3b"
25
  },
26
  "metrics": {
27
- "score": 0.734,
28
- "time": null,
29
- "costUSD": null
30
  }
31
  }
 
24
  "url": "https://qwen.ai/blog?id=qwen3.6-35b-a3b"
25
  },
26
  "metrics": {
27
+ "score": 0.734
 
 
28
  }
29
  }
results/qwen3-6-35b-nvfp4-claude-code.json CHANGED
@@ -37,8 +37,20 @@
37
  "url": "https://github.com/harbor-framework/harbor"
38
  },
39
  "metrics": {
40
- "score": 0.632,
41
- "time": 21600,
42
- "costUSD": 48.00
 
 
 
 
 
 
 
 
 
 
 
 
43
  }
44
  }
 
37
  "url": "https://github.com/harbor-framework/harbor"
38
  },
39
  "metrics": {
40
+ "n_tasks": 500,
41
+ "n_errors": 1,
42
+ "score": 0.63,
43
+ "n_input_tokens": 1106618897,
44
+ "n_cache_tokens": 0,
45
+ "n_output_tokens": 5733245,
46
+ "n_total_tokens": 1112352142,
47
+ "time_seconds": 122808,
48
+ "cost_usd": 34.11,
49
+ "mean_input_tokens_per_task": 2213237,
50
+ "mean_cache_tokens_per_task": 0,
51
+ "mean_output_tokens_per_task": 11466,
52
+ "mean_tokens_per_task": 2224704,
53
+ "mean_cost_usd_per_task": 0.07,
54
+ "mean_time_seconds_per_task": 245
55
  }
56
  }
results/qwen3-6-35b-nvfp4-opencode.json CHANGED
@@ -38,8 +38,20 @@
38
  "url": "https://github.com/harbor-framework/harbor"
39
  },
40
  "metrics": {
41
- "score": 0.548,
42
- "time": 29940,
43
- "costUSD": 66.53
 
 
 
 
 
 
 
 
 
 
 
 
44
  }
45
  }
 
38
  "url": "https://github.com/harbor-framework/harbor"
39
  },
40
  "metrics": {
41
+ "n_tasks": 500,
42
+ "n_errors": 4,
43
+ "score": 0.55,
44
+ "n_input_tokens": 469806650,
45
+ "n_cache_tokens": 0,
46
+ "n_output_tokens": 4937761,
47
+ "n_total_tokens": 474744411,
48
+ "time_seconds": 120473,
49
+ "cost_usd": 29.75,
50
+ "mean_input_tokens_per_task": 939613,
51
+ "mean_cache_tokens_per_task": 0,
52
+ "mean_output_tokens_per_task": 9875,
53
+ "mean_tokens_per_task": 949488,
54
+ "mean_cost_usd_per_task": 0.06,
55
+ "mean_time_seconds_per_task": 240
56
  }
57
  }
results/qwen3-6-36b-nvfp4-pi.json CHANGED
@@ -38,8 +38,20 @@
38
  "url": "https://github.com/harbor-framework/harbor"
39
  },
40
  "metrics": {
41
- "score": 0.650,
42
- "time": 23160,
43
- "costUSD": 51.47
 
 
 
 
 
 
 
 
 
 
 
 
44
  }
45
  }
 
38
  "url": "https://github.com/harbor-framework/harbor"
39
  },
40
  "metrics": {
41
+ "n_tasks": 500,
42
+ "n_errors": 6,
43
+ "score": 0.65,
44
+ "n_input_tokens": 791183735,
45
+ "n_cache_tokens": 0,
46
+ "n_output_tokens": 6333798,
47
+ "n_total_tokens": 797517533,
48
+ "time_seconds": 154531,
49
+ "cost_usd": 38.16,
50
+ "mean_input_tokens_per_task": 1582367,
51
+ "mean_cache_tokens_per_task": 0,
52
+ "mean_output_tokens_per_task": 12667,
53
+ "mean_tokens_per_task": 1595035,
54
+ "mean_cost_usd_per_task": 0.08,
55
+ "mean_time_seconds_per_task": 309
56
  }
57
  }
src/leaderboard.py CHANGED
@@ -7,6 +7,7 @@ from src.models import Result
7
  RESULTS_DIR = Path(__file__).parent.parent / "results"
8
 
9
  DISPLAY_BY_DEFAULT = [
 
10
  "Dataset",
11
  "Harness",
12
  "Model",
@@ -43,6 +44,7 @@ def get_leaderboard_df():
43
  for result in results:
44
  rows.append(
45
  {
 
46
  "Dataset": f'[{result.dataset.name}]({result.dataset.url})',
47
  "Harness": f'[{result.harness.name}]({result.harness.url})<sup>*</sup>' if result.harness.name == "internal" else f'[{result.harness.name}]({result.harness.url})',
48
  "Model": result.model.name,
@@ -51,8 +53,10 @@ def get_leaderboard_df():
51
  "Skills": str(result.harness.skills) if result.harness.skills else "None",
52
  "Environment": f'[{result.environment.name}]({result.environment.url})<sup>*</sup>' if result.environment.name == "internal" else f'[{result.environment.name}]({result.environment.url})',
53
  "Score": result.metrics.score,
54
- "Cost (USD)": result.metrics.costUSD,
55
- "Time": format_time(result.metrics.time),
 
 
56
  "Model License": "FOSS" if result.model.is_oss else "Proprietary",
57
  "Harness License": "FOSS" if result.harness.is_oss else "Proprietary",
58
  "Model Num Params (B)": result.model.num_params,
 
7
  RESULTS_DIR = Path(__file__).parent.parent / "results"
8
 
9
  DISPLAY_BY_DEFAULT = [
10
+ " ",
11
  "Dataset",
12
  "Harness",
13
  "Model",
 
44
  for result in results:
45
  rows.append(
46
  {
47
+ " ": "🟠" if result.model.is_oss and result.harness.is_oss else "🔶",
48
  "Dataset": f'[{result.dataset.name}]({result.dataset.url})',
49
  "Harness": f'[{result.harness.name}]({result.harness.url})<sup>*</sup>' if result.harness.name == "internal" else f'[{result.harness.name}]({result.harness.url})',
50
  "Model": result.model.name,
 
53
  "Skills": str(result.harness.skills) if result.harness.skills else "None",
54
  "Environment": f'[{result.environment.name}]({result.environment.url})<sup>*</sup>' if result.environment.name == "internal" else f'[{result.environment.name}]({result.environment.url})',
55
  "Score": result.metrics.score,
56
+ "Avg Cost Per Task (USD)": result.metrics.mean_cost_usd_per_task,
57
+ "Avg Seconds Per Task": result.metrics.mean_time_seconds_per_task,
58
+ "Avg Input Tokens Per Task": result.metrics.mean_input_tokens_per_task,
59
+ "Avg Output Tokens Per Task": result.metrics.mean_output_tokens_per_task,
60
  "Model License": "FOSS" if result.model.is_oss else "Proprietary",
61
  "Harness License": "FOSS" if result.harness.is_oss else "Proprietary",
62
  "Model Num Params (B)": result.model.num_params,
src/models.py CHANGED
@@ -33,9 +33,22 @@ class Environment(BaseModel):
33
 
34
 
35
  class Metrics(BaseModel):
 
36
  score: float
37
- time: Optional[int] = None
38
- costUSD: Optional[float] = None
 
 
 
 
 
 
 
 
 
 
 
 
39
 
40
 
41
  class Result(BaseModel):
 
33
 
34
 
35
  class Metrics(BaseModel):
36
+
37
  score: float
38
+ n_tasks: Optional[int] = None
39
+ n_errors: Optional[int] = None
40
+ n_input_tokens: Optional[int] = None
41
+ n_cache_tokens: Optional[int] = None
42
+ n_output_tokens: Optional[int] = None
43
+ n_total_tokens: Optional[int] = None
44
+ time_seconds: Optional[int] = None
45
+ cost_usd: Optional[float] = None
46
+ mean_input_tokens_per_task: Optional[int] = None
47
+ mean_cache_tokens_per_task: Optional[int] = None
48
+ mean_output_tokens_per_task: Optional[int] = None
49
+ mean_tokens_per_task: Optional[int] = None
50
+ mean_cost_usd_per_task: Optional[float] = None
51
+ mean_time_seconds_per_task: Optional[int] = None
52
 
53
 
54
  class Result(BaseModel):