✨ Add new/updated metrics
Browse files- app.py +3 -1
- results/claude-opus-4-7-internal.json +1 -3
- results/qwen3-6-35b-internal.json +1 -3
- results/qwen3-6-35b-nvfp4-claude-code.json +15 -3
- results/qwen3-6-35b-nvfp4-opencode.json +15 -3
- results/qwen3-6-36b-nvfp4-pi.json +15 -3
- src/leaderboard.py +6 -2
- src/models.py +15 -2
app.py
CHANGED
|
@@ -61,7 +61,8 @@ def init_leaderboard(dataframe):
|
|
| 61 |
if dataframe is None or dataframe.empty:
|
| 62 |
raise ValueError("Leaderboard DataFrame is empty or None.")
|
| 63 |
|
| 64 |
-
# Make ColumnFilter choices
|
|
|
|
| 65 |
dataset_choices = sorted({(extract_body(v), v) for v in dataframe["Dataset"]})
|
| 66 |
|
| 67 |
return Leaderboard(
|
|
@@ -73,6 +74,7 @@ def init_leaderboard(dataframe):
|
|
| 73 |
datatype="markdown",
|
| 74 |
search_columns=SEARCH_COLUMNS,
|
| 75 |
filter_columns=[
|
|
|
|
| 76 |
ColumnFilter(label="Dataset", column="Dataset", type="checkboxgroup", choices=dataset_choices),
|
| 77 |
ColumnFilter(label="Model License", column="Model License", type="checkboxgroup"),
|
| 78 |
ColumnFilter(label="Harness License", column="Harness License", type="checkboxgroup"),
|
|
|
|
| 61 |
if dataframe is None or dataframe.empty:
|
| 62 |
raise ValueError("Leaderboard DataFrame is empty or None.")
|
| 63 |
|
| 64 |
+
# Make ColumnFilter choices
|
| 65 |
+
label_choices = [("🟠 Fully FOSS", "🟠"), ("🔶 Proprietary", "🔶")]
|
| 66 |
dataset_choices = sorted({(extract_body(v), v) for v in dataframe["Dataset"]})
|
| 67 |
|
| 68 |
return Leaderboard(
|
|
|
|
| 74 |
datatype="markdown",
|
| 75 |
search_columns=SEARCH_COLUMNS,
|
| 76 |
filter_columns=[
|
| 77 |
+
ColumnFilter(label="Category", column=" ", type="checkboxgroup", choices=label_choices),
|
| 78 |
ColumnFilter(label="Dataset", column="Dataset", type="checkboxgroup", choices=dataset_choices),
|
| 79 |
ColumnFilter(label="Model License", column="Model License", type="checkboxgroup"),
|
| 80 |
ColumnFilter(label="Harness License", column="Harness License", type="checkboxgroup"),
|
results/claude-opus-4-7-internal.json
CHANGED
|
@@ -24,8 +24,6 @@
|
|
| 24 |
"url": "https://www.anthropic.com/news/claude-opus-4-7"
|
| 25 |
},
|
| 26 |
"metrics": {
|
| 27 |
-
"score": 0.876
|
| 28 |
-
"time": null,
|
| 29 |
-
"costUSD": null
|
| 30 |
}
|
| 31 |
}
|
|
|
|
| 24 |
"url": "https://www.anthropic.com/news/claude-opus-4-7"
|
| 25 |
},
|
| 26 |
"metrics": {
|
| 27 |
+
"score": 0.876
|
|
|
|
|
|
|
| 28 |
}
|
| 29 |
}
|
results/qwen3-6-35b-internal.json
CHANGED
|
@@ -24,8 +24,6 @@
|
|
| 24 |
"url": "https://qwen.ai/blog?id=qwen3.6-35b-a3b"
|
| 25 |
},
|
| 26 |
"metrics": {
|
| 27 |
-
"score": 0.734
|
| 28 |
-
"time": null,
|
| 29 |
-
"costUSD": null
|
| 30 |
}
|
| 31 |
}
|
|
|
|
| 24 |
"url": "https://qwen.ai/blog?id=qwen3.6-35b-a3b"
|
| 25 |
},
|
| 26 |
"metrics": {
|
| 27 |
+
"score": 0.734
|
|
|
|
|
|
|
| 28 |
}
|
| 29 |
}
|
results/qwen3-6-35b-nvfp4-claude-code.json
CHANGED
|
@@ -37,8 +37,20 @@
|
|
| 37 |
"url": "https://github.com/harbor-framework/harbor"
|
| 38 |
},
|
| 39 |
"metrics": {
|
| 40 |
-
"
|
| 41 |
-
"
|
| 42 |
-
"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 43 |
}
|
| 44 |
}
|
|
|
|
| 37 |
"url": "https://github.com/harbor-framework/harbor"
|
| 38 |
},
|
| 39 |
"metrics": {
|
| 40 |
+
"n_tasks": 500,
|
| 41 |
+
"n_errors": 1,
|
| 42 |
+
"score": 0.63,
|
| 43 |
+
"n_input_tokens": 1106618897,
|
| 44 |
+
"n_cache_tokens": 0,
|
| 45 |
+
"n_output_tokens": 5733245,
|
| 46 |
+
"n_total_tokens": 1112352142,
|
| 47 |
+
"time_seconds": 122808,
|
| 48 |
+
"cost_usd": 34.11,
|
| 49 |
+
"mean_input_tokens_per_task": 2213237,
|
| 50 |
+
"mean_cache_tokens_per_task": 0,
|
| 51 |
+
"mean_output_tokens_per_task": 11466,
|
| 52 |
+
"mean_tokens_per_task": 2224704,
|
| 53 |
+
"mean_cost_usd_per_task": 0.07,
|
| 54 |
+
"mean_time_seconds_per_task": 245
|
| 55 |
}
|
| 56 |
}
|
results/qwen3-6-35b-nvfp4-opencode.json
CHANGED
|
@@ -38,8 +38,20 @@
|
|
| 38 |
"url": "https://github.com/harbor-framework/harbor"
|
| 39 |
},
|
| 40 |
"metrics": {
|
| 41 |
-
"
|
| 42 |
-
"
|
| 43 |
-
"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 44 |
}
|
| 45 |
}
|
|
|
|
| 38 |
"url": "https://github.com/harbor-framework/harbor"
|
| 39 |
},
|
| 40 |
"metrics": {
|
| 41 |
+
"n_tasks": 500,
|
| 42 |
+
"n_errors": 4,
|
| 43 |
+
"score": 0.55,
|
| 44 |
+
"n_input_tokens": 469806650,
|
| 45 |
+
"n_cache_tokens": 0,
|
| 46 |
+
"n_output_tokens": 4937761,
|
| 47 |
+
"n_total_tokens": 474744411,
|
| 48 |
+
"time_seconds": 120473,
|
| 49 |
+
"cost_usd": 29.75,
|
| 50 |
+
"mean_input_tokens_per_task": 939613,
|
| 51 |
+
"mean_cache_tokens_per_task": 0,
|
| 52 |
+
"mean_output_tokens_per_task": 9875,
|
| 53 |
+
"mean_tokens_per_task": 949488,
|
| 54 |
+
"mean_cost_usd_per_task": 0.06,
|
| 55 |
+
"mean_time_seconds_per_task": 240
|
| 56 |
}
|
| 57 |
}
|
results/qwen3-6-36b-nvfp4-pi.json
CHANGED
|
@@ -38,8 +38,20 @@
|
|
| 38 |
"url": "https://github.com/harbor-framework/harbor"
|
| 39 |
},
|
| 40 |
"metrics": {
|
| 41 |
-
"
|
| 42 |
-
"
|
| 43 |
-
"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 44 |
}
|
| 45 |
}
|
|
|
|
| 38 |
"url": "https://github.com/harbor-framework/harbor"
|
| 39 |
},
|
| 40 |
"metrics": {
|
| 41 |
+
"n_tasks": 500,
|
| 42 |
+
"n_errors": 6,
|
| 43 |
+
"score": 0.65,
|
| 44 |
+
"n_input_tokens": 791183735,
|
| 45 |
+
"n_cache_tokens": 0,
|
| 46 |
+
"n_output_tokens": 6333798,
|
| 47 |
+
"n_total_tokens": 797517533,
|
| 48 |
+
"time_seconds": 154531,
|
| 49 |
+
"cost_usd": 38.16,
|
| 50 |
+
"mean_input_tokens_per_task": 1582367,
|
| 51 |
+
"mean_cache_tokens_per_task": 0,
|
| 52 |
+
"mean_output_tokens_per_task": 12667,
|
| 53 |
+
"mean_tokens_per_task": 1595035,
|
| 54 |
+
"mean_cost_usd_per_task": 0.08,
|
| 55 |
+
"mean_time_seconds_per_task": 309
|
| 56 |
}
|
| 57 |
}
|
src/leaderboard.py
CHANGED
|
@@ -7,6 +7,7 @@ from src.models import Result
|
|
| 7 |
RESULTS_DIR = Path(__file__).parent.parent / "results"
|
| 8 |
|
| 9 |
DISPLAY_BY_DEFAULT = [
|
|
|
|
| 10 |
"Dataset",
|
| 11 |
"Harness",
|
| 12 |
"Model",
|
|
@@ -43,6 +44,7 @@ def get_leaderboard_df():
|
|
| 43 |
for result in results:
|
| 44 |
rows.append(
|
| 45 |
{
|
|
|
|
| 46 |
"Dataset": f'[{result.dataset.name}]({result.dataset.url})',
|
| 47 |
"Harness": f'[{result.harness.name}]({result.harness.url})<sup>*</sup>' if result.harness.name == "internal" else f'[{result.harness.name}]({result.harness.url})',
|
| 48 |
"Model": result.model.name,
|
|
@@ -51,8 +53,10 @@ def get_leaderboard_df():
|
|
| 51 |
"Skills": str(result.harness.skills) if result.harness.skills else "None",
|
| 52 |
"Environment": f'[{result.environment.name}]({result.environment.url})<sup>*</sup>' if result.environment.name == "internal" else f'[{result.environment.name}]({result.environment.url})',
|
| 53 |
"Score": result.metrics.score,
|
| 54 |
-
"Cost (USD)": result.metrics.
|
| 55 |
-
"
|
|
|
|
|
|
|
| 56 |
"Model License": "FOSS" if result.model.is_oss else "Proprietary",
|
| 57 |
"Harness License": "FOSS" if result.harness.is_oss else "Proprietary",
|
| 58 |
"Model Num Params (B)": result.model.num_params,
|
|
|
|
| 7 |
RESULTS_DIR = Path(__file__).parent.parent / "results"
|
| 8 |
|
| 9 |
DISPLAY_BY_DEFAULT = [
|
| 10 |
+
" ",
|
| 11 |
"Dataset",
|
| 12 |
"Harness",
|
| 13 |
"Model",
|
|
|
|
| 44 |
for result in results:
|
| 45 |
rows.append(
|
| 46 |
{
|
| 47 |
+
" ": "🟠" if result.model.is_oss and result.harness.is_oss else "🔶",
|
| 48 |
"Dataset": f'[{result.dataset.name}]({result.dataset.url})',
|
| 49 |
"Harness": f'[{result.harness.name}]({result.harness.url})<sup>*</sup>' if result.harness.name == "internal" else f'[{result.harness.name}]({result.harness.url})',
|
| 50 |
"Model": result.model.name,
|
|
|
|
| 53 |
"Skills": str(result.harness.skills) if result.harness.skills else "None",
|
| 54 |
"Environment": f'[{result.environment.name}]({result.environment.url})<sup>*</sup>' if result.environment.name == "internal" else f'[{result.environment.name}]({result.environment.url})',
|
| 55 |
"Score": result.metrics.score,
|
| 56 |
+
"Avg Cost Per Task (USD)": result.metrics.mean_cost_usd_per_task,
|
| 57 |
+
"Avg Seconds Per Task": result.metrics.mean_time_seconds_per_task,
|
| 58 |
+
"Avg Input Tokens Per Task": result.metrics.mean_input_tokens_per_task,
|
| 59 |
+
"Avg Output Tokens Per Task": result.metrics.mean_output_tokens_per_task,
|
| 60 |
"Model License": "FOSS" if result.model.is_oss else "Proprietary",
|
| 61 |
"Harness License": "FOSS" if result.harness.is_oss else "Proprietary",
|
| 62 |
"Model Num Params (B)": result.model.num_params,
|
src/models.py
CHANGED
|
@@ -33,9 +33,22 @@ class Environment(BaseModel):
|
|
| 33 |
|
| 34 |
|
| 35 |
class Metrics(BaseModel):
|
|
|
|
| 36 |
score: float
|
| 37 |
-
|
| 38 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 39 |
|
| 40 |
|
| 41 |
class Result(BaseModel):
|
|
|
|
| 33 |
|
| 34 |
|
| 35 |
class Metrics(BaseModel):
|
| 36 |
+
|
| 37 |
score: float
|
| 38 |
+
n_tasks: Optional[int] = None
|
| 39 |
+
n_errors: Optional[int] = None
|
| 40 |
+
n_input_tokens: Optional[int] = None
|
| 41 |
+
n_cache_tokens: Optional[int] = None
|
| 42 |
+
n_output_tokens: Optional[int] = None
|
| 43 |
+
n_total_tokens: Optional[int] = None
|
| 44 |
+
time_seconds: Optional[int] = None
|
| 45 |
+
cost_usd: Optional[float] = None
|
| 46 |
+
mean_input_tokens_per_task: Optional[int] = None
|
| 47 |
+
mean_cache_tokens_per_task: Optional[int] = None
|
| 48 |
+
mean_output_tokens_per_task: Optional[int] = None
|
| 49 |
+
mean_tokens_per_task: Optional[int] = None
|
| 50 |
+
mean_cost_usd_per_task: Optional[float] = None
|
| 51 |
+
mean_time_seconds_per_task: Optional[int] = None
|
| 52 |
|
| 53 |
|
| 54 |
class Result(BaseModel):
|