✨ Add Qwen3.6 35b baseline
Browse files- app.py +3 -1
- results/qwen3-6-35b-internal.json +26 -0
- results/qwen3-6-35b-nvfp4-claude-code.json +1 -1
- src/leaderboard.py +5 -1
- src/models.py +4 -4
app.py
CHANGED
|
@@ -39,6 +39,7 @@ def init_leaderboard(dataframe):
|
|
| 39 |
filter_columns=[
|
| 40 |
ColumnFilter(label="Dataset", column="dataset", type="checkboxgroup"),
|
| 41 |
ColumnFilter(label="Number of Parameters (B)", column="model_num_params", type="slider", min=0.5, max=150),
|
|
|
|
| 42 |
],
|
| 43 |
interactive=False,
|
| 44 |
)
|
|
@@ -50,8 +51,9 @@ with demo:
|
|
| 50 |
gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
|
| 51 |
|
| 52 |
with gr.Tabs(elem_classes="tab-buttons") as tabs:
|
| 53 |
-
with gr.TabItem("🏅
|
| 54 |
leaderboard = init_leaderboard(LEADERBOARD_DF)
|
|
|
|
| 55 |
|
| 56 |
with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=2):
|
| 57 |
gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
|
|
|
|
| 39 |
filter_columns=[
|
| 40 |
ColumnFilter(label="Dataset", column="dataset", type="checkboxgroup"),
|
| 41 |
ColumnFilter(label="Number of Parameters (B)", column="model_num_params", type="slider", min=0.5, max=150),
|
| 42 |
+
ColumnFilter(label="Precision", column="precision", type="checkboxgroup"),
|
| 43 |
],
|
| 44 |
interactive=False,
|
| 45 |
)
|
|
|
|
| 51 |
gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
|
| 52 |
|
| 53 |
with gr.Tabs(elem_classes="tab-buttons") as tabs:
|
| 54 |
+
with gr.TabItem("🏅 Coding Agent Benchmark", elem_id="llm-benchmark-tab-table", id=0):
|
| 55 |
leaderboard = init_leaderboard(LEADERBOARD_DF)
|
| 56 |
+
gr.Markdown("\* `internal` refers to internal benchmarks performed by the model provider where the harness/environment were not made public")
|
| 57 |
|
| 58 |
with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=2):
|
| 59 |
gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
|
results/qwen3-6-35b-internal.json
ADDED
|
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"dataset": {
|
| 3 |
+
"name": "swe-bench-verified",
|
| 4 |
+
"repo": "SWE-bench/SWE-bench_Verified",
|
| 5 |
+
"num_tasks": 500
|
| 6 |
+
},
|
| 7 |
+
"harness": {
|
| 8 |
+
"name": "internal",
|
| 9 |
+
"skills": []
|
| 10 |
+
},
|
| 11 |
+
"model": {
|
| 12 |
+
"name": "Qwen3.6-35B-A3B",
|
| 13 |
+
"repo": "Qwen/Qwen3.6-35B-A3B",
|
| 14 |
+
"is_oss": true,
|
| 15 |
+
"num_params": 35,
|
| 16 |
+
"precision": "bf16"
|
| 17 |
+
},
|
| 18 |
+
"environment": {
|
| 19 |
+
"name": "internal"
|
| 20 |
+
},
|
| 21 |
+
"metrics": {
|
| 22 |
+
"score": 0.734,
|
| 23 |
+
"time": null,
|
| 24 |
+
"costUSD": null
|
| 25 |
+
}
|
| 26 |
+
}
|
results/qwen3-6-35b-nvfp4-claude-code.json
CHANGED
|
@@ -9,7 +9,7 @@
|
|
| 9 |
"skills": []
|
| 10 |
},
|
| 11 |
"model": {
|
| 12 |
-
"name": "Qwen3.6-35B-A3B
|
| 13 |
"repo": "RedHatAI/Qwen3.6-35B-A3B-NVFP4",
|
| 14 |
"is_oss": true,
|
| 15 |
"num_params": 35,
|
|
|
|
| 9 |
"skills": []
|
| 10 |
},
|
| 11 |
"model": {
|
| 12 |
+
"name": "Qwen3.6-35B-A3B",
|
| 13 |
"repo": "RedHatAI/Qwen3.6-35B-A3B-NVFP4",
|
| 14 |
"is_oss": true,
|
| 15 |
"num_params": 35,
|
src/leaderboard.py
CHANGED
|
@@ -9,11 +9,11 @@ RESULTS_DIR = Path(__file__).parent.parent / "results"
|
|
| 9 |
DISPLAY_BY_DEFAULT = [
|
| 10 |
"dataset",
|
| 11 |
"model",
|
|
|
|
| 12 |
"harness",
|
| 13 |
"skills",
|
| 14 |
"environment",
|
| 15 |
"score",
|
| 16 |
-
"costUSD",
|
| 17 |
]
|
| 18 |
|
| 19 |
SEARCH_COLUMNS = [
|
|
@@ -24,6 +24,8 @@ SEARCH_COLUMNS = [
|
|
| 24 |
|
| 25 |
|
| 26 |
def format_time(seconds: int):
|
|
|
|
|
|
|
| 27 |
m, s = divmod(seconds, 60)
|
| 28 |
h, m = divmod(m, 60)
|
| 29 |
return f"{h}h{m}m{s}s"
|
|
@@ -43,6 +45,8 @@ def get_leaderboard_df():
|
|
| 43 |
{
|
| 44 |
"dataset": result.dataset.name,
|
| 45 |
"model": result.model.name,
|
|
|
|
|
|
|
| 46 |
"harness": result.harness.name,
|
| 47 |
"skills": str(result.harness.skills) if result.harness.skills else "None",
|
| 48 |
"environment": result.environment.name,
|
|
|
|
| 9 |
DISPLAY_BY_DEFAULT = [
|
| 10 |
"dataset",
|
| 11 |
"model",
|
| 12 |
+
"precision",
|
| 13 |
"harness",
|
| 14 |
"skills",
|
| 15 |
"environment",
|
| 16 |
"score",
|
|
|
|
| 17 |
]
|
| 18 |
|
| 19 |
SEARCH_COLUMNS = [
|
|
|
|
| 24 |
|
| 25 |
|
| 26 |
def format_time(seconds: int):
|
| 27 |
+
if seconds is None:
|
| 28 |
+
return None
|
| 29 |
m, s = divmod(seconds, 60)
|
| 30 |
h, m = divmod(m, 60)
|
| 31 |
return f"{h}h{m}m{s}s"
|
|
|
|
| 45 |
{
|
| 46 |
"dataset": result.dataset.name,
|
| 47 |
"model": result.model.name,
|
| 48 |
+
"model_id": result.model.repo,
|
| 49 |
+
"precision": result.model.precision,
|
| 50 |
"harness": result.harness.name,
|
| 51 |
"skills": str(result.harness.skills) if result.harness.skills else "None",
|
| 52 |
"environment": result.environment.name,
|
src/models.py
CHANGED
|
@@ -1,4 +1,4 @@
|
|
| 1 |
-
from typing import Any
|
| 2 |
|
| 3 |
from pydantic import BaseModel
|
| 4 |
|
|
@@ -24,13 +24,13 @@ class Model(BaseModel):
|
|
| 24 |
|
| 25 |
class Environment(BaseModel):
|
| 26 |
name: str
|
| 27 |
-
config: dict[str, Any]
|
| 28 |
|
| 29 |
|
| 30 |
class Metrics(BaseModel):
|
| 31 |
score: float
|
| 32 |
-
time: int
|
| 33 |
-
costUSD: float
|
| 34 |
|
| 35 |
|
| 36 |
class Result(BaseModel):
|
|
|
|
| 1 |
+
from typing import Any, Optional
|
| 2 |
|
| 3 |
from pydantic import BaseModel
|
| 4 |
|
|
|
|
| 24 |
|
| 25 |
class Environment(BaseModel):
|
| 26 |
name: str
|
| 27 |
+
config: Optional[dict[str, Any]] = None
|
| 28 |
|
| 29 |
|
| 30 |
class Metrics(BaseModel):
|
| 31 |
score: float
|
| 32 |
+
time: Optional[int] = None
|
| 33 |
+
costUSD: Optional[float] = None
|
| 34 |
|
| 35 |
|
| 36 |
class Result(BaseModel):
|