taagarwa commited on
Commit
b2a378e
·
1 Parent(s): 11eb494

✨ Add Qwen3.6 35b baseline

Browse files
app.py CHANGED
@@ -39,6 +39,7 @@ def init_leaderboard(dataframe):
39
  filter_columns=[
40
  ColumnFilter(label="Dataset", column="dataset", type="checkboxgroup"),
41
  ColumnFilter(label="Number of Parameters (B)", column="model_num_params", type="slider", min=0.5, max=150),
 
42
  ],
43
  interactive=False,
44
  )
@@ -50,8 +51,9 @@ with demo:
50
  gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
51
 
52
  with gr.Tabs(elem_classes="tab-buttons") as tabs:
53
- with gr.TabItem("🏅 LLM Benchmark", elem_id="llm-benchmark-tab-table", id=0):
54
  leaderboard = init_leaderboard(LEADERBOARD_DF)
 
55
 
56
  with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=2):
57
  gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
 
39
  filter_columns=[
40
  ColumnFilter(label="Dataset", column="dataset", type="checkboxgroup"),
41
  ColumnFilter(label="Number of Parameters (B)", column="model_num_params", type="slider", min=0.5, max=150),
42
+ ColumnFilter(label="Precision", column="precision", type="checkboxgroup"),
43
  ],
44
  interactive=False,
45
  )
 
51
  gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
52
 
53
  with gr.Tabs(elem_classes="tab-buttons") as tabs:
54
+ with gr.TabItem("🏅 Coding Agent Benchmark", elem_id="llm-benchmark-tab-table", id=0):
55
  leaderboard = init_leaderboard(LEADERBOARD_DF)
56
+ gr.Markdown("\* `internal` refers to internal benchmarks performed by the model provider where the harness/environment were not made public")
57
 
58
  with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=2):
59
  gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
results/qwen3-6-35b-internal.json ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "dataset": {
3
+ "name": "swe-bench-verified",
4
+ "repo": "SWE-bench/SWE-bench_Verified",
5
+ "num_tasks": 500
6
+ },
7
+ "harness": {
8
+ "name": "internal",
9
+ "skills": []
10
+ },
11
+ "model": {
12
+ "name": "Qwen3.6-35B-A3B",
13
+ "repo": "Qwen/Qwen3.6-35B-A3B",
14
+ "is_oss": true,
15
+ "num_params": 35,
16
+ "precision": "bf16"
17
+ },
18
+ "environment": {
19
+ "name": "internal"
20
+ },
21
+ "metrics": {
22
+ "score": 0.734,
23
+ "time": null,
24
+ "costUSD": null
25
+ }
26
+ }
results/qwen3-6-35b-nvfp4-claude-code.json CHANGED
@@ -9,7 +9,7 @@
9
  "skills": []
10
  },
11
  "model": {
12
- "name": "Qwen3.6-35B-A3B-NVFP4",
13
  "repo": "RedHatAI/Qwen3.6-35B-A3B-NVFP4",
14
  "is_oss": true,
15
  "num_params": 35,
 
9
  "skills": []
10
  },
11
  "model": {
12
+ "name": "Qwen3.6-35B-A3B",
13
  "repo": "RedHatAI/Qwen3.6-35B-A3B-NVFP4",
14
  "is_oss": true,
15
  "num_params": 35,
src/leaderboard.py CHANGED
@@ -9,11 +9,11 @@ RESULTS_DIR = Path(__file__).parent.parent / "results"
9
  DISPLAY_BY_DEFAULT = [
10
  "dataset",
11
  "model",
 
12
  "harness",
13
  "skills",
14
  "environment",
15
  "score",
16
- "costUSD",
17
  ]
18
 
19
  SEARCH_COLUMNS = [
@@ -24,6 +24,8 @@ SEARCH_COLUMNS = [
24
 
25
 
26
  def format_time(seconds: int):
 
 
27
  m, s = divmod(seconds, 60)
28
  h, m = divmod(m, 60)
29
  return f"{h}h{m}m{s}s"
@@ -43,6 +45,8 @@ def get_leaderboard_df():
43
  {
44
  "dataset": result.dataset.name,
45
  "model": result.model.name,
 
 
46
  "harness": result.harness.name,
47
  "skills": str(result.harness.skills) if result.harness.skills else "None",
48
  "environment": result.environment.name,
 
9
  DISPLAY_BY_DEFAULT = [
10
  "dataset",
11
  "model",
12
+ "precision",
13
  "harness",
14
  "skills",
15
  "environment",
16
  "score",
 
17
  ]
18
 
19
  SEARCH_COLUMNS = [
 
24
 
25
 
26
  def format_time(seconds: int):
27
+ if seconds is None:
28
+ return None
29
  m, s = divmod(seconds, 60)
30
  h, m = divmod(m, 60)
31
  return f"{h}h{m}m{s}s"
 
45
  {
46
  "dataset": result.dataset.name,
47
  "model": result.model.name,
48
+ "model_id": result.model.repo,
49
+ "precision": result.model.precision,
50
  "harness": result.harness.name,
51
  "skills": str(result.harness.skills) if result.harness.skills else "None",
52
  "environment": result.environment.name,
src/models.py CHANGED
@@ -1,4 +1,4 @@
1
- from typing import Any
2
 
3
  from pydantic import BaseModel
4
 
@@ -24,13 +24,13 @@ class Model(BaseModel):
24
 
25
  class Environment(BaseModel):
26
  name: str
27
- config: dict[str, Any]
28
 
29
 
30
  class Metrics(BaseModel):
31
  score: float
32
- time: int
33
- costUSD: float
34
 
35
 
36
  class Result(BaseModel):
 
1
+ from typing import Any, Optional
2
 
3
  from pydantic import BaseModel
4
 
 
24
 
25
  class Environment(BaseModel):
26
  name: str
27
+ config: Optional[dict[str, Any]] = None
28
 
29
 
30
  class Metrics(BaseModel):
31
  score: float
32
+ time: Optional[int] = None
33
+ costUSD: Optional[float] = None
34
 
35
 
36
  class Result(BaseModel):