zhilinw commited on
Commit
c55b65d
·
verified ·
1 Parent(s): 4b71159

Upload 2 files

Browse files
Files changed (2) hide show
  1. app.py +1 -1
  2. report_generation.jsonl +2 -0
app.py CHANGED
@@ -111,7 +111,7 @@ with gr.Blocks(theme=theme) as app:
111
  with gr.TabItem("Report Generation"):
112
  with gr.Row():
113
  with gr.Column(scale=7):
114
- gr.Markdown("Report Generation Leaderboard: LLMs generate reports with just the prompt, which are then evaluated by gpt-oss-120b (mixed) judge with the lite dataset (160 samples) \nEvaluation and cost estimation last performed on 29 May 2026.")
115
 
116
  with gr.Tabs(elem_id="inner-tabs", elem_classes="tabs-small") as tabs:
117
  with gr.TabItem("Leaderboard"):
 
111
  with gr.TabItem("Report Generation"):
112
  with gr.Row():
113
  with gr.Column(scale=7):
114
+ gr.Markdown("Report Generation Leaderboard: LLMs generate reports with just the prompt, which are then evaluated by gpt-oss-120b (mixed) judge with the lite dataset (160 samples) \nEvaluation and cost estimation last performed on 3 Jun 2026.")
115
 
116
  with gr.Tabs(elem_id="inner-tabs", elem_classes="tabs-small") as tabs:
117
  with gr.TabItem("Leaderboard"):
report_generation.jsonl CHANGED
@@ -83,3 +83,5 @@
83
  {"Model": "Google/Gemini-3.5-Flash (Thinking)", "Category": "Closed-source Reasoning", "Overall": 56.0, "Physics": 42.7, "Chemistry": 71.5, "Finance": 35.2, "Consulting": 74.8, "Extraction": 51.0, "Reasoning": 56.7, "Style": 66.4, "Response Characters": 9102, "Input Tokens": 479, "Output Tokens": 17344, "Cost": 25.09}
84
  {"Model": "Google/Gemini-3.5-Flash", "Category": "Closed-source Instruct", "Overall": 54.5, "Physics": 40.3, "Chemistry": 70.9, "Finance": 35.3, "Consulting": 71.6, "Extraction": 50.8, "Reasoning": 56.0, "Style": 64.9, "Response Characters": 8379, "Input Tokens": 479, "Output Tokens": 2593, "Cost": 3.85}
85
  {"Model": "Anthropic/claude-opus-4.8 (Thinking)", "Category": "Closed-source Reasoning", "Overall": 56.8, "Physics": 38.9, "Chemistry": 73.5, "Finance": 44.8, "Consulting": 70.0, "Extraction": 49.4, "Reasoning": 56.6, "Style": 62.1, "Response Characters": 5125, "Input Tokens": 731, "Output Tokens": 12625, "Cost": 51.08}
 
 
 
83
  {"Model": "Google/Gemini-3.5-Flash (Thinking)", "Category": "Closed-source Reasoning", "Overall": 56.0, "Physics": 42.7, "Chemistry": 71.5, "Finance": 35.2, "Consulting": 74.8, "Extraction": 51.0, "Reasoning": 56.7, "Style": 66.4, "Response Characters": 9102, "Input Tokens": 479, "Output Tokens": 17344, "Cost": 25.09}
84
  {"Model": "Google/Gemini-3.5-Flash", "Category": "Closed-source Instruct", "Overall": 54.5, "Physics": 40.3, "Chemistry": 70.9, "Finance": 35.3, "Consulting": 71.6, "Extraction": 50.8, "Reasoning": 56.0, "Style": 64.9, "Response Characters": 8379, "Input Tokens": 479, "Output Tokens": 2593, "Cost": 3.85}
85
  {"Model": "Anthropic/claude-opus-4.8 (Thinking)", "Category": "Closed-source Reasoning", "Overall": 56.8, "Physics": 38.9, "Chemistry": 73.5, "Finance": 44.8, "Consulting": 70.0, "Extraction": 49.4, "Reasoning": 56.6, "Style": 62.1, "Response Characters": 5125, "Input Tokens": 731, "Output Tokens": 12625, "Cost": 51.08}
86
+ {"Model": "MiniMax/MiniMax-M3 (Thinking)", "Category": "Open-weight Reasoning", "Overall": 44.4, "Physics": 20.2, "Chemistry": 57.4, "Finance": 33.9, "Consulting": 66.3, "Extraction": 38.9, "Reasoning": 42.8, "Style": 51.7, "Response Characters": 6904, "Input Tokens": 617, "Output Tokens": 26030, "Cost": 5.03}
87
+ {"Model": "MiniMax/MiniMax-M3", "Category": "Open-weight Instruct", "Overall": 39.2, "Physics": 18.7, "Chemistry": 43.7, "Finance": 31.8, "Consulting": 62.4, "Extraction": 33.2, "Reasoning": 36.9, "Style": 53.6, "Response Characters": 6827, "Input Tokens": 621, "Output Tokens": 18799, "Cost": 3.64}