zhilinw commited on
Commit
5598f1c
·
verified ·
1 Parent(s): 0eb926a

Upload 2 files

Browse files
Files changed (2) hide show
  1. app.py +1 -1
  2. report_generation.jsonl +4 -0
app.py CHANGED
@@ -111,7 +111,7 @@ with gr.Blocks(theme=theme) as app:
111
  with gr.TabItem("Report Generation"):
112
  with gr.Row():
113
  with gr.Column(scale=7):
114
- gr.Markdown("Report Generation Leaderboard: LLMs generate reports with just the prompt, which are then evaluated by gpt-oss-120b (mixed) judge with the lite dataset (160 samples) \nEvaluation and cost estimation last performed on 17 Dec 2025.")
115
 
116
  with gr.Tabs(elem_id="inner-tabs", elem_classes="tabs-small") as tabs:
117
  with gr.TabItem("Leaderboard"):
 
111
  with gr.TabItem("Report Generation"):
112
  with gr.Row():
113
  with gr.Column(scale=7):
114
+ gr.Markdown("Report Generation Leaderboard: LLMs generate reports with just the prompt, which are then evaluated by gpt-oss-120b (mixed) judge with the lite dataset (160 samples) \nEvaluation and cost estimation last performed on 2 Jan 2026.")
115
 
116
  with gr.Tabs(elem_id="inner-tabs", elem_classes="tabs-small") as tabs:
117
  with gr.TabItem("Leaderboard"):
report_generation.jsonl CHANGED
@@ -47,3 +47,7 @@
47
  {"Model": "OpenAI/GPT-5.2 (xhigh)", "Category": "Closed-source Reasoning", "Overall": 55.1, "Physics": 42.7, "Chemistry": 72.6, "Finance": 35.6, "Consulting": 69.4, "Extraction": 47.3, "Reasoning": 57.8, "Style": 75.7, "Response Characters": 7010, "Input Tokens": 3191, "Output Tokens": 22965, "Cost": 52.34}
48
  {"Model": "Google/Gemini-3-Flash-Preview (Thinking)", "Category": "Closed-source Reasoning", "Overall": 53.4, "Physics": 37.8, "Chemistry": 72.1, "Finance": 37.2, "Consulting": 66.6, "Extraction": 45.9, "Reasoning": 52.1, "Style": 68.5, "Response Characters": 6766, "Input Tokens": 489, "Output Tokens": 9120, "Cost": 4.42}
49
  {"Model": "Google/Gemini-3-Flash-Preview", "Category": "Closed-source Instruct", "Overall": 50.2, "Physics": 35.2, "Chemistry": 64.4, "Finance": 35.8, "Consulting": 65.3, "Extraction": 43.1, "Reasoning": 50.2, "Style": 68.2, "Response Characters": 3989, "Input Tokens": 480, "Output Tokens": 1335, "Cost": 0.68}
 
 
 
 
 
47
  {"Model": "OpenAI/GPT-5.2 (xhigh)", "Category": "Closed-source Reasoning", "Overall": 55.1, "Physics": 42.7, "Chemistry": 72.6, "Finance": 35.6, "Consulting": 69.4, "Extraction": 47.3, "Reasoning": 57.8, "Style": 75.7, "Response Characters": 7010, "Input Tokens": 3191, "Output Tokens": 22965, "Cost": 52.34}
48
  {"Model": "Google/Gemini-3-Flash-Preview (Thinking)", "Category": "Closed-source Reasoning", "Overall": 53.4, "Physics": 37.8, "Chemistry": 72.1, "Finance": 37.2, "Consulting": 66.6, "Extraction": 45.9, "Reasoning": 52.1, "Style": 68.5, "Response Characters": 6766, "Input Tokens": 489, "Output Tokens": 9120, "Cost": 4.42}
49
  {"Model": "Google/Gemini-3-Flash-Preview", "Category": "Closed-source Instruct", "Overall": 50.2, "Physics": 35.2, "Chemistry": 64.4, "Finance": 35.8, "Consulting": 65.3, "Extraction": 43.1, "Reasoning": 50.2, "Style": 68.2, "Response Characters": 3989, "Input Tokens": 480, "Output Tokens": 1335, "Cost": 0.68}
50
+ {"Model": "Z-AI/GLM-4.7 (Thinking)", "Category": "Open-weight Reasoning", "Overall": 46.6, "Physics": 29.7, "Chemistry": 62.6, "Finance": 25.9, "Consulting": 68.2, "Extraction": 38.6, "Reasoning": 46.2, "Style": 54.0, "Response Characters": 5428, "Input Tokens": 475, "Output Tokens": 20648, "Cost": 4.99}
51
+ {"Model": "Z-AI/GLM-4.7", "Category": "Open-weight Instruct", "Overall": 38.9, "Physics": 20.2, "Chemistry": 43.7, "Finance": 25.9, "Consulting": 65.9, "Extraction": 30.4, "Reasoning": 36.8, "Style": 49.8, "Response Characters": 5232, "Input Tokens": 469, "Output Tokens": 17476, "Cost": 4.22}
52
+ {"Model": "MiniMax/MiniMax-M2.1 (Thinking)", "Category": "Open-weight Reasoning", "Overall": 41.3, "Physics": 29.4, "Chemistry": 48.3, "Finance": 26.2, "Consulting": 61.2, "Extraction": 34.1, "Reasoning": 42.3, "Style": 48.8, "Response Characters": 17288, "Input Tokens": 488, "Output Tokens": 12649, "Cost": 2.45}
53
+ {"Model": "MiniMax/MiniMax-M2.1", "Category": "Open-weight Instruct", "Overall": 41.6, "Physics": 30.6, "Chemistry": 43.4, "Finance": 27.6, "Consulting": 64.9, "Extraction": 32.5, "Reasoning": 41.7, "Style": 49.8, "Response Characters": 17482, "Input Tokens": 486, "Output Tokens": 8518, "Cost": 1.66}