Add AIME 2026 evaluation result

by SaylorTwift HF Staff - opened Jun 3

base: refs/heads/main

←

from: refs/pr/6

Discussion Files changed

+59

-140

Files changed (6) hide show

.eval_results/aime_2026.yaml +9 -0
README.md +7 -24
chat_template.jinja +41 -68
config.json +1 -1
model.safetensors +1 -1
tokenizer_config.json +0 -46

.eval_results/aime_2026.yaml ADDED Viewed

	@@ -0,0 +1,9 @@

+- dataset:
+    id: MathArena/aime_2026
+    task_id: MathArena/aime_2026
+  value: 77.5
+  date: '2026-05-23'
+  notes: No tools
+  source:
+    url: https://huggingface.co/google/gemma-4-12B-it
+    name: Model Card

README.md CHANGED Viewed

@@ -16,8 +16,7 @@ base_model:
     <a href="https://huggingface.co/collections/google/gemma-4" target="_blank">Hugging Face</a> |
     <a href="https://github.com/google-gemma" target="_blank">GitHub</a> |
     <a href="https://blog.google/innovation-and-ai/technology/developers-tools/introducing-gemma-4-12B/" target="_blank">Launch Blog</a> |
-    <a href="https://ai.google.dev/gemma/docs/core" target="_blank">Documentation</a> |
-    <a href="https://arxiv.org/abs/2607.02770" target="_blank">Technical Report</a>
     <br>
     <b>License</b>: <a href="https://ai.google.dev/gemma/docs/gemma_4_license" target="_blank">Apache 2.0</a> | <b>Authors</b>: <a href="https://deepmind.google/models/gemma/" target="_blank">Google DeepMind</a>
 </p>
@@ -175,7 +174,7 @@ outputs = model.generate(**inputs, max_new_tokens=1024)
 response = processor.decode(outputs[0][input_len:], skip_special_tokens=False)
 # Parse output
-processor.parse_response(response, prefix=inputs["input_ids"])
 ```
 To enable reasoning, set `enable_thinking=True` and the `parse_response` function will take care of parsing the thinking output.
@@ -235,7 +234,7 @@ outputs = model.generate(**inputs, max_new_tokens=512)
 response = processor.decode(outputs[0][input_len:], skip_special_tokens=False)
 # Parse output
-processor.parse_response(response, prefix=inputs["input_ids"])
 ```
 </details>
@@ -293,7 +292,7 @@ outputs = model.generate(**inputs, max_new_tokens=512)
 response = processor.decode(outputs[0][input_len:], skip_special_tokens=False)
 # Parse output
-processor.parse_response(response, prefix=inputs["input_ids"])
 ```
 </details>
@@ -352,7 +351,7 @@ outputs = model.generate(**inputs, max_new_tokens=512)
 response = processor.decode(outputs[0][input_len:], skip_special_tokens=False)
 # Parse output
-processor.parse_response(response, prefix=inputs["input_ids"])
 ```
 </details>
@@ -386,7 +385,7 @@ Compared to Gemma 3, the models use standard `system`, `assistant`, and `user` r
 ### 3. Multi-Turn Conversations
-* **No Thinking Content in History**: In multi-turn conversations, the historical model output should only include the final response. Thoughts from previous model turns must *not be added* before the next user turn begins, with the exception of tool call turns where thinking content should be preserved.
 ### 4. Modality order
@@ -525,20 +524,4 @@ The development of vision-language models (VLMs) raises several ethical concerns
 ### **Benefits**
-At the time of release, this family of models provides high-performance open vision-language model implementations designed from the ground up for responsible AI development compared to similarly sized models.
-## **Citation**
-If you find our work helpful, please consider citing it:
-```bibtex
-@misc{gemmateam2026gemma4,
-      title={Gemma 4 Technical Report},
-      author={Gemma Team},
-      year={2026},
-      eprint={2607.02770},
-      archivePrefix={arXiv},
-      primaryClass={cs.CL},
-      url={https://arxiv.org/abs/2607.02770},
-}
-```

     <a href="https://huggingface.co/collections/google/gemma-4" target="_blank">Hugging Face</a> |
     <a href="https://github.com/google-gemma" target="_blank">GitHub</a> |
     <a href="https://blog.google/innovation-and-ai/technology/developers-tools/introducing-gemma-4-12B/" target="_blank">Launch Blog</a> |
+    <a href="https://ai.google.dev/gemma/docs/core" target="_blank">Documentation</a>
     <br>
     <b>License</b>: <a href="https://ai.google.dev/gemma/docs/gemma_4_license" target="_blank">Apache 2.0</a> | <b>Authors</b>: <a href="https://deepmind.google/models/gemma/" target="_blank">Google DeepMind</a>
 </p>
 response = processor.decode(outputs[0][input_len:], skip_special_tokens=False)
 # Parse output
+processor.parse_response(response)
 ```
 To enable reasoning, set `enable_thinking=True` and the `parse_response` function will take care of parsing the thinking output.
 response = processor.decode(outputs[0][input_len:], skip_special_tokens=False)
 # Parse output
+processor.parse_response(response)
 ```
 </details>
 response = processor.decode(outputs[0][input_len:], skip_special_tokens=False)
 # Parse output
+processor.parse_response(response)
 ```
 </details>
 response = processor.decode(outputs[0][input_len:], skip_special_tokens=False)
 # Parse output
+processor.parse_response(response)
 ```
 </details>
 ### 3. Multi-Turn Conversations
+* **No Thinking Content in History**: In multi-turn conversations, the historical model output should only include the final response. Thoughts from previous model turns must *not be added* before the next user turn begins.
 ### 4. Modality order
 ### **Benefits**
+At the time of release, this family of models provides high-performance open vision-language model implementations designed from the ground up for responsible AI development compared to similarly sized models.

chat_template.jinja CHANGED Viewed

@@ -1,9 +1,3 @@
-{#
-  Template: Google Gemma 4 Canonical Chat Template
-  Author: Google Gemma Engineering Team
-  Published: 2026-07-09
-  Context: Fixed tool-calling loops, turn closures, and thinking content-ordering.
-#}
 {%- macro format_parameters(properties, required, filter_keys=false) -%}
     {%- set standard_keys = ['description', 'type', 'properties', 'required', 'nullable'] -%}
     {%- set ns = namespace(found_first=false) -%}
@@ -122,9 +116,7 @@
     }
 {%- endmacro -%}
 {%- macro format_argument(argument, escape_keys=True) -%}
-    {%- if argument is none -%}
-        {{- 'null' -}}
-    {%- elif argument is string -%}
         {{- '<|"|>' + argument + '<|"|>' -}}
     {%- elif argument is boolean -%}
         {{- 'true' if argument else 'false' -}}
@@ -180,21 +172,18 @@
     {{- '<tool_response|>' -}}
 {%- endmacro -%}
-{#- ===== SETUP ===== -#}
-{%- set ns = namespace(prev_message_type=None, prev_non_tool_role=None) -%}
 {%- set loop_messages = messages -%}
-{%- set enable_thinking = enable_thinking | default(false) -%}
-{%- set preserve_thinking = preserve_thinking | default(false) -%}
 {{- bos_token -}}
 {#- Handle System/Tool Definitions Block -#}
-{%- if enable_thinking or tools or (messages and messages[0]['role'] in ['system', 'developer']) -%}
     {{- '<|turn>system\n' -}}
     {#- Inject Thinking token at the very top of the FIRST system turn -#}
-    {%- if enable_thinking -%}
         {{- '<|think|>\n' -}}
         {%- set ns.prev_message_type = 'think' -%}
     {%- endif -%}
-    {%- if messages and messages[0]['role'] in ['system', 'developer'] -%}
         {%- if messages[0]['content'] is string -%}
             {{- messages[0]['content'] | trim -}}
         {%- elif messages[0]['content'] is sequence -%}
@@ -228,22 +217,31 @@
     {%- if message['role'] != 'tool' -%}
     {%- set ns.prev_message_type = None -%}
     {%- set role = 'model' if message['role'] == 'assistant' else message['role'] -%}
-    {#- Detect continuation using tracked state — O(1) instead of O(n) backward scan -#}
-    {%- set continue_same_model_turn = (role == 'model' and ns.prev_non_tool_role == 'assistant') -%}
     {%- if not continue_same_model_turn -%}
         {{- '<|turn>' + role + '\n' }}
     {%- endif -%}
     {#- Render reasoning/reasoning_content as thinking channel -#}
     {%- set thinking_text = message.get('reasoning') or message.get('reasoning_content') -%}
-    {%- set thinking_gate = (loop.index0 > ns_turn.last_user_idx) or (preserve_thinking and message.get('tool_calls')) -%}
-    {%- if thinking_text and thinking_gate -%}
         {{- '<|channel>thought\n' + thinking_text + '\n<channel|>' -}}
     {%- endif -%}
-            {%- if message.get('tool_calls') -%}
-                {%- for tool_call in message.get('tool_calls') -%}
                     {%- set function = tool_call['function'] -%}
                     {{- '<|tool_call>call:' + function['name'] + '{' -}}
                     {%- if function['arguments'] is mapping -%}
@@ -253,13 +251,8 @@
                             {%- set ns_args.found_first = true -%}
                             {{- key -}}:{{- format_argument(value, escape_keys=False) -}}
                         {%- endfor -%}
-                    {%- elif function['arguments'] is none -%}
-                    {%- else -%}
-                        {{- raise_exception(
-                            "chat_template: tool_calls[].function.arguments must be a "
-                            "JSON object (mapping), not a string. Deserialize arguments "
-                            "before passing to the template."
-                        ) -}}
                     {%- endif -%}
                     {{- '}<tool_call|>' -}}
                 {%- endfor -%}
@@ -269,8 +262,8 @@
             {%- set ns_tr_out = namespace(flag=false) -%}
             {%- if message.get('tool_responses') -%}
                 {#- Legacy: tool_responses embedded on the assistant message (Google/Gemma native) -#}
-                {%- for tool_response in message.get('tool_responses') -%}
-                    {{- format_tool_response_block(tool_response['name'] | default('unknown', true), tool_response['response']) -}}
                     {%- set ns_tr_out.flag = true -%}
                     {%- set ns.prev_message_type = 'tool_response' -%}
                 {%- endfor -%}
@@ -284,8 +277,8 @@
                     {%- else -%}
                         {%- set follow = loop_messages[k] -%}
                         {#- Resolve tool_call_id to function name -#}
-                        {%- set ns_tname = namespace(name=follow.get('name') or 'unknown') -%}
-                        {%- for tc in message.get('tool_calls') -%}
                             {%- if tc.get('id') == follow.get('tool_call_id') -%}
                                 {%- set ns_tname.name = tc['function']['name'] -%}
                             {%- endif -%}
@@ -303,9 +296,9 @@
                             {%- endfor -%}
                             {{- format_tool_response_block(ns_tname.name, ns_txt.s) -}}
                             {%- for part in tool_body -%}
-                                {%- if part.get('type') in ['image', 'image_url'] -%}
                                     {{- '<|image|>' -}}
-                                {%- elif part.get('type') in ['audio', 'input_audio'] -%}
                                     {{- '<|audio|>' -}}
                                 {%- elif part.get('type') == 'video' -%}
                                     {{- '<|video|>' -}}
@@ -321,26 +314,29 @@
             {%- endif -%}
             {%- set captured_content -%}
-            {%- if message.get('content') is string -%}
                 {%- if role == 'model' -%}
                     {{- strip_thinking(message['content']) -}}
                 {%- else -%}
                     {{- message['content'] | trim -}}
                 {%- endif -%}
-            {%- elif message.get('content') is sequence -%}
                 {%- for item in message['content'] -%}
-                    {%- if item.get('type') == 'text' -%}
                         {%- if role == 'model' -%}
                             {{- strip_thinking(item['text']) -}}
                         {%- else -%}
                             {{- item['text'] | trim -}}
                         {%- endif -%}
-                    {%- elif item.get('type') in ['image', 'image_url'] -%}
                         {{- '<|image|>' -}}
-                    {%- elif item.get('type') in ['audio', 'input_audio'] -%}
                         {{- '<|audio|>' -}}
-                    {%- elif item.get('type') == 'video' -%}
                         {{- '<|video|>' -}}
                     {%- endif -%}
                 {%- endfor -%}
             {%- endif -%}
@@ -349,42 +345,19 @@
             {{- captured_content -}}
             {%- set has_content = captured_content | trim | length > 0 -%}
-        {#- Forward-scan: find next non-tool message role for continuation detection -#}
-        {%- set next_nt = namespace(role=None, found=false) -%}
-        {%- for j in range(loop.index0 + 1, loop_messages | length) -%}
-            {%- if not next_nt.found -%}
-                {%- if loop_messages[j]['role'] != 'tool' -%}
-                    {%- set next_nt.role = loop_messages[j]['role'] -%}
-                    {%- set next_nt.found = true -%}
-                {%- endif -%}
-            {%- endif -%}
-        {%- endfor -%}
-        {%- set continues_into_next = (
-            role == 'model'
-            and next_nt.role == 'assistant'
-            and (not message.get('tool_calls') or ns_tr_out.flag)
-        ) -%}
         {%- if ns.prev_message_type == 'tool_call' and not ns_tr_out.flag -%}
             {{- '<|tool_response>' -}}
-        {%- elif continues_into_next -%}
-        {%- elif not (ns_tr_out.flag and not has_content and not next_nt.found) -%}
             {{- '<turn|>\n' -}}
         {%- endif -%}
-    {#- Track previous non-tool role for next iteration (avoids O(n) backward scan) -#}
-    {%- set ns.prev_non_tool_role = message['role'] -%}
     {%- endif -%}
 {%- endfor -%}
 {%- if add_generation_prompt -%}
     {%- if ns.prev_message_type != 'tool_response' and ns.prev_message_type != 'tool_call' -%}
         {{- '<|turn>model\n' -}}
-        {%- if not enable_thinking -%}
             {{- '<|channel>thought\n<channel|>' -}}
         {%- endif -%}
-    {%- elif ns.prev_message_type == 'tool_response' and enable_thinking -%}
-        {{- '<|channel>thought\n' -}}
     {%- endif -%}
-{%- endif -%}

 {%- macro format_parameters(properties, required, filter_keys=false) -%}
     {%- set standard_keys = ['description', 'type', 'properties', 'required', 'nullable'] -%}
     {%- set ns = namespace(found_first=false) -%}
     }
 {%- endmacro -%}
 {%- macro format_argument(argument, escape_keys=True) -%}
+    {%- if argument is string -%}
         {{- '<|"|>' + argument + '<|"|>' -}}
     {%- elif argument is boolean -%}
         {{- 'true' if argument else 'false' -}}
     {{- '<tool_response|>' -}}
 {%- endmacro -%}
+{%- set ns = namespace(prev_message_type=None) -%}
 {%- set loop_messages = messages -%}
 {{- bos_token -}}
 {#- Handle System/Tool Definitions Block -#}
+{%- if (enable_thinking is defined and enable_thinking) or tools or messages[0]['role'] in ['system', 'developer'] -%}
     {{- '<|turn>system\n' -}}
     {#- Inject Thinking token at the very top of the FIRST system turn -#}
+    {%- if enable_thinking is defined and enable_thinking -%}
         {{- '<|think|>\n' -}}
         {%- set ns.prev_message_type = 'think' -%}
     {%- endif -%}
+    {%- if messages[0]['role'] in ['system', 'developer'] -%}
         {%- if messages[0]['content'] is string -%}
             {{- messages[0]['content'] | trim -}}
         {%- elif messages[0]['content'] is sequence -%}
     {%- if message['role'] != 'tool' -%}
     {%- set ns.prev_message_type = None -%}
     {%- set role = 'model' if message['role'] == 'assistant' else message['role'] -%}
+    {#- Detect continuation: suppress duplicate <|turn>model when previous non-tool message was also assistant -#}
+    {%- set prev_nt = namespace(role=None, found=false) -%}
+    {%- if loop.index0 > 0 -%}
+        {%- for j in range(loop.index0 - 1, -1, -1) -%}
+            {%- if not prev_nt.found -%}
+                {%- if loop_messages[j]['role'] != 'tool' -%}
+                    {%- set prev_nt.role = loop_messages[j]['role'] -%}
+                    {%- set prev_nt.found = true -%}
+                {%- endif -%}
+            {%- endif -%}
+        {%- endfor -%}
+    {%- endif -%}
+    {%- set continue_same_model_turn = (role == 'model' and prev_nt.role == 'assistant') -%}
     {%- if not continue_same_model_turn -%}
         {{- '<|turn>' + role + '\n' }}
     {%- endif -%}
     {#- Render reasoning/reasoning_content as thinking channel -#}
     {%- set thinking_text = message.get('reasoning') or message.get('reasoning_content') -%}
+    {%- if thinking_text and loop.index0 > ns_turn.last_user_idx and message.get('tool_calls') -%}
         {{- '<|channel>thought\n' + thinking_text + '\n<channel|>' -}}
     {%- endif -%}
+            {%- if message['tool_calls'] -%}
+                {%- for tool_call in message['tool_calls'] -%}
                     {%- set function = tool_call['function'] -%}
                     {{- '<|tool_call>call:' + function['name'] + '{' -}}
                     {%- if function['arguments'] is mapping -%}
                             {%- set ns_args.found_first = true -%}
                             {{- key -}}:{{- format_argument(value, escape_keys=False) -}}
                         {%- endfor -%}
+                    {%- elif function['arguments'] is string -%}
+                        {{- function['arguments'] -}}
                     {%- endif -%}
                     {{- '}<tool_call|>' -}}
                 {%- endfor -%}
             {%- set ns_tr_out = namespace(flag=false) -%}
             {%- if message.get('tool_responses') -%}
                 {#- Legacy: tool_responses embedded on the assistant message (Google/Gemma native) -#}
+                {%- for tool_response in message['tool_responses'] -%}
+                    {{- format_tool_response_block(tool_response['name'] | default('unknown'), tool_response['response']) -}}
                     {%- set ns_tr_out.flag = true -%}
                     {%- set ns.prev_message_type = 'tool_response' -%}
                 {%- endfor -%}
                     {%- else -%}
                         {%- set follow = loop_messages[k] -%}
                         {#- Resolve tool_call_id to function name -#}
+                        {%- set ns_tname = namespace(name=follow.get('name') | default('unknown')) -%}
+                        {%- for tc in message['tool_calls'] -%}
                             {%- if tc.get('id') == follow.get('tool_call_id') -%}
                                 {%- set ns_tname.name = tc['function']['name'] -%}
                             {%- endif -%}
                             {%- endfor -%}
                             {{- format_tool_response_block(ns_tname.name, ns_txt.s) -}}
                             {%- for part in tool_body -%}
+                                {%- if part.get('type') == 'image' -%}
                                     {{- '<|image|>' -}}
+                                {%- elif part.get('type') == 'audio' -%}
                                     {{- '<|audio|>' -}}
                                 {%- elif part.get('type') == 'video' -%}
                                     {{- '<|video|>' -}}
             {%- endif -%}
             {%- set captured_content -%}
+            {%- if message['content'] is string -%}
                 {%- if role == 'model' -%}
                     {{- strip_thinking(message['content']) -}}
                 {%- else -%}
                     {{- message['content'] | trim -}}
                 {%- endif -%}
+            {%- elif message['content'] is sequence -%}
                 {%- for item in message['content'] -%}
+                    {%- if item['type'] == 'text' -%}
                         {%- if role == 'model' -%}
                             {{- strip_thinking(item['text']) -}}
                         {%- else -%}
                             {{- item['text'] | trim -}}
                         {%- endif -%}
+                    {%- elif item['type'] == 'image' -%}
                         {{- '<|image|>' -}}
+                        {%- set ns.prev_message_type = 'image' -%}
+                    {%- elif item['type'] == 'audio' -%}
                         {{- '<|audio|>' -}}
+                        {%- set ns.prev_message_type = 'audio' -%}
+                    {%- elif item['type'] == 'video' -%}
                         {{- '<|video|>' -}}
+                        {%- set ns.prev_message_type = 'video' -%}
                     {%- endif -%}
                 {%- endfor -%}
             {%- endif -%}
             {{- captured_content -}}
             {%- set has_content = captured_content | trim | length > 0 -%}
         {%- if ns.prev_message_type == 'tool_call' and not ns_tr_out.flag -%}
             {{- '<|tool_response>' -}}
+        {%- elif not (ns_tr_out.flag and not has_content) -%}
             {{- '<turn|>\n' -}}
         {%- endif -%}
     {%- endif -%}
 {%- endfor -%}
 {%- if add_generation_prompt -%}
     {%- if ns.prev_message_type != 'tool_response' and ns.prev_message_type != 'tool_call' -%}
         {{- '<|turn>model\n' -}}
+        {%- if not enable_thinking | default(false) -%}
             {{- '<|channel>thought\n<channel|>' -}}
         {%- endif -%}
     {%- endif -%}
+{%- endif -%}

config.json CHANGED Viewed

@@ -106,7 +106,7 @@
       "sliding_attention",
       "full_attention"
     ],
-    "max_position_embeddings": 262144,
     "model_type": "gemma4_unified_text",
     "moe_intermediate_size": null,
     "num_attention_heads": 16,

       "sliding_attention",
       "full_attention"
     ],
+    "max_position_embeddings": 131072,
     "model_type": "gemma4_unified_text",
     "moe_intermediate_size": null,
     "num_attention_heads": 16,

model.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:5a84cb313260ac447237b890387116dfa8682e49a6b44bc585ae8353abbff18d
 size 23919549408

 version https://git-lfs.github.com/spec/v1
+oid sha256:366b79fc7e2ea81106d45e2b3ca10e144925f93dd9d456396692825ddb7bb788
 size 23919549408

tokenizer_config.json CHANGED Viewed

@@ -63,52 +63,6 @@
     "type": "object",
     "x-regex": "(\\<\\|channel\\>thought\\n(?P<thinking>.*?)\\<channel\\|\\>)?(?P<tool_calls>\\<\\|tool_call\\>.*\\<tool_call\\|\\>)?(?P<content>(?:(?!\\<turn\\|\\>)(?!\\<\\|tool_response\\>).)+)?(?:\\<turn\\|\\>|\\<\\|tool_response\\>)?"
   },
-  "response_template": {
-    "defaults": {
-      "role": "assistant"
-    },
-    "fields": {
-      "content": {
-        "close": [
-          "<turn|>",
-          "<|tool_response>",
-          "<eos>"
-        ],
-        "content": "text"
-      },
-      "thinking": {
-        "close": "<channel|>",
-        "content": "text",
-        "open": "<|channel>thought\n"
-      },
-      "tool_calls": {
-        "close": "<tool_call|>",
-        "content": "json",
-        "content_args": {
-          "string_delims": [
-            [
-              "<|\"|>",
-              "<|\"|>"
-            ]
-          ],
-          "unquoted_keys": true
-        },
-        "open_pattern": "<\\|tool_call>call:(?P<name>\\w+)",
-        "repeats": true,
-        "transform": {
-          "function": {
-            "arguments": "{content}",
-            "name": "{name}"
-          },
-          "type": "function"
-        }
-      }
-    },
-    "start_anchor": [
-      "<|turn>model\n",
-      "<tool_response|>"
-    ]
-  },
   "soc_token": "<|channel>",
   "sot_token": "<|turn>",
   "stc_token": "<|tool_call>",

     "type": "object",
     "x-regex": "(\\<\\|channel\\>thought\\n(?P<thinking>.*?)\\<channel\\|\\>)?(?P<tool_calls>\\<\\|tool_call\\>.*\\<tool_call\\|\\>)?(?P<content>(?:(?!\\<turn\\|\\>)(?!\\<\\|tool_response\\>).)+)?(?:\\<turn\\|\\>|\\<\\|tool_response\\>)?"
   },
   "soc_token": "<|channel>",
   "sot_token": "<|turn>",
   "stc_token": "<|tool_call>",