|
|
--- |
|
|
base_model: |
|
|
- Qwen/Qwen2.5-Coder-7B-Instruct |
|
|
library_name: transformers |
|
|
license: mit |
|
|
metrics: |
|
|
- accuracy |
|
|
pipeline_tag: text-generation |
|
|
--- |
|
|
|
|
|
<div align="center"> |
|
|
<h1 align="center"> |
|
|
Z1: Efficient Test-time Scaling with Code |
|
|
</h1> |
|
|
<p>Train Large Language Model to Reason with Shifted Thinking |
|
|
</p> |
|
|
</div> |
|
|
<p align="center"> |
|
|
<a href="https://arxiv.org/abs/2504.00810"><b>[📜 Paper]</b></a> • |
|
|
<a href="https://huggingface.co/efficientscaling/Z1-7B"><b>[🤗 HF Models]</b></a> • |
|
|
<a href="https://github.com/efficientscaling/Z1"><b>[🐱 GitHub]</b></a> |
|
|
<!-- <a href="https://9557c5365a6f44dc84.gradio.live"><b>[🐯 Gradio Demo]</b></a> --> |
|
|
<br> |
|
|
|
|
|
<!-- <a href="#-quick-start">Quick Start</a> • --> |
|
|
<!-- <a href="#%EF%B8%8F-citation">Citation</a> --> |
|
|
</p> |
|
|
|
|
|
|
|
|
|
|
|
## Model Details |
|
|
To begin with the shifted thinking mode, please refer to https://github.com/efficientscaling/Z1. |
|
|
|
|
|
## Evaluation |
|
|
|
|
|
<p align="left"> |
|
|
<img src="tts.png" width="800"> |
|
|
<br> |
|
|
<!-- <em>Test-time scaling comparison between Z1-7B and R1-Distill-Qwen-7B. </em> --> |
|
|
</p> |
|
|
|
|
|
<!-- ## Example |
|
|
|
|
|
<p align="center"> |
|
|
<img src="simple_reason.png" width="1200"> |
|
|
<br> |
|
|
<em>Simple Reasoning</em> |
|
|
<br> |
|
|
<img src="complex_reason.png" width="1200"> |
|
|
<br> |
|
|
<em>Complex Reasoning</em> |
|
|
<br> |
|
|
|
|
|
</p> --> |
|
|
|
|
|
## Gradio Demo |
|
|
|
|
|
```python |
|
|
import copy |
|
|
from typing import List |
|
|
from dataclasses import dataclass |
|
|
|
|
|
import gradio as gr |
|
|
from vllm import LLM, SamplingParams |
|
|
from transformers import AutoTokenizer |
|
|
|
|
|
BOX=r"\boxed{}" |
|
|
ANSWER_WITH_BOX=f"\n\nI overthought it, the final answer in {BOX} should be:\n\n" |
|
|
ANSWER_WITHOUT_BOX=f"\n\nI overthought it, the final answer should be:\n\n" |
|
|
|
|
|
model_name = "efficientscaling/Z1-7B" |
|
|
|
|
|
@dataclass |
|
|
class ThinkingLLM(LLM): |
|
|
|
|
|
def __init__(self, *args, **kwargs): |
|
|
super().__init__(*args, **kwargs) |
|
|
|
|
|
def thinking_generate(self, prompts: List[str], sampling_params: SamplingParams = None, max_tokens_for_thinking: int = None): |
|
|
|
|
|
# If no SamplingParams is provided, create a default one |
|
|
if sampling_params is None: |
|
|
raise ValueError("Sampling_params can't be None!") |
|
|
else: |
|
|
all_max_tokens = sampling_params.max_tokens |
|
|
# Override the max_tokens in the provided SamplingParams with the budget |
|
|
sampling_params.max_tokens = max_tokens_for_thinking |
|
|
print(f"All tokens: {all_max_tokens}") |
|
|
print(f"Tokens for thinking: {max_tokens_for_thinking}") |
|
|
|
|
|
trajectories = self.generate(prompts, sampling_params) |
|
|
|
|
|
rethinking_str = ANSWER_WITHOUT_BOX |
|
|
sampling_params.max_tokens = all_max_tokens |
|
|
|
|
|
answers = copy.deepcopy(trajectories) |
|
|
|
|
|
unfinished_id = [] |
|
|
thinking_token = 0 |
|
|
new_prompts = [] |
|
|
|
|
|
for id, traj in enumerate(trajectories): |
|
|
if traj.outputs[0].finish_reason == 'length': |
|
|
unfinished_id.append(id) |
|
|
new_prompts.append(prompts[id] + traj.outputs[0].text + rethinking_str) |
|
|
thinking_token += len(traj.outputs[0].token_ids) |
|
|
|
|
|
avg_thinking_token = thinking_token / len(prompts) |
|
|
|
|
|
if new_prompts: |
|
|
print(new_prompts[0]) |
|
|
|
|
|
o = self.generate( |
|
|
new_prompts, |
|
|
sampling_params=sampling_params, |
|
|
) |
|
|
|
|
|
for i, uid in enumerate(unfinished_id): |
|
|
answers[uid] = o[i] |
|
|
|
|
|
return new_prompts, answers |
|
|
|
|
|
|
|
|
def generate_text(prompt, max_tokens, max_tokens_for_thinking, temperature, top_p): |
|
|
|
|
|
sampling_params = SamplingParams( |
|
|
temperature=temperature, |
|
|
max_tokens=max_tokens, |
|
|
top_p=top_p, |
|
|
skip_special_tokens=False, |
|
|
) |
|
|
|
|
|
trajectories, outputs = llm.thinking_generate(prompt, sampling_params, max_tokens_for_thinking=max_tokens_for_thinking) |
|
|
return trajectories[0] + '\n\n' + outputs[0].outputs[0].text if trajectories else outputs[0].outputs[0].text |
|
|
|
|
|
|
|
|
llm = ThinkingLLM( |
|
|
model=model_name, |
|
|
tensor_parallel_size=1, |
|
|
gpu_memory_utilization=0.96, |
|
|
) |
|
|
|
|
|
|
|
|
with gr.Blocks() as demo: |
|
|
gr.Markdown("# Reason with shifted thinking") |
|
|
|
|
|
with gr.Row(): |
|
|
with gr.Column(): |
|
|
prompt_input = gr.Textbox( |
|
|
label="Prompt", |
|
|
placeholder="Input", |
|
|
lines=5, |
|
|
) |
|
|
max_tokens_for_thinking_input = gr.Slider( |
|
|
label="shifted_thinking_window_size", |
|
|
minimum=1, |
|
|
maximum=32786, |
|
|
value=4000, |
|
|
step=1, |
|
|
) |
|
|
max_tokens_input = gr.Slider( |
|
|
label="all_max_tokens", |
|
|
minimum=1, |
|
|
maximum=32786, |
|
|
value=32786, |
|
|
step=1, |
|
|
) |
|
|
temperature_input = gr.Slider( |
|
|
label="Temperature", |
|
|
minimum=00, |
|
|
maximum=2.0, |
|
|
value=0, |
|
|
step=0.1, |
|
|
) |
|
|
top_p_input = gr.Slider( |
|
|
label="Top-p", |
|
|
minimum=0.0, |
|
|
maximum=1.0, |
|
|
value=1, |
|
|
step=0.01, |
|
|
) |
|
|
generate_button = gr.Button("Generate") |
|
|
|
|
|
with gr.Column(): |
|
|
output_text = gr.Textbox( |
|
|
label="Shifted Thinking Window", |
|
|
placeholder="Text is here...", |
|
|
lines=10, |
|
|
) |
|
|
|
|
|
generate_button.click( |
|
|
fn=generate_text, |
|
|
inputs=[prompt_input, max_tokens_for_thinking_input,max_tokens_input, temperature_input, top_p_input], |
|
|
outputs=output_text, |
|
|
) |
|
|
|
|
|
if __name__ == "__main__": |
|
|
demo.launch() |
|
|
``` |
|
|
|
|
|
## Citation |
|
|
``` |
|
|
@misc{yu2025efficientscaling, |
|
|
title={Z1: Efficient Test-time Scaling with Code}, |
|
|
author={Zhaojian Yu and Yinghao Wu and Yilun Zhao and Arman Cohan and Xiao-Ping Zhang}, |
|
|
year={2025}, |
|
|
eprint={2504.00810}, |
|
|
archivePrefix={arXiv}, |
|
|
primaryClass={cs.CL}, |
|
|
url={https://arxiv.org/abs/2504.00810}, |
|
|
} |
|
|
``` |