Update inference examples to use the correct chat template
Browse filesHey there! 👋
I noticed that the current Python examples for `transformers` and `vllm` aren't using the chat template. It seems like these examples might have been intended for the base model, but since this is the Instruct version, skipping the specific formatting causes the model to generate unexpected or low-quality outputs.
I’ve updated the code snippets to use `apply_chat_template` so the prompts are formatted exactly how the model expects (handling the `<|im_start|>` tokens automatically). This should make the examples work much smoother for new users!
Thanks for releasing the model! 🚀
README.md
CHANGED
|
@@ -45,13 +45,13 @@ You can use OLMo with the standard HuggingFace transformers library:
|
|
| 45 |
from transformers import AutoModelForCausalLM, AutoTokenizer
|
| 46 |
olmo = AutoModelForCausalLM.from_pretrained("allenai/Olmo-3-7B-Instruct")
|
| 47 |
tokenizer = AutoTokenizer.from_pretrained("allenai/Olmo-3-7B-Instruct")
|
| 48 |
-
message = ["Who would win in a fight - a dinosaur or a cow named Moo Moo?"]
|
| 49 |
-
inputs = tokenizer(message, return_tensors='pt',
|
| 50 |
# optional verifying cuda
|
| 51 |
# inputs = {k: v.to('cuda') for k,v in inputs.items()}
|
| 52 |
# olmo = olmo.to('cuda')
|
| 53 |
response = olmo.generate(**inputs, max_new_tokens=100, do_sample=True, top_k=50, top_p=0.95)
|
| 54 |
-
print(tokenizer.
|
| 55 |
>> 'This is a fun and imaginative question! Let’s break it down...'
|
| 56 |
```
|
| 57 |
|
|
@@ -184,8 +184,8 @@ model = AutoModelForCausalLM.from_pretrained(
|
|
| 184 |
device_map="auto",
|
| 185 |
)
|
| 186 |
|
| 187 |
-
|
| 188 |
-
inputs = tokenizer(
|
| 189 |
|
| 190 |
outputs = model.generate(
|
| 191 |
**inputs,
|
|
@@ -194,7 +194,7 @@ outputs = model.generate(
|
|
| 194 |
max_new_tokens=32768,
|
| 195 |
)
|
| 196 |
|
| 197 |
-
print(tokenizer.decode(outputs[0], skip_special_tokens=True))
|
| 198 |
```
|
| 199 |
|
| 200 |
### vllm Example
|
|
@@ -210,8 +210,8 @@ sampling_params = SamplingParams(
|
|
| 210 |
max_tokens=32768,
|
| 211 |
)
|
| 212 |
|
| 213 |
-
|
| 214 |
-
outputs = llm.
|
| 215 |
print(outputs[0].outputs[0].text)
|
| 216 |
```
|
| 217 |
|
|
|
|
| 45 |
from transformers import AutoModelForCausalLM, AutoTokenizer
|
| 46 |
olmo = AutoModelForCausalLM.from_pretrained("allenai/Olmo-3-7B-Instruct")
|
| 47 |
tokenizer = AutoTokenizer.from_pretrained("allenai/Olmo-3-7B-Instruct")
|
| 48 |
+
message = [{"role": "user", "content": "Who would win in a fight - a dinosaur or a cow named Moo Moo?"}]
|
| 49 |
+
inputs = tokenizer.apply_chat_template(message, add_generation_prompt=True, return_tensors='pt', return_dict=True)
|
| 50 |
# optional verifying cuda
|
| 51 |
# inputs = {k: v.to('cuda') for k,v in inputs.items()}
|
| 52 |
# olmo = olmo.to('cuda')
|
| 53 |
response = olmo.generate(**inputs, max_new_tokens=100, do_sample=True, top_k=50, top_p=0.95)
|
| 54 |
+
print(tokenizer.decode(response[0][inputs.input_ids.shape[1]:], skip_special_tokens=True))
|
| 55 |
>> 'This is a fun and imaginative question! Let’s break it down...'
|
| 56 |
```
|
| 57 |
|
|
|
|
| 184 |
device_map="auto",
|
| 185 |
)
|
| 186 |
|
| 187 |
+
message = [{"role": "user", "content": "Who would win in a fight - a dinosaur or a cow named Moo Moo?"}]
|
| 188 |
+
inputs = tokenizer.apply_chat_template(message, add_generation_prompt=True, return_tensors='pt', return_dict=True).to(model.device)
|
| 189 |
|
| 190 |
outputs = model.generate(
|
| 191 |
**inputs,
|
|
|
|
| 194 |
max_new_tokens=32768,
|
| 195 |
)
|
| 196 |
|
| 197 |
+
print(tokenizer.decode(outputs[0][inputs.input_ids.shape[1]:], skip_special_tokens=True))
|
| 198 |
```
|
| 199 |
|
| 200 |
### vllm Example
|
|
|
|
| 210 |
max_tokens=32768,
|
| 211 |
)
|
| 212 |
|
| 213 |
+
message = [{"role": "user", "content": "Who would win in a fight - a dinosaur or a cow named Moo Moo?"}]
|
| 214 |
+
outputs = llm.chat(message, sampling_params)
|
| 215 |
print(outputs[0].outputs[0].text)
|
| 216 |
```
|
| 217 |
|