Update README.md
Browse files
README.md
CHANGED
|
@@ -31,8 +31,8 @@ Beyond standard competencies such as factual knowledge and conversational abilit
|
|
| 31 |
```python
|
| 32 |
from transformers import AutoModelForCausalLM, AutoTokenizer
|
| 33 |
|
| 34 |
-
model = AutoModelForCausalLM.from_pretrained("llm360/k2-v2", device_map="auto")
|
| 35 |
-
tokenizer = AutoTokenizer.from_pretrained("llm360/k2-v2")
|
| 36 |
|
| 37 |
prompt = "Explain why the derivative of sin(x) is cos(x)."
|
| 38 |
messages = [
|
|
@@ -43,12 +43,41 @@ text = tokenizer.apply_chat_template(
|
|
| 43 |
messages,
|
| 44 |
tokenize=False,
|
| 45 |
add_generation_prompt=True
|
|
|
|
| 46 |
)
|
| 47 |
inputs = tokenizer(text, return_tensors="pt").to(model.device)
|
| 48 |
outputs = model.generate(**inputs, max_new_tokens=200)
|
| 49 |
print(tokenizer.decode(outputs[0], skip_special_tokens=True))
|
| 50 |
```
|
| 51 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 52 |
---
|
| 53 |
|
| 54 |
## **Evaluation Summary**
|
|
|
|
| 31 |
```python
|
| 32 |
from transformers import AutoModelForCausalLM, AutoTokenizer
|
| 33 |
|
| 34 |
+
model = AutoModelForCausalLM.from_pretrained("llm360/k2-v2-instruct", device_map="auto")
|
| 35 |
+
tokenizer = AutoTokenizer.from_pretrained("llm360/k2-v2-instruct")
|
| 36 |
|
| 37 |
prompt = "Explain why the derivative of sin(x) is cos(x)."
|
| 38 |
messages = [
|
|
|
|
| 43 |
messages,
|
| 44 |
tokenize=False,
|
| 45 |
add_generation_prompt=True
|
| 46 |
+
reasoning_effort="high" # Or "medium"/"low"
|
| 47 |
)
|
| 48 |
inputs = tokenizer(text, return_tensors="pt").to(model.device)
|
| 49 |
outputs = model.generate(**inputs, max_new_tokens=200)
|
| 50 |
print(tokenizer.decode(outputs[0], skip_special_tokens=True))
|
| 51 |
```
|
| 52 |
|
| 53 |
+
Alternatively, you may serve the model using VLLM:
|
| 54 |
+
|
| 55 |
+
```
|
| 56 |
+
vllm serve LLM360/K2-V2-Instruct --tensor-parallel-size 8 --port 8000 --revision "sft_final" &
|
| 57 |
+
```
|
| 58 |
+
|
| 59 |
+
K2-V2-Instruct uses `reasoning_effort="low"|"medium"|"high"` in the chat template to determine reasoning effort. If you cannot use `tokenizer.apply_chat_template`, you may also pass in these arguments using `extra_body` and `chat_template_kwargs`:
|
| 60 |
+
|
| 61 |
+
```
|
| 62 |
+
curl -X POST "http://localhost:8000/v1/chat/completions" \
|
| 63 |
+
-H "Content-Type: application/json" \
|
| 64 |
+
-H "Authorization: Bearer key" \
|
| 65 |
+
-d $'{
|
| 66 |
+
"model": "LLM360/K2-V2-Instruct",
|
| 67 |
+
"messages": [
|
| 68 |
+
{
|
| 69 |
+
"role": "user",
|
| 70 |
+
"content": "Explain why the derivative of sin(x) is cos(x)."
|
| 71 |
+
}
|
| 72 |
+
],
|
| 73 |
+
"extra_body": {
|
| 74 |
+
"chat_template_kwargs": {
|
| 75 |
+
"reasoning_effort": "high"
|
| 76 |
+
}
|
| 77 |
+
}
|
| 78 |
+
}'
|
| 79 |
+
```
|
| 80 |
+
|
| 81 |
---
|
| 82 |
|
| 83 |
## **Evaluation Summary**
|