Update README.md
Browse files
README.md
CHANGED
|
@@ -11,6 +11,8 @@ This is a low restriction, creative roleplay and conversational reasoning model
|
|
| 11 |
|
| 12 |
I have distilled and quantized the model through GPTQ 4-bit model (W4A16), meaning it can run on most GPUs.
|
| 13 |
|
|
|
|
|
|
|
| 14 |
Established by Staticaliza.
|
| 15 |
|
| 16 |
# vLLM: Use Instruction
|
|
@@ -23,6 +25,14 @@ from vllm import LLM, SamplingParams
|
|
| 23 |
repo = snapshot_download(repo_id="Staticaliza/Reya-Reasoning", allow_patterns=["*.json", "*.bin", "*.safetensors"])
|
| 24 |
llm = LLM(model=repo, dtype="auto", tensor_parallel_size=torch.cuda.device_count(), enforce_eager=True, trust_remote_code=True)
|
| 25 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 26 |
params = SamplingParams(
|
| 27 |
max_tokens=256,
|
| 28 |
temperature=1,
|
|
|
|
| 11 |
|
| 12 |
I have distilled and quantized the model through GPTQ 4-bit model (W4A16), meaning it can run on most GPUs.
|
| 13 |
|
| 14 |
+
This model uses ChatML chat template and can think using "<think>" and "</think>" tokens.
|
| 15 |
+
|
| 16 |
Established by Staticaliza.
|
| 17 |
|
| 18 |
# vLLM: Use Instruction
|
|
|
|
| 25 |
repo = snapshot_download(repo_id="Staticaliza/Reya-Reasoning", allow_patterns=["*.json", "*.bin", "*.safetensors"])
|
| 26 |
llm = LLM(model=repo, dtype="auto", tensor_parallel_size=torch.cuda.device_count(), enforce_eager=True, trust_remote_code=True)
|
| 27 |
|
| 28 |
+
# ChatML with think tokens is suggested
|
| 29 |
+
input = """<|im_start|>system
|
| 30 |
+
You are Reya.<|im_end|>
|
| 31 |
+
<|im_start|>user
|
| 32 |
+
Hi.<|im_end|>
|
| 33 |
+
<|im_start|>assistant
|
| 34 |
+
<think>"""
|
| 35 |
+
|
| 36 |
params = SamplingParams(
|
| 37 |
max_tokens=256,
|
| 38 |
temperature=1,
|