Update README.md
Browse files
README.md
CHANGED
|
@@ -82,6 +82,7 @@ The `embedl-models` package is required, it provides the optimized FlashHead imp
|
|
| 82 |
---
|
| 83 |
|
| 84 |
## Usage Examples
|
|
|
|
| 85 |
|
| 86 |
### vLLM Inference
|
| 87 |
|
|
@@ -93,7 +94,7 @@ model_id = "embedl/Llama-3.2-3B-Instruct-FlashHead-W4A16"
|
|
| 93 |
|
| 94 |
if __name__ == "__main__":
|
| 95 |
sampling = SamplingParams(max_tokens=128, temperature=0.0)
|
| 96 |
-
llm = LLM(model=model_id, trust_remote_code=True)
|
| 97 |
|
| 98 |
prompt = "Write a haiku about coffee."
|
| 99 |
output = llm.generate([prompt], sampling)
|
|
@@ -116,7 +117,8 @@ model_id = "embedl/Llama-3.2-3B-Instruct-FlashHead-W4A16"
|
|
| 116 |
if __name__ == "__main__":
|
| 117 |
asyncio.run(
|
| 118 |
run_repl(
|
| 119 |
-
model=model_id
|
|
|
|
| 120 |
)
|
| 121 |
)
|
| 122 |
```
|
|
|
|
| 82 |
---
|
| 83 |
|
| 84 |
## Usage Examples
|
| 85 |
+
**Note (vLLM context length):** `max_model_len=131072` may fail on GPUs without enough free VRAM for the KV cache. If you see a KV cache memory error, lower `max_model_len` (or increase `gpu_memory_utilization`).
|
| 86 |
|
| 87 |
### vLLM Inference
|
| 88 |
|
|
|
|
| 94 |
|
| 95 |
if __name__ == "__main__":
|
| 96 |
sampling = SamplingParams(max_tokens=128, temperature=0.0)
|
| 97 |
+
llm = LLM(model=model_id, trust_remote_code=True, max_model_len=131072)
|
| 98 |
|
| 99 |
prompt = "Write a haiku about coffee."
|
| 100 |
output = llm.generate([prompt], sampling)
|
|
|
|
| 117 |
if __name__ == "__main__":
|
| 118 |
asyncio.run(
|
| 119 |
run_repl(
|
| 120 |
+
model=model_id,
|
| 121 |
+
max_model_len=131072
|
| 122 |
)
|
| 123 |
)
|
| 124 |
```
|