|
|
--- |
|
|
license: apache-2.0 |
|
|
--- |
|
|
This is a copy of Qwen3-8B compiled with the 2.25 SDK for the Neuron workshop. |
|
|
https://github.com/aws-neuron/neuron-workshops |
|
|
|
|
|
This checkpoint was generated with the code: |
|
|
``` |
|
|
bs=1 |
|
|
seqlength=1024 |
|
|
|
|
|
import os |
|
|
from vllm import LLM, SamplingParams |
|
|
os.environ['VLLM_NEURON_FRAMEWORK'] = "neuronx-distributed-inference" |
|
|
|
|
|
path = f"/home/ubuntu/qwen3/qwen3-8B-BS{bs}-SEQ{seqlength}" |
|
|
#save the sharded weights and compiler artifacts in the same folder |
|
|
os.environ['NEURON_COMPILED_ARTIFACTS'] = path |
|
|
os.environ['BASE_COMPILE_WORK_DIR'] =path |
|
|
|
|
|
llm = LLM( |
|
|
model="/home/ubuntu/models/Qwen3-8B", |
|
|
max_num_seqs=bs, |
|
|
max_model_len=seqlength, |
|
|
device="neuron", |
|
|
tensor_parallel_size=2, |
|
|
override_neuron_config={"save_sharded_checkpoint": True}) |
|
|
prompts = [ |
|
|
"Hello, my name is", |
|
|
"The president of the United States is", |
|
|
"The capital of France is", |
|
|
"The future of AI is", |
|
|
] |
|
|
# note that top_k must be set to lower than the global_top_k defined in |
|
|
# the neuronx_distributed_inference.models.config.OnDeviceSamplingConfig |
|
|
sampling_params = SamplingParams(top_k=10, temperature=0.8, top_p=0.95) |
|
|
outputs = llm.generate(prompts, sampling_params) |
|
|
for output in outputs: |
|
|
prompt = output.prompt |
|
|
generated_text = output.outputs[0].text |
|
|
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") |
|
|
``` |