| import os | |
| import torch | |
| from vllm import LLM, SamplingParams | |
| # Force V1 engine | |
| os.environ["VLLM_USE_V1"] = "1" | |
| model_path = "/model" | |
| print(f"--- Final Blackwell NVFP4 Code Test ---") | |
| # Sampling for code generation | |
| sampling_params = SamplingParams( | |
| temperature=0.01, # Almost greedy for code | |
| top_p=0.95, | |
| max_tokens=512 | |
| ) | |
| try: | |
| llm = LLM( | |
| model=model_path, | |
| quantization="modelopt", | |
| trust_remote_code=True, | |
| tensor_parallel_size=1, | |
| gpu_memory_utilization=0.6, | |
| max_model_len=16384, | |
| enforce_eager=True | |
| ) | |
| # Specific code prompt | |
| prompt = "Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n### Instruction:\nWrite a Python function called `is_prime(n)` that uses a loop to check if a number is prime. Then test it with 29.\n\n### Response:\n" | |
| print(f"Prompt: {prompt}\n") | |
| outputs = llm.generate([prompt], sampling_params) | |
| for output in outputs: | |
| generated_text = output.outputs[0].text | |
| print(f"--- Model Output ---") | |
| print(f"{generated_text}") | |
| print(f"--- End of Output ---") | |
| except Exception as e: | |
| print(f"CRITICAL ERROR: {e}") | |