| | import os |
| | import time |
| | import torch |
| | from vllm import LLM, SamplingParams |
| |
|
| | |
| | os.environ["VLLM_USE_V1"] = "1" |
| |
|
| | model_path = "/model" |
| | print(f"--- OPTIMIZED BLACKWELL BENCHMARK ---") |
| |
|
| | def run_benchmark(enforce_eager=False): |
| | print(f"\n[Config] Enforce Eager: {enforce_eager}") |
| | try: |
| | llm = LLM( |
| | model=model_path, |
| | quantization="modelopt", |
| | trust_remote_code=True, |
| | tensor_parallel_size=1, |
| | gpu_memory_utilization=0.4, |
| | max_model_len=4096, |
| | enforce_eager=enforce_eager |
| | ) |
| |
|
| | prompt = "Write a long, efficient Python script to solve the Traveling Salesperson Problem using dynamic programming." |
| | sampling_params = SamplingParams(temperature=0.0, max_tokens=256) |
| |
|
| | start_time = time.time() |
| | outputs = llm.generate([prompt], sampling_params) |
| | end_time = time.time() |
| |
|
| | total_tokens = sum(len(output.outputs[0].token_ids) for output in outputs) |
| | duration = end_time - start_time |
| | tps = total_tokens / duration if duration > 0 else 0 |
| |
|
| | print(f"Generated {total_tokens} tokens in {duration:.2f}s") |
| | print(f"Tokens per Second (TPS): {tps:.2f}") |
| | |
| | |
| | del llm |
| | torch.cuda.empty_cache() |
| | |
| | except Exception as e: |
| | print(f"Benchmark run failed: {e}") |
| |
|
| | if __name__ == "__main__": |
| | |
| | |
| | |
| | print("\n--- Starting CUDA Graphs run (Optimized) ---") |
| | run_benchmark(enforce_eager=False) |
| |
|