import os import time import torch from vllm import LLM, SamplingParams # Optimization Flags os.environ["VLLM_USE_V1"] = "1" # Toggle this for comparison model_path = "/model" print(f"--- OPTIMIZED BLACKWELL BENCHMARK ---") def run_benchmark(enforce_eager=False): print(f"\n[Config] Enforce Eager: {enforce_eager}") try: llm = LLM( model=model_path, quantization="modelopt", trust_remote_code=True, tensor_parallel_size=1, gpu_memory_utilization=0.4, # Further reduced to ensure graph capture success max_model_len=4096, # Reduced to save VRAM for graphs enforce_eager=enforce_eager ) prompt = "Write a long, efficient Python script to solve the Traveling Salesperson Problem using dynamic programming." sampling_params = SamplingParams(temperature=0.0, max_tokens=256) start_time = time.time() outputs = llm.generate([prompt], sampling_params) end_time = time.time() total_tokens = sum(len(output.outputs[0].token_ids) for output in outputs) duration = end_time - start_time tps = total_tokens / duration if duration > 0 else 0 print(f"Generated {total_tokens} tokens in {duration:.2f}s") print(f"Tokens per Second (TPS): {tps:.2f}") # Cleanup to free VRAM for next run del llm torch.cuda.empty_cache() except Exception as e: print(f"Benchmark run failed: {e}") if __name__ == "__main__": # Skipping Eager Mode (Baseline already known) # Test: Optimized Mode (CUDA Graphs) print("\n--- Starting CUDA Graphs run (Optimized) ---") run_benchmark(enforce_eager=False)