Elias-Schwegler's picture
Upload folder using huggingface_hub
f16feb8 verified
import os
import time
import torch
from vllm import LLM, SamplingParams
# Optimization Flags
os.environ["VLLM_USE_V1"] = "1" # Toggle this for comparison
model_path = "/model"
print(f"--- OPTIMIZED BLACKWELL BENCHMARK ---")
def run_benchmark(enforce_eager=False):
print(f"\n[Config] Enforce Eager: {enforce_eager}")
try:
llm = LLM(
model=model_path,
quantization="modelopt",
trust_remote_code=True,
tensor_parallel_size=1,
gpu_memory_utilization=0.4, # Further reduced to ensure graph capture success
max_model_len=4096, # Reduced to save VRAM for graphs
enforce_eager=enforce_eager
)
prompt = "Write a long, efficient Python script to solve the Traveling Salesperson Problem using dynamic programming."
sampling_params = SamplingParams(temperature=0.0, max_tokens=256)
start_time = time.time()
outputs = llm.generate([prompt], sampling_params)
end_time = time.time()
total_tokens = sum(len(output.outputs[0].token_ids) for output in outputs)
duration = end_time - start_time
tps = total_tokens / duration if duration > 0 else 0
print(f"Generated {total_tokens} tokens in {duration:.2f}s")
print(f"Tokens per Second (TPS): {tps:.2f}")
# Cleanup to free VRAM for next run
del llm
torch.cuda.empty_cache()
except Exception as e:
print(f"Benchmark run failed: {e}")
if __name__ == "__main__":
# Skipping Eager Mode (Baseline already known)
# Test: Optimized Mode (CUDA Graphs)
print("\n--- Starting CUDA Graphs run (Optimized) ---")
run_benchmark(enforce_eager=False)