Elias-Schwegler
/

IQuest-Coder-V1-40B-Loop-Instruct-NVFP4

Text Generation

iquestloopcoder

custom-architecture

Model card Files Files and versions

IQuest-Coder-V1-40B-Loop-Instruct-NVFP4 / deploy /benchmark_optimized.py

Elias-Schwegler's picture

Elias-Schwegler

Upload folder using huggingface_hub

f16feb8 verified about 1 month ago

history blame contribute delete

1.74 kB

	import os
	import time
	import torch
	from vllm import LLM, SamplingParams

	# Optimization Flags
	os.environ["VLLM_USE_V1"] = "1" # Toggle this for comparison

	model_path = "/model"
	print(f"--- OPTIMIZED BLACKWELL BENCHMARK ---")

	def run_benchmark(enforce_eager=False):
	print(f"\n[Config] Enforce Eager: {enforce_eager}")
	try:
	llm = LLM(
	model=model_path,
	quantization="modelopt",
	trust_remote_code=True,
	tensor_parallel_size=1,
	gpu_memory_utilization=0.4, # Further reduced to ensure graph capture success
	max_model_len=4096, # Reduced to save VRAM for graphs
	enforce_eager=enforce_eager
	)

	prompt = "Write a long, efficient Python script to solve the Traveling Salesperson Problem using dynamic programming."
	sampling_params = SamplingParams(temperature=0.0, max_tokens=256)

	start_time = time.time()
	outputs = llm.generate([prompt], sampling_params)
	end_time = time.time()

	total_tokens = sum(len(output.outputs[0].token_ids) for output in outputs)
	duration = end_time - start_time
	tps = total_tokens / duration if duration > 0 else 0

	print(f"Generated {total_tokens} tokens in {duration:.2f}s")
	print(f"Tokens per Second (TPS): {tps:.2f}")

	# Cleanup to free VRAM for next run
	del llm
	torch.cuda.empty_cache()

	except Exception as e:
	print(f"Benchmark run failed: {e}")

	if __name__ == "__main__":
	# Skipping Eager Mode (Baseline already known)

	# Test: Optimized Mode (CUDA Graphs)
	print("\n--- Starting CUDA Graphs run (Optimized) ---")
	run_benchmark(enforce_eager=False)