Upload 4 files

fb35018 verified 3 months ago

6.25 kB

	import json
	import os
	import time
	import threading
	from dataclasses import dataclass, asdict

	import psutil
	import requests
	import torch
	from PIL import Image
	from transformers import AutoModelForImageTextToText, AutoProcessor


	MODEL_ID = "Dharunkumar9/SmolVLM-256M-Instruct-Agri"
	OUT_JSON = os.path.join(os.path.dirname(__file__), "benchmark_results.json")
	SAMPLE_IMAGE_URL = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/cats.png"


	@dataclass
	class CaseResult:
	name: str
	input_tokens: int
	generated_tokens: int
	latency_s: float
	tokens_per_s: float
	peak_rss_mb: float
	output_preview: str


	class MemoryMonitor:
	def __init__(self, process: psutil.Process, interval_s: float = 0.01):
	self.process = process
	self.interval_s = interval_s
	self._running = False
	self._thread = None
	self.max_rss = 0

	def _run(self):
	while self._running:
	rss = self.process.memory_info().rss
	if rss > self.max_rss:
	self.max_rss = rss
	time.sleep(self.interval_s)

	def __enter__(self):
	self._running = True
	self.max_rss = self.process.memory_info().rss
	self._thread = threading.Thread(target=self._run, daemon=True)
	self._thread.start()
	return self

	def __exit__(self, exc_type, exc, tb):
	self._running = False
	if self._thread is not None:
	self._thread.join(timeout=1)


	def pick_device():
	if torch.backends.mps.is_available():
	return "mps", torch.float16
	if torch.cuda.is_available():
	return "cuda", torch.bfloat16
	return "cpu", torch.float32


	def make_prompt(processor, text: str, with_image: bool):
	if with_image:
	messages = [
	{
	"role": "user",
	"content": [
	{"type": "image"},
	{"type": "text", "text": text},
	],
	}
	]
	else:
	messages = [{"role": "user", "content": [{"type": "text", "text": text}]}]

	return processor.apply_chat_template(messages, add_generation_prompt=True)


	def prepare_inputs(processor, prompt: str, image: Image.Image \| None, device: str):
	kwargs = {"text": prompt, "return_tensors": "pt"}
	if image is not None:
	kwargs["images"] = [image]
	inputs = processor(**kwargs)
	return {k: (v.to(device) if torch.is_tensor(v) else v) for k, v in inputs.items()}


	def run_case(model, processor, device: str, case_name: str, text: str, image: Image.Image \| None, max_new_tokens: int = 64):
	prompt = make_prompt(processor, text, with_image=image is not None)
	inputs = prepare_inputs(processor, prompt, image, device)
	input_tokens = int(inputs["input_ids"].shape[1])

	proc = psutil.Process(os.getpid())
	with MemoryMonitor(proc) as mon:
	t0 = time.perf_counter()
	with torch.inference_mode():
	out = model.generate(
	**inputs,
	max_new_tokens=max_new_tokens,
	do_sample=False,
	use_cache=True,
	)
	t1 = time.perf_counter()

	latency = t1 - t0
	generated_tokens = int(out.shape[1] - input_tokens)
	tps = float(generated_tokens / latency) if latency > 0 else 0.0

	decoded = processor.batch_decode(out[:, input_tokens:], skip_special_tokens=True)
	preview = (decoded[0] if decoded else "").strip().replace("\n", " ")[:220]

	return CaseResult(
	name=case_name,
	input_tokens=input_tokens,
	generated_tokens=generated_tokens,
	latency_s=round(latency, 3),
	tokens_per_s=round(tps, 3),
	peak_rss_mb=round(mon.max_rss / (1024 * 1024), 2),
	output_preview=preview,
	)


	def main():
	process = psutil.Process(os.getpid())
	rss_start_mb = process.memory_info().rss / (1024 * 1024)

	device, dtype = pick_device()
	print(f"Device: {device}, dtype: {dtype}")

	t0 = time.perf_counter()
	processor = AutoProcessor.from_pretrained(MODEL_ID)
	model = AutoModelForImageTextToText.from_pretrained(
	MODEL_ID,
	torch_dtype=dtype,
	low_cpu_mem_usage=True,
	attn_implementation="eager",
	).to(device)
	model.eval()
	t1 = time.perf_counter()

	load_time_s = t1 - t0
	rss_after_load_mb = process.memory_info().rss / (1024 * 1024)

	print("Downloading sample image...")
	img_bytes = requests.get(SAMPLE_IMAGE_URL, timeout=30).content
	image = Image.open(__import__("io").BytesIO(img_bytes)).convert("RGB")

	# Warm-up
	_ = run_case(
	model,
	processor,
	device,
	"warmup",
	"Describe this image briefly.",
	image,
	max_new_tokens=16,
	)

	cases = [
	("text_only_short", "You are an agri assistant. Give 3 tips for identifying early leaf blight.", None, 64),
	(
	"image_short",
	"What do you see in this image? Mention crop/plant clues if visible.",
	image,
	64,
	),
	(
	"image_long",
	"Analyze this image for agriculture relevance. Return: 1) likely object/plant, 2) possible health indicators, 3) recommended next observation steps, 4) confidence from 0-1.",
	image,
	96,
	),
	]

	results = []
	for name, text, img, max_new_tokens in cases:
	print(f"Running case: {name}")
	results.append(asdict(run_case(model, processor, device, name, text, img, max_new_tokens=max_new_tokens)))

	payload = {
	"model_id": MODEL_ID,
	"device": device,
	"dtype": str(dtype),
	"load_time_s": round(load_time_s, 3),
	"rss_start_mb": round(rss_start_mb, 2),
	"rss_after_load_mb": round(rss_after_load_mb, 2),
	"model_num_parameters": int(model.num_parameters()),
	"transformers_version": __import__("transformers").__version__,
	"torch_version": torch.__version__,
	"cases": results,
	}

	with open(OUT_JSON, "w", encoding="utf-8") as f:
	json.dump(payload, f, indent=2)

	print(f"Saved results to {OUT_JSON}")
	print(json.dumps(payload, indent=2))


	if __name__ == "__main__":
	main()