Add optimization script

f5c4ba1 verified 25 days ago

25 kB

	#!/usr/bin/env python3
	"""
	BiRefNet Lite Optimization for Intel Xeon W-2145 (Skylake-SP)
	v2 — fixes: manual ONNX export, ONNX→OpenVINO conversion, INT8 quantization

	Target CPU: 8C/16T, AVX-512 (NO VNNI/BF16/AMX), 11MB L3
	"""

	import os, sys, time, json, warnings, gc
	from pathlib import Path
	import numpy as np
	import torch
	from PIL import Image
	from torchvision import transforms

	warnings.filterwarnings("ignore")

	MODEL_ID = "ZhengPeng7/BiRefNet_lite"
	RESOLUTIONS = [(1024, 1024), (512, 512)]
	WARMUP_RUNS = 3
	BENCHMARK_RUNS = 10
	NUM_THREADS = 8
	OUTPUT_DIR = Path("/app/optimized_models")
	RESULTS_FILE = Path("/app/benchmark_results.json")

	os.environ["OMP_NUM_THREADS"] = str(NUM_THREADS)
	os.environ["KMP_AFFINITY"] = "granularity=fine,compact,1,0"
	os.environ["KMP_BLOCKTIME"] = "1"
	os.environ["OMP_WAIT_POLICY"] = "ACTIVE"

	OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

	def create_dummy_input(resolution, batch_size=1):
	return torch.randn(batch_size, 3, resolution[0], resolution[1])

	def benchmark_fn(fn, warmup=WARMUP_RUNS, runs=BENCHMARK_RUNS, label=""):
	for _ in range(warmup):
	fn()
	times = []
	for _ in range(runs):
	gc.collect()
	t0 = time.perf_counter()
	fn()
	t1 = time.perf_counter()
	times.append((t1 - t0) * 1000)
	result = {
	"label": label,
	"mean_ms": round(np.mean(times), 2),
	"std_ms": round(np.std(times), 2),
	"min_ms": round(np.min(times), 2),
	"max_ms": round(np.max(times), 2),
	"median_ms": round(np.median(times), 2),
	"fps": round(1000.0 / np.mean(times), 2),
	"runs": runs,
	}
	print(f" [{label}] mean={result['mean_ms']:.1f}ms ± {result['std_ms']:.1f}ms \| "
	f"min={result['min_ms']:.1f}ms \| fps={result['fps']:.2f}")
	return result

	all_results = {"model": MODEL_ID, "target_cpu": "Intel Xeon W-2145 (Skylake-SP)", "benchmarks": []}

	# ═══════════════════════════════════════════════════════════════════════════
	# STEP 1: Load PyTorch model
	# ═══════════════════════════════════════════════════════════════════════════
	print("=" * 70)
	print("STEP 1: Loading BiRefNet Lite (PyTorch)")
	print("=" * 70)

	from transformers import AutoModelForImageSegmentation

	model_pt = AutoModelForImageSegmentation.from_pretrained(MODEL_ID, trust_remote_code=True)
	model_pt.eval()
	model_pt = model_pt.float()

	param_count = sum(p.numel() for p in model_pt.parameters())
	model_size_mb = sum(p.numel() * p.element_size() for p in model_pt.parameters()) / 1024**2
	print(f" Parameters: {param_count:,}")
	print(f" Model size (FP32): {model_size_mb:.1f} MB")

	torch.set_num_threads(NUM_THREADS)

	# ═══════════════════════════════════════════════════════════════════════════
	# STEP 2: PyTorch Baseline
	# ═══════════════════════════════════════════════════════════════════════════
	print("\n" + "=" * 70)
	print("STEP 2: PyTorch FP32 Baseline")
	print("=" * 70)

	for res in RESOLUTIONS:
	dummy = create_dummy_input(res)
	def pt_infer(d=dummy):
	with torch.no_grad():
	return model_pt(d)[-1].sigmoid()
	result = benchmark_fn(pt_infer, label=f"PyTorch-FP32-{res[0]}x{res[1]}")
	result["resolution"] = f"{res[0]}x{res[1]}"
	result["backend"] = "pytorch_fp32"
	all_results["benchmarks"].append(result)

	# ═══════════════════════════════════════════════════════════════════════════
	# STEP 3: Manual ONNX Export (static shapes for each resolution)
	# ═══════════════════════════════════════════════════════════════════════════
	print("\n" + "=" * 70)
	print("STEP 3: ONNX Export (static shape per resolution)")
	print("=" * 70)

	onnx_dir = OUTPUT_DIR / "onnx"
	onnx_dir.mkdir(parents=True, exist_ok=True)

	# Wrapper to return only the last output (segmentation mask)
	class BiRefNetWrapper(torch.nn.Module):
	def __init__(self, model):
	super().__init__()
	self.model = model
	def forward(self, x):
	return self.model(x)[-1]

	wrapper = BiRefNetWrapper(model_pt)
	wrapper.eval()

	onnx_models = {}
	for res in RESOLUTIONS:
	onnx_path = onnx_dir / f"birefnet_lite_{res[0]}x{res[1]}.onnx"
	print(f" Exporting ONNX for {res[0]}x{res[1]}...")
	try:
	dummy = create_dummy_input(res)
	torch.onnx.export(
	wrapper,
	dummy,
	str(onnx_path),
	input_names=["input_image"],
	output_names=["output"],
	opset_version=17,
	do_constant_folding=True,
	)
	size_mb = onnx_path.stat().st_size / 1024**2
	print(f" Exported: {onnx_path.name} ({size_mb:.1f} MB)")
	onnx_models[res] = str(onnx_path)
	except Exception as e:
	print(f" ONNX export failed for {res[0]}x{res[1]}: {e}")

	# ═══════════════════════════════════════════════════════════════════════════
	# STEP 3b: OpenVINO direct conversion from PyTorch (fallback / parallel path)
	# ═══════════════════════════════════════════════════════════════════════════
	print("\n" + "=" * 70)
	print("STEP 3b: OpenVINO Direct Conversion from PyTorch (ov.convert_model)")
	print("=" * 70)

	import openvino as ov
	core = ov.Core()

	ov_dir = OUTPUT_DIR / "openvino_fp32"
	ov_dir.mkdir(parents=True, exist_ok=True)
	ov_models = {}

	for res in RESOLUTIONS:
	ir_path = str(ov_dir / f"birefnet_lite_{res[0]}x{res[1]}.xml")
	print(f" Converting PyTorch → OpenVINO IR for {res[0]}x{res[1]}...")
	try:
	dummy = create_dummy_input(res)
	# First try: ONNX → OpenVINO (if ONNX export succeeded)
	if res in onnx_models:
	print(f" Using ONNX path...")
	ov_model = core.read_model(onnx_models[res])
	else:
	# Fallback: direct PyTorch → OpenVINO via ov.convert_model
	print(f" Using direct PyTorch → OV conversion...")
	ov_model = ov.convert_model(wrapper, example_input=dummy)

	ov.save_model(ov_model, ir_path)
	bin_size = Path(ir_path.replace(".xml", ".bin")).stat().st_size / 1024**2
	print(f" IR saved: {bin_size:.1f} MB")
	ov_models[res] = ir_path
	except Exception as e:
	print(f" Conversion failed for {res[0]}x{res[1]}: {e}")
	import traceback; traceback.print_exc()

	# ═══════════════════════════════════════════════════════════════════════════
	# STEP 4: ONNX Runtime FP32 Benchmark
	# ═══════════════════════════════════════════════════════════════════════════
	print("\n" + "=" * 70)
	print("STEP 4: ONNX Runtime FP32 (Graph Optimized)")
	print("=" * 70)

	import onnxruntime as ort

	for res in RESOLUTIONS:
	if res not in onnx_models:
	print(f" Skipping {res[0]}x{res[1]} — no ONNX model")
	continue

	try:
	sess_opts = ort.SessionOptions()
	sess_opts.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_ALL
	sess_opts.intra_op_num_threads = NUM_THREADS
	sess_opts.inter_op_num_threads = 1
	sess_opts.execution_mode = ort.ExecutionMode.ORT_SEQUENTIAL

	session = ort.InferenceSession(onnx_models[res], sess_opts, providers=["CPUExecutionProvider"])
	input_name = session.get_inputs()[0].name
	dummy_np = create_dummy_input(res).numpy()

	def ort_infer(s=session, inp=input_name, d=dummy_np):
	return s.run(None, {inp: d})

	result = benchmark_fn(ort_infer, label=f"ONNX-RT-FP32-{res[0]}x{res[1]}")
	result["resolution"] = f"{res[0]}x{res[1]}"
	result["backend"] = "onnxruntime_fp32"
	all_results["benchmarks"].append(result)
	del session
	except Exception as e:
	print(f" ONNX-RT FP32 {res[0]}x{res[1]} failed: {e}")
	import traceback; traceback.print_exc()

	# ═══════════════════════════════════════════════════════════════════════════
	# STEP 5: ONNX Runtime INT8 Dynamic Quantization
	# ═══════════════════════════════════════════════════════════════════════════
	print("\n" + "=" * 70)
	print("STEP 5: ONNX Runtime INT8 Dynamic Quantization")
	print("=" * 70)

	from onnxruntime.quantization import quantize_dynamic, QuantType

	ort_int8_models = {}
	for res in RESOLUTIONS:
	if res not in onnx_models:
	continue

	int8_path = onnx_dir / f"birefnet_lite_{res[0]}x{res[1]}_int8.onnx"
	try:
	print(f" Quantizing {res[0]}x{res[1]} to INT8 (dynamic)...")
	quantize_dynamic(
	model_input=onnx_models[res],
	model_output=str(int8_path),
	weight_type=QuantType.QInt8,
	per_channel=True,
	reduce_range=False, # AVX-512 doesn't need reduce_range
	extra_options={"DefaultTensorType": 1}, # FLOAT = 1, fixes shape inference
	)
	size_mb = int8_path.stat().st_size / 1024**2
	print(f" INT8 model: {int8_path.name} ({size_mb:.1f} MB)")
	ort_int8_models[res] = str(int8_path)
	except Exception as e:
	print(f" INT8 quantization failed for {res[0]}x{res[1]}: {e}")
	import traceback; traceback.print_exc()

	# Benchmark INT8
	for res in RESOLUTIONS:
	if res not in ort_int8_models:
	continue
	try:
	sess_opts = ort.SessionOptions()
	sess_opts.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_ALL
	sess_opts.intra_op_num_threads = NUM_THREADS
	sess_opts.inter_op_num_threads = 1
	sess_opts.execution_mode = ort.ExecutionMode.ORT_SEQUENTIAL

	session = ort.InferenceSession(ort_int8_models[res], sess_opts, providers=["CPUExecutionProvider"])
	input_name = session.get_inputs()[0].name
	dummy_np = create_dummy_input(res).numpy()

	def ort_int8_infer(s=session, inp=input_name, d=dummy_np):
	return s.run(None, {inp: d})

	result = benchmark_fn(ort_int8_infer, label=f"ONNX-RT-INT8-{res[0]}x{res[1]}")
	result["resolution"] = f"{res[0]}x{res[1]}"
	result["backend"] = "onnxruntime_int8_dynamic"
	all_results["benchmarks"].append(result)
	del session
	except Exception as e:
	print(f" ONNX-RT INT8 {res[0]}x{res[1]} failed: {e}")

	# ═══════════════════════════════════════════════════════════════════════════
	# STEP 6: OpenVINO FP32 Benchmark (using pre-converted models)
	# ═══════════════════════════════════════════════════════════════════════════
	print("\n" + "=" * 70)
	print("STEP 6: OpenVINO FP32 Benchmark")
	print("=" * 70)

	for res in RESOLUTIONS:
	if res not in ov_models:
	print(f" Skipping {res[0]}x{res[1]} — no OpenVINO model")
	continue
	try:
	ov_model = core.read_model(ov_models[res])
	ov_config = {
	"PERFORMANCE_HINT": "LATENCY",
	"NUM_STREAMS": "1",
	"INFERENCE_NUM_THREADS": str(NUM_THREADS),
	}
	compiled = core.compile_model(ov_model, "CPU", ov_config)
	infer_req = compiled.create_infer_request()
	dummy_np = create_dummy_input(res).numpy()

	def ov_fp32_infer(req=infer_req, d=dummy_np):
	return req.infer({0: d})

	result = benchmark_fn(ov_fp32_infer, label=f"OpenVINO-FP32-{res[0]}x{res[1]}")
	result["resolution"] = f"{res[0]}x{res[1]}"
	result["backend"] = "openvino_fp32"
	all_results["benchmarks"].append(result)

	except Exception as e:
	print(f" OpenVINO FP32 {res[0]}x{res[1]} failed: {e}")
	import traceback; traceback.print_exc()

	# ═══════════════════════════════════════════════════════════════════════════
	# STEP 7: OpenVINO FP16 Weight Compression
	# ═══════════════════════════════════════════════════════════════════════════
	print("\n" + "=" * 70)
	print("STEP 7: OpenVINO FP16 Weight Compression")
	print("=" * 70)

	ov_fp16_dir = OUTPUT_DIR / "openvino_fp16"
	ov_fp16_dir.mkdir(parents=True, exist_ok=True)

	for res in RESOLUTIONS:
	if res not in ov_models:
	continue
	try:
	ov_model = core.read_model(ov_models[res])
	fp16_path = str(ov_fp16_dir / f"birefnet_lite_{res[0]}x{res[1]}_fp16.xml")
	ov.save_model(ov_model, fp16_path, compress_to_fp16=True)

	bin_size = Path(fp16_path.replace(".xml", ".bin")).stat().st_size / 1024**2
	print(f" {res[0]}x{res[1]} FP16: {bin_size:.1f} MB")

	ov_config = {"PERFORMANCE_HINT": "LATENCY", "NUM_STREAMS": "1", "INFERENCE_NUM_THREADS": str(NUM_THREADS)}
	compiled = core.compile_model(core.read_model(fp16_path), "CPU", ov_config)
	infer_req = compiled.create_infer_request()
	dummy_np = create_dummy_input(res).numpy()

	def ov_fp16_infer(req=infer_req, d=dummy_np):
	return req.infer({0: d})

	result = benchmark_fn(ov_fp16_infer, label=f"OpenVINO-FP16-{res[0]}x{res[1]}")
	result["resolution"] = f"{res[0]}x{res[1]}"
	result["backend"] = "openvino_fp16"
	all_results["benchmarks"].append(result)
	except Exception as e:
	print(f" OpenVINO FP16 {res[0]}x{res[1]} failed: {e}")

	# ═══════════════════════════════════════════════════════════════════════════
	# STEP 8: OpenVINO INT8 NNCF Post-Training Quantization
	# ═══════════════════════════════════════════════════════════════════════════
	print("\n" + "=" * 70)
	print("STEP 8: OpenVINO INT8 NNCF Post-Training Quantization")
	print("=" * 70)

	import nncf

	ov_int8_dir = OUTPUT_DIR / "openvino_int8"
	ov_int8_dir.mkdir(parents=True, exist_ok=True)

	# We'll quantize using the 1024x1024 model, then benchmark at both resolutions
	res_1024 = (1024, 1024)
	if res_1024 in ov_models:
	try:
	ov_model_fp32 = core.read_model(ov_models[res_1024])

	print(" Generating calibration data (50 synthetic images at 1024x1024)...")
	transform = transforms.Compose([
	transforms.Resize((1024, 1024)),
	transforms.ToTensor(),
	transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]),
	])

	cal_data = []
	for i in range(50):
	img = Image.fromarray(np.random.randint(0, 256, (1024, 1024, 3), dtype=np.uint8))
	tensor = transform(img).unsqueeze(0).numpy()
	cal_data.append(tensor)

	def cal_transform(data_item):
	return {0: data_item}

	nncf_dataset = nncf.Dataset(cal_data, cal_transform)

	print(" Running NNCF INT8 quantization...")
	t0 = time.time()
	quantized_model = nncf.quantize(
	ov_model_fp32,
	nncf_dataset,
	preset=nncf.QuantizationPreset.MIXED,
	subset_size=50,
	fast_bias_correction=True,
	)
	print(f" Quantization completed in {time.time() - t0:.1f}s")

	int8_path = str(ov_int8_dir / "birefnet_lite_1024x1024_int8.xml")
	ov.save_model(quantized_model, int8_path)
	bin_size = Path(int8_path.replace(".xml", ".bin")).stat().st_size / 1024**2
	print(f" INT8 model: {bin_size:.1f} MB")

	# Benchmark INT8 at 1024x1024
	ov_config = {"PERFORMANCE_HINT": "LATENCY", "NUM_STREAMS": "1", "INFERENCE_NUM_THREADS": str(NUM_THREADS)}
	compiled_int8 = core.compile_model(core.read_model(int8_path), "CPU", ov_config)
	infer_req_int8 = compiled_int8.create_infer_request()
	dummy_np = create_dummy_input(res_1024).numpy()

	def ov_int8_infer(req=infer_req_int8, d=dummy_np):
	return req.infer({0: d})

	result = benchmark_fn(ov_int8_infer, label="OpenVINO-INT8-1024x1024")
	result["resolution"] = "1024x1024"
	result["backend"] = "openvino_int8_nncf"
	all_results["benchmarks"].append(result)

	except Exception as e:
	print(f" OpenVINO INT8 NNCF failed: {e}")
	import traceback; traceback.print_exc()

	# Also do INT8 for 512x512
	res_512 = (512, 512)
	if res_512 in ov_models:
	try:
	ov_model_fp32_512 = core.read_model(ov_models[res_512])

	print("\n Generating calibration data (50 synthetic images at 512x512)...")
	transform_512 = transforms.Compose([
	transforms.Resize((512, 512)),
	transforms.ToTensor(),
	transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]),
	])
	cal_data_512 = []
	for i in range(50):
	img = Image.fromarray(np.random.randint(0, 256, (512, 512, 3), dtype=np.uint8))
	tensor = transform_512(img).unsqueeze(0).numpy()
	cal_data_512.append(tensor)

	nncf_dataset_512 = nncf.Dataset(cal_data_512, cal_transform)

	print(" Running NNCF INT8 quantization for 512x512...")
	t0 = time.time()
	quantized_model_512 = nncf.quantize(
	ov_model_fp32_512,
	nncf_dataset_512,
	preset=nncf.QuantizationPreset.MIXED,
	subset_size=50,
	fast_bias_correction=True,
	)
	print(f" Quantization completed in {time.time() - t0:.1f}s")

	int8_512_path = str(ov_int8_dir / "birefnet_lite_512x512_int8.xml")
	ov.save_model(quantized_model_512, int8_512_path)

	# Benchmark
	compiled_int8_512 = core.compile_model(core.read_model(int8_512_path), "CPU", ov_config)
	infer_req_int8_512 = compiled_int8_512.create_infer_request()
	dummy_512 = create_dummy_input(res_512).numpy()

	def ov_int8_512_infer(req=infer_req_int8_512, d=dummy_512):
	return req.infer({0: d})

	result = benchmark_fn(ov_int8_512_infer, label="OpenVINO-INT8-512x512")
	result["resolution"] = "512x512"
	result["backend"] = "openvino_int8_nncf"
	all_results["benchmarks"].append(result)

	except Exception as e:
	print(f" OpenVINO INT8 512x512 failed: {e}")
	import traceback; traceback.print_exc()

	# ═══════════════════════════════════════════════════════════════════════════
	# STEP 9: OpenVINO INT8 Weight-Only (no calibration needed)
	# ═══════════════════════════════════════════════════════════════════════════
	print("\n" + "=" * 70)
	print("STEP 9: OpenVINO INT8 Weight-Only Quantization")
	print("=" * 70)

	ov_int8wo_dir = OUTPUT_DIR / "openvino_int8wo"
	ov_int8wo_dir.mkdir(parents=True, exist_ok=True)

	for res in RESOLUTIONS:
	if res not in ov_models:
	continue
	try:
	ov_model = core.read_model(ov_models[res])
	compressed = nncf.compress_weights(ov_model, mode=nncf.CompressWeightsMode.INT8_SYM)

	wo_path = str(ov_int8wo_dir / f"birefnet_lite_{res[0]}x{res[1]}_int8wo.xml")
	ov.save_model(compressed, wo_path)
	bin_size = Path(wo_path.replace(".xml", ".bin")).stat().st_size / 1024**2
	print(f" {res[0]}x{res[1]} INT8-WO: {bin_size:.1f} MB")

	compiled = core.compile_model(core.read_model(wo_path), "CPU",
	{"PERFORMANCE_HINT": "LATENCY", "NUM_STREAMS": "1",
	"INFERENCE_NUM_THREADS": str(NUM_THREADS)})
	infer_req = compiled.create_infer_request()
	dummy_np = create_dummy_input(res).numpy()

	def ov_int8wo_infer(req=infer_req, d=dummy_np):
	return req.infer({0: d})

	result = benchmark_fn(ov_int8wo_infer, label=f"OpenVINO-INT8wo-{res[0]}x{res[1]}")
	result["resolution"] = f"{res[0]}x{res[1]}"
	result["backend"] = "openvino_int8_weight_only"
	all_results["benchmarks"].append(result)
	except Exception as e:
	print(f" OpenVINO INT8-WO {res[0]}x{res[1]} failed: {e}")

	# ═══════════════════════════════════════════════════════════════════════════
	# FINAL: Summary
	# ═══════════════════════════════════════════════════════════════════════════
	print("\n" + "=" * 70)
	print("FINAL RESULTS SUMMARY")
	print("=" * 70)

	with open(RESULTS_FILE, "w") as f:
	json.dump(all_results, f, indent=2)

	print(f"\n{'Backend':<40} {'Resolution':<12} {'Mean (ms)':<12} {'Min (ms)':<12} {'FPS':<10} {'Speedup':<10}")
	print("-" * 96)

	baselines = {}
	for b in all_results["benchmarks"]:
	if b["backend"] == "pytorch_fp32":
	baselines[b["resolution"]] = b["mean_ms"]

	for b in sorted(all_results["benchmarks"], key=lambda x: (x["resolution"], x["mean_ms"])):
	baseline = baselines.get(b["resolution"], b["mean_ms"])
	speedup = baseline / b["mean_ms"] if b["mean_ms"] > 0 else 0
	best_for_res = min(
	(x["mean_ms"] for x in all_results["benchmarks"] if x["resolution"] == b["resolution"] and x["mean_ms"] > 0),
	default=b["mean_ms"]
	)
	marker = " ★ BEST" if b["mean_ms"] == best_for_res else ""
	print(f"{b['backend']:<40} {b['resolution']:<12} {b['mean_ms']:<12.1f} {b['min_ms']:<12.1f} {b['fps']:<10.2f} {speedup:<10.2f}{marker}")

	print("\n" + "=" * 70)
	print("OPTIMIZATION GUIDE FOR INTEL XEON W-2145")
	print("=" * 70)
	print("""
	CPU: Intel Xeon W-2145 (Skylake-SP)
	- 8 cores / 16 threads, 11 MB L3
	- AVX-512F/CD/BW/DQ/VL — NO VNNI, NO BF16, NO AMX
	- FP32 compute only; INT8 reduces memory bandwidth, not compute

	Recommended deployment:
	1. OpenVINO INT8 (NNCF) — best latency/throughput ratio
	2. Static input shape — eliminates dynamic dispatch overhead
	3. OMP_NUM_THREADS=8 (physical cores, avoid HT contention)
	4. KMP_AFFINITY=granularity=fine,compact,1,0
	5. NUM_STREAMS=1 for single-request latency optimization
	6. 512x512 resolution when quality allows (~4x faster than 1024x1024)

	Upgrade path for additional gains:
	- Cascade Lake (W-3200): VNNI → 2x more INT8 throughput
	- Sapphire Rapids (W-2400): AMX → 4-8x INT8/BF16 throughput
	""")

	print(f"\nAll optimized models in: {OUTPUT_DIR}")
	print(f"Benchmark results in: {RESULTS_FILE}")
	print("Done!")