Spaces:

AsadIsmail
/

ternary-quant-demo

Running

App Files Files Community

ternary-quant-demo / ternary_quant /cli.py

AsadIsmail

Bundle ternary_quant package directly (private repo fix)

162f86a verified 10 days ago

raw

history blame contribute delete

71 kB

	"""
	CLI for ternary quantization of HuggingFace models.
	"""

	from __future__ import annotations

	import argparse
	import gc
	import json
	import math
	import sys
	import time
	from pathlib import Path

	import torch


	def cmd_catalog(args):
	"""List the repo's known-good and known-probe model entries."""
	from ternary_quant.toolkit import known_models_to_dict, list_known_models

	entries = list_known_models(status=args.status, family=args.family)
	if args.json:
	payload = {
	"status_filter": args.status,
	"family_filter": args.family,
	"models": known_models_to_dict(entries),
	}
	text = json.dumps(payload, indent=2)
	if args.output:
	output_path = Path(args.output)
	output_path.parent.mkdir(parents=True, exist_ok=True)
	output_path.write_text(text + "\n")
	print(f"Wrote catalog to {output_path}")
	return
	print(text)
	return

	if not entries:
	print("No models matched the requested filters.")
	return

	grouped: dict[str, list] = {}
	for entry in entries:
	grouped.setdefault(entry.status, []).append(entry)

	for status, status_entries in grouped.items():
	print(status.replace("_", " ").title())
	for entry in status_entries:
	print(
	f" {entry.model_id:<40} family={entry.family:<18} "
	f"path={entry.path:<8} runtime={entry.recommended_runtime}"
	)
	print(f" note: {entry.note}")
	print(f" artifact: {entry.artifact}")
	if args.show_commands and entry.quickstart_command:
	print(f" quickstart: {entry.quickstart_command}")
	print("")


	def cmd_doctor(args):
	"""Report environment readiness and runtime recommendations."""
	from ternary_quant.toolkit import build_doctor_report, doctor_report_to_text

	report = build_doctor_report()
	if args.json:
	text = json.dumps(report, indent=2)
	if args.output:
	output_path = Path(args.output)
	output_path.parent.mkdir(parents=True, exist_ok=True)
	output_path.write_text(text + "\n")
	print(f"Wrote doctor report to {output_path}")
	return
	print(text)
	return

	print(doctor_report_to_text(report))
	if args.output:
	output_path = Path(args.output)
	output_path.parent.mkdir(parents=True, exist_ok=True)
	output_path.write_text(json.dumps(report, indent=2) + "\n")
	print(f"\nWrote doctor report to {output_path}")


	def cmd_quantize(args):
	"""Quantize a HuggingFace model with the legacy full-ternary pipeline."""
	from ternary_quant.pipeline import QuantizationConfig, quantize_model
	from ternary_quant.storage import save_quantized_model

	config = QuantizationConfig(
	n_iter=args.n_iter,
	use_activation_aware=not args.no_activation_aware,
	block_size=args.block_size,
	n_samples=args.n_samples,
	seq_len=args.seq_len,
	dataset=args.dataset,
	dataset_config=args.dataset_config,
	seed=args.seed,
	)

	if args.skip_modules:
	config.skip_modules = args.skip_modules

	result = quantize_model(
	model_name_or_path=args.model,
	config=config,
	device=args.device,
	dtype=_parse_dtype(args.dtype),
	)

	save_quantized_model(
	ternary_params=result.ternary_params,
	model_name=result.model_name,
	model_config=result.model_config,
	quant_config=result.config,
	output_dir=args.output,
	stats=result.stats,
	)

	if args.eval:
	print("\nRunning perplexity evaluation...")
	from ternary_quant.eval import evaluate_perplexity
	from ternary_quant.inference import load_ternary_model

	model, tokenizer = load_ternary_model(
	args.output,
	device=args.device,
	runtime_mode=getattr(args, "runtime_mode", "packed"),
	)
	ppl = evaluate_perplexity(model, tokenizer, max_samples=args.eval_samples)
	print(f"Ternary model perplexity: {ppl:.2f}")


	def cmd_quantize_small(args):
	"""Quantize a small model with the role-aware sparse asymmetric ternary path."""
	from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer

	from ternary_quant.data import get_calibration_data
	from ternary_quant.eval import (
	evaluate_perplexity,
	evaluate_prompt_bank,
	get_default_prompt_bank,
	)
	from ternary_quant.inference import generate_text, load_ternary_model
	from ternary_quant.quantizer_small import (
	SmallModelQuantizationConfig,
	build_sensitivity_only_plan,
	build_role_aware_plan,
	config_to_dict,
	plan_to_dict,
	quantize_small_model_inplace,
	summarize_small_model_quantization,
	tune_low_rank_residuals_inplace,
	)
	from ternary_quant.storage import save_quantized_model

	device = _resolve_device(args.device)
	dtype = _parse_dtype(args.dtype)

	tokenizer = AutoTokenizer.from_pretrained(args.model)
	model_config = AutoConfig.from_pretrained(args.model)
	calibration_data = get_calibration_data(
	args.model,
	tokenizer=tokenizer,
	n_samples=args.n_samples,
	seq_len=args.seq_len,
	dataset_name=args.dataset,
	dataset_config=args.dataset_config,
	seed=args.seed,
	).to(device)

	def load_base_model():
	model = AutoModelForCausalLM.from_pretrained(
	args.model,
	torch_dtype=dtype,
	low_cpu_mem_usage=True,
	).to(device)
	model.eval()
	return model

	def build_behavior_sequences(prompt_bank: dict) -> list[torch.Tensor]:
	sequences = []
	for sample in prompt_bank.get("samples", []):
	prompt_ids = tokenizer(
	sample["prompt"],
	return_tensors="pt",
	truncation=False,
	)["input_ids"][0]
	generated_ids = torch.tensor(
	sample.get("generated_token_ids", []),
	dtype=torch.long,
	)
	full_sequence = torch.cat([prompt_ids, generated_ids], dim=0).unsqueeze(0)
	sequences.append(full_sequence)
	return sequences

	@torch.no_grad()
	def build_hidden_cache(sequences):
	if sequences is None:
	return None
	if isinstance(sequences, torch.Tensor):
	model = load_base_model()
	outputs = []
	for start in range(0, sequences.shape[0], args.calibration_tune_batch_size):
	batch = sequences[start : start + args.calibration_tune_batch_size]
	hidden = model(batch, output_hidden_states=True).hidden_states[-1]
	outputs.append(hidden.detach().cpu().to(torch.float16))
	del model
	_cleanup_device(device)
	return torch.cat(outputs, dim=0)

	outputs = []
	model = load_base_model()
	for seq in sequences:
	hidden = model(seq.to(device), output_hidden_states=True).hidden_states[-1]
	outputs.append(hidden.detach().cpu().to(torch.float16))
	del model
	_cleanup_device(device)
	return outputs

	@torch.no_grad()
	def build_topk_logit_cache(sequences, top_k: int):
	if sequences is None or top_k <= 0:
	return None
	top_k = max(1, int(top_k))
	if isinstance(sequences, torch.Tensor):
	model = load_base_model()
	indices_out = []
	logits_out = []
	entropy_out = []
	for start in range(0, sequences.shape[0], args.calibration_tune_batch_size):
	batch = sequences[start : start + args.calibration_tune_batch_size]
	logits = model(batch).logits[:, :-1, :].float()
	values, indices = torch.topk(logits, k=min(top_k, logits.shape[-1]), dim=-1)
	log_probs = torch.log_softmax(logits, dim=-1)
	probs = log_probs.exp()
	entropy = -(probs * log_probs).sum(dim=-1) / math.log(max(logits.shape[-1], 2))
	indices_out.append(indices.detach().cpu().to(torch.int32))
	logits_out.append(values.detach().cpu().to(torch.float16))
	entropy_out.append(entropy.detach().cpu().to(torch.float16))
	del model
	_cleanup_device(device)
	return {
	"indices": torch.cat(indices_out, dim=0),
	"logits": torch.cat(logits_out, dim=0),
	"entropy": torch.cat(entropy_out, dim=0),
	}

	outputs = []
	model = load_base_model()
	for seq in sequences:
	logits = model(seq.to(device)).logits[:, :-1, :].float()
	values, indices = torch.topk(logits, k=min(top_k, logits.shape[-1]), dim=-1)
	log_probs = torch.log_softmax(logits, dim=-1)
	probs = log_probs.exp()
	entropy = -(probs * log_probs).sum(dim=-1) / math.log(max(logits.shape[-1], 2))
	outputs.append(
	{
	"indices": indices.detach().cpu().to(torch.int32),
	"logits": values.detach().cpu().to(torch.float16),
	"entropy": entropy.detach().cpu().to(torch.float16),
	}
	)
	del model
	_cleanup_device(device)
	return outputs

	def make_config(
	planner: str,
	) -> SmallModelQuantizationConfig:
	target_average_bits = args.target_average_bits
	adaptive_salient = args.adaptive_salient
	role_cost_weights = None

	if planner == "budgeted" and target_average_bits is None:
	target_average_bits = 10.5
	adaptive_salient = True
	elif planner == "sensitivity_budget":
	if target_average_bits is None:
	target_average_bits = 10.5
	adaptive_salient = True
	role_cost_weights = _uniform_role_weights()
	elif planner == "practical":
	target_average_bits = None

	config = SmallModelQuantizationConfig(
	group_size=args.group_size,
	n_iter=args.n_iter,
	salient_fraction=args.salient_fraction,
	min_salient_fraction=args.min_salient_fraction,
	max_salient_fraction=args.max_salient_fraction,
	adaptive_salient=adaptive_salient,
	low_rank_rank=args.low_rank_rank,
	adaptive_low_rank=args.adaptive_low_rank,
	low_rank_chunk_rank=args.low_rank_chunk_rank,
	low_rank_target_average_bits=args.low_rank_target_average_bits,
	low_rank_fit_mode=args.low_rank_fit_mode,
	low_rank_ridge=args.low_rank_ridge,
	low_rank_max_samples=args.low_rank_max_samples,
	n_boundary_layers=args.boundary_layers,
	calibration_batch_size=args.calibration_batch_size,
	quantize_attention_output=args.quantize_attention_output,
	quantize_mlp_output=args.quantize_mlp_output,
	target_average_bits=target_average_bits,
	importance_threshold_scale=getattr(args, "importance_threshold_scale", 0.0),
	role_cost_weights=role_cost_weights
	if role_cost_weights is not None
	else SmallModelQuantizationConfig().role_cost_weights,
	)
	config.base_config.n_samples = args.n_samples
	config.base_config.seq_len = args.seq_len
	config.base_config.dataset = args.dataset
	config.base_config.dataset_config = args.dataset_config
	config.base_config.seed = args.seed
	return config

	def build_plan(model, config: SmallModelQuantizationConfig, planner: str):
	if planner == "sensitivity_budget":
	return build_sensitivity_only_plan(model, calibration_data, config)
	return build_role_aware_plan(model, calibration_data, config)

	behavior_sequences = None
	calibration_hidden_states = None
	behavior_hidden_states = None
	calibration_logit_targets = None
	behavior_logit_targets = None
	if args.calibration_tune_steps > 0 and args.behavior_tune_weight > 0.0:
	behavior_prompt_bank = get_default_prompt_bank(
	primary_prompt=args.prompt,
	max_prompts=args.behavior_tune_prompt_count,
	)
	print("Building prompt-bank behavior tuning data...")
	behavior_model = load_base_model()
	behavior_reference = evaluate_prompt_bank(
	behavior_model,
	tokenizer,
	prompts=behavior_prompt_bank,
	max_new_tokens=args.behavior_tune_max_tokens,
	)
	behavior_sequences = build_behavior_sequences(behavior_reference)
	del behavior_model
	_cleanup_device(device)
	if args.calibration_tune_steps > 0 and (
	args.distill_weight > 0.0 or args.behavior_hidden_weight > 0.0
	):
	print("Building teacher hidden-state caches...")
	calibration_hidden_states = build_hidden_cache(calibration_data)
	if behavior_sequences is not None:
	behavior_hidden_states = build_hidden_cache(behavior_sequences)
	if args.calibration_tune_steps > 0 and (
	args.logit_distill_weight > 0.0
	or args.behavior_logit_weight > 0.0
	or args.entropy_distill_weight > 0.0
	or args.behavior_entropy_weight > 0.0
	):
	print("Building teacher top-k logit caches...")
	calibration_logit_targets = build_topk_logit_cache(
	calibration_data,
	args.logit_distill_topk,
	)
	if behavior_sequences is not None:
	behavior_logit_targets = build_topk_logit_cache(
	behavior_sequences,
	args.logit_distill_topk,
	)

	selection = None
	auto_tuned = False
	if args.planner in {"auto", "collapse_auto"}:
	candidate_planners = ["practical", "sensitivity_budget"]
	best = None
	total_quant_time = 0.0
	selection_metric = (
	"collapse_aware" if args.planner == "collapse_auto" else "ppl"
	)
	selection = {
	"selection_metric": selection_metric,
	"candidate_scores": {},
	}
	selection_prompt_bank = get_default_prompt_bank(
	primary_prompt=args.prompt,
	max_prompts=args.selection_prompt_count,
	)
	reference_behavior = None
	if selection_metric == "collapse_aware":
	print("Measuring FP16 prompt-bank behavior...")
	reference_model = load_base_model()
	reference_behavior = evaluate_prompt_bank(
	reference_model,
	tokenizer,
	prompts=selection_prompt_bank,
	max_new_tokens=args.selection_max_tokens,
	)
	selection["reference_behavior"] = {
	"avg_collapse_score": reference_behavior["avg_collapse_score"],
	"worst_collapse_score": reference_behavior["worst_collapse_score"],
	"avg_distinct_2": reference_behavior["avg_distinct_2"],
	"avg_repeated_3gram_ratio": reference_behavior[
	"avg_repeated_3gram_ratio"
	],
	}
	del reference_model
	_cleanup_device(device)

	for planner in candidate_planners:
	print(f"Evaluating planner candidate: {planner}")
	model = load_base_model()
	config = make_config(planner)

	t0 = time.time()
	plan = build_plan(model, config, planner)
	result = quantize_small_model_inplace(
	model,
	calibration_data=calibration_data,
	config=config,
	plan=plan,
	)
	total_quant_time += time.time() - t0
	summary = summarize_small_model_quantization(result, model)
	tune_stats = None
	if args.calibration_tune_steps > 0:
	tune_stats = tune_low_rank_residuals_inplace(
	model,
	result,
	calibration_data=calibration_data,
	n_steps=args.calibration_tune_steps,
	lr=args.calibration_tune_lr,
	batch_size=args.calibration_tune_batch_size,
	max_seq_len=args.seq_len,
	behavior_sequences=behavior_sequences,
	behavior_weight=args.behavior_tune_weight,
	calibration_hidden_states=calibration_hidden_states,
	behavior_hidden_states=behavior_hidden_states,
	calibration_logit_targets=calibration_logit_targets,
	behavior_logit_targets=behavior_logit_targets,
	distill_weight=args.distill_weight,
	behavior_hidden_weight=args.behavior_hidden_weight,
	logit_distill_weight=args.logit_distill_weight,
	behavior_logit_weight=args.behavior_logit_weight,
	entropy_distill_weight=args.entropy_distill_weight,
	behavior_entropy_weight=args.behavior_entropy_weight,
	logit_distill_temperature=args.logit_distill_temperature,
	seed=args.seed,
	)
	summary = summarize_small_model_quantization(result, model)
	selection_ppl = evaluate_perplexity(
	model,
	tokenizer,
	seq_len=args.seq_len,
	max_samples=args.selection_eval_samples,
	)
	selection_score = float(selection_ppl)
	selection_behavior = None
	if selection_metric == "collapse_aware":
	selection_behavior = evaluate_prompt_bank(
	model,
	tokenizer,
	prompts=selection_prompt_bank,
	max_new_tokens=args.selection_max_tokens,
	)
	reference_avg = (
	0.0 if reference_behavior is None else reference_behavior["avg_collapse_score"]
	)
	reference_worst = (
	reference_avg
	if reference_behavior is None
	else reference_behavior["worst_collapse_score"]
	)
	collapse_excess = max(
	selection_behavior["avg_collapse_score"] - reference_avg,
	0.0,
	)
	worst_excess = max(
	selection_behavior["worst_collapse_score"] - reference_worst,
	0.0,
	)
	selection_score = selection_ppl * (
	1.0
	+ args.selection_collapse_weight * collapse_excess
	+ args.selection_worst_weight * worst_excess
	)
	selection["candidate_scores"][planner] = {
	"selection_ppl": selection_ppl,
	"selection_score": selection_score,
	"predicted_average_bits": plan.predicted_average_bits,
	"full_model_effective_bits": summary["full_model_effective_bits"],
	}
	if selection_behavior is not None:
	selection["candidate_scores"][planner]["selection_behavior"] = {
	"avg_collapse_score": selection_behavior["avg_collapse_score"],
	"worst_collapse_score": selection_behavior["worst_collapse_score"],
	"avg_distinct_2": selection_behavior["avg_distinct_2"],
	"avg_repeated_3gram_ratio": selection_behavior[
	"avg_repeated_3gram_ratio"
	],
	}
	if tune_stats is not None:
	selection["candidate_scores"][planner]["calibration_tune"] = tune_stats

	if best is None or selection_score < best["selection_score"]:
	if best is not None:
	del best["model"]
	_cleanup_device(device)
	best = {
	"model": model,
	"config": config,
	"plan": plan,
	"result": result,
	"summary": summary,
	"selection_ppl": selection_ppl,
	"selection_score": selection_score,
	"selection_behavior": selection_behavior,
	"planner": planner,
	}
	else:
	del model
	_cleanup_device(device)

	if best is None:
	raise RuntimeError("Auto planner failed to select a candidate.")

	model = best["model"]
	config = best["config"]
	plan = best["plan"]
	result = best["result"]
	summary = best["summary"]
	quant_time = total_quant_time
	selected_name = "RAST-collapse-auto" if args.planner == "collapse_auto" else "RAST-auto"
	result.plan.method_name = selected_name
	summary["method_name"] = selected_name
	auto_tuned = args.calibration_tune_steps > 0
	selection.update(
	{
	"selected_planner": best["planner"],
	"selection_ppl": best["selection_ppl"],
	"selection_score": best["selection_score"],
	}
	)
	if best["selection_behavior"] is not None:
	selection["selected_behavior"] = {
	"avg_collapse_score": best["selection_behavior"]["avg_collapse_score"],
	"worst_collapse_score": best["selection_behavior"]["worst_collapse_score"],
	"avg_distinct_2": best["selection_behavior"]["avg_distinct_2"],
	"avg_repeated_3gram_ratio": best["selection_behavior"][
	"avg_repeated_3gram_ratio"
	],
	}
	print(
	f"Selected planner: {best['planner']} \| "
	f"held-out score {best['selection_score']:.2f} \| "
	f"PPL {best['selection_ppl']:.2f} \| "
	f"full-model bits {summary['full_model_effective_bits']:.2f}"
	)
	else:
	model = load_base_model()
	config = make_config(args.planner)
	print("Building role-aware plan...")
	t0 = time.time()
	plan = build_plan(model, config, args.planner)
	print(
	f"Plan ready in {time.time() - t0:.1f}s \| "
	f"Predicted average bits: {plan.predicted_average_bits:.2f}"
	)

	print("Applying role-aware quantization...")
	t1 = time.time()
	result = quantize_small_model_inplace(
	model,
	calibration_data=calibration_data,
	config=config,
	plan=plan,
	)
	quant_time = time.time() - t1
	summary = summarize_small_model_quantization(result, model)

	if args.calibration_tune_steps > 0 and not auto_tuned:
	print("Calibrating low-rank residuals...")
	t2 = time.time()
	tune_stats = tune_low_rank_residuals_inplace(
	model,
	result,
	calibration_data=calibration_data,
	n_steps=args.calibration_tune_steps,
	lr=args.calibration_tune_lr,
	batch_size=args.calibration_tune_batch_size,
	max_seq_len=args.seq_len,
	behavior_sequences=behavior_sequences,
	behavior_weight=args.behavior_tune_weight,
	calibration_hidden_states=calibration_hidden_states,
	behavior_hidden_states=behavior_hidden_states,
	calibration_logit_targets=calibration_logit_targets,
	behavior_logit_targets=behavior_logit_targets,
	distill_weight=args.distill_weight,
	behavior_hidden_weight=args.behavior_hidden_weight,
	logit_distill_weight=args.logit_distill_weight,
	behavior_logit_weight=args.behavior_logit_weight,
	entropy_distill_weight=args.entropy_distill_weight,
	behavior_entropy_weight=args.behavior_entropy_weight,
	logit_distill_temperature=args.logit_distill_temperature,
	seed=args.seed,
	)
	quant_time += time.time() - t2
	summary = summarize_small_model_quantization(result, model)
	print(
	f"Calibration tune complete \| "
	f"final loss {tune_stats.get('final_loss', float('nan')):.4f} \| "
	f"wrapped modules {tune_stats['n_wrapped_modules']}"
	)

	save_quantized_model(
	ternary_params=result.quantized_params,
	model_name=args.model,
	model_config=model_config,
	quant_config=config,
	output_dir=args.output,
	stats=result.stats,
	summary=summary,
	plan=result.plan,
	method_name=result.plan.method_name,
	)

	report = {
	"method": result.plan.method_name,
	"model": args.model,
	"quant_time_sec": quant_time,
	"summary": summary,
	"plan": plan_to_dict(result.plan),
	"config": config_to_dict(config),
	}
	if selection is not None:
	report["selection"] = selection
	report_path = Path(args.output) / "role_aware_report.json"
	with open(report_path, "w") as f:
	json.dump(report, f, indent=2)
	print(f"Wrote role-aware report to {report_path}")

	if args.eval:
	print("\nRunning validation on saved model...")
	quantized_model, tokenizer = load_ternary_model(
	args.output,
	device=device,
	runtime_mode=getattr(args, "runtime_mode", "packed"),
	)
	ppl = evaluate_perplexity(
	quantized_model,
	tokenizer,
	seq_len=args.seq_len,
	max_samples=args.eval_samples,
	)
	print(f"Role-aware quantized perplexity: {ppl:.2f}")
	if args.prompt:
	text = generate_text(
	quantized_model,
	tokenizer,
	prompt=args.prompt,
	max_new_tokens=args.max_tokens,
	do_sample=False,
	)
	print(f"Prompt: {args.prompt}")
	print(f"Output: {text}")


	def cmd_quantize_ptq(args):
	"""Quantize a small model via a ternary PTQ family or controller."""
	from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer

	from ternary_quant.data import get_calibration_data
	from ternary_quant.eval import (
	evaluate_perplexity,
	evaluate_prompt_bank,
	get_default_prompt_bank,
	)
	from ternary_quant.inference import generate_text, load_ternary_model
	from ternary_quant.ptq_families import (
	build_family_config,
	family_config_to_dict,
	get_default_family_candidates,
	quantize_family_inplace,
	summarize_family_quantization,
	)
	from ternary_quant.storage import save_quantized_model

	device = _resolve_device(args.device)
	dtype = _parse_dtype(args.dtype)

	tokenizer = AutoTokenizer.from_pretrained(args.model)
	model_config = AutoConfig.from_pretrained(args.model)
	calibration_data = get_calibration_data(
	args.model,
	tokenizer=tokenizer,
	n_samples=args.n_samples,
	seq_len=args.seq_len,
	dataset_name=args.dataset,
	dataset_config=args.dataset_config,
	seed=args.seed,
	).to(device)

	def load_base_model():
	model = AutoModelForCausalLM.from_pretrained(
	args.model,
	torch_dtype=dtype,
	low_cpu_mem_usage=True,
	).to(device)
	model.eval()
	return model

	def build_config(family_name: str):
	return build_family_config(
	family_name,
	target_average_bits=args.target_average_bits,
	group_size=args.group_size,
	n_iter=args.n_iter,
	n_boundary_layers=args.boundary_layers,
	calibration_batch_size=args.calibration_batch_size,
	quantize_attention_output=args.quantize_attention_output,
	quantize_mlp_output=args.quantize_mlp_output,
	)

	selection = None
	if args.family == "controller":
	candidate_names = (
	args.candidate_families
	if args.candidate_families
	else get_default_family_candidates()
	)
	selection_metric = args.selection_metric
	selection_prompt_bank = get_default_prompt_bank(
	primary_prompt=args.prompt,
	max_prompts=args.selection_prompt_count,
	)
	selection = {
	"selection_metric": selection_metric,
	"candidate_scores": {},
	}

	reference_behavior = None
	if selection_metric == "collapse":
	print("Measuring FP16 prompt-bank behavior for controller selection...")
	reference_model = load_base_model()
	reference_behavior = evaluate_prompt_bank(
	reference_model,
	tokenizer,
	prompts=selection_prompt_bank,
	max_new_tokens=args.selection_max_tokens,
	)
	selection["reference_behavior"] = {
	"avg_collapse_score": reference_behavior["avg_collapse_score"],
	"worst_collapse_score": reference_behavior["worst_collapse_score"],
	"avg_distinct_2": reference_behavior["avg_distinct_2"],
	"avg_repeated_3gram_ratio": reference_behavior[
	"avg_repeated_3gram_ratio"
	],
	}
	del reference_model
	_cleanup_device(device)

	best = None
	total_quant_time = 0.0
	for family_name in candidate_names:
	print(f"Evaluating ternary PTQ family candidate: {family_name}")
	family_config = build_config(family_name)
	model = load_base_model()

	t0 = time.time()
	result = quantize_family_inplace(
	model,
	calibration_data=calibration_data,
	config=family_config,
	)
	total_quant_time += time.time() - t0
	summary = summarize_family_quantization(result)

	selection_ppl = evaluate_perplexity(
	model,
	tokenizer,
	seq_len=args.seq_len,
	max_samples=args.selection_eval_samples,
	)
	selection_score = float(selection_ppl)
	selection_behavior = None
	if selection_metric == "collapse":
	selection_behavior = evaluate_prompt_bank(
	model,
	tokenizer,
	prompts=selection_prompt_bank,
	max_new_tokens=args.selection_max_tokens,
	)
	reference_avg = (
	0.0
	if reference_behavior is None
	else reference_behavior["avg_collapse_score"]
	)
	reference_worst = (
	reference_avg
	if reference_behavior is None
	else reference_behavior["worst_collapse_score"]
	)
	collapse_excess = max(
	selection_behavior["avg_collapse_score"] - reference_avg,
	0.0,
	)
	worst_excess = max(
	selection_behavior["worst_collapse_score"] - reference_worst,
	0.0,
	)
	selection_score = selection_ppl * (
	1.0
	+ args.selection_collapse_weight * collapse_excess
	+ args.selection_worst_weight * worst_excess
	)

	if args.target_average_bits is not None:
	bits_excess = max(
	summary["full_model_effective_bits"] - args.target_average_bits,
	0.0,
	)
	selection_score *= (
	1.0
	+ args.selection_bits_weight
	* bits_excess
	/ max(args.target_average_bits, 1e-6)
	)

	selection["candidate_scores"][family_name] = {
	"label": family_config.label,
	"selection_ppl": selection_ppl,
	"selection_score": selection_score,
	"full_model_effective_bits": summary["full_model_effective_bits"],
	"quantized_fraction": summary["quantized_fraction"],
	}
	if selection_behavior is not None:
	selection["candidate_scores"][family_name]["selection_behavior"] = {
	"avg_collapse_score": selection_behavior["avg_collapse_score"],
	"worst_collapse_score": selection_behavior["worst_collapse_score"],
	"avg_distinct_2": selection_behavior["avg_distinct_2"],
	"avg_repeated_3gram_ratio": selection_behavior[
	"avg_repeated_3gram_ratio"
	],
	}

	if best is None or selection_score < best["selection_score"]:
	if best is not None:
	del best["model"]
	_cleanup_device(device)
	best = {
	"model": model,
	"family_config": family_config,
	"result": result,
	"summary": summary,
	"selection_ppl": selection_ppl,
	"selection_score": selection_score,
	"selection_behavior": selection_behavior,
	"family_name": family_name,
	}
	else:
	del model
	_cleanup_device(device)

	if best is None:
	raise RuntimeError("Controller failed to select a ternary PTQ family.")

	model = best["model"]
	family_config = best["family_config"]
	result = best["result"]
	summary = best["summary"]
	quant_time = total_quant_time
	result.plan.method_name = "Ternary-PTQ-auto"
	summary["method_name"] = "Ternary-PTQ-auto"
	summary["selected_family_preset"] = best["family_name"]
	selection.update(
	{
	"selected_family_preset": best["family_name"],
	"selected_family_label": family_config.label,
	"selection_ppl": best["selection_ppl"],
	"selection_score": best["selection_score"],
	}
	)
	if best["selection_behavior"] is not None:
	selection["selected_behavior"] = {
	"avg_collapse_score": best["selection_behavior"]["avg_collapse_score"],
	"worst_collapse_score": best["selection_behavior"]["worst_collapse_score"],
	"avg_distinct_2": best["selection_behavior"]["avg_distinct_2"],
	"avg_repeated_3gram_ratio": best["selection_behavior"][
	"avg_repeated_3gram_ratio"
	],
	}
	print(
	f"Selected family: {best['family_name']} \| "
	f"held-out score {best['selection_score']:.2f} \| "
	f"PPL {best['selection_ppl']:.2f} \| "
	f"full-model bits {summary['full_model_effective_bits']:.2f}"
	)
	else:
	family_config = build_config(args.family)
	print(f"Applying ternary PTQ family: {family_config.label}")
	model = load_base_model()
	t0 = time.time()
	result = quantize_family_inplace(
	model,
	calibration_data=calibration_data,
	config=family_config,
	)
	quant_time = time.time() - t0
	summary = summarize_family_quantization(result)

	save_quantized_model(
	ternary_params=result.quantized_params,
	model_name=args.model,
	model_config=model_config,
	quant_config=family_config,
	output_dir=args.output,
	stats=result.stats,
	summary=summary,
	plan=result.plan,
	method_name=result.plan.method_name,
	)

	report = {
	"method": result.plan.method_name,
	"model": args.model,
	"quant_time_sec": quant_time,
	"summary": summary,
	"family_config": family_config_to_dict(family_config),
	}
	if selection is not None:
	report["selection"] = selection
	report_path = Path(args.output) / "ternary_ptq_report.json"
	with open(report_path, "w") as f:
	json.dump(report, f, indent=2)
	print(f"Wrote ternary PTQ report to {report_path}")

	if args.eval:
	print("\nRunning validation on saved model...")
	quantized_model, tokenizer = load_ternary_model(
	args.output,
	device=device,
	runtime_mode=getattr(args, "runtime_mode", "packed"),
	)
	ppl = evaluate_perplexity(
	quantized_model,
	tokenizer,
	seq_len=args.seq_len,
	max_samples=args.eval_samples,
	)
	print(f"Ternary PTQ perplexity: {ppl:.2f}")
	if args.prompt:
	text = generate_text(
	quantized_model,
	tokenizer,
	prompt=args.prompt,
	max_new_tokens=args.max_tokens,
	do_sample=False,
	)
	print(f"Prompt: {args.prompt}")
	print(f"Output: {text}")


	def cmd_eval(args):
	"""Evaluate perplexity of a saved quantized model."""
	from ternary_quant.eval import evaluate_perplexity
	from ternary_quant.inference import load_ternary_model

	model, tokenizer = load_ternary_model(
	args.model_dir,
	device=args.device,
	runtime_mode=getattr(args, "runtime_mode", "packed"),
	)
	ppl = evaluate_perplexity(
	model,
	tokenizer,
	seq_len=args.seq_len,
	max_samples=args.max_samples,
	)
	print(f"\nPerplexity: {ppl:.2f}")


	def cmd_compare(args):
	"""Compare original and saved quantized model."""
	from ternary_quant.eval import compare_models

	compare_models(
	original_model_name=args.original,
	ternary_model_dir=args.ternary,
	device=args.device,
	seq_len=args.seq_len,
	max_samples=args.max_samples,
	)


	def cmd_generate(args):
	"""Generate text with a saved quantized model."""
	import numpy as np

	from ternary_quant.generative_adapters import inspect_generative_model
	from ternary_quant.inference import (
	generate_generative_output,
	generate_text,
	load_ternary_model,
	)

	model, asset = load_ternary_model(
	args.model_dir,
	device=args.device,
	runtime_mode=getattr(args, "runtime_mode", "packed"),
	)
	model_info = inspect_generative_model(
	model,
	model_name=str(getattr(model, "name_or_path", "loaded-model")),
	)
	image = None
	if args.image_path:
	try:
	from PIL import Image
	except Exception as exc:
	raise RuntimeError(
	"Reading --image-path requires Pillow. Install pillow or omit the image."
	) from exc
	image = np.array(Image.open(args.image_path).convert("RGB"))

	if model_info.model_family == "image_text_to_text":
	output = generate_generative_output(
	model,
	asset,
	prompt=args.prompt,
	max_new_tokens=args.max_tokens,
	image=image,
	)
	else:
	output = generate_text(
	model,
	asset,
	prompt=args.prompt,
	max_new_tokens=args.max_tokens,
	temperature=args.temperature,
	do_sample=args.temperature > 0,
	)
	print(f"\nPrompt: {args.prompt}")
	print(f"Output: {output}")


	def cmd_inspect_generative(args):
	"""Inspect a generative model and list its quantizable components."""
	from ternary_quant.generative_adapters import (
	generative_model_info_to_dict,
	load_generative_model,
	)

	device = _resolve_device(args.device)
	dtype = _parse_dtype(args.dtype)
	model, _, model_info = load_generative_model(
	args.model,
	device=device,
	dtype=dtype,
	)

	print(f"Model: {model_info.model_name}")
	print(f"Family: {model_info.model_family}")
	print(f"Model type: {model_info.model_type}")
	print(f"Architectures: {', '.join(model_info.architectures) or 'unknown'}")
	print(f"Default broad components: {', '.join(model_info.default_quantization_components)}")
	print("\nComponents:")
	for component in model_info.components:
	sample = ", ".join(component.sample_linear_like_names[:4]) or "(no linear modules)"
	print(
	f" {component.name:<22} path={component.path:<32} "
	f"linears={component.linear_like_count:<4} params={component.parameter_count:<12}"
	)
	print(f" sample: {sample}")

	if args.output:
	payload = generative_model_info_to_dict(model_info)
	output_path = Path(args.output)
	output_path.parent.mkdir(parents=True, exist_ok=True)
	with open(output_path, "w") as f:
	json.dump(payload, f, indent=2)
	print(f"\nWrote component inventory to {output_path}")

	del model
	_cleanup_device(device)


	def cmd_quantize_broad(args):
	"""Quantize selected components of a broad generative model."""
	from ternary_quant.generative_adapters import (
	BroadQuantizationConfig,
	broad_quant_config_to_dict,
	build_calibration_batches,
	evaluate_broad_prompt_bank,
	generative_model_info_to_dict,
	load_generative_model,
	make_demo_image,
	quantize_components_inplace,
	)
	from ternary_quant.inference import load_ternary_model
	from ternary_quant.storage import save_quantized_model

	device = _resolve_device(args.device)
	dtype = _parse_dtype(args.dtype)

	model, asset, model_info = load_generative_model(
	args.model,
	device=device,
	dtype=dtype,
	)

	components = (
	args.components if args.components else model_info.default_quantization_components
	)
	prompts = [args.prompt] if args.prompt else None
	broad_config = BroadQuantizationConfig(
	components=list(components),
	scheme=args.scheme,
	group_size=args.group_size,
	n_iter=args.n_iter,
	salient_fraction=args.salient_fraction,
	rescue_fraction=args.rescue_fraction,
	n_planes=3 if args.scheme == "tritplane3" else 2,
	allow_all_linear=args.allow_all_linear,
	max_length=args.seq_len,
	calibration_batch_size=args.calibration_batch_size,
	calibration_prompts=list(prompts) if prompts is not None else None,
	)

	demo_image = make_demo_image()
	calibration_batches = build_calibration_batches(
	asset,
	model_info,
	max_length=args.seq_len,
	batch_size=args.calibration_batch_size,
	prompts=prompts,
	demo_images=[demo_image],
	)
	result = quantize_components_inplace(
	model,
	model_info=model_info,
	calibration_batches=calibration_batches,
	config=broad_config,
	)

	save_quantized_model(
	ternary_params=result.quantized_params,
	model_name=args.model,
	model_config=model.config,
	quant_config=broad_config,
	output_dir=args.output,
	stats=result.stats,
	summary=result.summary,
	method_name=result.summary["method_name"],
	model_family=model_info.model_family,
	)

	report = {
	"method": result.summary["method_name"],
	"model": args.model,
	"model_info": generative_model_info_to_dict(model_info),
	"config": broad_quant_config_to_dict(broad_config),
	"summary": result.summary,
	}

	if args.eval:
	quantized_model, quantized_asset = load_ternary_model(
	args.output,
	device=device,
	runtime_mode=getattr(args, "runtime_mode", "packed"),
	)
	eval_prompts = prompts or None
	validation = evaluate_broad_prompt_bank(
	quantized_model,
	quantized_asset,
	model_info,
	prompts=eval_prompts
	if eval_prompts is not None
	else (
	[args.prompt]
	if args.prompt
	else (
	["Describe the image in one short sentence."]
	if model_info.model_family == "image_text_to_text"
	else [
	"The capital of France is",
	"Answer briefly: What is 2 + 2?",
	]
	)
	),
	max_new_tokens=args.max_tokens,
	demo_image=demo_image,
	)
	report["validation"] = validation
	print("\nValidation:")
	print(f" Avg collapse: {validation['avg_collapse_score']:.3f}")
	print(f" Primary output: {validation['primary_text']}")
	del quantized_model
	_cleanup_device(device)

	report_path = Path(args.output) / "broad_generative_report.json"
	with open(report_path, "w") as f:
	json.dump(report, f, indent=2)
	print(f"Wrote broad generative report to {report_path}")

	# Print a compact summary
	s = result.summary
	print(f"\nQuantization summary:")
	print(f" Layers quantized: {s['quantized_modules']}")
	print(f" Full-model effective bits: {s['full_model_effective_bits']:.2f}")
	print(f" Compression ratio: {s['compression_ratio']:.2f}×")
	print(f" Avg reconstruction error: {s['avg_relative_error']:.4f}")

	if getattr(args, "push_to_hub", None):
	_push_to_hub(args.output, args.push_to_hub, args.model, result.summary, broad_config)


	def _push_to_hub(output_dir: str, hub_repo: str, source_model: str, summary: dict, config) -> None:
	"""Push a quantized model directory to HuggingFace Hub."""
	try:
	from huggingface_hub import HfApi
	except ImportError:
	print("huggingface_hub not installed. Run: pip install huggingface_hub")
	return

	output_path = Path(output_dir)

	# Write a model card
	model_card = f"""---
	tags:
	- ternary-quant
	- quantization
	- ternary
	base_model: {source_model}
	---

	# {hub_repo}

	Ternary-quantized version of [{source_model}](https://huggingface.co/{source_model})
	produced with [ternary-quant](https://github.com/Asad-Ismail/ternary-quant).

	## Quantization details

	- Scheme: {getattr(config, 'scheme', 'unknown')}
	- Components: {', '.join(getattr(config, 'components', []))}
	- Full-model effective bits: {summary.get('full_model_effective_bits', '?'):.2f}
	- Compression ratio: {summary.get('compression_ratio', '?'):.2f}×
	- Avg reconstruction error: {summary.get('avg_relative_error', '?'):.4f}

	## Usage

	```python
	from ternary_quant.inference import load_ternary_model

	model, tokenizer = load_ternary_model("{hub_repo}", runtime_mode="cached")
	inputs = tokenizer("Hello, world!", return_tensors="pt")
	outputs = model.generate(**inputs, max_new_tokens=50)
	print(tokenizer.decode(outputs[0], skip_special_tokens=True))
	```

	Or via CLI:
	```bash
	pip install ternary-quant
	ternary-quant generate {hub_repo} --prompt "Hello" --runtime-mode cached
	```
	"""

	card_path = output_path / "README.md"
	card_path.write_text(model_card)

	api = HfApi()
	print(f"Pushing to {hub_repo}...")
	api.upload_folder(
	folder_path=str(output_path),
	repo_id=hub_repo,
	repo_type="model",
	)
	print(f"Pushed to https://huggingface.co/{hub_repo}")


	def cmd_check(args):
	"""Quick compatibility check using only the model config (no weights downloaded)."""
	from ternary_quant.generative_adapters import (
	VLM_MODEL_TYPES,
	_default_components_for_family,
	detect_model_family_from_config,
	)
	from transformers import AutoConfig

	print(f"Checking: {args.model}")
	try:
	config = AutoConfig.from_pretrained(args.model)
	except Exception as exc:
	print(f" Could not load config: {exc}")
	print(" → Model may be gated (requires HF token) or not found.")
	return

	model_type = getattr(config, "model_type", "unknown")
	architectures = list(getattr(config, "architectures", None) or [])
	family = detect_model_family_from_config(config)
	default_components = _default_components_for_family(family)

	print(f" model_type: {model_type}")
	print(f" architectures: {', '.join(architectures) or 'unknown'}")
	print(f" family: {family}")
	print(f" default components to quantize: {', '.join(default_components)}")

	is_vlm = model_type in VLM_MODEL_TYPES
	has_encoder_decoder = bool(getattr(config, "is_encoder_decoder", False))

	if is_vlm:
	print(" → VLM: quantize text_backbone + multimodal_connector")
	print(f" ternary-quant quantize-broad {args.model} \\")
	print(f" --output ./$(basename {args.model})-ternary \\")
	print(f" --components text_backbone multimodal_connector \\")
	print(f" --scheme tritplane3 --dtype float16")
	elif has_encoder_decoder:
	print(" → Seq2seq / audio: quantize decoder")
	print(f" ternary-quant quantize-broad {args.model} \\")
	print(f" --output ./$(basename {args.model})-ternary \\")
	print(f" --components decoder --scheme tritplane3")
	else:
	print(" → Causal LM: quantize text_backbone")
	print(f" ternary-quant quantize-broad {args.model} \\")
	print(f" --output ./$(basename {args.model})-ternary \\")
	print(f" --components text_backbone --scheme tritplane3")

	print()
	print(" If quantization fails with 'No quantizable linear modules',")
	print(" add --allow-all-linear to quantize all nn.Linear layers.")


	def cmd_info(args):
	"""Show info about a saved quantized model."""
	model_dir = Path(args.model_dir)
	meta_path = model_dir / "metadata.json"

	if not meta_path.exists():
	print(f"No quantized model found at {model_dir}")
	sys.exit(1)

	with open(meta_path) as f:
	metadata = json.load(f)

	print(f"Model: {metadata['model_name']}")
	print(f"Model family: {metadata.get('model_family', 'causal_lm')}")
	print(f"Method: {metadata.get('method_name', 'unknown')}")
	print(f"Format family: {metadata.get('format_family', 'legacy')}")
	print(f"Format version: {metadata['format_version']}")
	print(f"Layers quantized: {len(metadata['layer_info'])}")
	print(f"Packed size: {metadata['total_packed_bytes'] / 1e6:.1f} MB")
	print(f"FP16 size: {metadata['total_fp16_bytes'] / 1e6:.1f} MB")
	print(f"Compression: {metadata['compression_ratio']:.1f}x")

	qc = metadata["quant_config"]
	print("\nQuantization config:")
	for key, value in qc.items():
	if key == "base_config":
	continue
	print(f" {key}: {value}")

	if metadata.get("summary"):
	summary = metadata["summary"]
	print("\nSummary:")
	for key in [
	"quantized_fraction",
	"avg_relative_error",
	"avg_effective_bits",
	"full_model_effective_bits",
	"total_sparse_nnz",
	]:
	if key in summary:
	value = summary[key]
	if isinstance(value, float):
	if "fraction" in key:
	print(f" {key}: {value:.1%}")
	else:
	print(f" {key}: {value:.4f}")
	else:
	print(f" {key}: {value}")

	if metadata.get("plan"):
	plan = metadata["plan"]
	print("\nPlan:")
	print(f" Method: {plan.get('method_name', 'unknown')}")
	print(f" Target average bits: {plan.get('target_average_bits')}")
	print(f" Predicted average bits: {plan.get('predicted_average_bits'):.2f}")


	def _parse_dtype(s: str) -> torch.dtype:
	return {
	"float16": torch.float16,
	"bfloat16": torch.bfloat16,
	"float32": torch.float32,
	}[s]


	def _resolve_device(device: str) -> str:
	if device != "auto":
	return device
	if torch.cuda.is_available():
	return "cuda"
	if hasattr(torch.backends, "mps") and torch.backends.mps.is_available():
	return "mps"
	return "cpu"


	def _add_runtime_mode_arg(parser: argparse.ArgumentParser, *, default: str = "cached") -> None:
	parser.add_argument(
	"--runtime-mode",
	default=default,
	choices=["packed", "cached", "native", "metal", "triton", "gemlite"],
	help=(
	"Inference runtime path for saved quantized layers. "
	"'cached': dequantize once at load, fastest on GPU/CPU (recommended). "
	"'native': replace layers with nn.Linear, ~1.0× vs FP16. "
	"'packed': re-dequantize every forward, minimal live VRAM. "
	"'gemlite': NVIDIA GPU only — keeps weights 2-bit packed, good batch throughput. "
	"'triton': NVIDIA GPU only — custom Triton kernel, slightly faster than gemlite at batch=1. "
	"'metal': Apple Silicon adaptive — Metal kernel with cached fallback."
	),
	)


	def _cleanup_device(device: str) -> None:
	gc.collect()
	if device == "cuda":
	torch.cuda.empty_cache()
	if device == "mps":
	torch.mps.empty_cache()


	def _uniform_role_weights() -> dict[str, float]:
	return {
	"attention_inputs": 1.0,
	"attention_output": 1.0,
	"mlp_inputs": 1.0,
	"mlp_output": 1.0,
	}


	def main():
	from ternary_quant.ptq_families import FAMILY_PRESETS, get_default_family_candidates

	parser = argparse.ArgumentParser(
	prog="ternary-quant",
	description="Post-training ternary quantization for HuggingFace generative models",
	)
	subparsers = parser.add_subparsers(dest="command", required=True)

	p_catalog = subparsers.add_parser(
	"catalog",
	help="List validated, probe-only, and special-handling model entries",
	)
	p_catalog.add_argument(
	"--status",
	default="all",
	choices=[
	"all",
	"validated",
	"component_validated",
	"research_validated",
	"probe_only",
	"special_handling",
	],
	)
	p_catalog.add_argument(
	"--family",
	default="all",
	choices=["all", "causal_lm", "seq2seq_lm", "image_text_to_text"],
	)
	p_catalog.add_argument("--show-commands", action="store_true")
	p_catalog.add_argument("--json", action="store_true")
	p_catalog.add_argument("--output", default=None)
	p_catalog.set_defaults(func=cmd_catalog)

	p_doctor = subparsers.add_parser(
	"doctor",
	help="Check environment readiness and runtime recommendations",
	)
	p_doctor.add_argument("--json", action="store_true")
	p_doctor.add_argument("--output", default=None)
	p_doctor.set_defaults(func=cmd_doctor)

	p_quant = subparsers.add_parser("quantize", help="Quantize a model to ternary")
	p_quant.add_argument("model", help="HuggingFace model ID or local path")
	p_quant.add_argument("--output", "-o", required=True, help="Output directory")
	p_quant.add_argument("--device", default="auto", help="Device (auto/cuda/cpu/mps)")
	p_quant.add_argument(
	"--dtype",
	default="float16",
	choices=["float16", "bfloat16", "float32"],
	)
	p_quant.add_argument("--n-iter", type=int, default=10, help="ITF iterations")
	p_quant.add_argument(
	"--no-activation-aware",
	action="store_true",
	help="Disable activation-aware quantization",
	)
	p_quant.add_argument("--block-size", type=int, default=0, help="Column block size")
	p_quant.add_argument("--n-samples", type=int, default=128)
	p_quant.add_argument("--seq-len", type=int, default=2048)
	p_quant.add_argument("--dataset", default="wikitext")
	p_quant.add_argument("--dataset-config", default="wikitext-2-raw-v1")
	p_quant.add_argument("--seed", type=int, default=42)
	p_quant.add_argument("--skip-modules", nargs="+", default=None)
	p_quant.add_argument("--eval", action="store_true")
	p_quant.add_argument("--eval-samples", type=int, default=40)
	_add_runtime_mode_arg(p_quant)
	p_quant.set_defaults(func=cmd_quantize)

	p_small = subparsers.add_parser(
	"quantize-small",
	help="Role-aware sparse asymmetric ternarization for small models",
	)
	p_small.add_argument("model", help="HuggingFace model ID or local path")
	p_small.add_argument("--output", "-o", required=True, help="Output directory")
	p_small.add_argument("--device", default="auto")
	p_small.add_argument(
	"--dtype",
	default="float16",
	choices=["float16", "bfloat16", "float32"],
	)
	p_small.add_argument("--n-samples", type=int, default=16)
	p_small.add_argument("--seq-len", type=int, default=256)
	p_small.add_argument("--dataset", default="wikitext")
	p_small.add_argument("--dataset-config", default="wikitext-2-raw-v1")
	p_small.add_argument("--seed", type=int, default=42)
	p_small.add_argument("--group-size", type=int, default=32)
	p_small.add_argument("--n-iter", type=int, default=10)
	p_small.add_argument(
	"--planner",
	default="practical",
	choices=["practical", "budgeted", "sensitivity_budget", "auto", "collapse_auto"],
	help=(
	"Planner variant: fixed role-aware recipe, role-aware bit-budgeted recipe, "
	"sensitivity-only matched-bit baseline, held-out PPL selection, or "
	"held-out prompt-bank collapse-aware selection."
	),
	)
	p_small.add_argument("--salient-fraction", type=float, default=0.01)
	p_small.add_argument("--min-salient-fraction", type=float, default=0.0025)
	p_small.add_argument("--max-salient-fraction", type=float, default=0.01)
	p_small.add_argument(
	"--low-rank-rank",
	type=int,
	default=0,
	help="Optional per-module low-rank residual rank for quantized modules.",
	)
	p_small.add_argument(
	"--adaptive-low-rank",
	action="store_true",
	help="Allocate low-rank rank adaptively per module using residual spectra.",
	)
	p_small.add_argument(
	"--low-rank-chunk-rank",
	type=int,
	default=16,
	help="Rank chunk used by adaptive low-rank allocation.",
	)
	p_small.add_argument(
	"--low-rank-target-average-bits",
	type=float,
	default=None,
	help="Optional full-model bit target for adaptive low-rank allocation.",
	)
	p_small.add_argument(
	"--low-rank-fit-mode",
	default="activation_regression",
	choices=["weight_svd", "activation_regression"],
	help="How to fit optional low-rank residuals for quantized modules.",
	)
	p_small.add_argument(
	"--low-rank-ridge",
	type=float,
	default=1e-4,
	help="Ridge penalty used for activation-regressed low-rank fitting.",
	)
	p_small.add_argument(
	"--low-rank-max-samples",
	type=int,
	default=4096,
	help="Maximum captured tokens per module when fitting low-rank residuals.",
	)
	p_small.add_argument(
	"--calibration-tune-steps",
	type=int,
	default=0,
	help="Optional number of calibration-only LM fine-tune steps for low-rank residuals.",
	)
	p_small.add_argument(
	"--calibration-tune-lr",
	type=float,
	default=5e-5,
	help="Learning rate for optional low-rank calibration tuning.",
	)
	p_small.add_argument(
	"--calibration-tune-batch-size",
	type=int,
	default=2,
	help="Batch size for optional low-rank calibration tuning.",
	)
	p_small.add_argument(
	"--behavior-tune-weight",
	type=float,
	default=0.0,
	help=(
	"Optional weight for prompt-bank teacher-sequence tuning during low-rank "
	"calibration. Requires --calibration-tune-steps > 0."
	),
	)
	p_small.add_argument(
	"--behavior-tune-prompt-count",
	type=int,
	default=4,
	help="Number of prompts to use when building behavior-tuning teacher sequences.",
	)
	p_small.add_argument(
	"--behavior-tune-max-tokens",
	type=int,
	default=48,
	help="Max generated tokens per prompt when building behavior-tuning teacher sequences.",
	)
	p_small.add_argument(
	"--distill-weight",
	type=float,
	default=0.0,
	help="Optional teacher hidden-state distillation weight for calibration tuning.",
	)
	p_small.add_argument(
	"--behavior-hidden-weight",
	type=float,
	default=0.0,
	help="Optional teacher hidden-state distillation weight on prompt-bank sequences.",
	)
	p_small.add_argument(
	"--logit-distill-weight",
	type=float,
	default=0.0,
	help="Optional top-k teacher logit distillation weight for calibration tuning.",
	)
	p_small.add_argument(
	"--behavior-logit-weight",
	type=float,
	default=0.0,
	help="Optional top-k teacher logit distillation weight on prompt-bank sequences.",
	)
	p_small.add_argument(
	"--entropy-distill-weight",
	type=float,
	default=0.0,
	help="Optional teacher entropy-floor regularization weight for calibration tuning.",
	)
	p_small.add_argument(
	"--behavior-entropy-weight",
	type=float,
	default=0.0,
	help="Optional teacher entropy-floor regularization weight on prompt-bank sequences.",
	)
	p_small.add_argument(
	"--logit-distill-topk",
	type=int,
	default=32,
	help="Teacher top-k to cache for logit distillation.",
	)
	p_small.add_argument(
	"--logit-distill-temperature",
	type=float,
	default=2.0,
	help="Temperature for top-k teacher logit distillation.",
	)
	p_small.add_argument(
	"--importance-threshold-scale",
	type=float,
	default=0.0,
	help=(
	"AWQ-inspired per-channel importance thresholding. When > 0 and activations "
	"are used, input channels with high activation magnitude get a lower ternary "
	"threshold (fewer zeros = more signal preserved). 0.0 = uniform (default). "
	"Typical range: 0.25–0.5."
	),
	)
	p_small.add_argument("--adaptive-salient", action="store_true")
	p_small.add_argument("--boundary-layers", type=int, default=2)
	p_small.add_argument("--calibration-batch-size", type=int, default=4)
	p_small.add_argument("--quantize-attention-output", action="store_true")
	p_small.add_argument("--quantize-mlp-output", action="store_true")
	p_small.add_argument(
	"--target-average-bits",
	type=float,
	default=None,
	help="Optional full-model bit budget for the role-aware allocator.",
	)
	p_small.add_argument("--eval", action="store_true")
	p_small.add_argument("--eval-samples", type=int, default=8)
	p_small.add_argument("--selection-eval-samples", type=int, default=2)
	p_small.add_argument("--selection-prompt-count", type=int, default=4)
	p_small.add_argument("--selection-max-tokens", type=int, default=48)
	p_small.add_argument("--selection-collapse-weight", type=float, default=2.0)
	p_small.add_argument("--selection-worst-weight", type=float, default=1.0)
	p_small.add_argument("--prompt", default=None)
	p_small.add_argument("--max-tokens", type=int, default=80)
	_add_runtime_mode_arg(p_small)
	p_small.set_defaults(func=cmd_quantize_small)

	p_ptq = subparsers.add_parser(
	"quantize-ptq",
	help="Compare or apply broader ternary PTQ families for small models",
	)
	p_ptq.add_argument("model", help="HuggingFace model ID or local path")
	p_ptq.add_argument("--output", "-o", required=True, help="Output directory")
	p_ptq.add_argument("--device", default="auto")
	p_ptq.add_argument(
	"--dtype",
	default="float16",
	choices=["float16", "bfloat16", "float32"],
	)
	p_ptq.add_argument("--n-samples", type=int, default=16)
	p_ptq.add_argument("--seq-len", type=int, default=256)
	p_ptq.add_argument("--dataset", default="wikitext")
	p_ptq.add_argument("--dataset-config", default="wikitext-2-raw-v1")
	p_ptq.add_argument("--seed", type=int, default=42)
	p_ptq.add_argument("--group-size", type=int, default=32)
	p_ptq.add_argument("--n-iter", type=int, default=10)
	p_ptq.add_argument(
	"--family",
	default="controller",
	choices=["controller", *sorted(FAMILY_PRESETS)],
	help="PTQ family preset to apply, or controller to select across families.",
	)
	p_ptq.add_argument(
	"--candidate-families",
	nargs="*",
	default=list(get_default_family_candidates()),
	help="Candidate families considered by the controller.",
	)
	p_ptq.add_argument("--boundary-layers", type=int, default=2)
	p_ptq.add_argument("--calibration-batch-size", type=int, default=4)
	p_ptq.add_argument("--quantize-attention-output", action="store_true")
	p_ptq.add_argument("--quantize-mlp-output", action="store_true")
	p_ptq.add_argument(
	"--target-average-bits",
	type=float,
	default=None,
	help="Optional full-model bit target used by budget-aware family presets and selection.",
	)
	p_ptq.add_argument(
	"--selection-metric",
	default="ppl",
	choices=["ppl", "collapse"],
	help="Controller selection objective.",
	)
	p_ptq.add_argument("--selection-eval-samples", type=int, default=2)
	p_ptq.add_argument("--selection-prompt-count", type=int, default=4)
	p_ptq.add_argument("--selection-max-tokens", type=int, default=48)
	p_ptq.add_argument("--selection-collapse-weight", type=float, default=2.0)
	p_ptq.add_argument("--selection-worst-weight", type=float, default=1.0)
	p_ptq.add_argument("--selection-bits-weight", type=float, default=0.25)
	p_ptq.add_argument("--eval", action="store_true")
	p_ptq.add_argument("--eval-samples", type=int, default=8)
	p_ptq.add_argument("--prompt", default=None)
	p_ptq.add_argument("--max-tokens", type=int, default=80)
	_add_runtime_mode_arg(p_ptq)
	p_ptq.set_defaults(func=cmd_quantize_ptq)

	p_broad = subparsers.add_parser(
	"quantize-broad",
	help="Quantize selected components of a broader generative model family",
	)
	p_broad.add_argument("model", help="HuggingFace model ID or local path")
	p_broad.add_argument("--output", "-o", required=True, help="Output directory")
	p_broad.add_argument("--device", default="auto")
	p_broad.add_argument(
	"--dtype",
	default="float32",
	choices=["float16", "bfloat16", "float32"],
	)
	p_broad.add_argument(
	"--components",
	nargs="*",
	default=None,
	help="Component names to quantize. Defaults to the family-specific broad preset.",
	)
	p_broad.add_argument(
	"--scheme",
	default="groupwise",
	choices=["groupwise", "tritplane2", "tritplane3"],
	help="Broad quantization scheme.",
	)
	p_broad.add_argument("--group-size", type=int, default=32)
	p_broad.add_argument("--n-iter", type=int, default=10)
	p_broad.add_argument("--salient-fraction", type=float, default=0.0)
	p_broad.add_argument("--rescue-fraction", type=float, default=0.0)
	p_broad.add_argument("--allow-all-linear", action="store_true")
	p_broad.add_argument("--seq-len", type=int, default=160)
	p_broad.add_argument("--calibration-batch-size", type=int, default=2)
	p_broad.add_argument("--prompt", default=None)
	p_broad.add_argument("--max-tokens", type=int, default=64)
	p_broad.add_argument("--eval", action="store_true")
	p_broad.add_argument(
	"--push-to-hub",
	default=None,
	metavar="REPO_ID",
	help="Push the quantized model to HuggingFace Hub (e.g. username/my-model-ternary).",
	)
	_add_runtime_mode_arg(p_broad)
	p_broad.set_defaults(
	func=cmd_quantize_broad,
	n_planes=2,
	)

	p_inspect = subparsers.add_parser(
	"inspect-generative",
	help="Inspect the generative-family components of a model",
	)
	p_inspect.add_argument("model", help="HuggingFace model ID or local path")
	p_inspect.add_argument("--device", default="auto")
	p_inspect.add_argument(
	"--dtype",
	default="float32",
	choices=["float16", "bfloat16", "float32"],
	)
	p_inspect.add_argument(
	"--output",
	default=None,
	help="Optional JSON output path for the component inventory.",
	)
	p_inspect.set_defaults(func=cmd_inspect_generative)

	p_check = subparsers.add_parser(
	"check",
	help="Quick compatibility check for a model (no weights downloaded)",
	)
	p_check.add_argument("model", help="HuggingFace model ID")
	p_check.set_defaults(func=cmd_check)

	p_eval = subparsers.add_parser("eval", help="Evaluate saved model perplexity")
	p_eval.add_argument("model_dir")
	p_eval.add_argument("--device", default="auto")
	p_eval.add_argument("--seq-len", type=int, default=2048)
	p_eval.add_argument("--max-samples", type=int, default=None)
	_add_runtime_mode_arg(p_eval)
	p_eval.set_defaults(func=cmd_eval)

	p_cmp = subparsers.add_parser("compare", help="Compare original vs quantized")
	p_cmp.add_argument("original")
	p_cmp.add_argument("ternary")
	p_cmp.add_argument("--device", default="auto")
	p_cmp.add_argument("--seq-len", type=int, default=2048)
	p_cmp.add_argument("--max-samples", type=int, default=40)
	p_cmp.set_defaults(func=cmd_compare)

	p_gen = subparsers.add_parser("generate", help="Generate text with saved model")
	p_gen.add_argument("model_dir")
	p_gen.add_argument("--prompt", "-p", required=True)
	p_gen.add_argument("--max-tokens", type=int, default=256)
	p_gen.add_argument("--temperature", type=float, default=0.7)
	p_gen.add_argument("--device", default="auto")
	p_gen.add_argument(
	"--image-path",
	default=None,
	help="Optional image path for image-text-to-text models. If omitted, a demo image is used.",
	)
	_add_runtime_mode_arg(p_gen)
	p_gen.set_defaults(func=cmd_generate)

	p_info = subparsers.add_parser("info", help="Show info about a saved model")
	p_info.add_argument("model_dir")
	p_info.set_defaults(func=cmd_info)

	args = parser.parse_args()
	args.func(args)


	if __name__ == "__main__":
	main()