Upload scripts/convert_translategemma_android.py with huggingface_hub

7f4c0e9 verified 1 day ago

21.6 kB

	#!/usr/bin/env python3
	import argparse
	import importlib
	import inspect
	import json
	import os
	import subprocess
	import sys
	import traceback
	from pathlib import Path

	os.environ.setdefault("TRANSFORMERS_NO_TORCHVISION", "1")
	os.environ.setdefault("PYTHONUNBUFFERED", "1")


	def log(msg): print(f"[+] {msg}", flush=True)
	def warn(msg): print(f"[!] {msg}", flush=True)
	def die(msg):
	print(f"[x] {msg}", file=sys.stderr, flush=True)
	sys.exit(1)


	SUPPORTED_QUANT = {
	"none",
	"dynamic_int8",
	"float16",
	"int8",
	"int4",
	}


	def normalize_quantize(q: str) -> str:
	q = (q or "dynamic_int8").strip().lower()
	aliases = {
	"fp32": "none",
	"no": "none",
	"off": "none",
	"fp16": "float16",
	"f16": "float16",
	"i8": "int8",
	"q8": "int8",
	"i4": "int4",
	"q4": "int4",
	}
	q = aliases.get(q, q)
	if q not in SUPPORTED_QUANT:
	die(f"Unsupported --quantize '{q}'. Supported: {sorted(SUPPORTED_QUANT)}")
	return q


	def load_config(model_dir: Path):
	cfg_path = model_dir / "config.json"
	if not cfg_path.exists():
	die(f"Missing config: {cfg_path}")
	cfg = json.loads(cfg_path.read_text())
	text_cfg = cfg.get("text_config", {})
	return cfg, text_cfg


	def inspect_arch(model_dir: Path):
	cfg, text_cfg = load_config(model_dir)
	info = {
	"model_type": cfg.get("model_type", "unknown"),
	"architecture": (cfg.get("architectures") or ["unknown"])[0],
	"vocab_size": cfg.get("vocab_size", text_cfg.get("vocab_size", 262144)),
	}
	log(f"ARCH: {info}")
	return info


	def ensure_model_downloaded(model_id: str, model_dir: Path, hf_token: str):
	if (model_dir / "config.json").exists():
	log(f"Using existing model dir: {model_dir}")
	return

	log(f"Downloading {model_id} -> {model_dir}")
	try:
	from huggingface_hub import snapshot_download
	snapshot_download(
	repo_id=model_id,
	local_dir=str(model_dir),
	token=hf_token if hf_token else None,
	local_dir_use_symlinks=False,
	)
	except Exception as e:
	die(f"Model download failed: {e}")


	def try_builders(mod, builder_names, model_dir: Path):
	for fn_name in builder_names:
	fn = getattr(mod, fn_name, None)
	if fn is None:
	continue
	log(f"Trying {mod.__name__}.{fn_name} ...")
	try:
	m = fn(str(model_dir))
	if m is None:
	warn(" returned None")
	continue
	if isinstance(m, (tuple, list)) and len(m) > 0:
	m = m[0]
	if not hasattr(m, "eval"):
	warn(f" unsupported return type: {type(m)}")
	continue
	m.eval()
	log(f" success with {fn_name}")
	return m
	except Exception as e:
	warn(f" failed: {e}")
	return None


	def build_translategemma_4b(checkpoint_path: str):
	"""
	Custom builder for TranslateGemma 4B IT (Gemma3 multimodal decoder).
	Strips 'language_model.' prefix from safetensors keys so the standard
	TENSOR_NAMES_SEP_QKV mapping works.

	Architecture (from config.json / verified weight shapes):
	34 layers, embedding_dim=2560, 8 heads, head_dim=256, 4 KV heads,
	intermediate=10240, sliding_window=1024, global every 6th layer.
	"""
	import safetensors.torch as st_lib
	import json as json_lib
	from litert_torch.generative.utilities import model_builder, loader as loading_utils
	from litert_torch.generative.layers import kv_cache as kv_utils
	from litert_torch.generative.examples.gemma3 import decoder as gemma3_decoder
	import litert_torch.generative.layers.model_config as cfg_mod

	norm = cfg_mod.NormalizationConfig(
	type=cfg_mod.NormalizationType.RMS_NORM, epsilon=1e-6, zero_centered=True
	)
	ff = cfg_mod.FeedForwardConfig(
	type=cfg_mod.FeedForwardType.GATED,
	activation=cfg_mod.ActivationConfig(cfg_mod.ActivationType.GELU_TANH),
	intermediate_size=10240,
	pre_ff_norm_config=norm, post_ff_norm_config=norm,
	)

	def blk(idx):
	attn = cfg_mod.AttentionConfig(
	num_heads=8, head_dim=256, num_query_groups=4,
	rotary_base=1_000_000 if (idx + 1) % 6 == 0 else 10_000,
	rotary_percentage=1.0, qkv_transpose_before_split=True,
	qkv_fused_interleaved=False,
	query_norm_config=norm, key_norm_config=norm, logit_softcap=None,
	sliding_window_size=1024,
	attn_type=cfg_mod.AttentionType.GLOBAL if (idx + 1) % 6 == 0
	else cfg_mod.AttentionType.LOCAL_SLIDING,
	)
	return cfg_mod.TransformerBlockConfig(
	attn_config=attn, ff_config=ff,
	pre_attention_norm_config=norm, post_attention_norm_config=norm,
	)

	model_config = cfg_mod.ModelConfig(
	vocab_size=262208, num_layers=34, max_seq_len=8192,
	embedding_dim=2560, embedding_scale=2560 ** 0.5,
	block_configs=[blk(i) for i in range(34)],
	final_norm_config=norm, lm_head_use_bias=False, final_logit_softcap=None,
	)

	tensor_names = loading_utils.ModelLoader.TensorNames(
	ff_up_proj="model.layers.{}.mlp.up_proj",
	ff_down_proj="model.layers.{}.mlp.down_proj",
	ff_gate_proj="model.layers.{}.mlp.gate_proj",
	attn_query_proj="model.layers.{}.self_attn.q_proj",
	attn_key_proj="model.layers.{}.self_attn.k_proj",
	attn_value_proj="model.layers.{}.self_attn.v_proj",
	attn_output_proj="model.layers.{}.self_attn.o_proj",
	attn_query_norm="model.layers.{}.self_attn.q_norm",
	attn_key_norm="model.layers.{}.self_attn.k_norm",
	pre_attn_norm="model.layers.{}.input_layernorm",
	post_attn_norm="model.layers.{}.post_attention_layernorm",
	pre_ff_norm="model.layers.{}.pre_feedforward_layernorm",
	post_ff_norm="model.layers.{}.post_feedforward_layernorm",
	embedding="model.embed_tokens",
	final_norm="model.norm",
	lm_head=None,
	)

	def custom_loader(path: str):
	idx = json_lib.loads((Path(path) / "model.safetensors.index.json").read_text())
	out = {}
	for fname in set(idx["weight_map"].values()):
	for k, v in st_lib.load_file(str(Path(path) / fname)).items():
	out[k[len("language_model."):] if k.startswith("language_model.") else k] = v
	return out

	return model_builder.build_decoder_only_model(
	checkpoint_path=checkpoint_path,
	config=model_config,
	tensor_names=tensor_names,
	model_class=gemma3_decoder.Decoder,
	custom_loader=custom_loader,
	)


	def strategy1_litert_native(model_dir: Path, out_dir: Path, model_type: str, quantize: str, prefill: int, kvcache: int):
	"""
	Native LiteRT conversion with explicit quantization support.
	"""
	log("Strategy 1: litert-torch native")

	from litert_torch.generative.utilities import converter
	from litert_torch.generative.utilities.export_config import ExportConfig
	from litert_torch.generative.layers import kv_cache

	export_config = ExportConfig()
	export_config.kvcache_layout = kv_cache.KV_LAYOUT_TRANSPOSED
	export_config.mask_as_input = True

	# Map our quantize flags to converter's QuantizationName values
	QUANT_MAP = {
	"none": "none",
	"dynamic_int8": "dynamic_int8",
	"int8": "weight_only_int8",
	"float16": "fp16",
	"int4": "dynamic_int4_block128",
	}
	quant_for_converter = QUANT_MAP.get(quantize)
	if quant_for_converter is None:
	warn(f"No converter mapping for '{quantize}', falling back to Strategy 2")
	return None

	model = None

	if model_type == "gemma3":
	# Try custom 4B builder first (handles TranslateGemma 4B multimodal weight prefix)
	log("Trying custom build_translategemma_4b ...")
	try:
	model = build_translategemma_4b(str(model_dir))
	if model is not None:
	model.eval()
	log(" build_translategemma_4b success")
	except Exception as e:
	warn(f" build_translategemma_4b failed: {e}")
	model = None

	if model is None:
	mod = importlib.import_module("litert_torch.generative.examples.gemma3.gemma3")
	available = [n for n in dir(mod) if n.startswith("build_model")]
	log(f"Gemma3 builders available: {available}")
	preferred = ["build_model_4b", "build_model_2b", "build_model_1b", "build_model_270m", "build_model"]
	ordered = [n for n in preferred if n in available] + [n for n in available if n not in preferred]
	model = try_builders(mod, ordered, model_dir)

	elif model_type in ("gemma", "gemma2"):
	mod = importlib.import_module("litert_torch.generative.examples.gemma2.gemma2")
	available = [n for n in dir(mod) if n.startswith("build_model")]
	log(f"Gemma2 builders available: {available}")
	preferred = ["build_model_4b", "build_model_2b", "build_model"]
	ordered = [n for n in preferred if n in available] + [n for n in available if n not in preferred]
	model = try_builders(mod, ordered, model_dir)

	else:
	warn(f"Model type '{model_type}' not handled by native strategy")
	return None

	if model is None:
	warn("Strategy 1 did not find a compatible builder")
	return None

	converter.convert_to_tflite(
	model,
	output_path=str(out_dir),
	output_name_prefix=f"translategemma-4b-it-{quantize}",
	prefill_seq_len=prefill,
	kv_cache_max_len=kvcache,
	quantize=quant_for_converter,
	export_config=export_config,
	)

	produced = sorted(out_dir.glob(f"{quantize}.tflite")) or sorted(out_dir.glob("*.tflite"))
	return produced[0] if produced else None


	def strategy2_generic(model_dir: Path, out_dir: Path, prefill: int, quantize: str):
	"""
	Generic fallback conversion (logits-only). Always exports float32; use
	strategy3_post_tflite_quantize() afterwards for real int4/int8 compression.
	"""
	log("Strategy 2: ai_edge_torch generic (wrapped logits-only)")

	import torch
	try:
	import litert_torch as ai_edge_torch
	except Exception:
	import ai_edge_torch # deprecated fallback

	from transformers import AutoConfig, AutoModelForCausalLM

	dtype = torch.float32
	if quantize == "float16":
	dtype = torch.float16

	cfg = AutoConfig.from_pretrained(str(model_dir), trust_remote_code=True)
	vocab = getattr(cfg, "vocab_size", None)
	if vocab is None and hasattr(cfg, "text_config"):
	vocab = getattr(cfg.text_config, "vocab_size", None)
	vocab = int(vocab or 262144)

	log(f"Loading HF model on CPU with dtype={dtype} ...")
	base_model = AutoModelForCausalLM.from_pretrained(
	str(model_dir),
	trust_remote_code=True,
	torch_dtype=dtype,
	)
	base_model.eval()

	# TFLite embedding_lookup does not support f16 weights — keep embedding in float32
	if dtype == torch.float16:
	log("Casting embedding and lm_head to float32 for TFLite compatibility")
	if hasattr(base_model, "model") and hasattr(base_model.model, "embed_tokens"):
	base_model.model.embed_tokens = base_model.model.embed_tokens.to(torch.float32)
	if hasattr(base_model, "lm_head"):
	base_model.lm_head = base_model.lm_head.to(torch.float32)

	if hasattr(base_model, "config") and hasattr(base_model.config, "use_cache"):
	base_model.config.use_cache = False

	class LogitsOnlyWrapper(torch.nn.Module):
	def __init__(self, model):
	super().__init__()
	self.model = model

	def forward(self, input_ids):
	out = self.model(
	input_ids=input_ids,
	use_cache=False,
	return_dict=False,
	)
	logits = out[0] if isinstance(out, (tuple, list)) else out.logits
	return logits

	wrapped = LogitsOnlyWrapper(base_model).eval()
	sample_ids = torch.randint(0, vocab, (1, min(prefill, 128)), dtype=torch.int64)

	# Always export float32 base; int4/int8 quantization applied post-export via Strategy 3
	out_file = out_dir / f"translategemma-4b-it-generic-none.tflite"
	edge_model = ai_edge_torch.convert(wrapped, (sample_ids,))
	edge_model.export(str(out_file))

	return out_file if out_file.exists() else None


	def strategy3_post_tflite_quantize(tflite_in: Path, out_dir: Path, quantize: str):
	"""
	Post-conversion weight quantization applied directly to a TFLite flatbuffer
	using ai_edge_quantizer (bundled with litert_torch).

	Supported modes and their recipes (per get_supported_layer_schemes()):
	int4 -> INT4 DYNAMIC_RANGE BLOCKWISE_128 (~2 GB for 4B model)
	int8 -> INT8 WEIGHT_ONLY CHANNELWISE (~4 GB)
	dynamic_int8 -> INT8 DYNAMIC_RANGE CHANNELWISE (~4 GB)
	float16 -> FP16 WEIGHT_ONLY FLOAT_CAST (~8 GB)
	"""
	log(f"Strategy 3: post-TFLite quantization ({quantize}) on {tflite_in.name}")

	from litert_torch.generative.quantize import quant_attrs as qa, quant_recipe, quant_recipe_utils
	from litert_torch.quantize import translate_recipe

	if quantize == "int4":
	layer = quant_recipe_utils.create_layer_quant_dynamic(qa.Dtype.INT4, qa.Granularity.BLOCKWISE_128)
	elif quantize == "int8":
	layer = quant_recipe_utils.create_layer_quant_weight_only(qa.Dtype.INT8, qa.Granularity.CHANNELWISE)
	elif quantize == "dynamic_int8":
	layer = quant_recipe_utils.create_layer_quant_dynamic(qa.Dtype.INT8, qa.Granularity.CHANNELWISE)
	elif quantize == "float16":
	layer = quant_recipe_utils.create_layer_quant_fp16()
	else:
	warn(f"Strategy 3: no post-TFLite recipe for '{quantize}', skipping")
	return None

	gen_recipe = quant_recipe.GenerativeQuantRecipe(default=layer)
	ai_recipe = translate_recipe.translate_to_ai_edge_recipe(gen_recipe)

	model_bytes = tflite_in.read_bytes()
	log(f" Input: {len(model_bytes) / 1024**3:.2f} GB — quantizing ...")
	quantized_bytes = translate_recipe.quantize_model(model_bytes, ai_recipe)

	out_file = out_dir / f"translategemma-4b-it-{quantize}.tflite"
	out_file.write_bytes(quantized_bytes)
	log(f" Output: {len(quantized_bytes) / 1024**3:.2f} GB -> {out_file}")
	return out_file


	def ensure_tokenizer_model(model_dir: Path):
	tok_model = model_dir / "tokenizer.model"
	if tok_model.exists():
	return tok_model

	tok_json = model_dir / "tokenizer.json"
	if not tok_json.exists():
	die(f"Missing tokenizer files: neither {tok_model} nor {tok_json} exists")

	log("Converting tokenizer.json -> tokenizer.model")
	cmd = [
	sys.executable,
	"-m",
	"litert_torch.generative.tools.tokenizer_to_sentencepiece",
	f"--checkpoint={model_dir}",
	f"--output_path={tok_model}",
	]
	res = subprocess.run(cmd, text=True, capture_output=True)
	if res.stdout:
	print(res.stdout, flush=True)
	if res.returncode != 0:
	if res.stderr:
	print(res.stderr, file=sys.stderr, flush=True)
	die("Tokenizer conversion failed")

	if not tok_model.exists():
	die("Tokenizer conversion reported success but tokenizer.model is missing")

	return tok_model


	def bundle_task(tflite_file: Path, tokenizer_model: Path, task_file: Path):
	log(f"Bundling .task -> {task_file}")
	from mediapipe.tasks.python.genai import bundler

	task_file.parent.mkdir(parents=True, exist_ok=True)

	sig = inspect.signature(bundler.BundleConfig)
	params = sig.parameters
	log(f"BundleConfig params: {list(params.keys())}")

	kwargs = {
	"tflite_model": str(tflite_file),
	"tokenizer_model": str(tokenizer_model),
	"output_filename": str(task_file),
	}

	if "start_token" in params:
	kwargs["start_token"] = "<bos>"
	elif "start_tokens" in params:
	kwargs["start_tokens"] = ["<bos>"]

	if "stop_tokens" in params:
	kwargs["stop_tokens"] = ["<eos>"]

	if "prompt_prefix" in params:
	kwargs["prompt_prefix"] = ""
	if "prompt_suffix" in params:
	kwargs["prompt_suffix"] = ""

	kwargs = {k: v for k, v in kwargs.items() if k in params}
	cfg = bundler.BundleConfig(**kwargs)
	bundler.create_bundle(cfg)


	def main():
	ap = argparse.ArgumentParser(description="TranslateGemma -> Android .task converter")

	ap.add_argument("--model-id", default="google/translategemma-4b-it")
	ap.add_argument("--model-dir", default="./translategemma-4b-it")
	ap.add_argument("--tflite-dir", default="./tflite_output")
	ap.add_argument("--output-dir", default="./output")
	ap.add_argument("--task-file", default="./output/translategemma-4b-it-android.task")

	ap.add_argument("--quantize", default="dynamic_int8", help=f"One of: {sorted(SUPPORTED_QUANT)}")

	ap.add_argument("--prefill-seq-len", "--prefill", dest="prefill_seq_len", type=int, default=1024)
	ap.add_argument("--kv-cache-max-len", "--kvcache", dest="kv_cache_max_len", type=int, default=1024)

	ap.add_argument("--skip-strategy1", action="store_true")
	ap.add_argument("--bundle-only", action="store_true", help="Skip conversion; only bundle existing TFLite")
	ap.add_argument("--existing-tflite", default="", help="Path to an existing .tflite to bundle")
	ap.add_argument("--allow-no-token", action="store_true", help="Allow model download from public repo without HF_TOKEN")

	args = ap.parse_args()
	q = normalize_quantize(args.quantize)

	hf_token = os.environ.get("HF_TOKEN", "").strip()
	if not hf_token and not args.allow_no_token:
	warn("HF_TOKEN is not set. If model is public, you can pass --allow-no-token.")

	model_dir = Path(args.model_dir)
	tflite_dir = Path(args.tflite_dir)
	output_dir = Path(args.output_dir)
	task_file = Path(args.task_file)

	tflite_dir.mkdir(parents=True, exist_ok=True)
	output_dir.mkdir(parents=True, exist_ok=True)

	tflite_file = None

	if args.bundle_only:
	if not args.existing_tflite:
	die("--bundle-only requires --existing-tflite /path/to/model.tflite")
	tflite_file = Path(args.existing_tflite)
	if not tflite_file.exists():
	die(f"Existing tflite not found: {tflite_file}")
	log(f"Bundle-only mode using: {tflite_file}")
	# Apply post-TFLite quantization if requested
	if q != "none":
	try:
	quantized = strategy3_post_tflite_quantize(tflite_file, tflite_dir, q)
	if quantized:
	tflite_file = quantized
	log(f"Strategy 3 success: {tflite_file}")
	else:
	warn("Strategy 3 skipped; bundling input as-is")
	except Exception as e:
	warn(f"Strategy 3 failed: {e}")
	traceback.print_exc()
	else:
	ensure_model_downloaded(args.model_id, model_dir, hf_token if hf_token else "")
	arch = inspect_arch(model_dir)
	model_type = arch["model_type"]

	strategy1_succeeded = False
	if not args.skip_strategy1:
	try:
	tflite_file = strategy1_litert_native(
	model_dir=model_dir,
	out_dir=tflite_dir,
	model_type=model_type,
	quantize=q,
	prefill=args.prefill_seq_len,
	kvcache=args.kv_cache_max_len,
	)
	if tflite_file:
	log(f"Strategy 1 success: {tflite_file}")
	strategy1_succeeded = True
	except Exception as e:
	warn(f"Strategy 1 failed: {e}")
	traceback.print_exc()

	if not tflite_file:
	try:
	tflite_file = strategy2_generic(
	model_dir=model_dir,
	out_dir=tflite_dir,
	prefill=args.prefill_seq_len,
	quantize=q,
	)
	if tflite_file:
	log(f"Strategy 2 success: {tflite_file}")
	warn("Generic TFLite may not have MediaPipe LLM prefill/decode signatures.")
	except Exception as e:
	warn(f"Strategy 2 failed: {e}")
	traceback.print_exc()

	# Strategy 3: post-TFLite quantization — only when Strategy 2 was used (Strategy 1
	# already applies quantization natively via the converter).
	if tflite_file and not strategy1_succeeded and q != "none":
	try:
	quantized = strategy3_post_tflite_quantize(tflite_file, tflite_dir, q)
	if quantized:
	tflite_file = quantized
	log(f"Strategy 3 success: {tflite_file}")
	else:
	warn("Strategy 3 skipped; bundling unquantized model")
	except Exception as e:
	warn(f"Strategy 3 failed: {e}")
	traceback.print_exc()

	if not tflite_file:
	die("All conversion strategies failed")

	tokenizer_model = ensure_tokenizer_model(model_dir)
	bundle_task(tflite_file, tokenizer_model, task_file)

	log(f"DONE: {task_file}")
	if task_file.exists():
	log(f"Size: {task_file.stat().st_size / (1024 * 1024):.2f} MB")


	if __name__ == "__main__":
	main()