Text Generation
LiteRT-LM
English
custom
hermes-edge
mobile-ai
on-device
ios
iphone-16
apple-neural-engine
deepseek
dspark
speculative-decoding
hermes-agent
tool-calling
raven-ecosystem
Instructions to use bclermo/hermes-edge with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- LiteRT-LM
How to use bclermo/hermes-edge with LiteRT-LM:
# LiteRT-LM runs on various platforms (Android, iOS, Windows, Linux, macOS, IoT, Web/WASM) # and supports many APIs (C++, Python, Kotlin, Swift, JavaScript, Flutter). # For platform-specific integration guides, please refer to the official developer website: # https://ai.google.dev/edge/litert-lm # To try LiteRT-LM, the easiest way is to use our CLI tool. # 1. Install the LiteRT-LM CLI tool: pip install litert-lm # 2. Download and run this model locally: # See: https://ai.google.dev/edge/litert-lm/cli litert-lm run \ --from-huggingface-repo=bclermo/hermes-edge \ model.litertlm \ --prompt="Write me a poem"
- Notebooks
- Google Colab
- Kaggle
| """Post-training quantization (PTQ) analysis + fake-quant utilities. | |
| These helpers are deliberately **standalone** — they have no ``ai_edge_torch`` | |
| dependency. They serve two purposes: | |
| 1. **Pre-conversion analysis.** :func:`collect_calibration_stats` and | |
| :func:`quantization_error_report` let you measure activation ranges and the | |
| weight/perplexity error a given bit-width would introduce, *before* you spend | |
| minutes lowering the model through the LiteRT stack. Use them to sanity-check | |
| that INT4 is viable for a checkpoint, or to pick which layers are sensitive. | |
| 2. **Training-time fake quantization.** :func:`apply_weight_only_int4` and | |
| :func:`apply_weight_only_int8` replace each ``nn.Linear`` weight with its | |
| quantized-then-dequantized value using a straight-through estimator (STE) so | |
| gradients still flow. This is the quantization-aware-training (QAT) path: fine | |
| tune with fake-quant on to recover accuracy the real INT4 graph would lose. | |
| Relationship to ``scripts/convert_to_litertlm.py`` | |
| -------------------------------------------------- | |
| The *real* mobile INT4 graph is produced by ``convert_to_litertlm.py`` via | |
| ``ai_edge_torch``'s ``full_int4_dynamic_recipe`` — that is what actually ships in | |
| the ``.litertlm`` bundle. The functions here do **not** replace that conversion: | |
| they approximate the same symmetric per-group INT4 scheme in pure PyTorch so you | |
| can (a) estimate the error offline and (b) QAT-finetune to minimize it. Numbers | |
| from here are guidance; the converter's output is ground truth. | |
| """ | |
| from __future__ import annotations | |
| import math | |
| from typing import Dict, Iterable, Optional | |
| import torch | |
| import torch.nn as nn | |
| # --------------------------------------------------------------------------- # | |
| # Symmetric per-group quantization core | |
| # --------------------------------------------------------------------------- # | |
| def _quant_levels(bits: int) -> tuple[int, int]: | |
| """Return ``(qmin, qmax)`` for a signed ``bits``-bit integer.""" | |
| qmax = 2 ** (bits - 1) - 1 | |
| qmin = -(2 ** (bits - 1)) | |
| return qmin, qmax | |
| def fake_quantize_per_group( | |
| weight: torch.Tensor, bits: int, group_size: int | |
| ) -> torch.Tensor: | |
| """Symmetric per-group fake quantization of a 2-D weight matrix. | |
| The weight ``[out_features, in_features]`` is split along ``in_features`` into | |
| groups of ``group_size``; each group gets its own scale ``max(|w|) / qmax``. | |
| The result is quantized to the integer grid and dequantized back to float, so | |
| the returned tensor has the same dtype/shape but only takes representable | |
| values. Used by both the analysis and STE paths. | |
| """ | |
| qmin, qmax = _quant_levels(bits) | |
| out_features, in_features = weight.shape | |
| gs = group_size if group_size > 0 else in_features | |
| pad = (gs - in_features % gs) % gs | |
| w = weight | |
| if pad: | |
| w = torch.nn.functional.pad(w, (0, pad)) | |
| w = w.reshape(out_features, -1, gs) | |
| max_abs = w.abs().amax(dim=-1, keepdim=True) | |
| scale = (max_abs / qmax).clamp(min=1e-8) | |
| q = torch.clamp(torch.round(w / scale), qmin, qmax) | |
| deq = (q * scale).reshape(out_features, -1) | |
| if pad: | |
| deq = deq[:, :in_features] | |
| return deq.to(weight.dtype) | |
| class _STEFakeQuant(torch.autograd.Function): | |
| """Straight-through estimator: quantize on forward, identity on backward.""" | |
| def forward(ctx, weight: torch.Tensor, bits: int, group_size: int) -> torch.Tensor: # type: ignore[override] | |
| return fake_quantize_per_group(weight, bits, group_size) | |
| def backward(ctx, grad_output: torch.Tensor): # type: ignore[override] | |
| # Identity gradient w.r.t. the weight; None for the int hyper-params. | |
| return grad_output, None, None | |
| def _apply_weight_only(model: nn.Module, bits: int, group_size: int) -> nn.Module: | |
| """In-place STE fake-quant of every ``nn.Linear`` weight in ``model``.""" | |
| for module in model.modules(): | |
| if isinstance(module, nn.Linear): | |
| with torch.no_grad(): | |
| quantized = _STEFakeQuant.apply(module.weight, bits, group_size) | |
| module.weight.copy_(quantized) | |
| return model | |
| def apply_weight_only_int4(model: nn.Module, group_size: int = 128) -> nn.Module: | |
| """Fake-quantize all ``nn.Linear`` weights to symmetric per-group INT4. | |
| Each weight is mapped onto the signed 4-bit grid ``[-8, 7]`` (per group of | |
| ``group_size`` input channels) and dequantized in place. Uses a | |
| straight-through estimator so the operation is differentiable for QAT. | |
| This mirrors the per-group INT4 scheme that | |
| ``ai_edge_torch``'s ``full_int4_dynamic_recipe`` applies during the real | |
| conversion in ``scripts/convert_to_litertlm.py`` — call this to QAT-finetune | |
| or to estimate INT4 error offline; the converter produces the shipped graph. | |
| Returns the same model (mutated in place). | |
| """ | |
| return _apply_weight_only(model, bits=4, group_size=group_size) | |
| def apply_weight_only_int8(model: nn.Module, group_size: int = 0) -> nn.Module: | |
| """Fake-quantize all ``nn.Linear`` weights to symmetric INT8 (``[-128, 127]``). | |
| Per-channel by default (``group_size=0`` → one scale per output row). Same STE | |
| semantics as :func:`apply_weight_only_int4`; useful as the higher-quality | |
| fallback recipe when INT4 degrades a sensitive checkpoint too much. | |
| Returns the same model (mutated in place). | |
| """ | |
| return _apply_weight_only(model, bits=8, group_size=group_size) | |
| # --------------------------------------------------------------------------- # | |
| # Calibration + error analysis | |
| # --------------------------------------------------------------------------- # | |
| def collect_calibration_stats( | |
| model: nn.Module, | |
| dataloader: Iterable, | |
| num_batches: int = 64, | |
| ) -> Dict[str, Dict[str, float]]: | |
| """Run forward passes and collect per-layer activation statistics. | |
| Forward hooks on every ``nn.Linear`` record the running min/max and a coarse | |
| 99th-percentile estimate of the *output* activations across up to | |
| ``num_batches`` batches. These ranges are what an activation-quantization | |
| scheme (or a converter calibration pass) would use to pick scales. | |
| Args: | |
| model: The model to profile (set to eval). | |
| dataloader: Yields either tensors of ``input_ids`` or ``(inputs, _)`` | |
| tuples / dicts with an ``input_ids`` key. | |
| num_batches: Max number of batches to run. | |
| Returns: | |
| ``{layer_name: {"min", "max", "abs_max", "p99", "mean", "num_samples"}}``. | |
| """ | |
| model.eval() | |
| stats: Dict[str, Dict[str, float]] = {} | |
| handles = [] | |
| def make_hook(name: str): | |
| def hook(_module, _inp, out): | |
| t = out.detach() | |
| if not torch.is_floating_point(t): | |
| return | |
| flat = t.float().reshape(-1) | |
| entry = stats.setdefault( | |
| name, | |
| { | |
| "min": math.inf, | |
| "max": -math.inf, | |
| "abs_max": 0.0, | |
| "p99": 0.0, | |
| "mean": 0.0, | |
| "num_samples": 0.0, | |
| }, | |
| ) | |
| entry["min"] = min(entry["min"], float(flat.min())) | |
| entry["max"] = max(entry["max"], float(flat.max())) | |
| entry["abs_max"] = max(entry["abs_max"], float(flat.abs().max())) | |
| # Running mean + percentile (cheap quantile on a subsample). | |
| n_prev = entry["num_samples"] | |
| n_new = flat.numel() | |
| entry["mean"] = ( | |
| entry["mean"] * n_prev + float(flat.sum()) | |
| ) / max(n_prev + n_new, 1) | |
| sample = flat if flat.numel() <= 16384 else flat[torch.randint( | |
| 0, flat.numel(), (16384,), device=flat.device)] | |
| entry["p99"] = max(entry["p99"], float(torch.quantile(sample.abs(), 0.99))) | |
| entry["num_samples"] = n_prev + n_new | |
| return hook | |
| for name, module in model.named_modules(): | |
| if isinstance(module, nn.Linear): | |
| handles.append(module.register_forward_hook(make_hook(name))) | |
| try: | |
| for i, batch in enumerate(dataloader): | |
| if i >= num_batches: | |
| break | |
| input_ids = _extract_input_ids(batch) | |
| model(input_ids) | |
| finally: | |
| for h in handles: | |
| h.remove() | |
| return stats | |
| def _extract_input_ids(batch) -> torch.Tensor: | |
| """Pull an ``input_ids`` tensor out of common dataloader batch shapes.""" | |
| if isinstance(batch, torch.Tensor): | |
| return batch | |
| if isinstance(batch, dict): | |
| return batch["input_ids"] | |
| if isinstance(batch, (tuple, list)): | |
| return batch[0] | |
| raise TypeError(f"Cannot extract input_ids from batch of type {type(batch)}.") | |
| def _perplexity(model: nn.Module, dataloader: Iterable, num_batches: int) -> float: | |
| """Mean token-level perplexity over ``num_batches`` (labels == inputs).""" | |
| model.eval() | |
| total_loss = 0.0 | |
| count = 0 | |
| for i, batch in enumerate(dataloader): | |
| if i >= num_batches: | |
| break | |
| input_ids = _extract_input_ids(batch) | |
| out = model(input_ids, labels=input_ids) | |
| loss = out["loss"] if isinstance(out, dict) else out | |
| if loss is None: | |
| continue | |
| total_loss += float(loss) | |
| count += 1 | |
| if count == 0: | |
| return float("nan") | |
| return math.exp(total_loss / count) | |
| def quantization_error_report( | |
| original_model: nn.Module, | |
| quantized_model: nn.Module, | |
| dataloader: Iterable, | |
| num_batches: int = 8, | |
| ) -> Dict[str, object]: | |
| """Compare a model against its quantized copy. | |
| Computes, per ``nn.Linear`` layer, the relative L2 error between the original | |
| and quantized weights, and the model-level perplexity delta on ``dataloader``. | |
| Returns: | |
| ``{"per_layer_l2": {name: rel_l2}, "max_layer_l2": float, | |
| "perplexity_original": float, "perplexity_quantized": float, | |
| "perplexity_delta": float}``. | |
| """ | |
| orig_linears = dict(_named_linears(original_model)) | |
| quant_linears = dict(_named_linears(quantized_model)) | |
| per_layer: Dict[str, float] = {} | |
| for name, orig in orig_linears.items(): | |
| if name not in quant_linears: | |
| continue | |
| diff = (orig.weight - quant_linears[name].weight).float() | |
| denom = orig.weight.float().norm().clamp(min=1e-8) | |
| per_layer[name] = float(diff.norm() / denom) | |
| ppl_orig = _perplexity(original_model, dataloader, num_batches) | |
| ppl_quant = _perplexity(quantized_model, dataloader, num_batches) | |
| return { | |
| "per_layer_l2": per_layer, | |
| "max_layer_l2": max(per_layer.values()) if per_layer else 0.0, | |
| "perplexity_original": ppl_orig, | |
| "perplexity_quantized": ppl_quant, | |
| "perplexity_delta": ppl_quant - ppl_orig, | |
| } | |
| def _named_linears(model: nn.Module): | |
| """Yield ``(name, module)`` for every ``nn.Linear`` in ``model``.""" | |
| for name, module in model.named_modules(): | |
| if isinstance(module, nn.Linear): | |
| yield name, module | |
| if __name__ == "__main__": # pragma: no cover - manual smoke check | |
| import copy | |
| from hermes.config import HermesConfig | |
| from hermes.model import build_model | |
| cfg = HermesConfig( | |
| vocab_size=128, hidden_size=64, intermediate_size=128, num_layers=2, | |
| num_heads=4, num_kv_heads=2, head_dim=16, max_seq_len=32, | |
| ) | |
| fp_model = build_model(cfg) | |
| q_model = apply_weight_only_int4(copy.deepcopy(fp_model)) | |
| data = [torch.randint(0, cfg.vocab_size, (1, 8)) for _ in range(4)] | |
| report = quantization_error_report(fp_model, q_model, data, num_batches=4) | |
| print("max layer L2 error:", round(report["max_layer_l2"], 4)) | |
| print("perplexity delta:", round(report["perplexity_delta"], 4)) | |