Text Generation
LiteRT-LM
English
custom
hermes-edge
mobile-ai
on-device
ios
iphone-16
apple-neural-engine
deepseek
dspark
speculative-decoding
hermes-agent
tool-calling
raven-ecosystem
Instructions to use bclermo/hermes-edge with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- LiteRT-LM
How to use bclermo/hermes-edge with LiteRT-LM:
# LiteRT-LM runs on various platforms (Android, iOS, Windows, Linux, macOS, IoT, Web/WASM) # and supports many APIs (C++, Python, Kotlin, Swift, JavaScript, Flutter). # For platform-specific integration guides, please refer to the official developer website: # https://ai.google.dev/edge/litert-lm # To try LiteRT-LM, the easiest way is to use our CLI tool. # 1. Install the LiteRT-LM CLI tool: pip install litert-lm # 2. Download and run this model locally: # See: https://ai.google.dev/edge/litert-lm/cli litert-lm run \ --from-huggingface-repo=bclermo/hermes-edge \ model.litertlm \ --prompt="Write me a poem"
- Notebooks
- Google Colab
- Kaggle
| #!/usr/bin/env python3 | |
| """Lightweight evaluation harness for the Hermes model. | |
| Runs two cheap, CI-friendly evals against tiny bundled datasets and prints a | |
| summary table: | |
| * **Perplexity** — token-level cross-entropy perplexity over ``data/eval.jsonl`` | |
| (10 diverse chat conversations rendered through the Hermes ChatML template). | |
| * **Tool-call accuracy** — over ``data/tool_eval.jsonl`` (10 prompts whose | |
| expected reply is a ``<tool_call>``), the fraction for which the model emits a | |
| parseable tool call whose ``name`` matches the expected tool. | |
| The harness runs with **randomly initialized weights** when ``--checkpoint`` is | |
| omitted (perplexity will be ~vocab-size and tool accuracy ~0), which keeps it | |
| usable as a smoke test in CI. With a trained checkpoint + SentencePiece | |
| tokenizer the numbers become meaningful. | |
| Example:: | |
| python scripts/eval.py --preset hermes-270m \ | |
| --checkpoint checkpoints/hermes-270m.pt --tokenizer tokenizer/hermes.model | |
| Writes ``eval_results.json``. | |
| """ | |
| from __future__ import annotations | |
| import argparse | |
| import json | |
| import math | |
| import os | |
| import sys | |
| import time | |
| from typing import Any, Dict, List, Optional | |
| sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) | |
| import torch # noqa: E402 | |
| from hermes.chat_template import Message, build_prompt, parse_tool_call # noqa: E402 | |
| from hermes.config import get_config # noqa: E402 | |
| from hermes.inference import HermesInference # noqa: E402 | |
| _REPO_ROOT = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) | |
| class ByteTokenizer: | |
| """Deterministic byte-level tokenizer fallback (no external deps). | |
| Used when no SentencePiece model is supplied so the harness runs in CI. | |
| Maps each UTF-8 byte to an id offset past the reserved special tokens. | |
| """ | |
| def __init__(self, vocab_size: int) -> None: | |
| self.vocab_size = vocab_size | |
| self.offset = 5 # leave room for pad/bos/eos/tool sentinels | |
| def encode(self, text: str) -> List[int]: | |
| ids = [(b + self.offset) % self.vocab_size for b in text.encode("utf-8")] | |
| return ids or [1] | |
| def decode(self, ids: List[int]) -> str: | |
| out = bytes((i - self.offset) % 256 for i in ids if i >= self.offset) | |
| return out.decode("utf-8", errors="replace") | |
| def load_tokenizer(path: Optional[str], vocab_size: int): | |
| """Load a SentencePiece tokenizer if available, else the byte fallback.""" | |
| if path and os.path.exists(path): | |
| try: | |
| import sentencepiece as spm | |
| sp = spm.SentencePieceProcessor(model_file=path) | |
| class _SP: | |
| def encode(self, text: str) -> List[int]: | |
| return sp.encode(text, out_type=int) | |
| def decode(self, ids: List[int]) -> str: | |
| return sp.decode(ids) | |
| return _SP() | |
| except Exception as exc: # noqa: BLE001 - fall back gracefully | |
| print(f"[warn] could not load SentencePiece tokenizer ({exc}); using bytes.") | |
| return ByteTokenizer(vocab_size) | |
| def _messages_from(obj: Dict[str, Any]) -> List[Message]: | |
| return [Message(m["role"], m["content"]) for m in obj["messages"]] | |
| def _read_jsonl(path: str) -> List[Dict[str, Any]]: | |
| with open(path, "r", encoding="utf-8") as f: | |
| return [json.loads(line) for line in f if line.strip()] | |
| def eval_perplexity(engine: HermesInference, path: str) -> float: | |
| """Mean perplexity over rendered conversations in ``path``.""" | |
| examples = _read_jsonl(path) | |
| total_loss = 0.0 | |
| count = 0 | |
| for ex in examples: | |
| prompt = build_prompt(_messages_from(ex), add_generation_prompt=False) | |
| ids = engine.tokenizer.encode(prompt)[: engine.config.max_seq_len] | |
| if len(ids) < 2: | |
| continue | |
| input_ids = torch.tensor([ids], dtype=torch.long, device=engine.device) | |
| out = engine.model(input_ids, labels=input_ids) | |
| loss = out["loss"] | |
| if loss is not None and math.isfinite(float(loss)): | |
| total_loss += float(loss) | |
| count += 1 | |
| if count == 0: | |
| return float("nan") | |
| mean_loss = total_loss / count | |
| try: | |
| return math.exp(mean_loss) | |
| except OverflowError: | |
| return float("inf") | |
| def eval_tool_calls(engine: HermesInference, path: str, max_new_tokens: int) -> Dict[str, float]: | |
| """Fraction of prompts where the model emits the expected tool call name.""" | |
| examples = _read_jsonl(path) | |
| correct = 0 | |
| parseable = 0 | |
| latencies: List[float] = [] | |
| for ex in examples: | |
| msgs = _messages_from(ex) | |
| tools = ex.get("tools") | |
| t0 = time.perf_counter() | |
| reply = engine.chat(msgs, tools=tools, max_new_tokens=max_new_tokens, temperature=0.0) | |
| latencies.append((time.perf_counter() - t0) * 1000.0) | |
| call = parse_tool_call(reply) | |
| if call is not None: | |
| parseable += 1 | |
| if call.get("name") == ex.get("expected", {}).get("name"): | |
| correct += 1 | |
| n = max(len(examples), 1) | |
| return { | |
| "tool_call_accuracy": correct / n, | |
| "parseable_rate": parseable / n, | |
| "avg_latency_ms": sum(latencies) / max(len(latencies), 1), | |
| "num_examples": len(examples), | |
| } | |
| def run(args: argparse.Namespace) -> int: | |
| config = get_config(args.preset) | |
| tokenizer = load_tokenizer(args.tokenizer, config.vocab_size) | |
| engine = HermesInference.from_checkpoint( | |
| config, args.checkpoint, tokenizer, device=args.device, preset_name=args.preset | |
| ) | |
| print(engine) | |
| if args.checkpoint is None: | |
| print("[info] No checkpoint supplied — evaluating randomly initialized weights (CI mode).") | |
| eval_path = args.eval_data or os.path.join(_REPO_ROOT, "data", "eval.jsonl") | |
| tool_path = args.tool_data or os.path.join(_REPO_ROOT, "data", "tool_eval.jsonl") | |
| perplexity = eval_perplexity(engine, eval_path) | |
| tool_metrics = eval_tool_calls(engine, tool_path, args.max_new_tokens) | |
| results = { | |
| "preset": args.preset, | |
| "checkpoint": args.checkpoint, | |
| "perplexity": perplexity, | |
| **tool_metrics, | |
| } | |
| print("\n| metric | value |") | |
| print("|---|---|") | |
| print(f"| perplexity | {perplexity:.2f} |") | |
| print(f"| tool_call_accuracy | {tool_metrics['tool_call_accuracy']:.2%} |") | |
| print(f"| parseable_rate | {tool_metrics['parseable_rate']:.2%} |") | |
| print(f"| avg_latency_ms | {tool_metrics['avg_latency_ms']:.1f} |") | |
| print() | |
| with open(args.output, "w", encoding="utf-8") as f: | |
| json.dump(results, f, indent=2) | |
| print(f"Saved {args.output}") | |
| return 0 | |
| def parse_args(argv=None) -> argparse.Namespace: | |
| p = argparse.ArgumentParser(description="Evaluate Hermes perplexity + tool-call accuracy.") | |
| p.add_argument("--preset", default="hermes-270m", choices=["hermes-1b", "hermes-500m", "hermes-270m"]) | |
| p.add_argument("--checkpoint", default=None, help="Optional .pt checkpoint (random init if omitted).") | |
| p.add_argument("--tokenizer", default=None, help="Optional SentencePiece model (byte fallback if omitted).") | |
| p.add_argument("--eval-data", default=None, help="Override path to perplexity JSONL.") | |
| p.add_argument("--tool-data", default=None, help="Override path to tool-call JSONL.") | |
| p.add_argument("--max-new-tokens", type=int, default=64) | |
| p.add_argument("--device", default="cpu") | |
| p.add_argument("--output", default="eval_results.json") | |
| return p.parse_args(argv) | |
| if __name__ == "__main__": | |
| sys.exit(run(parse_args())) | |