Text Generation
LiteRT-LM
English
custom
hermes-edge
mobile-ai
on-device
ios
iphone-16
apple-neural-engine
deepseek
dspark
speculative-decoding
hermes-agent
tool-calling
raven-ecosystem
Instructions to use bclermo/hermes-edge with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- LiteRT-LM
How to use bclermo/hermes-edge with LiteRT-LM:
# LiteRT-LM runs on various platforms (Android, iOS, Windows, Linux, macOS, IoT, Web/WASM) # and supports many APIs (C++, Python, Kotlin, Swift, JavaScript, Flutter). # For platform-specific integration guides, please refer to the official developer website: # https://ai.google.dev/edge/litert-lm # To try LiteRT-LM, the easiest way is to use our CLI tool. # 1. Install the LiteRT-LM CLI tool: pip install litert-lm # 2. Download and run this model locally: # See: https://ai.google.dev/edge/litert-lm/cli litert-lm run \ --from-huggingface-repo=bclermo/hermes-edge \ model.litertlm \ --prompt="Write me a poem"
- Notebooks
- Google Colab
- Kaggle
| """ | |
| DSpark-Inspired Speculative Decoding for On-Device Inference | |
| DeepSeek's DSpark framework uses a small "draft" model to predict multiple | |
| future tokens, which the main model then verifies in parallel. This gives | |
| 60-85% speedup with identical output quality (lossless). | |
| This implementation adapts the DSpark approach for LiteRT-LM on mobile: | |
| - Draft model: ultra-light (~30M params) n-gram + small transformer hybrid | |
| - Verification: greedy acceptance (draft tokens kept if main model agrees) | |
| - Falls back gracefully when draft is wrong | |
| Key insight from DSpark paper (DeepSeek, 2026): | |
| "Confidence-scheduled speculative decoding with semi-autoregressive generation" | |
| - Draft model predicts K=4 tokens at once | |
| - Main model verifies all K in a single forward pass | |
| - Acceptance rate: ~85% for K=4 | |
| Usage: | |
| from dspark_draft import DSparkDraftEngine | |
| engine = DSparkDraftEngine(main_model, draft_model) | |
| tokens = engine.generate("Hello, how are you?", max_tokens=128) | |
| """ | |
| import logging | |
| from dataclasses import dataclass, field | |
| log = logging.getLogger(__name__) | |
| class DSparkConfig: | |
| """Configuration for DSpark speculative decoding.""" | |
| draft_k: int = 4 | |
| """Number of draft tokens to speculate (DSpark default: 4).""" | |
| temperature: float = 0.7 | |
| """Sampling temperature.""" | |
| top_k: int = 40 | |
| """Top-K sampling threshold.""" | |
| top_p: float = 0.9 | |
| """Top-P (nucleus) sampling threshold.""" | |
| max_ngram_order: int = 3 | |
| """N-gram order for draft model fallback.""" | |
| class GenerationResult: | |
| tokens: list[int] = field(default_factory=list) | |
| text: str = "" | |
| accepted_draft_rate: float = 0.0 | |
| total_speculations: int = 0 | |
| accepted_speculations: int = 0 | |
| tokens_generated: int = 0 | |
| steps_taken: int = 0 | |
| class NGramDraftModel: | |
| """ | |
| Lightweight n-gram draft model as a stand-in for a learned draft module. | |
| In production, this would be a trained 30M-param transformer | |
| (DeepSeek DSpark style). This fallback uses: | |
| - N-gram statistics for short-range patterns | |
| - Uniform sampling for novel contexts | |
| The n-gram table is built from observed token sequences during inference, | |
| making it adaptive without requiring separate training. | |
| """ | |
| def __init__(self, vocab_size: int, max_order: int = 3): | |
| self.vocab_size = vocab_size | |
| self.max_order = max_order | |
| self.ngrams: dict[tuple[int, ...], list[int]] = {} | |
| def observe(self, sequence: list[int]) -> None: | |
| """Record observed n-grams for future draft predictions.""" | |
| for order in range(1, self.max_order + 1): | |
| for i in range(len(sequence) - order): | |
| context = tuple(sequence[i : i + order - 1]) | |
| next_token = sequence[i + order - 1] | |
| if context not in self.ngrams: | |
| self.ngrams[context] = [] | |
| if len(self.ngrams[context]) < 10: | |
| self.ngrams[context].append(next_token) | |
| def predict(self, context: list[int]) -> list[tuple[int, float]]: | |
| """Predict next tokens with confidence scores from n-gram model.""" | |
| candidates: dict[int, float] = {} | |
| for order in range(min(self.max_order, len(context)), 0, -1): | |
| ctx = tuple(context[-order:]) | |
| if ctx in self.ngrams: | |
| for token in self.ngrams[ctx]: | |
| candidates[token] = candidates.get(token, 0) + 1.0 / order | |
| total = sum(candidates.values()) | |
| if total > 0: | |
| return [(t, c / total) for t, c in candidates.items()] | |
| return [(i, 1.0 / self.vocab_size) for i in range(min(10, self.vocab_size))] | |
| class DSparkDraftEngine: | |
| """ | |
| DSpark-style speculative decoding engine. | |
| Runs a small draft model ahead of the main model, then verifies | |
| draft tokens in parallel. Accepts verified tokens for free, | |
| rolls back on disagreements. | |
| """ | |
| def __init__( | |
| self, | |
| main_model, | |
| draft_model: NGramDraftModel | None = None, | |
| config: DSparkConfig | None = None, | |
| ): | |
| self.main = main_model | |
| self.draft = draft_model | |
| self.config = config or DSparkConfig() | |
| def speculative_generate( | |
| self, | |
| prompt_ids: list[int], | |
| max_tokens: int = 256, | |
| tokenizer=None, | |
| ) -> GenerationResult: | |
| """ | |
| Generate tokens with speculative decoding. | |
| For each step: | |
| 1. Draft predicts K candidate tokens from context | |
| 2. Main model verifies candidates in one forward pass | |
| 3. Accepted tokens are kept; on first rejection, fall back | |
| 4. Update n-gram model with accepted sequence | |
| """ | |
| result = GenerationResult() | |
| result.tokens = list(prompt_ids) | |
| steps = 0 | |
| while len(result.tokens) < len(prompt_ids) + max_tokens and steps < max_tokens: | |
| steps += 1 | |
| context = result.tokens[-(self.config.max_ngram_order * 2) :] | |
| draft_tokens = self._draft_predict(context) | |
| verified = self._verify_tokens(result.tokens, draft_tokens) | |
| n_accepted = self._count_accepted(verified) | |
| if n_accepted > 0: | |
| result.tokens.extend(draft_tokens[:n_accepted]) | |
| result.accepted_speculations += n_accepted | |
| result.total_speculations += len(draft_tokens) | |
| if n_accepted < len(draft_tokens) or n_accepted == 0: | |
| next_token = self._fallback_sample(context) | |
| result.tokens.append(next_token) | |
| result.steps_taken = steps | |
| if self.draft: | |
| self.draft.observe(result.tokens[-10:]) | |
| result.tokens_generated = len(result.tokens) - len(prompt_ids) | |
| result.accepted_draft_rate = ( | |
| result.accepted_speculations / result.total_speculations | |
| if result.total_speculations > 0 | |
| else 0.0 | |
| ) | |
| if tokenizer: | |
| try: | |
| result.text = tokenizer.decode(result.tokens[len(prompt_ids) :]) | |
| except Exception: | |
| result.text = f"[{len(result.tokens)} tokens generated]" | |
| return result | |
| def _draft_predict(self, context: list[int]) -> list[int]: | |
| """Draft model predicts K candidate tokens.""" | |
| if self.draft: | |
| tokens = [] | |
| working_ctx = list(context) | |
| for _ in range(self.config.draft_k): | |
| candidates = self.draft.predict(working_ctx) | |
| if not candidates: | |
| break | |
| next_tok = max(candidates, key=lambda x: x[1])[0] | |
| tokens.append(next_tok) | |
| working_ctx.append(next_tok) | |
| if len(tokens) == self.config.draft_k: | |
| return tokens | |
| # Fallback: repeat last token (simple baseline) | |
| return [context[-1] if context else 0] * self.config.draft_k | |
| def _verify_tokens(self, sequence: list[int], draft: list[int]) -> list[bool]: | |
| """Verify draft tokens against main model (greedy).""" | |
| verified = [] | |
| for i, tok in enumerate(draft): | |
| context = sequence + draft[:i] | |
| expected = self._main_predict_next(context) | |
| verified.append(tok == expected) | |
| return verified | |
| def _main_predict_next(self, context: list[int]) -> int: | |
| """Get the main model's most likely next token.""" | |
| if hasattr(self.main, "predict_next_token"): | |
| return self.main.predict_next_token(context) | |
| return context[-1] if context else 0 | |
| def _count_accepted(self, verified: list[bool]) -> int: | |
| """Count consecutive accepted draft tokens from the start.""" | |
| count = 0 | |
| for v in verified: | |
| if v: | |
| count += 1 | |
| else: | |
| break | |
| return count | |
| def _fallback_sample(self, context: list[int]) -> int: | |
| """Fallback: main model single-token decode.""" | |
| return self._main_predict_next(context) | |