Text Generation
LiteRT-LM
English
custom
hermes-edge
mobile-ai
on-device
ios
iphone-16
apple-neural-engine
deepseek
dspark
speculative-decoding
hermes-agent
tool-calling
raven-ecosystem
Instructions to use bclermo/hermes-edge with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- LiteRT-LM
How to use bclermo/hermes-edge with LiteRT-LM:
# LiteRT-LM runs on various platforms (Android, iOS, Windows, Linux, macOS, IoT, Web/WASM) # and supports many APIs (C++, Python, Kotlin, Swift, JavaScript, Flutter). # For platform-specific integration guides, please refer to the official developer website: # https://ai.google.dev/edge/litert-lm # To try LiteRT-LM, the easiest way is to use our CLI tool. # 1. Install the LiteRT-LM CLI tool: pip install litert-lm # 2. Download and run this model locally: # See: https://ai.google.dev/edge/litert-lm/cli litert-lm run \ --from-huggingface-repo=bclermo/hermes-edge \ model.litertlm \ --prompt="Write me a poem"
- Notebooks
- Google Colab
- Kaggle
File size: 8,025 Bytes
696c8be | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 | """
DSpark-Inspired Speculative Decoding for On-Device Inference
DeepSeek's DSpark framework uses a small "draft" model to predict multiple
future tokens, which the main model then verifies in parallel. This gives
60-85% speedup with identical output quality (lossless).
This implementation adapts the DSpark approach for LiteRT-LM on mobile:
- Draft model: ultra-light (~30M params) n-gram + small transformer hybrid
- Verification: greedy acceptance (draft tokens kept if main model agrees)
- Falls back gracefully when draft is wrong
Key insight from DSpark paper (DeepSeek, 2026):
"Confidence-scheduled speculative decoding with semi-autoregressive generation"
- Draft model predicts K=4 tokens at once
- Main model verifies all K in a single forward pass
- Acceptance rate: ~85% for K=4
Usage:
from dspark_draft import DSparkDraftEngine
engine = DSparkDraftEngine(main_model, draft_model)
tokens = engine.generate("Hello, how are you?", max_tokens=128)
"""
import logging
from dataclasses import dataclass, field
log = logging.getLogger(__name__)
@dataclass
class DSparkConfig:
"""Configuration for DSpark speculative decoding."""
draft_k: int = 4
"""Number of draft tokens to speculate (DSpark default: 4)."""
temperature: float = 0.7
"""Sampling temperature."""
top_k: int = 40
"""Top-K sampling threshold."""
top_p: float = 0.9
"""Top-P (nucleus) sampling threshold."""
max_ngram_order: int = 3
"""N-gram order for draft model fallback."""
@dataclass
class GenerationResult:
tokens: list[int] = field(default_factory=list)
text: str = ""
accepted_draft_rate: float = 0.0
total_speculations: int = 0
accepted_speculations: int = 0
tokens_generated: int = 0
steps_taken: int = 0
class NGramDraftModel:
"""
Lightweight n-gram draft model as a stand-in for a learned draft module.
In production, this would be a trained 30M-param transformer
(DeepSeek DSpark style). This fallback uses:
- N-gram statistics for short-range patterns
- Uniform sampling for novel contexts
The n-gram table is built from observed token sequences during inference,
making it adaptive without requiring separate training.
"""
def __init__(self, vocab_size: int, max_order: int = 3):
self.vocab_size = vocab_size
self.max_order = max_order
self.ngrams: dict[tuple[int, ...], list[int]] = {}
def observe(self, sequence: list[int]) -> None:
"""Record observed n-grams for future draft predictions."""
for order in range(1, self.max_order + 1):
for i in range(len(sequence) - order):
context = tuple(sequence[i : i + order - 1])
next_token = sequence[i + order - 1]
if context not in self.ngrams:
self.ngrams[context] = []
if len(self.ngrams[context]) < 10:
self.ngrams[context].append(next_token)
def predict(self, context: list[int]) -> list[tuple[int, float]]:
"""Predict next tokens with confidence scores from n-gram model."""
candidates: dict[int, float] = {}
for order in range(min(self.max_order, len(context)), 0, -1):
ctx = tuple(context[-order:])
if ctx in self.ngrams:
for token in self.ngrams[ctx]:
candidates[token] = candidates.get(token, 0) + 1.0 / order
total = sum(candidates.values())
if total > 0:
return [(t, c / total) for t, c in candidates.items()]
return [(i, 1.0 / self.vocab_size) for i in range(min(10, self.vocab_size))]
class DSparkDraftEngine:
"""
DSpark-style speculative decoding engine.
Runs a small draft model ahead of the main model, then verifies
draft tokens in parallel. Accepts verified tokens for free,
rolls back on disagreements.
"""
def __init__(
self,
main_model,
draft_model: NGramDraftModel | None = None,
config: DSparkConfig | None = None,
):
self.main = main_model
self.draft = draft_model
self.config = config or DSparkConfig()
def speculative_generate(
self,
prompt_ids: list[int],
max_tokens: int = 256,
tokenizer=None,
) -> GenerationResult:
"""
Generate tokens with speculative decoding.
For each step:
1. Draft predicts K candidate tokens from context
2. Main model verifies candidates in one forward pass
3. Accepted tokens are kept; on first rejection, fall back
4. Update n-gram model with accepted sequence
"""
result = GenerationResult()
result.tokens = list(prompt_ids)
steps = 0
while len(result.tokens) < len(prompt_ids) + max_tokens and steps < max_tokens:
steps += 1
context = result.tokens[-(self.config.max_ngram_order * 2) :]
draft_tokens = self._draft_predict(context)
verified = self._verify_tokens(result.tokens, draft_tokens)
n_accepted = self._count_accepted(verified)
if n_accepted > 0:
result.tokens.extend(draft_tokens[:n_accepted])
result.accepted_speculations += n_accepted
result.total_speculations += len(draft_tokens)
if n_accepted < len(draft_tokens) or n_accepted == 0:
next_token = self._fallback_sample(context)
result.tokens.append(next_token)
result.steps_taken = steps
if self.draft:
self.draft.observe(result.tokens[-10:])
result.tokens_generated = len(result.tokens) - len(prompt_ids)
result.accepted_draft_rate = (
result.accepted_speculations / result.total_speculations
if result.total_speculations > 0
else 0.0
)
if tokenizer:
try:
result.text = tokenizer.decode(result.tokens[len(prompt_ids) :])
except Exception:
result.text = f"[{len(result.tokens)} tokens generated]"
return result
def _draft_predict(self, context: list[int]) -> list[int]:
"""Draft model predicts K candidate tokens."""
if self.draft:
tokens = []
working_ctx = list(context)
for _ in range(self.config.draft_k):
candidates = self.draft.predict(working_ctx)
if not candidates:
break
next_tok = max(candidates, key=lambda x: x[1])[0]
tokens.append(next_tok)
working_ctx.append(next_tok)
if len(tokens) == self.config.draft_k:
return tokens
# Fallback: repeat last token (simple baseline)
return [context[-1] if context else 0] * self.config.draft_k
def _verify_tokens(self, sequence: list[int], draft: list[int]) -> list[bool]:
"""Verify draft tokens against main model (greedy)."""
verified = []
for i, tok in enumerate(draft):
context = sequence + draft[:i]
expected = self._main_predict_next(context)
verified.append(tok == expected)
return verified
def _main_predict_next(self, context: list[int]) -> int:
"""Get the main model's most likely next token."""
if hasattr(self.main, "predict_next_token"):
return self.main.predict_next_token(context)
return context[-1] if context else 0
def _count_accepted(self, verified: list[bool]) -> int:
"""Count consecutive accepted draft tokens from the start."""
count = 0
for v in verified:
if v:
count += 1
else:
break
return count
def _fallback_sample(self, context: list[int]) -> int:
"""Fallback: main model single-token decode."""
return self._main_predict_next(context)
|