Bump CoreML-LLM references to v1.4 (3-chunk decode, chunk pipelining)
Browse files- models.json +6 -6
models.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
| 1 |
{
|
| 2 |
"manifest_version": 1,
|
| 3 |
-
"updated_at": "2026-04-
|
| 4 |
"min_app_version": "1.0",
|
| 5 |
"categories": [
|
| 6 |
{
|
|
@@ -88,7 +88,7 @@
|
|
| 88 |
"name": "Gemma 4 E2B",
|
| 89 |
"subtitle": "Google DeepMind, 2025",
|
| 90 |
"category_id": "llm",
|
| 91 |
-
"description_md": "Google's on-device multimodal LLM (Gemma 3n E2B, 2.3B effective params with Per-Layer Embeddings). **Text + image + audio + video** input, streaming text output.
|
| 92 |
"thumbnail_url": "https://huggingface.co/mlboydaisuke/coreml-zoo/resolve/main/thumbnails/gemma4.jpg",
|
| 93 |
"demo": {
|
| 94 |
"template": "chat",
|
|
@@ -130,7 +130,7 @@
|
|
| 130 |
"name": "Gemma 4 E4B",
|
| 131 |
"subtitle": "Google DeepMind, 2025",
|
| 132 |
"category_id": "llm",
|
| 133 |
-
"description_md": "Larger Gemma 4 variant: 42-layer text decoder, ~4B effective params, 100% ANE-resident. Text-only (no vision/audio). ~14 tok/s decode on iPhone 17 Pro at 2048-token context. Use when you want maximum text quality and have the storage budget. Powered by [CoreML-LLM](https://github.com/john-rocky/CoreML-LLM) v1.
|
| 134 |
"thumbnail_url": "https://huggingface.co/mlboydaisuke/coreml-zoo/resolve/main/thumbnails/gemma4.jpg",
|
| 135 |
"demo": {
|
| 136 |
"template": "chat",
|
|
@@ -172,7 +172,7 @@
|
|
| 172 |
"name": "Qwen3.5 2B",
|
| 173 |
"subtitle": "Alibaba Qwen, 2025",
|
| 174 |
"category_id": "llm",
|
| 175 |
-
"description_md": "Hybrid Gated-DeltaNet SSM + attention architecture, shipped as 4 INT8 chunks to fit the iPhone ANE single-mlprogram budget. ~17 tok/s decode on iPhone 17 Pro with ~200 MB RSS — exceptional memory efficiency for a 2B-param model. Text-only. Powered by [CoreML-LLM](https://github.com/john-rocky/CoreML-LLM) v1.
|
| 176 |
"thumbnail_url": "https://huggingface.co/mlboydaisuke/coreml-zoo/resolve/main/thumbnails/qwen3_5.jpg",
|
| 177 |
"demo": {
|
| 178 |
"template": "chat",
|
|
@@ -214,7 +214,7 @@
|
|
| 214 |
"name": "Qwen3.5 0.8B",
|
| 215 |
"subtitle": "Alibaba Qwen, 2025",
|
| 216 |
"category_id": "llm",
|
| 217 |
-
"description_md": "Compact hybrid SSM+attention model, INT8 palettized (same semantic precision as fp16 — 100% top-3 parity vs fp32 oracle). ~20 tok/s decode on iPhone 17 Pro. Smallest and fastest option in the CoreML-LLM lineup. Text-only. Powered by [CoreML-LLM](https://github.com/john-rocky/CoreML-LLM) v1.
|
| 218 |
"thumbnail_url": "https://huggingface.co/mlboydaisuke/coreml-zoo/resolve/main/thumbnails/qwen3_5.jpg",
|
| 219 |
"demo": {
|
| 220 |
"template": "chat",
|
|
@@ -256,7 +256,7 @@
|
|
| 256 |
"name": "Qwen3-VL 2B",
|
| 257 |
"subtitle": "Alibaba Qwen, 2025",
|
| 258 |
"category_id": "llm",
|
| 259 |
-
"description_md": "Qwen3-VL multimodal model — **text + image** input (DeepStack injection at L0/1/2, interleaved mRoPE for image tokens, 196 image tokens). 28-layer GQA text backbone, 6 INT8 body chunks + fp16 embed sidecar. ~7.5 tok/s decode on iPhone 17 Pro. Powered by [CoreML-LLM](https://github.com/john-rocky/CoreML-LLM) v1.
|
| 260 |
"thumbnail_url": "https://huggingface.co/mlboydaisuke/coreml-zoo/resolve/main/thumbnails/qwen3_vl.jpg",
|
| 261 |
"demo": {
|
| 262 |
"template": "chat",
|
|
|
|
| 1 |
{
|
| 2 |
"manifest_version": 1,
|
| 3 |
+
"updated_at": "2026-04-24T15:44:51Z",
|
| 4 |
"min_app_version": "1.0",
|
| 5 |
"categories": [
|
| 6 |
{
|
|
|
|
| 88 |
"name": "Gemma 4 E2B",
|
| 89 |
"subtitle": "Google DeepMind, 2025",
|
| 90 |
"category_id": "llm",
|
| 91 |
+
"description_md": "Google's on-device multimodal LLM (Gemma 3n E2B, 2.3B effective params with Per-Layer Embeddings). **Text + image + audio + video** input, streaming text output. On Apple Neural Engine: ~31 tok/s (4-chunk default) / ~34 tok/s (3-chunk, `LLM_3CHUNK=1`) on iPhone 17 Pro. Native 384x384 vision encoder (64 tokens/frame) handles video; 12-layer Conformer encoder handles audio. Powered by [CoreML-LLM](https://github.com/john-rocky/CoreML-LLM) v1.4.",
|
| 92 |
"thumbnail_url": "https://huggingface.co/mlboydaisuke/coreml-zoo/resolve/main/thumbnails/gemma4.jpg",
|
| 93 |
"demo": {
|
| 94 |
"template": "chat",
|
|
|
|
| 130 |
"name": "Gemma 4 E4B",
|
| 131 |
"subtitle": "Google DeepMind, 2025",
|
| 132 |
"category_id": "llm",
|
| 133 |
+
"description_md": "Larger Gemma 4 variant: 42-layer text decoder, ~4B effective params, 100% ANE-resident. Text-only (no vision/audio). ~14 tok/s decode on iPhone 17 Pro at 2048-token context. Use when you want maximum text quality and have the storage budget. Powered by [CoreML-LLM](https://github.com/john-rocky/CoreML-LLM) v1.4.",
|
| 134 |
"thumbnail_url": "https://huggingface.co/mlboydaisuke/coreml-zoo/resolve/main/thumbnails/gemma4.jpg",
|
| 135 |
"demo": {
|
| 136 |
"template": "chat",
|
|
|
|
| 172 |
"name": "Qwen3.5 2B",
|
| 173 |
"subtitle": "Alibaba Qwen, 2025",
|
| 174 |
"category_id": "llm",
|
| 175 |
+
"description_md": "Hybrid Gated-DeltaNet SSM + attention architecture, shipped as 4 INT8 chunks to fit the iPhone ANE single-mlprogram budget. ~17 tok/s decode on iPhone 17 Pro with ~200 MB RSS — exceptional memory efficiency for a 2B-param model. Text-only. Powered by [CoreML-LLM](https://github.com/john-rocky/CoreML-LLM) v1.4.",
|
| 176 |
"thumbnail_url": "https://huggingface.co/mlboydaisuke/coreml-zoo/resolve/main/thumbnails/qwen3_5.jpg",
|
| 177 |
"demo": {
|
| 178 |
"template": "chat",
|
|
|
|
| 214 |
"name": "Qwen3.5 0.8B",
|
| 215 |
"subtitle": "Alibaba Qwen, 2025",
|
| 216 |
"category_id": "llm",
|
| 217 |
+
"description_md": "Compact hybrid SSM+attention model, INT8 palettized (same semantic precision as fp16 — 100% top-3 parity vs fp32 oracle). ~20 tok/s decode on iPhone 17 Pro. Smallest and fastest option in the CoreML-LLM lineup. Text-only. Powered by [CoreML-LLM](https://github.com/john-rocky/CoreML-LLM) v1.4.",
|
| 218 |
"thumbnail_url": "https://huggingface.co/mlboydaisuke/coreml-zoo/resolve/main/thumbnails/qwen3_5.jpg",
|
| 219 |
"demo": {
|
| 220 |
"template": "chat",
|
|
|
|
| 256 |
"name": "Qwen3-VL 2B",
|
| 257 |
"subtitle": "Alibaba Qwen, 2025",
|
| 258 |
"category_id": "llm",
|
| 259 |
+
"description_md": "Qwen3-VL multimodal model — **text + image** input (DeepStack injection at L0/1/2, interleaved mRoPE for image tokens, 196 image tokens). 28-layer GQA text backbone, 6 INT8 body chunks + fp16 embed sidecar. ~7.5 tok/s decode on iPhone 17 Pro. Powered by [CoreML-LLM](https://github.com/john-rocky/CoreML-LLM) v1.4.",
|
| 260 |
"thumbnail_url": "https://huggingface.co/mlboydaisuke/coreml-zoo/resolve/main/thumbnails/qwen3_vl.jpg",
|
| 261 |
"demo": {
|
| 262 |
"template": "chat",
|