mlboydaisuke commited on
Commit
780af30
·
verified ·
1 Parent(s): 2196468

Bump CoreML-LLM references to v1.4 (3-chunk decode, chunk pipelining)

Browse files
Files changed (1) hide show
  1. models.json +6 -6
models.json CHANGED
@@ -1,6 +1,6 @@
1
  {
2
  "manifest_version": 1,
3
- "updated_at": "2026-04-24T10:44:41Z",
4
  "min_app_version": "1.0",
5
  "categories": [
6
  {
@@ -88,7 +88,7 @@
88
  "name": "Gemma 4 E2B",
89
  "subtitle": "Google DeepMind, 2025",
90
  "category_id": "llm",
91
- "description_md": "Google's on-device multimodal LLM (Gemma 3n E2B, 2.3B effective params with Per-Layer Embeddings). **Text + image + audio + video** input, streaming text output. Runs on Apple Neural Engine at ~31 tok/s decode on iPhone 17 Pro (34 tok/s in 3-chunk mode). Native 384x384 vision encoder (64 tokens/frame) handles video; 12-layer Conformer encoder handles audio. Powered by [CoreML-LLM](https://github.com/john-rocky/CoreML-LLM) v1.3.",
92
  "thumbnail_url": "https://huggingface.co/mlboydaisuke/coreml-zoo/resolve/main/thumbnails/gemma4.jpg",
93
  "demo": {
94
  "template": "chat",
@@ -130,7 +130,7 @@
130
  "name": "Gemma 4 E4B",
131
  "subtitle": "Google DeepMind, 2025",
132
  "category_id": "llm",
133
- "description_md": "Larger Gemma 4 variant: 42-layer text decoder, ~4B effective params, 100% ANE-resident. Text-only (no vision/audio). ~14 tok/s decode on iPhone 17 Pro at 2048-token context. Use when you want maximum text quality and have the storage budget. Powered by [CoreML-LLM](https://github.com/john-rocky/CoreML-LLM) v1.3.",
134
  "thumbnail_url": "https://huggingface.co/mlboydaisuke/coreml-zoo/resolve/main/thumbnails/gemma4.jpg",
135
  "demo": {
136
  "template": "chat",
@@ -172,7 +172,7 @@
172
  "name": "Qwen3.5 2B",
173
  "subtitle": "Alibaba Qwen, 2025",
174
  "category_id": "llm",
175
- "description_md": "Hybrid Gated-DeltaNet SSM + attention architecture, shipped as 4 INT8 chunks to fit the iPhone ANE single-mlprogram budget. ~17 tok/s decode on iPhone 17 Pro with ~200 MB RSS — exceptional memory efficiency for a 2B-param model. Text-only. Powered by [CoreML-LLM](https://github.com/john-rocky/CoreML-LLM) v1.3.",
176
  "thumbnail_url": "https://huggingface.co/mlboydaisuke/coreml-zoo/resolve/main/thumbnails/qwen3_5.jpg",
177
  "demo": {
178
  "template": "chat",
@@ -214,7 +214,7 @@
214
  "name": "Qwen3.5 0.8B",
215
  "subtitle": "Alibaba Qwen, 2025",
216
  "category_id": "llm",
217
- "description_md": "Compact hybrid SSM+attention model, INT8 palettized (same semantic precision as fp16 — 100% top-3 parity vs fp32 oracle). ~20 tok/s decode on iPhone 17 Pro. Smallest and fastest option in the CoreML-LLM lineup. Text-only. Powered by [CoreML-LLM](https://github.com/john-rocky/CoreML-LLM) v1.3.",
218
  "thumbnail_url": "https://huggingface.co/mlboydaisuke/coreml-zoo/resolve/main/thumbnails/qwen3_5.jpg",
219
  "demo": {
220
  "template": "chat",
@@ -256,7 +256,7 @@
256
  "name": "Qwen3-VL 2B",
257
  "subtitle": "Alibaba Qwen, 2025",
258
  "category_id": "llm",
259
- "description_md": "Qwen3-VL multimodal model — **text + image** input (DeepStack injection at L0/1/2, interleaved mRoPE for image tokens, 196 image tokens). 28-layer GQA text backbone, 6 INT8 body chunks + fp16 embed sidecar. ~7.5 tok/s decode on iPhone 17 Pro. Powered by [CoreML-LLM](https://github.com/john-rocky/CoreML-LLM) v1.3.",
260
  "thumbnail_url": "https://huggingface.co/mlboydaisuke/coreml-zoo/resolve/main/thumbnails/qwen3_vl.jpg",
261
  "demo": {
262
  "template": "chat",
 
1
  {
2
  "manifest_version": 1,
3
+ "updated_at": "2026-04-24T15:44:51Z",
4
  "min_app_version": "1.0",
5
  "categories": [
6
  {
 
88
  "name": "Gemma 4 E2B",
89
  "subtitle": "Google DeepMind, 2025",
90
  "category_id": "llm",
91
+ "description_md": "Google's on-device multimodal LLM (Gemma 3n E2B, 2.3B effective params with Per-Layer Embeddings). **Text + image + audio + video** input, streaming text output. On Apple Neural Engine: ~31 tok/s (4-chunk default) / ~34 tok/s (3-chunk, `LLM_3CHUNK=1`) on iPhone 17 Pro. Native 384x384 vision encoder (64 tokens/frame) handles video; 12-layer Conformer encoder handles audio. Powered by [CoreML-LLM](https://github.com/john-rocky/CoreML-LLM) v1.4.",
92
  "thumbnail_url": "https://huggingface.co/mlboydaisuke/coreml-zoo/resolve/main/thumbnails/gemma4.jpg",
93
  "demo": {
94
  "template": "chat",
 
130
  "name": "Gemma 4 E4B",
131
  "subtitle": "Google DeepMind, 2025",
132
  "category_id": "llm",
133
+ "description_md": "Larger Gemma 4 variant: 42-layer text decoder, ~4B effective params, 100% ANE-resident. Text-only (no vision/audio). ~14 tok/s decode on iPhone 17 Pro at 2048-token context. Use when you want maximum text quality and have the storage budget. Powered by [CoreML-LLM](https://github.com/john-rocky/CoreML-LLM) v1.4.",
134
  "thumbnail_url": "https://huggingface.co/mlboydaisuke/coreml-zoo/resolve/main/thumbnails/gemma4.jpg",
135
  "demo": {
136
  "template": "chat",
 
172
  "name": "Qwen3.5 2B",
173
  "subtitle": "Alibaba Qwen, 2025",
174
  "category_id": "llm",
175
+ "description_md": "Hybrid Gated-DeltaNet SSM + attention architecture, shipped as 4 INT8 chunks to fit the iPhone ANE single-mlprogram budget. ~17 tok/s decode on iPhone 17 Pro with ~200 MB RSS — exceptional memory efficiency for a 2B-param model. Text-only. Powered by [CoreML-LLM](https://github.com/john-rocky/CoreML-LLM) v1.4.",
176
  "thumbnail_url": "https://huggingface.co/mlboydaisuke/coreml-zoo/resolve/main/thumbnails/qwen3_5.jpg",
177
  "demo": {
178
  "template": "chat",
 
214
  "name": "Qwen3.5 0.8B",
215
  "subtitle": "Alibaba Qwen, 2025",
216
  "category_id": "llm",
217
+ "description_md": "Compact hybrid SSM+attention model, INT8 palettized (same semantic precision as fp16 — 100% top-3 parity vs fp32 oracle). ~20 tok/s decode on iPhone 17 Pro. Smallest and fastest option in the CoreML-LLM lineup. Text-only. Powered by [CoreML-LLM](https://github.com/john-rocky/CoreML-LLM) v1.4.",
218
  "thumbnail_url": "https://huggingface.co/mlboydaisuke/coreml-zoo/resolve/main/thumbnails/qwen3_5.jpg",
219
  "demo": {
220
  "template": "chat",
 
256
  "name": "Qwen3-VL 2B",
257
  "subtitle": "Alibaba Qwen, 2025",
258
  "category_id": "llm",
259
+ "description_md": "Qwen3-VL multimodal model — **text + image** input (DeepStack injection at L0/1/2, interleaved mRoPE for image tokens, 196 image tokens). 28-layer GQA text backbone, 6 INT8 body chunks + fp16 embed sidecar. ~7.5 tok/s decode on iPhone 17 Pro. Powered by [CoreML-LLM](https://github.com/john-rocky/CoreML-LLM) v1.4.",
260
  "thumbnail_url": "https://huggingface.co/mlboydaisuke/coreml-zoo/resolve/main/thumbnails/qwen3_vl.jpg",
261
  "demo": {
262
  "template": "chat",