mlboydaisuke
/

coreml-zoo

Model card Files Files and versions

xet

Community

mlboydaisuke commited on Apr 24

Commit

780af30

verified ·

1 Parent(s): 2196468

Bump CoreML-LLM references to v1.4 (3-chunk decode, chunk pipelining)

Browse files

Files changed (1) hide show

models.json +6 -6

models.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "manifest_version": 1,
-  "updated_at": "2026-04-24T10:44:41Z",
   "min_app_version": "1.0",
   "categories": [
     {
@@ -88,7 +88,7 @@
       "name": "Gemma 4 E2B",
       "subtitle": "Google DeepMind, 2025",
       "category_id": "llm",
-      "description_md": "Google's on-device multimodal LLM (Gemma 3n E2B, 2.3B effective params with Per-Layer Embeddings). **Text + image + audio + video** input, streaming text output. Runs on Apple Neural Engine at ~31 tok/s decode on iPhone 17 Pro (34 tok/s in 3-chunk mode). Native 384x384 vision encoder (64 tokens/frame) handles video; 12-layer Conformer encoder handles audio. Powered by [CoreML-LLM](https://github.com/john-rocky/CoreML-LLM) v1.3.",
       "thumbnail_url": "https://huggingface.co/mlboydaisuke/coreml-zoo/resolve/main/thumbnails/gemma4.jpg",
       "demo": {
         "template": "chat",
@@ -130,7 +130,7 @@
       "name": "Gemma 4 E4B",
       "subtitle": "Google DeepMind, 2025",
       "category_id": "llm",
-      "description_md": "Larger Gemma 4 variant: 42-layer text decoder, ~4B effective params, 100% ANE-resident. Text-only (no vision/audio). ~14 tok/s decode on iPhone 17 Pro at 2048-token context. Use when you want maximum text quality and have the storage budget. Powered by [CoreML-LLM](https://github.com/john-rocky/CoreML-LLM) v1.3.",
       "thumbnail_url": "https://huggingface.co/mlboydaisuke/coreml-zoo/resolve/main/thumbnails/gemma4.jpg",
       "demo": {
         "template": "chat",
@@ -172,7 +172,7 @@
       "name": "Qwen3.5 2B",
       "subtitle": "Alibaba Qwen, 2025",
       "category_id": "llm",
-      "description_md": "Hybrid Gated-DeltaNet SSM + attention architecture, shipped as 4 INT8 chunks to fit the iPhone ANE single-mlprogram budget. ~17 tok/s decode on iPhone 17 Pro with ~200 MB RSS — exceptional memory efficiency for a 2B-param model. Text-only. Powered by [CoreML-LLM](https://github.com/john-rocky/CoreML-LLM) v1.3.",
       "thumbnail_url": "https://huggingface.co/mlboydaisuke/coreml-zoo/resolve/main/thumbnails/qwen3_5.jpg",
       "demo": {
         "template": "chat",
@@ -214,7 +214,7 @@
       "name": "Qwen3.5 0.8B",
       "subtitle": "Alibaba Qwen, 2025",
       "category_id": "llm",
-      "description_md": "Compact hybrid SSM+attention model, INT8 palettized (same semantic precision as fp16 — 100% top-3 parity vs fp32 oracle). ~20 tok/s decode on iPhone 17 Pro. Smallest and fastest option in the CoreML-LLM lineup. Text-only. Powered by [CoreML-LLM](https://github.com/john-rocky/CoreML-LLM) v1.3.",
       "thumbnail_url": "https://huggingface.co/mlboydaisuke/coreml-zoo/resolve/main/thumbnails/qwen3_5.jpg",
       "demo": {
         "template": "chat",
@@ -256,7 +256,7 @@
       "name": "Qwen3-VL 2B",
       "subtitle": "Alibaba Qwen, 2025",
       "category_id": "llm",
-      "description_md": "Qwen3-VL multimodal model — **text + image** input (DeepStack injection at L0/1/2, interleaved mRoPE for image tokens, 196 image tokens). 28-layer GQA text backbone, 6 INT8 body chunks + fp16 embed sidecar. ~7.5 tok/s decode on iPhone 17 Pro. Powered by [CoreML-LLM](https://github.com/john-rocky/CoreML-LLM) v1.3.",
       "thumbnail_url": "https://huggingface.co/mlboydaisuke/coreml-zoo/resolve/main/thumbnails/qwen3_vl.jpg",
       "demo": {
         "template": "chat",

 {
   "manifest_version": 1,
+  "updated_at": "2026-04-24T15:44:51Z",
   "min_app_version": "1.0",
   "categories": [
     {
       "name": "Gemma 4 E2B",
       "subtitle": "Google DeepMind, 2025",
       "category_id": "llm",
+      "description_md": "Google's on-device multimodal LLM (Gemma 3n E2B, 2.3B effective params with Per-Layer Embeddings). **Text + image + audio + video** input, streaming text output. On Apple Neural Engine: ~31 tok/s (4-chunk default) / ~34 tok/s (3-chunk, `LLM_3CHUNK=1`) on iPhone 17 Pro. Native 384x384 vision encoder (64 tokens/frame) handles video; 12-layer Conformer encoder handles audio. Powered by [CoreML-LLM](https://github.com/john-rocky/CoreML-LLM) v1.4.",
       "thumbnail_url": "https://huggingface.co/mlboydaisuke/coreml-zoo/resolve/main/thumbnails/gemma4.jpg",
       "demo": {
         "template": "chat",
       "name": "Gemma 4 E4B",
       "subtitle": "Google DeepMind, 2025",
       "category_id": "llm",
+      "description_md": "Larger Gemma 4 variant: 42-layer text decoder, ~4B effective params, 100% ANE-resident. Text-only (no vision/audio). ~14 tok/s decode on iPhone 17 Pro at 2048-token context. Use when you want maximum text quality and have the storage budget. Powered by [CoreML-LLM](https://github.com/john-rocky/CoreML-LLM) v1.4.",
       "thumbnail_url": "https://huggingface.co/mlboydaisuke/coreml-zoo/resolve/main/thumbnails/gemma4.jpg",
       "demo": {
         "template": "chat",
       "name": "Qwen3.5 2B",
       "subtitle": "Alibaba Qwen, 2025",
       "category_id": "llm",
+      "description_md": "Hybrid Gated-DeltaNet SSM + attention architecture, shipped as 4 INT8 chunks to fit the iPhone ANE single-mlprogram budget. ~17 tok/s decode on iPhone 17 Pro with ~200 MB RSS — exceptional memory efficiency for a 2B-param model. Text-only. Powered by [CoreML-LLM](https://github.com/john-rocky/CoreML-LLM) v1.4.",
       "thumbnail_url": "https://huggingface.co/mlboydaisuke/coreml-zoo/resolve/main/thumbnails/qwen3_5.jpg",
       "demo": {
         "template": "chat",
       "name": "Qwen3.5 0.8B",
       "subtitle": "Alibaba Qwen, 2025",
       "category_id": "llm",
+      "description_md": "Compact hybrid SSM+attention model, INT8 palettized (same semantic precision as fp16 — 100% top-3 parity vs fp32 oracle). ~20 tok/s decode on iPhone 17 Pro. Smallest and fastest option in the CoreML-LLM lineup. Text-only. Powered by [CoreML-LLM](https://github.com/john-rocky/CoreML-LLM) v1.4.",
       "thumbnail_url": "https://huggingface.co/mlboydaisuke/coreml-zoo/resolve/main/thumbnails/qwen3_5.jpg",
       "demo": {
         "template": "chat",
       "name": "Qwen3-VL 2B",
       "subtitle": "Alibaba Qwen, 2025",
       "category_id": "llm",
+      "description_md": "Qwen3-VL multimodal model — **text + image** input (DeepStack injection at L0/1/2, interleaved mRoPE for image tokens, 196 image tokens). 28-layer GQA text backbone, 6 INT8 body chunks + fp16 embed sidecar. ~7.5 tok/s decode on iPhone 17 Pro. Powered by [CoreML-LLM](https://github.com/john-rocky/CoreML-LLM) v1.4.",
       "thumbnail_url": "https://huggingface.co/mlboydaisuke/coreml-zoo/resolve/main/thumbnails/qwen3_vl.jpg",
       "demo": {
         "template": "chat",