mlboydaisuke
/

coreml-zoo

Model card Files Files and versions

xet

Community

mlboydaisuke commited on Apr 16

Commit

f5c93e9

verified ·

1 Parent(s): 5585de7

Upload models.json with huggingface_hub

Browse files

Files changed (1) hide show

models.json +173 -161

models.json CHANGED Viewed

@@ -3,6 +3,12 @@
   "updated_at": "2026-04-10",
   "min_app_version": "1.0",
   "categories": [
     {
       "id": "segmentation",
       "name": "Segmentation",
@@ -77,12 +83,54 @@
     }
   ],
   "models": [
     {
       "id": "rmbg_1_4",
       "name": "RMBG-1.4",
       "subtitle": "BRIA AI, 2023",
       "category_id": "segmentation",
-      "description_md": "High-quality background removal. Outputs foreground with alpha mask. INT8 quantized U-Net, 1024×1024 input.",
       "demo": {
         "template": "image_in_out",
         "config": {
@@ -97,7 +145,7 @@
           "archive": "zip",
           "size_bytes": 38771210,
           "sha256": "a80dbb5f04c922a8fa698c38592e4e52af4e62471d70bc7c59c28a3355a1da95",
-          "compute_units": "cpuAndGPU",
           "kind": "model"
         }
       ],
@@ -117,10 +165,10 @@
     },
     {
       "id": "ddcolor",
-      "name": "DDColor",
       "subtitle": "Image Colorization, 2023",
       "category_id": "enhancement",
-      "description_md": "Automatic grayscale image colorization via dual decoders. 512×512 input, processes in LAB color space.",
       "demo": {
         "template": "image_in_out",
         "config": {
@@ -158,12 +206,12 @@
       "name": "SinSR",
       "subtitle": "Single-Step Super-Resolution, 2024",
       "category_id": "enhancement",
-      "description_md": "4× super-resolution via single-step diffusion. 256×256 input → 1024×1024 output. Swin Transformer denoiser (FP32 required).",
       "demo": {
         "template": "image_in_out",
         "config": {
           "input_size": 256,
-          "output_type": "image"
         }
       },
       "files": [
@@ -209,50 +257,12 @@
         "year": 2024
       }
     },
-    {
-      "id": "efficientad",
-      "name": "EfficientAD",
-      "subtitle": "Anomaly Detection, 2023",
-      "category_id": "segmentation",
-      "description_md": "Lightweight unsupervised anomaly detection. 256×256 input → anomaly heatmap + score. Industrial quality inspection.",
-      "demo": {
-        "template": "image_in_out",
-        "config": {
-          "input_size": 256,
-          "output_type": "image"
-        }
-      },
-      "files": [
-        {
-          "name": "EfficientAD.mlpackage.zip",
-          "url": "TODO",
-          "archive": "zip",
-          "size_bytes": 8000000,
-          "sha256": "TODO",
-          "compute_units": "all",
-          "kind": "model"
-        }
-      ],
-      "requirements": {
-        "min_ios": "17.0",
-        "min_ram_mb": 200
-      },
-      "license": {
-        "name": "MIT",
-        "url": "https://github.com/nelson1425/EfficientAD"
-      },
-      "upstream": {
-        "name": "nelson1425/EfficientAD",
-        "url": "https://github.com/nelson1425/EfficientAD",
-        "year": 2023
-      }
-    },
     {
       "id": "yolo26s",
       "name": "YOLO26s",
       "subtitle": "NMS-Free Detection, 2026",
       "category_id": "detection",
-      "description_md": "NMS-free object detection. 640×640 input, output [1,300,6]: x1,y1,x2,y2,confidence,class_id. 80 COCO classes.",
       "demo": {
         "template": "image_detection",
         "config": {
@@ -286,11 +296,11 @@
       }
     },
     {
-      "id": "yolov9s",
-      "name": "YOLOv9s",
       "subtitle": "Object Detection, 2024",
       "category_id": "detection",
-      "description_md": "YOLOv9 small with Vision framework NMS. 640×640 input. PGI + GELAN architecture.",
       "demo": {
         "template": "image_detection",
         "config": {
@@ -315,11 +325,11 @@
       },
       "license": {
         "name": "AGPL-3.0",
-        "url": "https://github.com/WongKinYiu/yolov9"
       },
       "upstream": {
-        "name": "WongKinYiu/yolov9",
-        "url": "https://github.com/WongKinYiu/yolov9",
         "year": 2024
       }
     },
@@ -328,7 +338,7 @@
       "name": "YOLOv10n",
       "subtitle": "Object Detection, 2024",
       "category_id": "detection",
-      "description_md": "YOLOv10 nano with Vision framework NMS. 640×640 input. Dual-assignment strategy.",
       "demo": {
         "template": "image_detection",
         "config": {
@@ -361,12 +371,58 @@
         "year": 2024
       }
     },
     {
       "id": "moge2_vitb_normal_504",
       "name": "MoGe-2 ViT-B (504×504)",
       "subtitle": "Microsoft, CVPR 2025",
       "category_id": "depth",
-      "description_md": "Monocular geometry from a single image. Predicts metric depth, surface normals, and a confidence mask in one forward pass. DINOv2 ViT-B/14 backbone.",
       "demo": {
         "template": "depth_visualization",
         "config": {
@@ -410,7 +466,7 @@
       "name": "SigLIP",
       "subtitle": "Zero-Shot Classification, 2023",
       "category_id": "vision_language",
-      "description_md": "Zero-shot image classification. Dual encoder (image + text) with sigmoid loss. 224×224 input. Type any class names to classify.",
       "demo": {
         "template": "zero_shot_classify",
         "config": {
@@ -468,7 +524,7 @@
       "name": "Florence-2",
       "subtitle": "Microsoft, 2024",
       "category_id": "vision_language",
-      "description_md": "Vision-language captioning, OCR, and visual QA. Three-stage encoder-decoder. 768×768 input, autoregressive text output.",
       "demo": {
         "template": "image_to_text",
         "config": {
@@ -563,26 +619,24 @@
       }
     },
     {
-      "id": "adaface",
-      "name": "AdaFace",
-      "subtitle": "Face Recognition, 2022",
       "category_id": "face",
-      "description_md": "Face recognition via 512-dim embeddings. IR-18 backbone, 112×112 face crop input. Compare faces by cosine similarity.",
       "demo": {
-        "template": "face_compare",
         "config": {
-          "input_size": 112,
-          "embedding_dim": 512,
-          "match_threshold": 0.6
         }
       },
       "files": [
         {
-          "name": "AdaFace_IR18_CASIA.mlpackage.zip",
-          "url": "TODO",
           "archive": "zip",
-          "size_bytes": 32000000,
-          "sha256": "TODO",
           "compute_units": "all",
           "kind": "model"
         }
@@ -593,12 +647,12 @@
       },
       "license": {
         "name": "MIT",
-        "url": "https://github.com/mk-minchul/AdaFace"
       },
       "upstream": {
-        "name": "mk-minchul/AdaFace",
-        "url": "https://github.com/mk-minchul/AdaFace",
-        "year": 2022
       }
     },
     {
@@ -606,7 +660,7 @@
       "name": "Hyper-SD (1-Step)",
       "subtitle": "ByteDance, 2024",
       "category_id": "generation",
-      "description_md": "Single-step text-to-image from SD1.5 via TCD distillation. 512×512 output. Chunked UNet (6-bit palettized) + TCD scheduler.",
       "demo": {
         "template": "text_to_image",
         "config": {
@@ -630,7 +684,7 @@
           "archive": "zip",
           "size_bytes": 226397794,
           "sha256": "201b0fcc3573811aac6a4e8545c695bc4fb2f7710ea0d60c227919d87b37687e",
-          "compute_units": "cpuAndNeuralEngine",
           "kind": "model"
         },
         {
@@ -657,7 +711,7 @@
           "archive": "zip",
           "size_bytes": 91282754,
           "sha256": "1260371542d845a2261ed2de36c5fe3e9ccb740a6ceb59b1990705d125e8cf66",
-          "compute_units": "cpuAndNeuralEngine",
           "kind": "model"
         },
         {
@@ -694,16 +748,16 @@
       "name": "MatAnyone",
       "subtitle": "Video Matting, 2025",
       "category_id": "video",
-      "description_md": "Temporally consistent video matting with memory propagation. 5-model pipeline: encoder, mask encoder, read first, read, decoder. 768×432 landscape input.",
       "demo": {
         "template": "video_matting",
         "config": {
           "frame_size": 512,
-          "encoder": "MatAnyone_Encoder.mlpackage.zip",
-          "mask_encoder": "MatAnyone_MaskEncoder.mlpackage.zip",
-          "read_first": "MatAnyone_ReadFirst.mlpackage.zip",
-          "read": "MatAnyone_Read.mlpackage.zip",
-          "decoder": "MatAnyone_Decoder.mlpackage.zip"
         }
       },
       "files": [
@@ -713,7 +767,7 @@
           "archive": "zip",
           "size_bytes": 17306121,
           "sha256": "97ffd6bc4611f9a3351dc890fc00954ba48171e517e66a39f7a5f1f38110dfda",
-          "compute_units": "all",
           "kind": "model"
         },
         {
@@ -722,7 +776,7 @@
           "archive": "zip",
           "size_bytes": 16819866,
           "sha256": "ba67559188ffc64d8e46418c051c6a55815d4482def17519fa518daac7d5a911",
-          "compute_units": "all",
           "kind": "model"
         },
         {
@@ -749,7 +803,7 @@
           "archive": "zip",
           "size_bytes": 8807630,
           "sha256": "67136aa67000e604838fe9aa7de151c514ef84f0b83f1da0f043cf70652d28eb",
-          "compute_units": "all",
           "kind": "model"
         }
       ],
@@ -772,7 +826,7 @@
       "name": "HTDemucs",
       "subtitle": "Audio Source Separation",
       "category_id": "audio",
-      "description_md": "Split music into 4 stems: drums, bass, vocals, other. 44.1 kHz stereo, overlap-add for full tracks. FP32 model.",
       "demo": {
         "template": "audio_in_out",
         "config": {
@@ -816,7 +870,7 @@
       "name": "Kokoro-82M",
       "subtitle": "Multilingual TTS",
       "category_id": "speech",
-      "description_md": "English + Japanese text-to-speech. 24 kHz mono. On-device G2P. StyleTTS2 + iSTFTNet vocoder. 10 voices, bucketed decoder (128/256/512).",
       "demo": {
         "template": "text_to_audio",
         "config": {
@@ -896,7 +950,7 @@
       "name": "Stable Audio Open",
       "subtitle": "Text-to-Music, 2024",
       "category_id": "speech",
-      "description_md": "Text-to-music generation. Up to 11.9s stereo 44.1 kHz. Rectified flow DiT + T5 encoder + Oobleck VAE decoder.",
       "demo": {
         "template": "text_to_audio",
         "config": {
@@ -965,45 +1019,52 @@
       }
     },
     {
-      "id": "basicpitch",
-      "name": "Basic Pitch",
-      "subtitle": "Spotify, Music Transcription",
       "category_id": "audio",
-      "description_md": "Polyphonic music transcription: audio → MIDI notes. Tiny 17K-param model (272 KB). Windowed inference at 22.05 kHz.",
       "demo": {
-        "template": "audio_to_score",
         "config": {
           "sample_rate": 22050,
-          "window_size": 43844,
-          "hop_size": 256,
-          "n_bins": 88,
-          "onset_threshold": 0.5,
-          "note_threshold": 0.5
         }
       },
       "files": [
         {
-          "name": "nmp.mlpackage.zip",
-          "url": "TODO",
           "archive": "zip",
-          "size_bytes": 272000,
-          "sha256": "TODO",
-          "compute_units": "all",
           "kind": "model"
         }
       ],
       "requirements": {
         "min_ios": "17.0",
-        "min_ram_mb": 200
       },
       "license": {
-        "name": "Apache-2.0",
-        "url": "https://github.com/spotify/basic-pitch"
       },
       "upstream": {
-        "name": "spotify/basic-pitch",
-        "url": "https://github.com/spotify/basic-pitch",
-        "year": 2022
       }
     },
     {
@@ -1011,7 +1072,7 @@
       "name": "Pyannote Diarization",
       "subtitle": "Speaker Identification",
       "category_id": "audio",
-      "description_md": "Speaker diarization: who spoke when. 16 kHz mono input, 10s segments. Outputs per-frame speaker logits.",
       "demo": {
         "template": "audio_in_out",
         "config": {
@@ -1046,55 +1107,6 @@
         "year": 2021
       }
     },
-    {
-      "id": "openvoice",
-      "name": "OpenVoice V2",
-      "subtitle": "Voice Cloning",
-      "category_id": "audio",
-      "description_md": "Zero-shot voice conversion. Clone a speaker from ~10s reference audio. Speaker encoder + voice converter.",
-      "demo": {
-        "template": "audio_in_out",
-        "config": {
-          "sample_rate": 22050,
-          "output_stems": [
-            "converted"
-          ]
-        }
-      },
-      "files": [
-        {
-          "name": "OpenVoice_SpeakerEncoder.mlpackage.zip",
-          "url": "https://huggingface.co/mlboydaisuke/coreml-zoo/resolve/main/openvoice/OpenVoice_SpeakerEncoder.mlpackage.zip",
-          "archive": "zip",
-          "size_bytes": 1519880,
-          "sha256": "c3f2a96aaf5ecb5c5afc62b3d3dfbd47dc7ae64bc3edb7aa68befb54aef74459",
-          "compute_units": "cpuAndGPU",
-          "kind": "model"
-        },
-        {
-          "name": "OpenVoice_VoiceConverter.mlpackage.zip",
-          "url": "https://huggingface.co/mlboydaisuke/coreml-zoo/resolve/main/openvoice/OpenVoice_VoiceConverter.mlpackage.zip",
-          "archive": "zip",
-          "size_bytes": 59799630,
-          "sha256": "ef3ce8a2d1564aefa13830d7d0ca43f85e0aa62d5f59622c8bc456c307ab5e05",
-          "compute_units": "cpuAndGPU",
-          "kind": "model"
-        }
-      ],
-      "requirements": {
-        "min_ios": "17.0",
-        "min_ram_mb": 500
-      },
-      "license": {
-        "name": "MIT",
-        "url": "https://github.com/myshell-ai/OpenVoice"
-      },
-      "upstream": {
-        "name": "myshell-ai/OpenVoice",
-        "url": "https://github.com/myshell-ai/OpenVoice",
-        "year": 2023
-      }
-    },
     {
       "id": "realesrgan",
       "name": "Real-ESRGAN 4x",
@@ -1176,7 +1188,7 @@
       "name": "RF-DETR Nano",
       "subtitle": "Object Detection, 2025",
       "category_id": "detection",
-      "description_md": "End-to-end transformer detector. 384×384 input. 300 queries, 91 classes (COCO + background). No NMS needed. Output: confidence [300,91] + coordinates [300,4] in normalized cxcywh.",
       "demo": {
         "template": "image_detection",
         "config": {
@@ -1256,12 +1268,12 @@
       "name": "MobileSAM",
       "subtitle": "Segment Anything, 2023",
       "category_id": "segmentation",
-      "description_md": "Lightweight Segment Anything. Tap any point to generate a segmentation mask. ViT-Tiny encoder (13 MB) + lightweight decoder (9.8 MB). ~60× smaller than SAM.",
       "demo": {
         "template": "segment_anything",
         "config": {
-          "encoder": "MobileSAM_Encoder.mlpackage.zip",
-          "decoder": "MobileSAM_Decoder.mlpackage.zip",
           "input_size": 1024
         }
       },

   "updated_at": "2026-04-10",
   "min_app_version": "1.0",
   "categories": [
+    {
+      "id": "llm",
+      "name": "Large Language Models",
+      "icon": "bubble.left.and.text.bubble.right",
+      "order": 0
+    },
     {
       "id": "segmentation",
       "name": "Segmentation",
     }
   ],
   "models": [
+    {
+      "id": "gemma4_e2b",
+      "name": "Gemma 4 E2B",
+      "subtitle": "Google DeepMind, 2025",
+      "category_id": "llm",
+      "description_md": "Google's latest on-device multimodal LLM. 2.3B effective parameters with Per-Layer Embeddings. Text + image input, streaming text output. Runs on Apple Neural Engine at ~31 tok/s decode. Supports multi-turn conversations, image understanding, and reasoning.",
+      "thumbnail_url": "https://huggingface.co/mlboydaisuke/coreml-zoo/resolve/main/thumbnails/gemma4.jpg",
+      "demo": {
+        "template": "chat",
+        "config": {
+          "max_tokens": 1024,
+          "multimodal": true
+        }
+      },
+      "files": [
+        {
+          "name": "gemma4-e2b-coreml.zip",
+          "url": "https://huggingface.co/mlboydaisuke/gemma-4-E2B-coreml/resolve/main/gemma4-e2b-coreml.zip",
+          "archive": "zip",
+          "size_bytes": 2700000000,
+          "sha256": "TODO",
+          "compute_units": "cpuAndNeuralEngine",
+          "kind": "model"
+        }
+      ],
+      "requirements": {
+        "min_ios": "18.0",
+        "min_ram_mb": 1500,
+        "device_capabilities": [
+          "arm64"
+        ]
+      },
+      "license": {
+        "name": "Gemma",
+        "url": "https://ai.google.dev/gemma/terms"
+      },
+      "upstream": {
+        "name": "google/gemma-4-e2b",
+        "url": "https://huggingface.co/google/gemma-4-e2b",
+        "year": 2025
+      }
+    },
     {
       "id": "rmbg_1_4",
       "name": "RMBG-1.4",
       "subtitle": "BRIA AI, 2023",
       "category_id": "segmentation",
+      "description_md": "High-quality background removal. Outputs foreground with alpha mask. 1024×1024 input.",
       "demo": {
         "template": "image_in_out",
         "config": {
           "archive": "zip",
           "size_bytes": 38771210,
           "sha256": "a80dbb5f04c922a8fa698c38592e4e52af4e62471d70bc7c59c28a3355a1da95",
+          "compute_units": "cpuOnly",
           "kind": "model"
         }
       ],
     },
     {
       "id": "ddcolor",
+      "name": "DDColor Tiny",
       "subtitle": "Image Colorization, 2023",
       "category_id": "enhancement",
+      "description_md": "Automatic grayscale image colorization via dual decoders. 512×512 input.",
       "demo": {
         "template": "image_in_out",
         "config": {
       "name": "SinSR",
       "subtitle": "Single-Step Super-Resolution, 2024",
       "category_id": "enhancement",
+      "description_md": "4× super-resolution via single-step diffusion. 256→1024. Swin Transformer denoiser (FP32).",
       "demo": {
         "template": "image_in_out",
         "config": {
           "input_size": 256,
+          "output_type": "sinsr"
         }
       },
       "files": [
         "year": 2024
       }
     },
     {
       "id": "yolo26s",
       "name": "YOLO26s",
       "subtitle": "NMS-Free Detection, 2026",
       "category_id": "detection",
+      "description_md": "NMS-free object detection. 640×640 input, 80 COCO classes.",
       "demo": {
         "template": "image_detection",
         "config": {
       }
     },
     {
+      "id": "yolo11s",
+      "name": "YOLO11s",
       "subtitle": "Object Detection, 2024",
       "category_id": "detection",
+      "description_md": "YOLO11 small detection with Vision framework NMS. 640×640 input.",
       "demo": {
         "template": "image_detection",
         "config": {
       },
       "license": {
         "name": "AGPL-3.0",
+        "url": "https://github.com/ultralytics/ultralytics"
       },
       "upstream": {
+        "name": "ultralytics/ultralytics",
+        "url": "https://github.com/ultralytics/ultralytics",
         "year": 2024
       }
     },
       "name": "YOLOv10n",
       "subtitle": "Object Detection, 2024",
       "category_id": "detection",
+      "description_md": "YOLOv10 nano. 640×640 input. Dual-assignment strategy.",
       "demo": {
         "template": "image_detection",
         "config": {
         "year": 2024
       }
     },
+    {
+      "id": "yoloworld",
+      "name": "YOLO-World",
+      "subtitle": "Open-Vocabulary Detection, 2024",
+      "category_id": "detection",
+      "description_md": "Open-vocabulary detection. Type any text query. YOLO-World V2-S + CLIP ViT-B/32.",
+      "demo": {
+        "template": "open_vocab_detection",
+        "config": {
+          "input_size": 640
+        }
+      },
+      "files": [
+        {
+          "name": "yoloworld_detector.mlpackage.zip",
+          "url": "https://huggingface.co/mlboydaisuke/coreml-zoo/resolve/main/yoloworld/yoloworld_detector.mlpackage.zip",
+          "archive": "zip",
+          "size_bytes": 23710620,
+          "sha256": "611d299ae74c83f90a5cc9f4585709859d5db735baa8ade721e0c2d99cd5af92",
+          "compute_units": "all",
+          "kind": "model"
+        },
+        {
+          "name": "clip_text_encoder.mlpackage.zip",
+          "url": "https://huggingface.co/mlboydaisuke/coreml-zoo/resolve/main/yoloworld/clip_text_encoder.mlpackage.zip",
+          "archive": "zip",
+          "size_bytes": 116681932,
+          "sha256": "45770a743297e8c2a57cc330d4f5c80f47734263680895b33b593b50dd2c382b",
+          "compute_units": "cpuOnly",
+          "kind": "model"
+        }
+      ],
+      "requirements": {
+        "min_ios": "17.0",
+        "min_ram_mb": 600
+      },
+      "license": {
+        "name": "GPL-3.0",
+        "url": "https://github.com/AILab-CVC/YOLO-World"
+      },
+      "upstream": {
+        "name": "AILab-CVC/YOLO-World",
+        "url": "https://github.com/AILab-CVC/YOLO-World",
+        "year": 2024
+      }
+    },
     {
       "id": "moge2_vitb_normal_504",
       "name": "MoGe-2 ViT-B (504×504)",
       "subtitle": "Microsoft, CVPR 2025",
       "category_id": "depth",
+      "description_md": "Monocular geometry from a single image. Metric depth, surface normals, confidence mask. DINOv2 ViT-B/14 backbone.",
       "demo": {
         "template": "depth_visualization",
         "config": {
       "name": "SigLIP",
       "subtitle": "Zero-Shot Classification, 2023",
       "category_id": "vision_language",
+      "description_md": "Zero-shot image classification. Dual encoder (image + text). 224×224 input.",
       "demo": {
         "template": "zero_shot_classify",
         "config": {
       "name": "Florence-2",
       "subtitle": "Microsoft, 2024",
       "category_id": "vision_language",
+      "description_md": "Vision-language captioning, OCR, and VQA. Three-stage encoder-decoder. 768×768 input.",
       "demo": {
         "template": "image_to_text",
         "config": {
       }
     },
     {
+      "id": "face3d",
+      "name": "3DDFA V2",
+      "subtitle": "3D Face Reconstruction, 2020",
       "category_id": "face",
+      "description_md": "Single-image 3D face reconstruction. Predicts 6 DoF pose + expression parameters.",
       "demo": {
+        "template": "face_3d",
         "config": {
+          "input_size": 120
         }
       },
       "files": [
         {
+          "name": "3DDFA_V2.mlpackage.zip",
+          "url": "https://huggingface.co/mlboydaisuke/coreml-zoo/resolve/main/face3d/3DDFA_V2.mlpackage.zip",
           "archive": "zip",
+          "size_bytes": 6083375,
+          "sha256": "0f715dc220c046f558e3b8fc65246df9a2eec77182830a16628783430cdacdc8",
           "compute_units": "all",
           "kind": "model"
         }
       },
       "license": {
         "name": "MIT",
+        "url": "https://github.com/cleardusk/3DDFA_V2"
       },
       "upstream": {
+        "name": "cleardusk/3DDFA_V2",
+        "url": "https://github.com/cleardusk/3DDFA_V2",
+        "year": 2020
       }
     },
     {
       "name": "Hyper-SD (1-Step)",
       "subtitle": "ByteDance, 2024",
       "category_id": "generation",
+      "description_md": "Single-step text-to-image from SD1.5 via TCD distillation. 512×512. Chunked UNet (6-bit).",
       "demo": {
         "template": "text_to_image",
         "config": {
           "archive": "zip",
           "size_bytes": 226397794,
           "sha256": "201b0fcc3573811aac6a4e8545c695bc4fb2f7710ea0d60c227919d87b37687e",
+          "compute_units": "cpuAndGPU",
           "kind": "model"
         },
         {
           "archive": "zip",
           "size_bytes": 91282754,
           "sha256": "1260371542d845a2261ed2de36c5fe3e9ccb740a6ceb59b1990705d125e8cf66",
+          "compute_units": "cpuAndGPU",
           "kind": "model"
         },
         {
       "name": "MatAnyone",
       "subtitle": "Video Matting, 2025",
       "category_id": "video",
+      "description_md": "Temporally consistent video matting. 5-model pipeline with memory propagation.",
       "demo": {
         "template": "video_matting",
         "config": {
           "frame_size": 512,
+          "encoder": "MatAnyone_encoder.mlpackage.zip",
+          "mask_encoder": "MatAnyone_mask_encoder.mlpackage.zip",
+          "read_first": "MatAnyone_read_first.mlpackage.zip",
+          "read": "MatAnyone_read.mlpackage.zip",
+          "decoder": "MatAnyone_decoder.mlpackage.zip"
         }
       },
       "files": [
           "archive": "zip",
           "size_bytes": 17306121,
           "sha256": "97ffd6bc4611f9a3351dc890fc00954ba48171e517e66a39f7a5f1f38110dfda",
+          "compute_units": "cpuAndGPU",
           "kind": "model"
         },
         {
           "archive": "zip",
           "size_bytes": 16819866,
           "sha256": "ba67559188ffc64d8e46418c051c6a55815d4482def17519fa518daac7d5a911",
+          "compute_units": "cpuAndGPU",
           "kind": "model"
         },
         {
           "archive": "zip",
           "size_bytes": 8807630,
           "sha256": "67136aa67000e604838fe9aa7de151c514ef84f0b83f1da0f043cf70652d28eb",
+          "compute_units": "cpuAndGPU",
           "kind": "model"
         }
       ],
       "name": "HTDemucs",
       "subtitle": "Audio Source Separation",
       "category_id": "audio",
+      "description_md": "Split music into 4 stems: drums, bass, vocals, other. 44.1 kHz stereo, FP32.",
       "demo": {
         "template": "audio_in_out",
         "config": {
       "name": "Kokoro-82M",
       "subtitle": "Multilingual TTS",
       "category_id": "speech",
+      "description_md": "English + Japanese text-to-speech. 24 kHz. StyleTTS2 + iSTFTNet vocoder. Multiple voices.",
       "demo": {
         "template": "text_to_audio",
         "config": {
       "name": "Stable Audio Open",
       "subtitle": "Text-to-Music, 2024",
       "category_id": "speech",
+      "description_md": "Text-to-music. Up to 11.9s stereo 44.1 kHz. Rectified flow DiT + T5 + Oobleck VAE.",
       "demo": {
         "template": "text_to_audio",
         "config": {
       }
     },
     {
+      "id": "openvoice",
+      "name": "OpenVoice V2",
+      "subtitle": "Voice Cloning",
       "category_id": "audio",
+      "description_md": "Zero-shot voice conversion. Clone a speaker from ~10s reference audio.",
       "demo": {
+        "template": "audio_in_out",
         "config": {
           "sample_rate": 22050,
+          "output_stems": [
+            "converted"
+          ]
         }
       },
       "files": [
         {
+          "name": "OpenVoice_SpeakerEncoder.mlpackage.zip",
+          "url": "https://huggingface.co/mlboydaisuke/coreml-zoo/resolve/main/openvoice/OpenVoice_SpeakerEncoder.mlpackage.zip",
           "archive": "zip",
+          "size_bytes": 1519880,
+          "sha256": "c3f2a96aaf5ecb5c5afc62b3d3dfbd47dc7ae64bc3edb7aa68befb54aef74459",
+          "compute_units": "cpuAndGPU",
+          "kind": "model"
+        },
+        {
+          "name": "OpenVoice_VoiceConverter.mlpackage.zip",
+          "url": "https://huggingface.co/mlboydaisuke/coreml-zoo/resolve/main/openvoice/OpenVoice_VoiceConverter.mlpackage.zip",
+          "archive": "zip",
+          "size_bytes": 59799630,
+          "sha256": "ef3ce8a2d1564aefa13830d7d0ca43f85e0aa62d5f59622c8bc456c307ab5e05",
+          "compute_units": "cpuAndGPU",
           "kind": "model"
         }
       ],
       "requirements": {
         "min_ios": "17.0",
+        "min_ram_mb": 500
       },
       "license": {
+        "name": "MIT",
+        "url": "https://github.com/myshell-ai/OpenVoice"
       },
       "upstream": {
+        "name": "myshell-ai/OpenVoice",
+        "url": "https://github.com/myshell-ai/OpenVoice",
+        "year": 2023
       }
     },
     {
       "name": "Pyannote Diarization",
       "subtitle": "Speaker Identification",
       "category_id": "audio",
+      "description_md": "Speaker diarization: who spoke when. 16 kHz mono, 10s segments.",
       "demo": {
         "template": "audio_in_out",
         "config": {
         "year": 2021
       }
     },
     {
       "id": "realesrgan",
       "name": "Real-ESRGAN 4x",
       "name": "RF-DETR Nano",
       "subtitle": "Object Detection, 2025",
       "category_id": "detection",
+      "description_md": "End-to-end transformer detector. 384×384 input. 300 queries, 91 classes (COCO + background). No NMS needed.",
       "demo": {
         "template": "image_detection",
         "config": {
       "name": "MobileSAM",
       "subtitle": "Segment Anything, 2023",
       "category_id": "segmentation",
+      "description_md": "Lightweight Segment Anything. Tap any point to generate a segmentation mask. ViT-Tiny encoder + lightweight decoder. ~60× smaller than SAM.",
       "demo": {
         "template": "segment_anything",
         "config": {
+          "encoder": "MobileSAM.zip",
+          "decoder": "MobileSAM.zip",
           "input_size": 1024
         }
       },