mlboydaisuke commited on
Commit
f5c93e9
·
verified ·
1 Parent(s): 5585de7

Upload models.json with huggingface_hub

Browse files
Files changed (1) hide show
  1. models.json +173 -161
models.json CHANGED
@@ -3,6 +3,12 @@
3
  "updated_at": "2026-04-10",
4
  "min_app_version": "1.0",
5
  "categories": [
 
 
 
 
 
 
6
  {
7
  "id": "segmentation",
8
  "name": "Segmentation",
@@ -77,12 +83,54 @@
77
  }
78
  ],
79
  "models": [
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
80
  {
81
  "id": "rmbg_1_4",
82
  "name": "RMBG-1.4",
83
  "subtitle": "BRIA AI, 2023",
84
  "category_id": "segmentation",
85
- "description_md": "High-quality background removal. Outputs foreground with alpha mask. INT8 quantized U-Net, 1024×1024 input.",
86
  "demo": {
87
  "template": "image_in_out",
88
  "config": {
@@ -97,7 +145,7 @@
97
  "archive": "zip",
98
  "size_bytes": 38771210,
99
  "sha256": "a80dbb5f04c922a8fa698c38592e4e52af4e62471d70bc7c59c28a3355a1da95",
100
- "compute_units": "cpuAndGPU",
101
  "kind": "model"
102
  }
103
  ],
@@ -117,10 +165,10 @@
117
  },
118
  {
119
  "id": "ddcolor",
120
- "name": "DDColor",
121
  "subtitle": "Image Colorization, 2023",
122
  "category_id": "enhancement",
123
- "description_md": "Automatic grayscale image colorization via dual decoders. 512×512 input, processes in LAB color space.",
124
  "demo": {
125
  "template": "image_in_out",
126
  "config": {
@@ -158,12 +206,12 @@
158
  "name": "SinSR",
159
  "subtitle": "Single-Step Super-Resolution, 2024",
160
  "category_id": "enhancement",
161
- "description_md": "4× super-resolution via single-step diffusion. 256×256 input 1024×1024 output. Swin Transformer denoiser (FP32 required).",
162
  "demo": {
163
  "template": "image_in_out",
164
  "config": {
165
  "input_size": 256,
166
- "output_type": "image"
167
  }
168
  },
169
  "files": [
@@ -209,50 +257,12 @@
209
  "year": 2024
210
  }
211
  },
212
- {
213
- "id": "efficientad",
214
- "name": "EfficientAD",
215
- "subtitle": "Anomaly Detection, 2023",
216
- "category_id": "segmentation",
217
- "description_md": "Lightweight unsupervised anomaly detection. 256×256 input → anomaly heatmap + score. Industrial quality inspection.",
218
- "demo": {
219
- "template": "image_in_out",
220
- "config": {
221
- "input_size": 256,
222
- "output_type": "image"
223
- }
224
- },
225
- "files": [
226
- {
227
- "name": "EfficientAD.mlpackage.zip",
228
- "url": "TODO",
229
- "archive": "zip",
230
- "size_bytes": 8000000,
231
- "sha256": "TODO",
232
- "compute_units": "all",
233
- "kind": "model"
234
- }
235
- ],
236
- "requirements": {
237
- "min_ios": "17.0",
238
- "min_ram_mb": 200
239
- },
240
- "license": {
241
- "name": "MIT",
242
- "url": "https://github.com/nelson1425/EfficientAD"
243
- },
244
- "upstream": {
245
- "name": "nelson1425/EfficientAD",
246
- "url": "https://github.com/nelson1425/EfficientAD",
247
- "year": 2023
248
- }
249
- },
250
  {
251
  "id": "yolo26s",
252
  "name": "YOLO26s",
253
  "subtitle": "NMS-Free Detection, 2026",
254
  "category_id": "detection",
255
- "description_md": "NMS-free object detection. 640×640 input, output [1,300,6]: x1,y1,x2,y2,confidence,class_id. 80 COCO classes.",
256
  "demo": {
257
  "template": "image_detection",
258
  "config": {
@@ -286,11 +296,11 @@
286
  }
287
  },
288
  {
289
- "id": "yolov9s",
290
- "name": "YOLOv9s",
291
  "subtitle": "Object Detection, 2024",
292
  "category_id": "detection",
293
- "description_md": "YOLOv9 small with Vision framework NMS. 640×640 input. PGI + GELAN architecture.",
294
  "demo": {
295
  "template": "image_detection",
296
  "config": {
@@ -315,11 +325,11 @@
315
  },
316
  "license": {
317
  "name": "AGPL-3.0",
318
- "url": "https://github.com/WongKinYiu/yolov9"
319
  },
320
  "upstream": {
321
- "name": "WongKinYiu/yolov9",
322
- "url": "https://github.com/WongKinYiu/yolov9",
323
  "year": 2024
324
  }
325
  },
@@ -328,7 +338,7 @@
328
  "name": "YOLOv10n",
329
  "subtitle": "Object Detection, 2024",
330
  "category_id": "detection",
331
- "description_md": "YOLOv10 nano with Vision framework NMS. 640×640 input. Dual-assignment strategy.",
332
  "demo": {
333
  "template": "image_detection",
334
  "config": {
@@ -361,12 +371,58 @@
361
  "year": 2024
362
  }
363
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
364
  {
365
  "id": "moge2_vitb_normal_504",
366
  "name": "MoGe-2 ViT-B (504×504)",
367
  "subtitle": "Microsoft, CVPR 2025",
368
  "category_id": "depth",
369
- "description_md": "Monocular geometry from a single image. Predicts metric depth, surface normals, and a confidence mask in one forward pass. DINOv2 ViT-B/14 backbone.",
370
  "demo": {
371
  "template": "depth_visualization",
372
  "config": {
@@ -410,7 +466,7 @@
410
  "name": "SigLIP",
411
  "subtitle": "Zero-Shot Classification, 2023",
412
  "category_id": "vision_language",
413
- "description_md": "Zero-shot image classification. Dual encoder (image + text) with sigmoid loss. 224×224 input. Type any class names to classify.",
414
  "demo": {
415
  "template": "zero_shot_classify",
416
  "config": {
@@ -468,7 +524,7 @@
468
  "name": "Florence-2",
469
  "subtitle": "Microsoft, 2024",
470
  "category_id": "vision_language",
471
- "description_md": "Vision-language captioning, OCR, and visual QA. Three-stage encoder-decoder. 768×768 input, autoregressive text output.",
472
  "demo": {
473
  "template": "image_to_text",
474
  "config": {
@@ -563,26 +619,24 @@
563
  }
564
  },
565
  {
566
- "id": "adaface",
567
- "name": "AdaFace",
568
- "subtitle": "Face Recognition, 2022",
569
  "category_id": "face",
570
- "description_md": "Face recognition via 512-dim embeddings. IR-18 backbone, 112×112 face crop input. Compare faces by cosine similarity.",
571
  "demo": {
572
- "template": "face_compare",
573
  "config": {
574
- "input_size": 112,
575
- "embedding_dim": 512,
576
- "match_threshold": 0.6
577
  }
578
  },
579
  "files": [
580
  {
581
- "name": "AdaFace_IR18_CASIA.mlpackage.zip",
582
- "url": "TODO",
583
  "archive": "zip",
584
- "size_bytes": 32000000,
585
- "sha256": "TODO",
586
  "compute_units": "all",
587
  "kind": "model"
588
  }
@@ -593,12 +647,12 @@
593
  },
594
  "license": {
595
  "name": "MIT",
596
- "url": "https://github.com/mk-minchul/AdaFace"
597
  },
598
  "upstream": {
599
- "name": "mk-minchul/AdaFace",
600
- "url": "https://github.com/mk-minchul/AdaFace",
601
- "year": 2022
602
  }
603
  },
604
  {
@@ -606,7 +660,7 @@
606
  "name": "Hyper-SD (1-Step)",
607
  "subtitle": "ByteDance, 2024",
608
  "category_id": "generation",
609
- "description_md": "Single-step text-to-image from SD1.5 via TCD distillation. 512×512 output. Chunked UNet (6-bit palettized) + TCD scheduler.",
610
  "demo": {
611
  "template": "text_to_image",
612
  "config": {
@@ -630,7 +684,7 @@
630
  "archive": "zip",
631
  "size_bytes": 226397794,
632
  "sha256": "201b0fcc3573811aac6a4e8545c695bc4fb2f7710ea0d60c227919d87b37687e",
633
- "compute_units": "cpuAndNeuralEngine",
634
  "kind": "model"
635
  },
636
  {
@@ -657,7 +711,7 @@
657
  "archive": "zip",
658
  "size_bytes": 91282754,
659
  "sha256": "1260371542d845a2261ed2de36c5fe3e9ccb740a6ceb59b1990705d125e8cf66",
660
- "compute_units": "cpuAndNeuralEngine",
661
  "kind": "model"
662
  },
663
  {
@@ -694,16 +748,16 @@
694
  "name": "MatAnyone",
695
  "subtitle": "Video Matting, 2025",
696
  "category_id": "video",
697
- "description_md": "Temporally consistent video matting with memory propagation. 5-model pipeline: encoder, mask encoder, read first, read, decoder. 768×432 landscape input.",
698
  "demo": {
699
  "template": "video_matting",
700
  "config": {
701
  "frame_size": 512,
702
- "encoder": "MatAnyone_Encoder.mlpackage.zip",
703
- "mask_encoder": "MatAnyone_MaskEncoder.mlpackage.zip",
704
- "read_first": "MatAnyone_ReadFirst.mlpackage.zip",
705
- "read": "MatAnyone_Read.mlpackage.zip",
706
- "decoder": "MatAnyone_Decoder.mlpackage.zip"
707
  }
708
  },
709
  "files": [
@@ -713,7 +767,7 @@
713
  "archive": "zip",
714
  "size_bytes": 17306121,
715
  "sha256": "97ffd6bc4611f9a3351dc890fc00954ba48171e517e66a39f7a5f1f38110dfda",
716
- "compute_units": "all",
717
  "kind": "model"
718
  },
719
  {
@@ -722,7 +776,7 @@
722
  "archive": "zip",
723
  "size_bytes": 16819866,
724
  "sha256": "ba67559188ffc64d8e46418c051c6a55815d4482def17519fa518daac7d5a911",
725
- "compute_units": "all",
726
  "kind": "model"
727
  },
728
  {
@@ -749,7 +803,7 @@
749
  "archive": "zip",
750
  "size_bytes": 8807630,
751
  "sha256": "67136aa67000e604838fe9aa7de151c514ef84f0b83f1da0f043cf70652d28eb",
752
- "compute_units": "all",
753
  "kind": "model"
754
  }
755
  ],
@@ -772,7 +826,7 @@
772
  "name": "HTDemucs",
773
  "subtitle": "Audio Source Separation",
774
  "category_id": "audio",
775
- "description_md": "Split music into 4 stems: drums, bass, vocals, other. 44.1 kHz stereo, overlap-add for full tracks. FP32 model.",
776
  "demo": {
777
  "template": "audio_in_out",
778
  "config": {
@@ -816,7 +870,7 @@
816
  "name": "Kokoro-82M",
817
  "subtitle": "Multilingual TTS",
818
  "category_id": "speech",
819
- "description_md": "English + Japanese text-to-speech. 24 kHz mono. On-device G2P. StyleTTS2 + iSTFTNet vocoder. 10 voices, bucketed decoder (128/256/512).",
820
  "demo": {
821
  "template": "text_to_audio",
822
  "config": {
@@ -896,7 +950,7 @@
896
  "name": "Stable Audio Open",
897
  "subtitle": "Text-to-Music, 2024",
898
  "category_id": "speech",
899
- "description_md": "Text-to-music generation. Up to 11.9s stereo 44.1 kHz. Rectified flow DiT + T5 encoder + Oobleck VAE decoder.",
900
  "demo": {
901
  "template": "text_to_audio",
902
  "config": {
@@ -965,45 +1019,52 @@
965
  }
966
  },
967
  {
968
- "id": "basicpitch",
969
- "name": "Basic Pitch",
970
- "subtitle": "Spotify, Music Transcription",
971
  "category_id": "audio",
972
- "description_md": "Polyphonic music transcription: audio → MIDI notes. Tiny 17K-param model (272 KB). Windowed inference at 22.05 kHz.",
973
  "demo": {
974
- "template": "audio_to_score",
975
  "config": {
976
  "sample_rate": 22050,
977
- "window_size": 43844,
978
- "hop_size": 256,
979
- "n_bins": 88,
980
- "onset_threshold": 0.5,
981
- "note_threshold": 0.5
982
  }
983
  },
984
  "files": [
985
  {
986
- "name": "nmp.mlpackage.zip",
987
- "url": "TODO",
988
  "archive": "zip",
989
- "size_bytes": 272000,
990
- "sha256": "TODO",
991
- "compute_units": "all",
 
 
 
 
 
 
 
 
 
992
  "kind": "model"
993
  }
994
  ],
995
  "requirements": {
996
  "min_ios": "17.0",
997
- "min_ram_mb": 200
998
  },
999
  "license": {
1000
- "name": "Apache-2.0",
1001
- "url": "https://github.com/spotify/basic-pitch"
1002
  },
1003
  "upstream": {
1004
- "name": "spotify/basic-pitch",
1005
- "url": "https://github.com/spotify/basic-pitch",
1006
- "year": 2022
1007
  }
1008
  },
1009
  {
@@ -1011,7 +1072,7 @@
1011
  "name": "Pyannote Diarization",
1012
  "subtitle": "Speaker Identification",
1013
  "category_id": "audio",
1014
- "description_md": "Speaker diarization: who spoke when. 16 kHz mono input, 10s segments. Outputs per-frame speaker logits.",
1015
  "demo": {
1016
  "template": "audio_in_out",
1017
  "config": {
@@ -1046,55 +1107,6 @@
1046
  "year": 2021
1047
  }
1048
  },
1049
- {
1050
- "id": "openvoice",
1051
- "name": "OpenVoice V2",
1052
- "subtitle": "Voice Cloning",
1053
- "category_id": "audio",
1054
- "description_md": "Zero-shot voice conversion. Clone a speaker from ~10s reference audio. Speaker encoder + voice converter.",
1055
- "demo": {
1056
- "template": "audio_in_out",
1057
- "config": {
1058
- "sample_rate": 22050,
1059
- "output_stems": [
1060
- "converted"
1061
- ]
1062
- }
1063
- },
1064
- "files": [
1065
- {
1066
- "name": "OpenVoice_SpeakerEncoder.mlpackage.zip",
1067
- "url": "https://huggingface.co/mlboydaisuke/coreml-zoo/resolve/main/openvoice/OpenVoice_SpeakerEncoder.mlpackage.zip",
1068
- "archive": "zip",
1069
- "size_bytes": 1519880,
1070
- "sha256": "c3f2a96aaf5ecb5c5afc62b3d3dfbd47dc7ae64bc3edb7aa68befb54aef74459",
1071
- "compute_units": "cpuAndGPU",
1072
- "kind": "model"
1073
- },
1074
- {
1075
- "name": "OpenVoice_VoiceConverter.mlpackage.zip",
1076
- "url": "https://huggingface.co/mlboydaisuke/coreml-zoo/resolve/main/openvoice/OpenVoice_VoiceConverter.mlpackage.zip",
1077
- "archive": "zip",
1078
- "size_bytes": 59799630,
1079
- "sha256": "ef3ce8a2d1564aefa13830d7d0ca43f85e0aa62d5f59622c8bc456c307ab5e05",
1080
- "compute_units": "cpuAndGPU",
1081
- "kind": "model"
1082
- }
1083
- ],
1084
- "requirements": {
1085
- "min_ios": "17.0",
1086
- "min_ram_mb": 500
1087
- },
1088
- "license": {
1089
- "name": "MIT",
1090
- "url": "https://github.com/myshell-ai/OpenVoice"
1091
- },
1092
- "upstream": {
1093
- "name": "myshell-ai/OpenVoice",
1094
- "url": "https://github.com/myshell-ai/OpenVoice",
1095
- "year": 2023
1096
- }
1097
- },
1098
  {
1099
  "id": "realesrgan",
1100
  "name": "Real-ESRGAN 4x",
@@ -1176,7 +1188,7 @@
1176
  "name": "RF-DETR Nano",
1177
  "subtitle": "Object Detection, 2025",
1178
  "category_id": "detection",
1179
- "description_md": "End-to-end transformer detector. 384×384 input. 300 queries, 91 classes (COCO + background). No NMS needed. Output: confidence [300,91] + coordinates [300,4] in normalized cxcywh.",
1180
  "demo": {
1181
  "template": "image_detection",
1182
  "config": {
@@ -1256,12 +1268,12 @@
1256
  "name": "MobileSAM",
1257
  "subtitle": "Segment Anything, 2023",
1258
  "category_id": "segmentation",
1259
- "description_md": "Lightweight Segment Anything. Tap any point to generate a segmentation mask. ViT-Tiny encoder (13 MB) + lightweight decoder (9.8 MB). ~60× smaller than SAM.",
1260
  "demo": {
1261
  "template": "segment_anything",
1262
  "config": {
1263
- "encoder": "MobileSAM_Encoder.mlpackage.zip",
1264
- "decoder": "MobileSAM_Decoder.mlpackage.zip",
1265
  "input_size": 1024
1266
  }
1267
  },
 
3
  "updated_at": "2026-04-10",
4
  "min_app_version": "1.0",
5
  "categories": [
6
+ {
7
+ "id": "llm",
8
+ "name": "Large Language Models",
9
+ "icon": "bubble.left.and.text.bubble.right",
10
+ "order": 0
11
+ },
12
  {
13
  "id": "segmentation",
14
  "name": "Segmentation",
 
83
  }
84
  ],
85
  "models": [
86
+ {
87
+ "id": "gemma4_e2b",
88
+ "name": "Gemma 4 E2B",
89
+ "subtitle": "Google DeepMind, 2025",
90
+ "category_id": "llm",
91
+ "description_md": "Google's latest on-device multimodal LLM. 2.3B effective parameters with Per-Layer Embeddings. Text + image input, streaming text output. Runs on Apple Neural Engine at ~31 tok/s decode. Supports multi-turn conversations, image understanding, and reasoning.",
92
+ "thumbnail_url": "https://huggingface.co/mlboydaisuke/coreml-zoo/resolve/main/thumbnails/gemma4.jpg",
93
+ "demo": {
94
+ "template": "chat",
95
+ "config": {
96
+ "max_tokens": 1024,
97
+ "multimodal": true
98
+ }
99
+ },
100
+ "files": [
101
+ {
102
+ "name": "gemma4-e2b-coreml.zip",
103
+ "url": "https://huggingface.co/mlboydaisuke/gemma-4-E2B-coreml/resolve/main/gemma4-e2b-coreml.zip",
104
+ "archive": "zip",
105
+ "size_bytes": 2700000000,
106
+ "sha256": "TODO",
107
+ "compute_units": "cpuAndNeuralEngine",
108
+ "kind": "model"
109
+ }
110
+ ],
111
+ "requirements": {
112
+ "min_ios": "18.0",
113
+ "min_ram_mb": 1500,
114
+ "device_capabilities": [
115
+ "arm64"
116
+ ]
117
+ },
118
+ "license": {
119
+ "name": "Gemma",
120
+ "url": "https://ai.google.dev/gemma/terms"
121
+ },
122
+ "upstream": {
123
+ "name": "google/gemma-4-e2b",
124
+ "url": "https://huggingface.co/google/gemma-4-e2b",
125
+ "year": 2025
126
+ }
127
+ },
128
  {
129
  "id": "rmbg_1_4",
130
  "name": "RMBG-1.4",
131
  "subtitle": "BRIA AI, 2023",
132
  "category_id": "segmentation",
133
+ "description_md": "High-quality background removal. Outputs foreground with alpha mask. 1024×1024 input.",
134
  "demo": {
135
  "template": "image_in_out",
136
  "config": {
 
145
  "archive": "zip",
146
  "size_bytes": 38771210,
147
  "sha256": "a80dbb5f04c922a8fa698c38592e4e52af4e62471d70bc7c59c28a3355a1da95",
148
+ "compute_units": "cpuOnly",
149
  "kind": "model"
150
  }
151
  ],
 
165
  },
166
  {
167
  "id": "ddcolor",
168
+ "name": "DDColor Tiny",
169
  "subtitle": "Image Colorization, 2023",
170
  "category_id": "enhancement",
171
+ "description_md": "Automatic grayscale image colorization via dual decoders. 512×512 input.",
172
  "demo": {
173
  "template": "image_in_out",
174
  "config": {
 
206
  "name": "SinSR",
207
  "subtitle": "Single-Step Super-Resolution, 2024",
208
  "category_id": "enhancement",
209
+ "description_md": "4× super-resolution via single-step diffusion. 256→1024. Swin Transformer denoiser (FP32).",
210
  "demo": {
211
  "template": "image_in_out",
212
  "config": {
213
  "input_size": 256,
214
+ "output_type": "sinsr"
215
  }
216
  },
217
  "files": [
 
257
  "year": 2024
258
  }
259
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
260
  {
261
  "id": "yolo26s",
262
  "name": "YOLO26s",
263
  "subtitle": "NMS-Free Detection, 2026",
264
  "category_id": "detection",
265
+ "description_md": "NMS-free object detection. 640×640 input, 80 COCO classes.",
266
  "demo": {
267
  "template": "image_detection",
268
  "config": {
 
296
  }
297
  },
298
  {
299
+ "id": "yolo11s",
300
+ "name": "YOLO11s",
301
  "subtitle": "Object Detection, 2024",
302
  "category_id": "detection",
303
+ "description_md": "YOLO11 small detection with Vision framework NMS. 640×640 input.",
304
  "demo": {
305
  "template": "image_detection",
306
  "config": {
 
325
  },
326
  "license": {
327
  "name": "AGPL-3.0",
328
+ "url": "https://github.com/ultralytics/ultralytics"
329
  },
330
  "upstream": {
331
+ "name": "ultralytics/ultralytics",
332
+ "url": "https://github.com/ultralytics/ultralytics",
333
  "year": 2024
334
  }
335
  },
 
338
  "name": "YOLOv10n",
339
  "subtitle": "Object Detection, 2024",
340
  "category_id": "detection",
341
+ "description_md": "YOLOv10 nano. 640×640 input. Dual-assignment strategy.",
342
  "demo": {
343
  "template": "image_detection",
344
  "config": {
 
371
  "year": 2024
372
  }
373
  },
374
+ {
375
+ "id": "yoloworld",
376
+ "name": "YOLO-World",
377
+ "subtitle": "Open-Vocabulary Detection, 2024",
378
+ "category_id": "detection",
379
+ "description_md": "Open-vocabulary detection. Type any text query. YOLO-World V2-S + CLIP ViT-B/32.",
380
+ "demo": {
381
+ "template": "open_vocab_detection",
382
+ "config": {
383
+ "input_size": 640
384
+ }
385
+ },
386
+ "files": [
387
+ {
388
+ "name": "yoloworld_detector.mlpackage.zip",
389
+ "url": "https://huggingface.co/mlboydaisuke/coreml-zoo/resolve/main/yoloworld/yoloworld_detector.mlpackage.zip",
390
+ "archive": "zip",
391
+ "size_bytes": 23710620,
392
+ "sha256": "611d299ae74c83f90a5cc9f4585709859d5db735baa8ade721e0c2d99cd5af92",
393
+ "compute_units": "all",
394
+ "kind": "model"
395
+ },
396
+ {
397
+ "name": "clip_text_encoder.mlpackage.zip",
398
+ "url": "https://huggingface.co/mlboydaisuke/coreml-zoo/resolve/main/yoloworld/clip_text_encoder.mlpackage.zip",
399
+ "archive": "zip",
400
+ "size_bytes": 116681932,
401
+ "sha256": "45770a743297e8c2a57cc330d4f5c80f47734263680895b33b593b50dd2c382b",
402
+ "compute_units": "cpuOnly",
403
+ "kind": "model"
404
+ }
405
+ ],
406
+ "requirements": {
407
+ "min_ios": "17.0",
408
+ "min_ram_mb": 600
409
+ },
410
+ "license": {
411
+ "name": "GPL-3.0",
412
+ "url": "https://github.com/AILab-CVC/YOLO-World"
413
+ },
414
+ "upstream": {
415
+ "name": "AILab-CVC/YOLO-World",
416
+ "url": "https://github.com/AILab-CVC/YOLO-World",
417
+ "year": 2024
418
+ }
419
+ },
420
  {
421
  "id": "moge2_vitb_normal_504",
422
  "name": "MoGe-2 ViT-B (504×504)",
423
  "subtitle": "Microsoft, CVPR 2025",
424
  "category_id": "depth",
425
+ "description_md": "Monocular geometry from a single image. Metric depth, surface normals, confidence mask. DINOv2 ViT-B/14 backbone.",
426
  "demo": {
427
  "template": "depth_visualization",
428
  "config": {
 
466
  "name": "SigLIP",
467
  "subtitle": "Zero-Shot Classification, 2023",
468
  "category_id": "vision_language",
469
+ "description_md": "Zero-shot image classification. Dual encoder (image + text). 224×224 input.",
470
  "demo": {
471
  "template": "zero_shot_classify",
472
  "config": {
 
524
  "name": "Florence-2",
525
  "subtitle": "Microsoft, 2024",
526
  "category_id": "vision_language",
527
+ "description_md": "Vision-language captioning, OCR, and VQA. Three-stage encoder-decoder. 768×768 input.",
528
  "demo": {
529
  "template": "image_to_text",
530
  "config": {
 
619
  }
620
  },
621
  {
622
+ "id": "face3d",
623
+ "name": "3DDFA V2",
624
+ "subtitle": "3D Face Reconstruction, 2020",
625
  "category_id": "face",
626
+ "description_md": "Single-image 3D face reconstruction. Predicts 6 DoF pose + expression parameters.",
627
  "demo": {
628
+ "template": "face_3d",
629
  "config": {
630
+ "input_size": 120
 
 
631
  }
632
  },
633
  "files": [
634
  {
635
+ "name": "3DDFA_V2.mlpackage.zip",
636
+ "url": "https://huggingface.co/mlboydaisuke/coreml-zoo/resolve/main/face3d/3DDFA_V2.mlpackage.zip",
637
  "archive": "zip",
638
+ "size_bytes": 6083375,
639
+ "sha256": "0f715dc220c046f558e3b8fc65246df9a2eec77182830a16628783430cdacdc8",
640
  "compute_units": "all",
641
  "kind": "model"
642
  }
 
647
  },
648
  "license": {
649
  "name": "MIT",
650
+ "url": "https://github.com/cleardusk/3DDFA_V2"
651
  },
652
  "upstream": {
653
+ "name": "cleardusk/3DDFA_V2",
654
+ "url": "https://github.com/cleardusk/3DDFA_V2",
655
+ "year": 2020
656
  }
657
  },
658
  {
 
660
  "name": "Hyper-SD (1-Step)",
661
  "subtitle": "ByteDance, 2024",
662
  "category_id": "generation",
663
+ "description_md": "Single-step text-to-image from SD1.5 via TCD distillation. 512×512. Chunked UNet (6-bit).",
664
  "demo": {
665
  "template": "text_to_image",
666
  "config": {
 
684
  "archive": "zip",
685
  "size_bytes": 226397794,
686
  "sha256": "201b0fcc3573811aac6a4e8545c695bc4fb2f7710ea0d60c227919d87b37687e",
687
+ "compute_units": "cpuAndGPU",
688
  "kind": "model"
689
  },
690
  {
 
711
  "archive": "zip",
712
  "size_bytes": 91282754,
713
  "sha256": "1260371542d845a2261ed2de36c5fe3e9ccb740a6ceb59b1990705d125e8cf66",
714
+ "compute_units": "cpuAndGPU",
715
  "kind": "model"
716
  },
717
  {
 
748
  "name": "MatAnyone",
749
  "subtitle": "Video Matting, 2025",
750
  "category_id": "video",
751
+ "description_md": "Temporally consistent video matting. 5-model pipeline with memory propagation.",
752
  "demo": {
753
  "template": "video_matting",
754
  "config": {
755
  "frame_size": 512,
756
+ "encoder": "MatAnyone_encoder.mlpackage.zip",
757
+ "mask_encoder": "MatAnyone_mask_encoder.mlpackage.zip",
758
+ "read_first": "MatAnyone_read_first.mlpackage.zip",
759
+ "read": "MatAnyone_read.mlpackage.zip",
760
+ "decoder": "MatAnyone_decoder.mlpackage.zip"
761
  }
762
  },
763
  "files": [
 
767
  "archive": "zip",
768
  "size_bytes": 17306121,
769
  "sha256": "97ffd6bc4611f9a3351dc890fc00954ba48171e517e66a39f7a5f1f38110dfda",
770
+ "compute_units": "cpuAndGPU",
771
  "kind": "model"
772
  },
773
  {
 
776
  "archive": "zip",
777
  "size_bytes": 16819866,
778
  "sha256": "ba67559188ffc64d8e46418c051c6a55815d4482def17519fa518daac7d5a911",
779
+ "compute_units": "cpuAndGPU",
780
  "kind": "model"
781
  },
782
  {
 
803
  "archive": "zip",
804
  "size_bytes": 8807630,
805
  "sha256": "67136aa67000e604838fe9aa7de151c514ef84f0b83f1da0f043cf70652d28eb",
806
+ "compute_units": "cpuAndGPU",
807
  "kind": "model"
808
  }
809
  ],
 
826
  "name": "HTDemucs",
827
  "subtitle": "Audio Source Separation",
828
  "category_id": "audio",
829
+ "description_md": "Split music into 4 stems: drums, bass, vocals, other. 44.1 kHz stereo, FP32.",
830
  "demo": {
831
  "template": "audio_in_out",
832
  "config": {
 
870
  "name": "Kokoro-82M",
871
  "subtitle": "Multilingual TTS",
872
  "category_id": "speech",
873
+ "description_md": "English + Japanese text-to-speech. 24 kHz. StyleTTS2 + iSTFTNet vocoder. Multiple voices.",
874
  "demo": {
875
  "template": "text_to_audio",
876
  "config": {
 
950
  "name": "Stable Audio Open",
951
  "subtitle": "Text-to-Music, 2024",
952
  "category_id": "speech",
953
+ "description_md": "Text-to-music. Up to 11.9s stereo 44.1 kHz. Rectified flow DiT + T5 + Oobleck VAE.",
954
  "demo": {
955
  "template": "text_to_audio",
956
  "config": {
 
1019
  }
1020
  },
1021
  {
1022
+ "id": "openvoice",
1023
+ "name": "OpenVoice V2",
1024
+ "subtitle": "Voice Cloning",
1025
  "category_id": "audio",
1026
+ "description_md": "Zero-shot voice conversion. Clone a speaker from ~10s reference audio.",
1027
  "demo": {
1028
+ "template": "audio_in_out",
1029
  "config": {
1030
  "sample_rate": 22050,
1031
+ "output_stems": [
1032
+ "converted"
1033
+ ]
 
 
1034
  }
1035
  },
1036
  "files": [
1037
  {
1038
+ "name": "OpenVoice_SpeakerEncoder.mlpackage.zip",
1039
+ "url": "https://huggingface.co/mlboydaisuke/coreml-zoo/resolve/main/openvoice/OpenVoice_SpeakerEncoder.mlpackage.zip",
1040
  "archive": "zip",
1041
+ "size_bytes": 1519880,
1042
+ "sha256": "c3f2a96aaf5ecb5c5afc62b3d3dfbd47dc7ae64bc3edb7aa68befb54aef74459",
1043
+ "compute_units": "cpuAndGPU",
1044
+ "kind": "model"
1045
+ },
1046
+ {
1047
+ "name": "OpenVoice_VoiceConverter.mlpackage.zip",
1048
+ "url": "https://huggingface.co/mlboydaisuke/coreml-zoo/resolve/main/openvoice/OpenVoice_VoiceConverter.mlpackage.zip",
1049
+ "archive": "zip",
1050
+ "size_bytes": 59799630,
1051
+ "sha256": "ef3ce8a2d1564aefa13830d7d0ca43f85e0aa62d5f59622c8bc456c307ab5e05",
1052
+ "compute_units": "cpuAndGPU",
1053
  "kind": "model"
1054
  }
1055
  ],
1056
  "requirements": {
1057
  "min_ios": "17.0",
1058
+ "min_ram_mb": 500
1059
  },
1060
  "license": {
1061
+ "name": "MIT",
1062
+ "url": "https://github.com/myshell-ai/OpenVoice"
1063
  },
1064
  "upstream": {
1065
+ "name": "myshell-ai/OpenVoice",
1066
+ "url": "https://github.com/myshell-ai/OpenVoice",
1067
+ "year": 2023
1068
  }
1069
  },
1070
  {
 
1072
  "name": "Pyannote Diarization",
1073
  "subtitle": "Speaker Identification",
1074
  "category_id": "audio",
1075
+ "description_md": "Speaker diarization: who spoke when. 16 kHz mono, 10s segments.",
1076
  "demo": {
1077
  "template": "audio_in_out",
1078
  "config": {
 
1107
  "year": 2021
1108
  }
1109
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1110
  {
1111
  "id": "realesrgan",
1112
  "name": "Real-ESRGAN 4x",
 
1188
  "name": "RF-DETR Nano",
1189
  "subtitle": "Object Detection, 2025",
1190
  "category_id": "detection",
1191
+ "description_md": "End-to-end transformer detector. 384×384 input. 300 queries, 91 classes (COCO + background). No NMS needed.",
1192
  "demo": {
1193
  "template": "image_detection",
1194
  "config": {
 
1268
  "name": "MobileSAM",
1269
  "subtitle": "Segment Anything, 2023",
1270
  "category_id": "segmentation",
1271
+ "description_md": "Lightweight Segment Anything. Tap any point to generate a segmentation mask. ViT-Tiny encoder + lightweight decoder. ~60× smaller than SAM.",
1272
  "demo": {
1273
  "template": "segment_anything",
1274
  "config": {
1275
+ "encoder": "MobileSAM.zip",
1276
+ "decoder": "MobileSAM.zip",
1277
  "input_size": 1024
1278
  }
1279
  },