mlboydaisuke commited on
Commit
7f94c79
·
verified ·
1 Parent(s): faa0b7b

Upload models.json with huggingface_hub

Browse files
Files changed (1) hide show
  1. models.json +465 -627
models.json CHANGED
@@ -3,458 +3,248 @@
3
  "updated_at": "2026-04-10",
4
  "min_app_version": "1.0",
5
  "categories": [
6
- {
7
- "id": "llm",
8
- "name": "Large Language Models",
9
- "icon": "bubble.left.and.text.bubble.right",
10
- "order": 0
11
- },
12
- {
13
- "id": "segmentation",
14
- "name": "Segmentation",
15
- "icon": "person.and.background.dotted",
16
- "order": 1
17
- },
18
- {
19
- "id": "enhancement",
20
- "name": "Image Enhancement",
21
- "icon": "wand.and.stars",
22
- "order": 2
23
- },
24
- {
25
- "id": "detection",
26
- "name": "Object Detection",
27
- "icon": "viewfinder",
28
- "order": 3
29
- },
30
- {
31
- "id": "depth",
32
- "name": "Depth & Geometry",
33
- "icon": "cube.transparent",
34
- "order": 4
35
- },
36
- {
37
- "id": "vision_language",
38
- "name": "Vision-Language",
39
- "icon": "text.viewfinder",
40
- "order": 5
41
- },
42
- {
43
- "id": "face",
44
- "name": "Face Processing",
45
- "icon": "face.smiling",
46
- "order": 6
47
- },
48
- {
49
- "id": "generation",
50
- "name": "Image Generation",
51
- "icon": "sparkles",
52
- "order": 7
53
- },
54
- {
55
- "id": "video",
56
- "name": "Video Processing",
57
- "icon": "film",
58
- "order": 8
59
- },
60
- {
61
- "id": "audio",
62
- "name": "Audio Processing",
63
- "icon": "waveform.circle",
64
- "order": 9
65
- },
66
- {
67
- "id": "speech",
68
- "name": "Speech & Music",
69
- "icon": "music.note",
70
- "order": 10
71
- }
72
  ],
73
  "models": [
74
- {
75
- "id": "gemma4_e2b",
76
- "name": "Gemma 4 E2B",
77
- "subtitle": "Google DeepMind, 2025",
78
- "category_id": "llm",
79
- "description_md": "Google's latest on-device multimodal LLM. 2.3B effective parameters with Per-Layer Embeddings. Text + image input, streaming text output. Runs on Apple Neural Engine at ~31 tok/s decode. Supports multi-turn conversations, image understanding, and reasoning.",
80
- "thumbnail_url": "https://huggingface.co/mlboydaisuke/coreml-zoo/resolve/main/thumbnails/gemma4.jpg",
81
- "demo": {
82
- "template": "chat",
83
- "config": {
84
- "max_tokens": 1024,
85
- "multimodal": true
86
- }
87
- },
88
- "files": [
89
- {
90
- "name": "gemma4-e2b-coreml.zip",
91
- "url": "https://huggingface.co/mlboydaisuke/gemma-4-E2B-coreml/resolve/main/gemma4-e2b-coreml.zip",
92
- "archive": "zip",
93
- "size_bytes": 2700000000,
94
- "sha256": "TODO",
95
- "compute_units": "cpuAndNeuralEngine",
96
- "kind": "model"
97
- }
98
- ],
99
- "requirements": {
100
- "min_ios": "18.0",
101
- "min_ram_mb": 1500,
102
- "device_capabilities": [
103
- "arm64"
104
- ]
105
- },
106
- "license": {
107
- "name": "Gemma",
108
- "url": "https://ai.google.dev/gemma/terms"
109
- },
110
- "upstream": {
111
- "name": "google/gemma-4-e2b",
112
- "url": "https://huggingface.co/google/gemma-4-e2b",
113
- "year": 2025
114
- }
115
- },
116
  {
117
  "id": "rmbg_1_4",
118
  "name": "RMBG-1.4",
119
  "subtitle": "BRIA AI, 2023",
120
  "category_id": "segmentation",
121
- "description_md": "High-quality background removal. Outputs foreground with alpha mask. 1024×1024 input.",
122
  "demo": {
123
  "template": "image_in_out",
124
- "config": {
125
- "input_size": 1024,
126
- "output_type": "mask"
127
- }
128
  },
129
  "files": [
130
  {
131
  "name": "RMBG_1_4.mlpackage.zip",
132
- "url": "https://huggingface.co/mlboydaisuke/coreml-zoo/resolve/main/rmbg/RMBG_1_4.mlpackage.zip",
133
  "archive": "zip",
134
- "size_bytes": 38771210,
135
- "sha256": "a80dbb5f04c922a8fa698c38592e4e52af4e62471d70bc7c59c28a3355a1da95",
136
- "compute_units": "cpuOnly",
137
  "kind": "model"
138
  }
139
  ],
140
- "requirements": {
141
- "min_ios": "17.0",
142
- "min_ram_mb": 300
143
- },
144
- "license": {
145
- "name": "Apache-2.0",
146
- "url": "https://huggingface.co/briaai/RMBG-1.4"
147
- },
148
- "upstream": {
149
- "name": "briaai/RMBG-1.4",
150
- "url": "https://huggingface.co/briaai/RMBG-1.4",
151
- "year": 2023
152
- }
153
  },
154
  {
155
  "id": "ddcolor",
156
- "name": "DDColor Tiny",
157
  "subtitle": "Image Colorization, 2023",
158
  "category_id": "enhancement",
159
- "description_md": "Automatic grayscale image colorization via dual decoders. 512×512 input.",
160
  "demo": {
161
  "template": "image_in_out",
162
- "config": {
163
- "input_size": 512,
164
- "output_type": "lab_ab"
165
- }
166
  },
167
  "files": [
168
  {
169
- "name": "DDColor_Tiny.mlpackage.zip",
170
- "url": "https://huggingface.co/mlboydaisuke/coreml-zoo/resolve/main/ddcolor/DDColor_Tiny.mlpackage.zip",
171
  "archive": "zip",
172
- "size_bytes": 212344570,
173
- "sha256": "bfecea37d66005f602efe13978360b8e4707923234c3d1d00beeb4e36cb1b02c",
174
  "compute_units": "all",
175
  "kind": "model"
176
  }
177
  ],
178
- "requirements": {
179
- "min_ios": "17.0",
180
- "min_ram_mb": 400
181
- },
182
- "license": {
183
- "name": "Apache-2.0",
184
- "url": "https://github.com/piddnad/DDColor"
185
- },
186
- "upstream": {
187
- "name": "piddnad/DDColor",
188
- "url": "https://github.com/piddnad/DDColor",
189
- "year": 2023
190
- }
191
  },
192
  {
193
  "id": "sinsr",
194
  "name": "SinSR",
195
  "subtitle": "Single-Step Super-Resolution, 2024",
196
  "category_id": "enhancement",
197
- "description_md": "4× super-resolution via single-step diffusion. 256→1024. Swin Transformer denoiser (FP32).",
198
  "demo": {
199
  "template": "image_in_out",
200
- "config": {
201
- "input_size": 256,
202
- "output_type": "sinsr"
203
- }
204
  },
205
  "files": [
206
  {
207
  "name": "SinSR_Encoder.mlpackage.zip",
208
- "url": "https://huggingface.co/mlboydaisuke/coreml-zoo/resolve/main/sinsr/SinSR_Encoder.mlpackage.zip",
209
  "archive": "zip",
210
- "size_bytes": 41246338,
211
- "sha256": "fdec09d17561ec1bb5a2e829683d48c2b45e76b876285619a6e29a3523b8b7e2",
212
  "compute_units": "cpuAndGPU",
213
  "kind": "model"
214
  },
215
  {
216
  "name": "SinSR_Denoiser.mlpackage.zip",
217
- "url": "https://huggingface.co/mlboydaisuke/coreml-zoo/resolve/main/sinsr/SinSR_Denoiser.mlpackage.zip",
218
  "archive": "zip",
219
- "size_bytes": 440014511,
220
- "sha256": "b31374c2d539b2cdd81499d6062c801ca00e405f5a67507cd609d14e2d6d4beb",
221
  "compute_units": "cpuOnly",
222
  "kind": "model"
223
  },
224
  {
225
  "name": "SinSR_Decoder.mlpackage.zip",
226
- "url": "https://huggingface.co/mlboydaisuke/coreml-zoo/resolve/main/sinsr/SinSR_Decoder.mlpackage.zip",
227
  "archive": "zip",
228
- "size_bytes": 60880285,
229
- "sha256": "b8b9a7b52d6b240cf9fb3352b286ea83eb984fd73f5dd81c9f034f0016a5cb8c",
230
  "compute_units": "cpuAndGPU",
231
  "kind": "model"
232
  }
233
  ],
234
- "requirements": {
235
- "min_ios": "17.0",
236
- "min_ram_mb": 600
237
- },
238
- "license": {
239
- "name": "Apache-2.0",
240
- "url": "https://github.com/wyf0912/SinSR"
241
- },
242
- "upstream": {
243
- "name": "wyf0912/SinSR",
244
- "url": "https://github.com/wyf0912/SinSR",
245
- "year": 2024
246
- }
247
  },
248
  {
249
- "id": "yolo26s",
250
- "name": "YOLO26s",
251
- "subtitle": "NMS-Free Detection, 2026",
252
- "category_id": "detection",
253
- "description_md": "NMS-free object detection. 640×640 input, 80 COCO classes.",
254
  "demo": {
255
- "template": "image_detection",
256
- "config": {
257
- "input_size": 640,
258
- "confidence_threshold": 0.25
259
- }
260
  },
261
  "files": [
262
  {
263
- "name": "yolo26s.mlpackage.zip",
264
- "url": "https://huggingface.co/mlboydaisuke/coreml-zoo/resolve/main/yolo26/yolo26s.mlpackage.zip",
265
  "archive": "zip",
266
- "size_bytes": 17697581,
267
- "sha256": "0ec02fb0cf2dbd6e09601cbbc00a9734156ea4c2a52b0da23a984337074c6fd4",
268
  "compute_units": "all",
269
  "kind": "model"
270
  }
271
  ],
272
- "requirements": {
273
- "min_ios": "17.0",
274
- "min_ram_mb": 300
275
- },
276
- "license": {
277
- "name": "AGPL-3.0",
278
- "url": "https://github.com/ultralytics/ultralytics"
279
- },
280
- "upstream": {
281
- "name": "ultralytics/ultralytics",
282
- "url": "https://github.com/ultralytics/ultralytics",
283
- "year": 2026
284
- }
285
  },
286
  {
287
- "id": "yolo11s",
288
- "name": "YOLO11s",
289
- "subtitle": "Object Detection, 2024",
290
  "category_id": "detection",
291
- "description_md": "YOLO11 small detection with Vision framework NMS. 640×640 input.",
292
  "demo": {
293
  "template": "image_detection",
294
- "config": {
295
- "input_size": 640,
296
- "confidence_threshold": 0.25
297
- }
298
  },
299
  "files": [
300
  {
301
- "name": "yolo11s.mlpackage.zip",
302
- "url": "https://huggingface.co/mlboydaisuke/coreml-zoo/resolve/main/yolov9/yolo11s.mlpackage.zip",
303
  "archive": "zip",
304
- "size_bytes": 17580204,
305
- "sha256": "79e82aacc3ad20fc1eb990df6979fae9b927d4b06f33bd20ec0e1c0dcb7d1f6b",
306
  "compute_units": "all",
307
  "kind": "model"
308
  }
309
  ],
310
- "requirements": {
311
- "min_ios": "17.0",
312
- "min_ram_mb": 300
313
- },
314
- "license": {
315
- "name": "AGPL-3.0",
316
- "url": "https://github.com/ultralytics/ultralytics"
317
- },
318
- "upstream": {
319
- "name": "ultralytics/ultralytics",
320
- "url": "https://github.com/ultralytics/ultralytics",
321
- "year": 2024
322
- }
323
  },
324
  {
325
- "id": "yolov10n",
326
- "name": "YOLOv10n",
327
  "subtitle": "Object Detection, 2024",
328
  "category_id": "detection",
329
- "description_md": "YOLOv10 nano. 640×640 input. Dual-assignment strategy.",
330
  "demo": {
331
  "template": "image_detection",
332
- "config": {
333
- "input_size": 640,
334
- "confidence_threshold": 0.25
335
- }
336
  },
337
  "files": [
338
  {
339
- "name": "YOLOv10N.mlpackage.zip",
340
- "url": "https://huggingface.co/mlboydaisuke/coreml-zoo/resolve/main/yolov10/YOLOv10N.mlpackage.zip",
341
  "archive": "zip",
342
- "size_bytes": 4309168,
343
- "sha256": "9a687144a6b0b764f508c8f544fe46b6674629b8f09a1e99d8ca69b0be899891",
344
  "compute_units": "all",
345
  "kind": "model"
346
  }
347
  ],
348
- "requirements": {
349
- "min_ios": "17.0",
350
- "min_ram_mb": 300
351
- },
352
- "license": {
353
- "name": "AGPL-3.0",
354
- "url": "https://github.com/THU-MIG/yolov10"
355
- },
356
- "upstream": {
357
- "name": "THU-MIG/yolov10",
358
- "url": "https://github.com/THU-MIG/yolov10",
359
- "year": 2024
360
- }
361
  },
362
  {
363
- "id": "yoloworld",
364
- "name": "YOLO-World",
365
- "subtitle": "Open-Vocabulary Detection, 2024",
366
  "category_id": "detection",
367
- "description_md": "Open-vocabulary detection. Type any text query. YOLO-World V2-S + CLIP ViT-B/32.",
368
  "demo": {
369
- "template": "open_vocab_detection",
370
- "config": {
371
- "input_size": 640
372
- }
373
  },
374
  "files": [
375
  {
376
- "name": "yoloworld_detector.mlpackage.zip",
377
- "url": "https://huggingface.co/mlboydaisuke/coreml-zoo/resolve/main/yoloworld/yoloworld_detector.mlpackage.zip",
378
  "archive": "zip",
379
- "size_bytes": 23710620,
380
- "sha256": "611d299ae74c83f90a5cc9f4585709859d5db735baa8ade721e0c2d99cd5af92",
381
  "compute_units": "all",
382
  "kind": "model"
383
- },
384
- {
385
- "name": "clip_text_encoder.mlpackage.zip",
386
- "url": "https://huggingface.co/mlboydaisuke/coreml-zoo/resolve/main/yoloworld/clip_text_encoder.mlpackage.zip",
387
- "archive": "zip",
388
- "size_bytes": 116681932,
389
- "sha256": "45770a743297e8c2a57cc330d4f5c80f47734263680895b33b593b50dd2c382b",
390
- "compute_units": "cpuOnly",
391
- "kind": "model"
392
  }
393
  ],
394
- "requirements": {
395
- "min_ios": "17.0",
396
- "min_ram_mb": 600
397
- },
398
- "license": {
399
- "name": "GPL-3.0",
400
- "url": "https://github.com/AILab-CVC/YOLO-World"
401
- },
402
- "upstream": {
403
- "name": "AILab-CVC/YOLO-World",
404
- "url": "https://github.com/AILab-CVC/YOLO-World",
405
- "year": 2024
406
- }
407
  },
408
  {
409
  "id": "moge2_vitb_normal_504",
410
  "name": "MoGe-2 ViT-B (504×504)",
411
  "subtitle": "Microsoft, CVPR 2025",
412
  "category_id": "depth",
413
- "description_md": "Monocular geometry from a single image. Metric depth, surface normals, confidence mask. DINOv2 ViT-B/14 backbone.",
414
  "demo": {
415
  "template": "depth_visualization",
416
  "config": {
417
  "input_size": 504,
418
- "output_keys": [
419
- "depth",
420
- "normal",
421
- "mask",
422
- "metric_scale"
423
- ],
424
  "depth_unit": "meters"
425
  }
426
  },
427
  "files": [
428
  {
429
  "name": "MoGe2_ViTB_Normal_504.mlpackage.zip",
430
- "url": "https://huggingface.co/mlboydaisuke/coreml-zoo/resolve/main/moge2/MoGe2_ViTB_Normal_504.mlpackage.zip",
431
  "archive": "zip",
432
- "size_bytes": 193312088,
433
- "sha256": "f60cfb4804707a489d99e24453188cd31ddcabb299bbf6da4507edc9cecbf9e7",
434
  "compute_units": "all",
435
  "kind": "model"
436
  }
437
  ],
438
- "requirements": {
439
- "min_ios": "17.0",
440
- "min_ram_mb": 600
441
- },
442
- "license": {
443
- "name": "MIT",
444
- "url": "https://github.com/microsoft/MoGe/blob/main/LICENSE"
445
- },
446
- "upstream": {
447
- "name": "microsoft/MoGe",
448
- "url": "https://github.com/microsoft/MoGe",
449
- "year": 2025
450
- }
451
  },
452
  {
453
  "id": "siglip",
454
  "name": "SigLIP",
455
  "subtitle": "Zero-Shot Classification, 2023",
456
  "category_id": "vision_language",
457
- "description_md": "Zero-shot image classification. Dual encoder (image + text). 224×224 input.",
458
  "demo": {
459
  "template": "zero_shot_classify",
460
  "config": {
@@ -469,50 +259,40 @@
469
  "files": [
470
  {
471
  "name": "SigLIP_ImageEncoder.mlpackage.zip",
472
- "url": "https://huggingface.co/mlboydaisuke/coreml-zoo/resolve/main/siglip/SigLIP_ImageEncoder.mlpackage.zip",
473
  "archive": "zip",
474
- "size_bytes": 170352400,
475
- "sha256": "98f6abf5f4aa145199f4ae22305f9c1d5929eee6b126daad84783b2b2090ee24",
476
  "compute_units": "cpuOnly",
477
  "kind": "model"
478
  },
479
  {
480
  "name": "SigLIP_TextEncoder.mlpackage.zip",
481
- "url": "https://huggingface.co/mlboydaisuke/coreml-zoo/resolve/main/siglip/SigLIP_TextEncoder.mlpackage.zip",
482
  "archive": "zip",
483
- "size_bytes": 203975769,
484
- "sha256": "9dead2d58705838aef7ad83c3bf4036698c78d872ca1cdd04f2c4a6272009ccf",
485
  "compute_units": "cpuOnly",
486
  "kind": "model"
487
  },
488
  {
489
  "name": "siglip_vocab.json",
490
- "url": "https://huggingface.co/mlboydaisuke/coreml-zoo/resolve/main/siglip/siglip_vocab.json",
491
- "size_bytes": 673754,
492
- "sha256": "b94b3a58e04f619936b3890804dff7c478522c07515ff748cf127c5443ee5229",
493
  "kind": "vocab"
494
  }
495
  ],
496
- "requirements": {
497
- "min_ios": "17.0",
498
- "min_ram_mb": 800
499
- },
500
- "license": {
501
- "name": "Apache-2.0",
502
- "url": "https://github.com/google-research/big_vision"
503
- },
504
- "upstream": {
505
- "name": "google-research/big_vision",
506
- "url": "https://github.com/google-research/big_vision",
507
- "year": 2023
508
- }
509
  },
510
  {
511
  "id": "florence2",
512
  "name": "Florence-2",
513
  "subtitle": "Microsoft, 2024",
514
  "category_id": "vision_language",
515
- "description_md": "Vision-language captioning, OCR, and VQA. Three-stage encoder-decoder. 768×768 input.",
516
  "demo": {
517
  "template": "image_to_text",
518
  "config": {
@@ -523,132 +303,83 @@
523
  "decoder": "Florence2Decoder.mlpackage.zip",
524
  "vocab_file": "florence2_vocab.json",
525
  "tasks": {
526
- "caption": [
527
- 0,
528
- 2264,
529
- 473,
530
- 5,
531
- 2274,
532
- 6190,
533
- 116,
534
- 2
535
- ],
536
- "detailed_caption": [
537
- 0,
538
- 2264,
539
- 473,
540
- 5,
541
- 31962,
542
- 2274,
543
- 6190,
544
- 116,
545
- 2
546
- ],
547
- "ocr": [
548
- 0,
549
- 2264,
550
- 473,
551
- 5,
552
- 71307,
553
- 116,
554
- 2
555
- ]
556
  }
557
  }
558
  },
559
  "files": [
560
  {
561
  "name": "Florence2VisionEncoder.mlpackage.zip",
562
- "url": "https://huggingface.co/mlboydaisuke/coreml-zoo/resolve/main/florence2/Florence2VisionEncoder.mlpackage.zip",
563
  "archive": "zip",
564
- "size_bytes": 81198683,
565
- "sha256": "9422f189c21220a0f9966eb9d780856772feb55597dcc579fc4e3c88990d0046",
566
  "compute_units": "cpuOnly",
567
  "kind": "model"
568
  },
569
  {
570
  "name": "Florence2TextEncoder.mlpackage.zip",
571
- "url": "https://huggingface.co/mlboydaisuke/coreml-zoo/resolve/main/florence2/Florence2TextEncoder.mlpackage.zip",
572
  "archive": "zip",
573
- "size_bytes": 72742890,
574
- "sha256": "f985deeef0408ea8aac33ac4f5c6d9635cd9c64c98b53f85031db6e27f3bfd92",
575
  "compute_units": "cpuOnly",
576
  "kind": "model"
577
  },
578
  {
579
  "name": "Florence2Decoder.mlpackage.zip",
580
- "url": "https://huggingface.co/mlboydaisuke/coreml-zoo/resolve/main/florence2/Florence2Decoder.mlpackage.zip",
581
  "archive": "zip",
582
- "size_bytes": 85329746,
583
- "sha256": "fe85a6faab5281272bcd79dabfbf87d60ba1a78dd9455e2bf71c67a134d61dc5",
584
  "compute_units": "cpuOnly",
585
  "kind": "model"
586
  },
587
  {
588
  "name": "florence2_vocab.json",
589
- "url": "https://huggingface.co/mlboydaisuke/coreml-zoo/resolve/main/florence2/florence2_vocab.json",
590
- "size_bytes": 999352,
591
- "sha256": "861fee9af5520403f6dbb4940d6af6627f1481b71cdc4a870f1f61344e57e645",
592
  "kind": "vocab"
593
  }
594
  ],
595
- "requirements": {
596
- "min_ios": "17.0",
597
- "min_ram_mb": 1200
598
- },
599
- "license": {
600
- "name": "MIT",
601
- "url": "https://huggingface.co/microsoft/Florence-2-base"
602
- },
603
- "upstream": {
604
- "name": "microsoft/Florence-2",
605
- "url": "https://huggingface.co/microsoft/Florence-2-base",
606
- "year": 2024
607
- }
608
  },
609
  {
610
- "id": "face3d",
611
- "name": "3DDFA V2",
612
- "subtitle": "3D Face Reconstruction, 2020",
613
  "category_id": "face",
614
- "description_md": "Single-image 3D face reconstruction. Predicts 6 DoF pose + expression parameters.",
615
  "demo": {
616
- "template": "face_3d",
617
- "config": {
618
- "input_size": 120
619
- }
620
  },
621
  "files": [
622
  {
623
- "name": "3DDFA_V2.mlpackage.zip",
624
- "url": "https://huggingface.co/mlboydaisuke/coreml-zoo/resolve/main/face3d/3DDFA_V2.mlpackage.zip",
625
  "archive": "zip",
626
- "size_bytes": 6083375,
627
- "sha256": "0f715dc220c046f558e3b8fc65246df9a2eec77182830a16628783430cdacdc8",
628
  "compute_units": "all",
629
  "kind": "model"
630
  }
631
  ],
632
- "requirements": {
633
- "min_ios": "17.0",
634
- "min_ram_mb": 200
635
- },
636
- "license": {
637
- "name": "MIT",
638
- "url": "https://github.com/cleardusk/3DDFA_V2"
639
- },
640
- "upstream": {
641
- "name": "cleardusk/3DDFA_V2",
642
- "url": "https://github.com/cleardusk/3DDFA_V2",
643
- "year": 2020
644
- }
645
  },
646
  {
647
  "id": "hypersd",
648
  "name": "Hyper-SD (1-Step)",
649
  "subtitle": "ByteDance, 2024",
650
  "category_id": "generation",
651
- "description_md": "Single-step text-to-image from SD1.5 via TCD distillation. 512×512. Chunked UNet (6-bit).",
652
  "demo": {
653
  "template": "text_to_image",
654
  "config": {
@@ -668,277 +399,226 @@
668
  "files": [
669
  {
670
  "name": "HyperSDTextEncoder.mlpackage.zip",
671
- "url": "https://huggingface.co/mlboydaisuke/coreml-zoo/resolve/main/hypersd/HyperSDTextEncoder.mlpackage.zip",
672
  "archive": "zip",
673
- "size_bytes": 226397794,
674
- "sha256": "201b0fcc3573811aac6a4e8545c695bc4fb2f7710ea0d60c227919d87b37687e",
675
- "compute_units": "cpuAndGPU",
676
  "kind": "model"
677
  },
678
  {
679
  "name": "HyperSDUnetChunk1.mlpackage.zip",
680
- "url": "https://huggingface.co/mlboydaisuke/coreml-zoo/resolve/main/hypersd/HyperSDUnetChunk1.mlpackage.zip",
681
  "archive": "zip",
682
- "size_bytes": 324819653,
683
- "sha256": "279da11b8231aeeb9045f6ceabebb3a68c20a1b86ecc81aa6914b77ce76d5203",
684
  "compute_units": "cpuAndNeuralEngine",
685
  "kind": "model"
686
  },
687
  {
688
  "name": "HyperSDUnetChunk2.mlpackage.zip",
689
- "url": "https://huggingface.co/mlboydaisuke/coreml-zoo/resolve/main/hypersd/HyperSDUnetChunk2.mlpackage.zip",
690
  "archive": "zip",
691
- "size_bytes": 304530429,
692
- "sha256": "0a700d11a105da589bb3e5666e38b9c72fa283149951b253fc11722e70e72faa",
693
  "compute_units": "cpuAndNeuralEngine",
694
  "kind": "model"
695
  },
696
  {
697
  "name": "HyperSDVAEDecoder.mlpackage.zip",
698
- "url": "https://huggingface.co/mlboydaisuke/coreml-zoo/resolve/main/hypersd/HyperSDVAEDecoder.mlpackage.zip",
699
  "archive": "zip",
700
- "size_bytes": 91282754,
701
- "sha256": "1260371542d845a2261ed2de36c5fe3e9ccb740a6ceb59b1990705d125e8cf66",
702
- "compute_units": "cpuAndGPU",
703
  "kind": "model"
704
  },
705
  {
706
  "name": "vocab.json",
707
- "url": "https://huggingface.co/mlboydaisuke/coreml-zoo/resolve/main/hypersd/vocab.json",
708
- "size_bytes": 1059962,
709
- "sha256": "e089ad92ba36837a0d31433e555c8f45fe601ab5c221d4f607ded32d9f7a4349",
710
  "kind": "vocab"
711
  },
712
  {
713
  "name": "merges.txt",
714
- "url": "https://huggingface.co/mlboydaisuke/coreml-zoo/resolve/main/hypersd/merges.txt",
715
- "size_bytes": 524619,
716
- "sha256": "9fd691f7c8039210e0fced15865466c65820d09b63988b0174bfe25de299051a",
717
  "kind": "vocab"
718
  }
719
  ],
720
- "requirements": {
721
- "min_ios": "17.0",
722
- "min_ram_mb": 1000
723
- },
724
- "license": {
725
- "name": "OpenRAIL-M",
726
- "url": "https://huggingface.co/ByteDance/Hyper-SD"
727
- },
728
- "upstream": {
729
- "name": "ByteDance/Hyper-SD",
730
- "url": "https://huggingface.co/ByteDance/Hyper-SD",
731
- "year": 2024
732
- }
733
  },
734
  {
735
  "id": "matanyone",
736
  "name": "MatAnyone",
737
  "subtitle": "Video Matting, 2025",
738
  "category_id": "video",
739
- "description_md": "Temporally consistent video matting. 5-model pipeline with memory propagation.",
740
  "demo": {
741
  "template": "video_matting",
742
  "config": {
743
  "frame_size": 512,
744
- "encoder": "MatAnyone_encoder.mlpackage.zip",
745
- "mask_encoder": "MatAnyone_mask_encoder.mlpackage.zip",
746
- "read_first": "MatAnyone_read_first.mlpackage.zip",
747
- "read": "MatAnyone_read.mlpackage.zip",
748
- "decoder": "MatAnyone_decoder.mlpackage.zip"
749
  }
750
  },
751
  "files": [
752
  {
753
- "name": "MatAnyone_encoder.mlpackage.zip",
754
- "url": "https://huggingface.co/mlboydaisuke/coreml-zoo/resolve/main/matanyone/MatAnyone_encoder.mlpackage.zip",
755
  "archive": "zip",
756
- "size_bytes": 17306121,
757
- "sha256": "97ffd6bc4611f9a3351dc890fc00954ba48171e517e66a39f7a5f1f38110dfda",
758
- "compute_units": "cpuAndGPU",
759
  "kind": "model"
760
  },
761
  {
762
- "name": "MatAnyone_mask_encoder.mlpackage.zip",
763
- "url": "https://huggingface.co/mlboydaisuke/coreml-zoo/resolve/main/matanyone/MatAnyone_mask_encoder.mlpackage.zip",
764
  "archive": "zip",
765
- "size_bytes": 16819866,
766
- "sha256": "ba67559188ffc64d8e46418c051c6a55815d4482def17519fa518daac7d5a911",
767
- "compute_units": "cpuAndGPU",
768
  "kind": "model"
769
  },
770
  {
771
- "name": "MatAnyone_read_first.mlpackage.zip",
772
- "url": "https://huggingface.co/mlboydaisuke/coreml-zoo/resolve/main/matanyone/MatAnyone_read_first.mlpackage.zip",
773
  "archive": "zip",
774
- "size_bytes": 21991849,
775
- "sha256": "34daf7227dbcec7373a3fef175259fa7ec631ed8cb91d5595ca57ee9b22df7bb",
776
  "compute_units": "cpuOnly",
777
  "kind": "model"
778
  },
779
  {
780
- "name": "MatAnyone_read.mlpackage.zip",
781
- "url": "https://huggingface.co/mlboydaisuke/coreml-zoo/resolve/main/matanyone/MatAnyone_read.mlpackage.zip",
782
  "archive": "zip",
783
- "size_bytes": 22135429,
784
- "sha256": "052e52c0ffb7ff9ede448128950cd4c1c9a96589b6900c82b5104d99addb7fa5",
785
  "compute_units": "cpuOnly",
786
  "kind": "model"
787
  },
788
  {
789
- "name": "MatAnyone_decoder.mlpackage.zip",
790
- "url": "https://huggingface.co/mlboydaisuke/coreml-zoo/resolve/main/matanyone/MatAnyone_decoder.mlpackage.zip",
791
  "archive": "zip",
792
- "size_bytes": 8807630,
793
- "sha256": "67136aa67000e604838fe9aa7de151c514ef84f0b83f1da0f043cf70652d28eb",
794
- "compute_units": "cpuAndGPU",
795
  "kind": "model"
796
  }
797
  ],
798
- "requirements": {
799
- "min_ios": "17.0",
800
- "min_ram_mb": 800
801
- },
802
- "license": {
803
- "name": "MIT",
804
- "url": "https://github.com/pq-yang/MatAnyone"
805
- },
806
- "upstream": {
807
- "name": "pq-yang/MatAnyone",
808
- "url": "https://github.com/pq-yang/MatAnyone",
809
- "year": 2025
810
- }
811
  },
812
  {
813
  "id": "demucs",
814
  "name": "HTDemucs",
815
  "subtitle": "Audio Source Separation",
816
  "category_id": "audio",
817
- "description_md": "Split music into 4 stems: drums, bass, vocals, other. 44.1 kHz stereo, FP32.",
818
  "demo": {
819
  "template": "audio_in_out",
820
  "config": {
821
  "sample_rate": 44100,
822
  "segment_length": 343980,
823
- "output_stems": [
824
- "drums",
825
- "bass",
826
- "vocals",
827
- "other"
828
- ]
829
  }
830
  },
831
  "files": [
832
  {
833
  "name": "HTDemucs_SourceSeparation_F32.mlpackage.zip",
834
- "url": "https://huggingface.co/mlboydaisuke/coreml-zoo/resolve/main/demucs/HTDemucs_SourceSeparation_F32.mlpackage.zip",
835
  "archive": "zip",
836
- "size_bytes": 79076395,
837
- "sha256": "0fbb941e15a5b2fa425d14fe630ed4c14b6dee72780c1f5b2b05f58803bce5f7",
838
  "compute_units": "cpuOnly",
839
  "kind": "model"
840
  }
841
  ],
842
- "requirements": {
843
- "min_ios": "17.0",
844
- "min_ram_mb": 1000
845
- },
846
- "license": {
847
- "name": "MIT",
848
- "url": "https://github.com/adefossez/demucs"
849
- },
850
- "upstream": {
851
- "name": "adefossez/demucs",
852
- "url": "https://github.com/adefossez/demucs",
853
- "year": 2021
854
- }
855
  },
856
  {
857
  "id": "kokoro",
858
  "name": "Kokoro-82M",
859
  "subtitle": "Multilingual TTS",
860
  "category_id": "speech",
861
- "description_md": "English + Japanese text-to-speech. 24 kHz. StyleTTS2 + iSTFTNet vocoder. Multiple voices.",
862
  "demo": {
863
  "template": "text_to_audio",
864
  "config": {
865
  "mode": "tts",
866
  "sample_rate": 24000,
867
  "vocab_file": "kokoro_vocab.json",
868
- "voices": [
869
- "af_heart",
870
- "af_bella",
871
- "am_michael",
872
- "bf_emma",
873
- "bm_george"
874
- ]
875
  }
876
  },
877
  "files": [
878
  {
879
  "name": "Kokoro_Predictor.mlpackage.zip",
880
- "url": "https://huggingface.co/mlboydaisuke/coreml-zoo/resolve/main/kokoro/Kokoro_Predictor.mlpackage.zip",
881
  "archive": "zip",
882
- "size_bytes": 72191470,
883
- "sha256": "af1d55dc842980c32b5591a70f603941f11ab60a435bed0c13a107a8ef467bed",
884
  "compute_units": "cpuAndGPU",
885
  "kind": "model"
886
  },
887
  {
888
  "name": "Kokoro_Decoder_128.mlpackage.zip",
889
- "url": "https://huggingface.co/mlboydaisuke/coreml-zoo/resolve/main/kokoro/Kokoro_Decoder_128.mlpackage.zip",
890
  "archive": "zip",
891
- "size_bytes": 229120589,
892
- "sha256": "cece0d072f5ba6aa3f729cf4c76b4de51823bcc65a26ab363c10441c3cd8b306",
893
  "compute_units": "all",
894
  "kind": "model"
895
  },
896
  {
897
  "name": "Kokoro_Decoder_256.mlpackage.zip",
898
- "url": "https://huggingface.co/mlboydaisuke/coreml-zoo/resolve/main/kokoro/Kokoro_Decoder_256.mlpackage.zip",
899
  "archive": "zip",
900
- "size_bytes": 229123438,
901
- "sha256": "36d5e16d5c5ccb500fc96f1b07a1d5ac57b791f8e09e61b78319d76949003efe",
902
  "compute_units": "all",
903
  "kind": "model"
904
  },
905
  {
906
  "name": "Kokoro_Decoder_512.mlpackage.zip",
907
- "url": "https://huggingface.co/mlboydaisuke/coreml-zoo/resolve/main/kokoro/Kokoro_Decoder_512.mlpackage.zip",
908
  "archive": "zip",
909
- "size_bytes": 229128735,
910
- "sha256": "0a44484c327e4fe8443b0bcf104d6964fe3f30d628c9e78aee3f31af7f2475dc",
911
  "compute_units": "all",
912
  "kind": "model"
913
  },
914
  {
915
  "name": "kokoro_vocab.json",
916
- "url": "https://huggingface.co/mlboydaisuke/coreml-zoo/resolve/main/kokoro/kokoro_vocab.json",
917
- "size_bytes": 1144,
918
- "sha256": "70abefbe8a1c8865e43e0a43bbdc25b91a33e4aa053479d443ccf23e20a59e5d",
919
  "kind": "vocab"
920
  }
921
  ],
922
- "requirements": {
923
- "min_ios": "17.0",
924
- "min_ram_mb": 1000
925
- },
926
- "license": {
927
- "name": "Apache-2.0",
928
- "url": "https://huggingface.co/hexgrad/Kokoro-82M"
929
- },
930
- "upstream": {
931
- "name": "hexgrad/Kokoro-82M",
932
- "url": "https://huggingface.co/hexgrad/Kokoro-82M",
933
- "year": 2024
934
- }
935
  },
936
  {
937
  "id": "stable_audio",
938
  "name": "Stable Audio Open",
939
  "subtitle": "Text-to-Music, 2024",
940
  "category_id": "speech",
941
- "description_md": "Text-to-music. Up to 11.9s stereo 44.1 kHz. Rectified flow DiT + T5 + Oobleck VAE.",
942
  "demo": {
943
  "template": "text_to_audio",
944
  "config": {
@@ -950,150 +630,308 @@
950
  "files": [
951
  {
952
  "name": "StableAudioT5Encoder.mlpackage.zip",
953
- "url": "https://huggingface.co/mlboydaisuke/coreml-zoo/resolve/main/stableaudio/StableAudioT5Encoder.mlpackage.zip",
954
  "archive": "zip",
955
- "size_bytes": 98538259,
956
- "sha256": "319a8ba775d309240253ced68a03a3923d0aec9a79f608044f9403bdcfe4b741",
957
  "compute_units": "cpuOnly",
958
  "kind": "model"
959
  },
960
  {
961
  "name": "StableAudioNumberEmbedder.mlpackage.zip",
962
- "url": "https://huggingface.co/mlboydaisuke/coreml-zoo/resolve/main/stableaudio/StableAudioNumberEmbedder.mlpackage.zip",
963
  "archive": "zip",
964
- "size_bytes": 376018,
965
- "sha256": "04bdc5de00a2cf1c4a18f80c94f0d74ecfab41f3ad99f2fb7a031d6ff5af75da",
966
  "compute_units": "cpuOnly",
967
  "kind": "model"
968
  },
969
  {
970
  "name": "StableAudioDiT.mlpackage.zip",
971
- "url": "https://huggingface.co/mlboydaisuke/coreml-zoo/resolve/main/stableaudio/StableAudioDiT.mlpackage.zip",
972
  "archive": "zip",
973
- "size_bytes": 1265748504,
974
- "sha256": "b17da4fc4df857821d39dbdf7d3bfe7062a2272ab3e5df1284d545afb54047e4",
975
  "compute_units": "cpuOnly",
976
  "kind": "model"
977
  },
978
  {
979
  "name": "StableAudioVAEDecoder.mlpackage.zip",
980
- "url": "https://huggingface.co/mlboydaisuke/coreml-zoo/resolve/main/stableaudio/StableAudioVAEDecoder.mlpackage.zip",
981
  "archive": "zip",
982
- "size_bytes": 144960275,
983
- "sha256": "7207544cca9799cc1d6803c5e81badd0bb4527b2d3a64d5cab5700a5f19a9374",
984
  "compute_units": "cpuAndGPU",
985
  "kind": "model"
986
  },
987
  {
988
  "name": "t5_vocab.json",
989
- "url": "https://huggingface.co/mlboydaisuke/coreml-zoo/resolve/main/stableaudio/t5_vocab.json",
990
- "size_bytes": 749757,
991
- "sha256": "7c9ff3ac1b3dbcaa617ee659f2df68688cfd44f1a5eb3be3fa0a2f56c749d56a",
992
  "kind": "vocab"
993
  }
994
  ],
995
- "requirements": {
996
- "min_ios": "17.0",
997
- "min_ram_mb": 1200
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
998
  },
999
- "license": {
1000
- "name": "custom",
1001
- "url": "https://huggingface.co/stabilityai/stable-audio-open-small"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1002
  },
1003
- "upstream": {
1004
- "name": "stabilityai/stable-audio-open-small",
1005
- "url": "https://huggingface.co/stabilityai/stable-audio-open-small",
1006
- "year": 2024
1007
- }
 
 
 
 
 
 
 
 
 
1008
  },
1009
  {
1010
  "id": "openvoice",
1011
  "name": "OpenVoice V2",
1012
  "subtitle": "Voice Cloning",
1013
  "category_id": "audio",
1014
- "description_md": "Zero-shot voice conversion. Clone a speaker from ~10s reference audio.",
1015
  "demo": {
1016
  "template": "audio_in_out",
1017
  "config": {
1018
  "sample_rate": 22050,
1019
- "output_stems": [
1020
- "converted"
1021
- ]
1022
  }
1023
  },
1024
  "files": [
1025
  {
1026
  "name": "OpenVoice_SpeakerEncoder.mlpackage.zip",
1027
- "url": "https://huggingface.co/mlboydaisuke/coreml-zoo/resolve/main/openvoice/OpenVoice_SpeakerEncoder.mlpackage.zip",
1028
  "archive": "zip",
1029
- "size_bytes": 1519880,
1030
- "sha256": "c3f2a96aaf5ecb5c5afc62b3d3dfbd47dc7ae64bc3edb7aa68befb54aef74459",
1031
  "compute_units": "cpuAndGPU",
1032
  "kind": "model"
1033
  },
1034
  {
1035
  "name": "OpenVoice_VoiceConverter.mlpackage.zip",
1036
- "url": "https://huggingface.co/mlboydaisuke/coreml-zoo/resolve/main/openvoice/OpenVoice_VoiceConverter.mlpackage.zip",
1037
  "archive": "zip",
1038
- "size_bytes": 59799630,
1039
- "sha256": "ef3ce8a2d1564aefa13830d7d0ca43f85e0aa62d5f59622c8bc456c307ab5e05",
1040
  "compute_units": "cpuAndGPU",
1041
  "kind": "model"
1042
  }
1043
  ],
1044
- "requirements": {
1045
- "min_ios": "17.0",
1046
- "min_ram_mb": 500
 
 
 
 
 
 
 
 
 
 
1047
  },
1048
- "license": {
1049
- "name": "MIT",
1050
- "url": "https://github.com/myshell-ai/OpenVoice"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1051
  },
1052
- "upstream": {
1053
- "name": "myshell-ai/OpenVoice",
1054
- "url": "https://github.com/myshell-ai/OpenVoice",
1055
- "year": 2023
1056
- }
 
 
 
 
 
 
 
 
 
1057
  },
1058
  {
1059
- "id": "diarization",
1060
- "name": "Pyannote Diarization",
1061
- "subtitle": "Speaker Identification",
1062
- "category_id": "audio",
1063
- "description_md": "Speaker diarization: who spoke when. 16 kHz mono, 10s segments.",
1064
  "demo": {
1065
- "template": "audio_in_out",
1066
  "config": {
1067
- "sample_rate": 16000,
1068
- "output_stems": [
1069
- "speaker_timeline"
1070
- ]
 
1071
  }
1072
  },
1073
  "files": [
1074
  {
1075
- "name": "SpeakerSegmentation.mlpackage.zip",
1076
- "url": "https://huggingface.co/mlboydaisuke/coreml-zoo/resolve/main/diarization/SpeakerSegmentation.mlpackage.zip",
1077
  "archive": "zip",
1078
- "size_bytes": 5327137,
1079
- "sha256": "dcfa2b98900f2b99029abfb593644b70418186a6ec2e94c9a79c2b3d7a84378a",
1080
- "compute_units": "cpuAndGPU",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1081
  "kind": "model"
1082
  }
1083
  ],
1084
- "requirements": {
1085
- "min_ios": "17.0",
1086
- "min_ram_mb": 200
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1087
  },
1088
- "license": {
1089
- "name": "MIT",
1090
- "url": "https://github.com/pyannote/pyannote-audio"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1091
  },
1092
- "upstream": {
1093
- "name": "pyannote/pyannote-audio",
1094
- "url": "https://github.com/pyannote/pyannote-audio",
1095
- "year": 2021
1096
- }
 
 
 
 
 
 
 
 
 
1097
  }
1098
  ]
1099
- }
 
3
  "updated_at": "2026-04-10",
4
  "min_app_version": "1.0",
5
  "categories": [
6
+ { "id": "segmentation", "name": "Segmentation", "icon": "person.and.background.dotted", "order": 1 },
7
+ { "id": "enhancement", "name": "Image Enhancement", "icon": "wand.and.stars", "order": 2 },
8
+ { "id": "detection", "name": "Object Detection", "icon": "viewfinder", "order": 3 },
9
+ { "id": "depth", "name": "Depth & Geometry", "icon": "cube.transparent", "order": 4 },
10
+ { "id": "vision_language", "name": "Vision-Language", "icon": "text.viewfinder", "order": 5 },
11
+ { "id": "face", "name": "Face Processing", "icon": "face.smiling", "order": 6 },
12
+ { "id": "generation", "name": "Image Generation", "icon": "sparkles", "order": 7 },
13
+ { "id": "video", "name": "Video Processing", "icon": "film", "order": 8 },
14
+ { "id": "audio", "name": "Audio Processing", "icon": "waveform.circle", "order": 9 },
15
+ { "id": "speech", "name": "Speech & Music", "icon": "music.note", "order": 10 },
16
+ { "id": "inpainting", "name": "Inpainting", "icon": "eraser", "order": 11 },
17
+ { "id": "restoration", "name": "Face Restoration", "icon": "face.smiling.inverse", "order": 12 }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
18
  ],
19
  "models": [
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
20
  {
21
  "id": "rmbg_1_4",
22
  "name": "RMBG-1.4",
23
  "subtitle": "BRIA AI, 2023",
24
  "category_id": "segmentation",
25
+ "description_md": "High-quality background removal. Outputs foreground with alpha mask. INT8 quantized U-Net, 1024×1024 input.",
26
  "demo": {
27
  "template": "image_in_out",
28
+ "config": { "input_size": 1024, "output_type": "mask" }
 
 
 
29
  },
30
  "files": [
31
  {
32
  "name": "RMBG_1_4.mlpackage.zip",
33
+ "url": "TODO",
34
  "archive": "zip",
35
+ "size_bytes": 50000000,
36
+ "sha256": "TODO",
37
+ "compute_units": "cpuAndGPU",
38
  "kind": "model"
39
  }
40
  ],
41
+ "requirements": { "min_ios": "17.0", "min_ram_mb": 300 },
42
+ "license": { "name": "Apache-2.0", "url": "https://huggingface.co/briaai/RMBG-1.4" },
43
+ "upstream": { "name": "briaai/RMBG-1.4", "url": "https://huggingface.co/briaai/RMBG-1.4", "year": 2023 }
 
 
 
 
 
 
 
 
 
 
44
  },
45
  {
46
  "id": "ddcolor",
47
+ "name": "DDColor",
48
  "subtitle": "Image Colorization, 2023",
49
  "category_id": "enhancement",
50
+ "description_md": "Automatic grayscale image colorization via dual decoders. 512×512 input, processes in LAB color space.",
51
  "demo": {
52
  "template": "image_in_out",
53
+ "config": { "input_size": 512, "output_type": "image" }
 
 
 
54
  },
55
  "files": [
56
  {
57
+ "name": "DDColor.mlpackage.zip",
58
+ "url": "TODO",
59
  "archive": "zip",
60
+ "size_bytes": 35000000,
61
+ "sha256": "TODO",
62
  "compute_units": "all",
63
  "kind": "model"
64
  }
65
  ],
66
+ "requirements": { "min_ios": "17.0", "min_ram_mb": 400 },
67
+ "license": { "name": "Apache-2.0", "url": "https://github.com/piddnad/DDColor" },
68
+ "upstream": { "name": "piddnad/DDColor", "url": "https://github.com/piddnad/DDColor", "year": 2023 }
 
 
 
 
 
 
 
 
 
 
69
  },
70
  {
71
  "id": "sinsr",
72
  "name": "SinSR",
73
  "subtitle": "Single-Step Super-Resolution, 2024",
74
  "category_id": "enhancement",
75
+ "description_md": "4× super-resolution via single-step diffusion. 256×256 input 1024×1024 output. Swin Transformer denoiser (FP32 required).",
76
  "demo": {
77
  "template": "image_in_out",
78
+ "config": { "input_size": 256, "output_type": "image" }
 
 
 
79
  },
80
  "files": [
81
  {
82
  "name": "SinSR_Encoder.mlpackage.zip",
83
+ "url": "TODO",
84
  "archive": "zip",
85
+ "size_bytes": 40000000,
86
+ "sha256": "TODO",
87
  "compute_units": "cpuAndGPU",
88
  "kind": "model"
89
  },
90
  {
91
  "name": "SinSR_Denoiser.mlpackage.zip",
92
+ "url": "TODO",
93
  "archive": "zip",
94
+ "size_bytes": 440000000,
95
+ "sha256": "TODO",
96
  "compute_units": "cpuOnly",
97
  "kind": "model"
98
  },
99
  {
100
  "name": "SinSR_Decoder.mlpackage.zip",
101
+ "url": "TODO",
102
  "archive": "zip",
103
+ "size_bytes": 60000000,
104
+ "sha256": "TODO",
105
  "compute_units": "cpuAndGPU",
106
  "kind": "model"
107
  }
108
  ],
109
+ "requirements": { "min_ios": "17.0", "min_ram_mb": 600 },
110
+ "license": { "name": "Apache-2.0", "url": "https://github.com/wyf0912/SinSR" },
111
+ "upstream": { "name": "wyf0912/SinSR", "url": "https://github.com/wyf0912/SinSR", "year": 2024 }
 
 
 
 
 
 
 
 
 
 
112
  },
113
  {
114
+ "id": "efficientad",
115
+ "name": "EfficientAD",
116
+ "subtitle": "Anomaly Detection, 2023",
117
+ "category_id": "segmentation",
118
+ "description_md": "Lightweight unsupervised anomaly detection. 256×256 input anomaly heatmap + score. Industrial quality inspection.",
119
  "demo": {
120
+ "template": "image_in_out",
121
+ "config": { "input_size": 256, "output_type": "image" }
 
 
 
122
  },
123
  "files": [
124
  {
125
+ "name": "EfficientAD.mlpackage.zip",
126
+ "url": "TODO",
127
  "archive": "zip",
128
+ "size_bytes": 8000000,
129
+ "sha256": "TODO",
130
  "compute_units": "all",
131
  "kind": "model"
132
  }
133
  ],
134
+ "requirements": { "min_ios": "17.0", "min_ram_mb": 200 },
135
+ "license": { "name": "MIT", "url": "https://github.com/nelson1425/EfficientAD" },
136
+ "upstream": { "name": "nelson1425/EfficientAD", "url": "https://github.com/nelson1425/EfficientAD", "year": 2023 }
 
 
 
 
 
 
 
 
 
 
137
  },
138
  {
139
+ "id": "yolo26s",
140
+ "name": "YOLO26s",
141
+ "subtitle": "NMS-Free Detection, 2026",
142
  "category_id": "detection",
143
+ "description_md": "NMS-free object detection. 640×640 input, output [1,300,6]: x1,y1,x2,y2,confidence,class_id. 80 COCO classes.",
144
  "demo": {
145
  "template": "image_detection",
146
+ "config": { "input_size": 640, "confidence_threshold": 0.25 }
 
 
 
147
  },
148
  "files": [
149
  {
150
+ "name": "yolo26s.mlpackage.zip",
151
+ "url": "TODO",
152
  "archive": "zip",
153
+ "size_bytes": 18000000,
154
+ "sha256": "TODO",
155
  "compute_units": "all",
156
  "kind": "model"
157
  }
158
  ],
159
+ "requirements": { "min_ios": "17.0", "min_ram_mb": 300 },
160
+ "license": { "name": "AGPL-3.0", "url": "https://github.com/ultralytics/ultralytics" },
161
+ "upstream": { "name": "ultralytics/ultralytics", "url": "https://github.com/ultralytics/ultralytics", "year": 2026 }
 
 
 
 
 
 
 
 
 
 
162
  },
163
  {
164
+ "id": "yolov9s",
165
+ "name": "YOLOv9s",
166
  "subtitle": "Object Detection, 2024",
167
  "category_id": "detection",
168
+ "description_md": "YOLOv9 small with Vision framework NMS. 640×640 input. PGI + GELAN architecture.",
169
  "demo": {
170
  "template": "image_detection",
171
+ "config": { "input_size": 640, "confidence_threshold": 0.25 }
 
 
 
172
  },
173
  "files": [
174
  {
175
+ "name": "yolov9s.mlpackage.zip",
176
+ "url": "TODO",
177
  "archive": "zip",
178
+ "size_bytes": 14000000,
179
+ "sha256": "TODO",
180
  "compute_units": "all",
181
  "kind": "model"
182
  }
183
  ],
184
+ "requirements": { "min_ios": "17.0", "min_ram_mb": 300 },
185
+ "license": { "name": "AGPL-3.0", "url": "https://github.com/WongKinYiu/yolov9" },
186
+ "upstream": { "name": "WongKinYiu/yolov9", "url": "https://github.com/WongKinYiu/yolov9", "year": 2024 }
 
 
 
 
 
 
 
 
 
 
187
  },
188
  {
189
+ "id": "yolov10n",
190
+ "name": "YOLOv10n",
191
+ "subtitle": "Object Detection, 2024",
192
  "category_id": "detection",
193
+ "description_md": "YOLOv10 nano with Vision framework NMS. 640×640 input. Dual-assignment strategy.",
194
  "demo": {
195
+ "template": "image_detection",
196
+ "config": { "input_size": 640, "confidence_threshold": 0.25 }
 
 
197
  },
198
  "files": [
199
  {
200
+ "name": "yolov10n.mlpackage.zip",
201
+ "url": "TODO",
202
  "archive": "zip",
203
+ "size_bytes": 14000000,
204
+ "sha256": "TODO",
205
  "compute_units": "all",
206
  "kind": "model"
 
 
 
 
 
 
 
 
 
207
  }
208
  ],
209
+ "requirements": { "min_ios": "17.0", "min_ram_mb": 300 },
210
+ "license": { "name": "AGPL-3.0", "url": "https://github.com/THU-MIG/yolov10" },
211
+ "upstream": { "name": "THU-MIG/yolov10", "url": "https://github.com/THU-MIG/yolov10", "year": 2024 }
 
 
 
 
 
 
 
 
 
 
212
  },
213
  {
214
  "id": "moge2_vitb_normal_504",
215
  "name": "MoGe-2 ViT-B (504×504)",
216
  "subtitle": "Microsoft, CVPR 2025",
217
  "category_id": "depth",
218
+ "description_md": "Monocular geometry from a single image. Predicts metric depth, surface normals, and a confidence mask in one forward pass. DINOv2 ViT-B/14 backbone.",
219
  "demo": {
220
  "template": "depth_visualization",
221
  "config": {
222
  "input_size": 504,
223
+ "output_keys": ["depth", "normal", "mask", "metric_scale"],
 
 
 
 
 
224
  "depth_unit": "meters"
225
  }
226
  },
227
  "files": [
228
  {
229
  "name": "MoGe2_ViTB_Normal_504.mlpackage.zip",
230
+ "url": "https://github.com/john-rocky/CoreML-Models/releases/download/moge2-v1/MoGe2_ViTB_Normal_504.mlpackage.zip",
231
  "archive": "zip",
232
+ "size_bytes": 209715200,
233
+ "sha256": "TODO",
234
  "compute_units": "all",
235
  "kind": "model"
236
  }
237
  ],
238
+ "requirements": { "min_ios": "17.0", "min_ram_mb": 600 },
239
+ "license": { "name": "MIT", "url": "https://github.com/microsoft/MoGe/blob/main/LICENSE" },
240
+ "upstream": { "name": "microsoft/MoGe", "url": "https://github.com/microsoft/MoGe", "year": 2025 }
 
 
 
 
 
 
 
 
 
 
241
  },
242
  {
243
  "id": "siglip",
244
  "name": "SigLIP",
245
  "subtitle": "Zero-Shot Classification, 2023",
246
  "category_id": "vision_language",
247
+ "description_md": "Zero-shot image classification. Dual encoder (image + text) with sigmoid loss. 224×224 input. Type any class names to classify.",
248
  "demo": {
249
  "template": "zero_shot_classify",
250
  "config": {
 
259
  "files": [
260
  {
261
  "name": "SigLIP_ImageEncoder.mlpackage.zip",
262
+ "url": "TODO",
263
  "archive": "zip",
264
+ "size_bytes": 350000000,
265
+ "sha256": "TODO",
266
  "compute_units": "cpuOnly",
267
  "kind": "model"
268
  },
269
  {
270
  "name": "SigLIP_TextEncoder.mlpackage.zip",
271
+ "url": "TODO",
272
  "archive": "zip",
273
+ "size_bytes": 350000000,
274
+ "sha256": "TODO",
275
  "compute_units": "cpuOnly",
276
  "kind": "model"
277
  },
278
  {
279
  "name": "siglip_vocab.json",
280
+ "url": "TODO",
281
+ "size_bytes": 1000000,
282
+ "sha256": "TODO",
283
  "kind": "vocab"
284
  }
285
  ],
286
+ "requirements": { "min_ios": "17.0", "min_ram_mb": 800 },
287
+ "license": { "name": "Apache-2.0", "url": "https://github.com/google-research/big_vision" },
288
+ "upstream": { "name": "google-research/big_vision", "url": "https://github.com/google-research/big_vision", "year": 2023 }
 
 
 
 
 
 
 
 
 
 
289
  },
290
  {
291
  "id": "florence2",
292
  "name": "Florence-2",
293
  "subtitle": "Microsoft, 2024",
294
  "category_id": "vision_language",
295
+ "description_md": "Vision-language captioning, OCR, and visual QA. Three-stage encoder-decoder. 768×768 input, autoregressive text output.",
296
  "demo": {
297
  "template": "image_to_text",
298
  "config": {
 
303
  "decoder": "Florence2Decoder.mlpackage.zip",
304
  "vocab_file": "florence2_vocab.json",
305
  "tasks": {
306
+ "caption": [0, 2264, 473, 5, 2274, 6190, 116, 2],
307
+ "detailed_caption": [0, 2264, 473, 5, 31962, 2274, 6190, 116, 2],
308
+ "ocr": [0, 2264, 473, 5, 71307, 116, 2]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
309
  }
310
  }
311
  },
312
  "files": [
313
  {
314
  "name": "Florence2VisionEncoder.mlpackage.zip",
315
+ "url": "TODO",
316
  "archive": "zip",
317
+ "size_bytes": 400000000,
318
+ "sha256": "TODO",
319
  "compute_units": "cpuOnly",
320
  "kind": "model"
321
  },
322
  {
323
  "name": "Florence2TextEncoder.mlpackage.zip",
324
+ "url": "TODO",
325
  "archive": "zip",
326
+ "size_bytes": 450000000,
327
+ "sha256": "TODO",
328
  "compute_units": "cpuOnly",
329
  "kind": "model"
330
  },
331
  {
332
  "name": "Florence2Decoder.mlpackage.zip",
333
+ "url": "TODO",
334
  "archive": "zip",
335
+ "size_bytes": 1400000000,
336
+ "sha256": "TODO",
337
  "compute_units": "cpuOnly",
338
  "kind": "model"
339
  },
340
  {
341
  "name": "florence2_vocab.json",
342
+ "url": "TODO",
343
+ "size_bytes": 500000,
344
+ "sha256": "TODO",
345
  "kind": "vocab"
346
  }
347
  ],
348
+ "requirements": { "min_ios": "17.0", "min_ram_mb": 1200 },
349
+ "license": { "name": "MIT", "url": "https://huggingface.co/microsoft/Florence-2-base" },
350
+ "upstream": { "name": "microsoft/Florence-2", "url": "https://huggingface.co/microsoft/Florence-2-base", "year": 2024 }
 
 
 
 
 
 
 
 
 
 
351
  },
352
  {
353
+ "id": "adaface",
354
+ "name": "AdaFace",
355
+ "subtitle": "Face Recognition, 2022",
356
  "category_id": "face",
357
+ "description_md": "Face recognition via 512-dim embeddings. IR-18 backbone, 112×112 face crop input. Compare faces by cosine similarity.",
358
  "demo": {
359
+ "template": "face_compare",
360
+ "config": { "input_size": 112, "embedding_dim": 512, "match_threshold": 0.6 }
 
 
361
  },
362
  "files": [
363
  {
364
+ "name": "AdaFace_IR18_CASIA.mlpackage.zip",
365
+ "url": "TODO",
366
  "archive": "zip",
367
+ "size_bytes": 32000000,
368
+ "sha256": "TODO",
369
  "compute_units": "all",
370
  "kind": "model"
371
  }
372
  ],
373
+ "requirements": { "min_ios": "17.0", "min_ram_mb": 200 },
374
+ "license": { "name": "MIT", "url": "https://github.com/mk-minchul/AdaFace" },
375
+ "upstream": { "name": "mk-minchul/AdaFace", "url": "https://github.com/mk-minchul/AdaFace", "year": 2022 }
 
 
 
 
 
 
 
 
 
 
376
  },
377
  {
378
  "id": "hypersd",
379
  "name": "Hyper-SD (1-Step)",
380
  "subtitle": "ByteDance, 2024",
381
  "category_id": "generation",
382
+ "description_md": "Single-step text-to-image from SD1.5 via TCD distillation. 512×512 output. Chunked UNet (6-bit palettized) + TCD scheduler.",
383
  "demo": {
384
  "template": "text_to_image",
385
  "config": {
 
399
  "files": [
400
  {
401
  "name": "HyperSDTextEncoder.mlpackage.zip",
402
+ "url": "https://github.com/john-rocky/CoreML-Models/releases/download/hypersd-v1/HyperSDTextEncoder.mlpackage.zip",
403
  "archive": "zip",
404
+ "size_bytes": 235000000,
405
+ "sha256": "TODO",
406
+ "compute_units": "cpuAndNeuralEngine",
407
  "kind": "model"
408
  },
409
  {
410
  "name": "HyperSDUnetChunk1.mlpackage.zip",
411
+ "url": "https://github.com/john-rocky/CoreML-Models/releases/download/hypersd-v1/HyperSDUnetChunk1.mlpackage.zip",
412
  "archive": "zip",
413
+ "size_bytes": 318000000,
414
+ "sha256": "TODO",
415
  "compute_units": "cpuAndNeuralEngine",
416
  "kind": "model"
417
  },
418
  {
419
  "name": "HyperSDUnetChunk2.mlpackage.zip",
420
+ "url": "https://github.com/john-rocky/CoreML-Models/releases/download/hypersd-v1/HyperSDUnetChunk2.mlpackage.zip",
421
  "archive": "zip",
422
+ "size_bytes": 299000000,
423
+ "sha256": "TODO",
424
  "compute_units": "cpuAndNeuralEngine",
425
  "kind": "model"
426
  },
427
  {
428
  "name": "HyperSDVAEDecoder.mlpackage.zip",
429
+ "url": "https://github.com/john-rocky/CoreML-Models/releases/download/hypersd-v1/HyperSDVAEDecoder.mlpackage.zip",
430
  "archive": "zip",
431
+ "size_bytes": 95000000,
432
+ "sha256": "TODO",
433
+ "compute_units": "cpuAndNeuralEngine",
434
  "kind": "model"
435
  },
436
  {
437
  "name": "vocab.json",
438
+ "url": "https://github.com/john-rocky/CoreML-Models/releases/download/hypersd-v1/vocab.json",
439
+ "size_bytes": 1600000,
440
+ "sha256": "TODO",
441
  "kind": "vocab"
442
  },
443
  {
444
  "name": "merges.txt",
445
+ "url": "https://github.com/john-rocky/CoreML-Models/releases/download/hypersd-v1/merges.txt",
446
+ "size_bytes": 525000,
447
+ "sha256": "TODO",
448
  "kind": "vocab"
449
  }
450
  ],
451
+ "requirements": { "min_ios": "17.0", "min_ram_mb": 1000 },
452
+ "license": { "name": "OpenRAIL-M", "url": "https://huggingface.co/ByteDance/Hyper-SD" },
453
+ "upstream": { "name": "ByteDance/Hyper-SD", "url": "https://huggingface.co/ByteDance/Hyper-SD", "year": 2024 }
 
 
 
 
 
 
 
 
 
 
454
  },
455
  {
456
  "id": "matanyone",
457
  "name": "MatAnyone",
458
  "subtitle": "Video Matting, 2025",
459
  "category_id": "video",
460
+ "description_md": "Temporally consistent video matting with memory propagation. 5-model pipeline: encoder, mask encoder, read first, read, decoder. 768×432 landscape input.",
461
  "demo": {
462
  "template": "video_matting",
463
  "config": {
464
  "frame_size": 512,
465
+ "encoder": "MatAnyone_Encoder.mlpackage.zip",
466
+ "mask_encoder": "MatAnyone_MaskEncoder.mlpackage.zip",
467
+ "read_first": "MatAnyone_ReadFirst.mlpackage.zip",
468
+ "read": "MatAnyone_Read.mlpackage.zip",
469
+ "decoder": "MatAnyone_Decoder.mlpackage.zip"
470
  }
471
  },
472
  "files": [
473
  {
474
+ "name": "MatAnyone_Encoder.mlpackage.zip",
475
+ "url": "TODO",
476
  "archive": "zip",
477
+ "size_bytes": 20000000,
478
+ "sha256": "TODO",
479
+ "compute_units": "all",
480
  "kind": "model"
481
  },
482
  {
483
+ "name": "MatAnyone_MaskEncoder.mlpackage.zip",
484
+ "url": "TODO",
485
  "archive": "zip",
486
+ "size_bytes": 10000000,
487
+ "sha256": "TODO",
488
+ "compute_units": "all",
489
  "kind": "model"
490
  },
491
  {
492
+ "name": "MatAnyone_ReadFirst.mlpackage.zip",
493
+ "url": "TODO",
494
  "archive": "zip",
495
+ "size_bytes": 15000000,
496
+ "sha256": "TODO",
497
  "compute_units": "cpuOnly",
498
  "kind": "model"
499
  },
500
  {
501
+ "name": "MatAnyone_Read.mlpackage.zip",
502
+ "url": "TODO",
503
  "archive": "zip",
504
+ "size_bytes": 20000000,
505
+ "sha256": "TODO",
506
  "compute_units": "cpuOnly",
507
  "kind": "model"
508
  },
509
  {
510
+ "name": "MatAnyone_Decoder.mlpackage.zip",
511
+ "url": "TODO",
512
  "archive": "zip",
513
+ "size_bytes": 35000000,
514
+ "sha256": "TODO",
515
+ "compute_units": "all",
516
  "kind": "model"
517
  }
518
  ],
519
+ "requirements": { "min_ios": "17.0", "min_ram_mb": 800 },
520
+ "license": { "name": "MIT", "url": "https://github.com/pq-yang/MatAnyone" },
521
+ "upstream": { "name": "pq-yang/MatAnyone", "url": "https://github.com/pq-yang/MatAnyone", "year": 2025 }
 
 
 
 
 
 
 
 
 
 
522
  },
523
  {
524
  "id": "demucs",
525
  "name": "HTDemucs",
526
  "subtitle": "Audio Source Separation",
527
  "category_id": "audio",
528
+ "description_md": "Split music into 4 stems: drums, bass, vocals, other. 44.1 kHz stereo, overlap-add for full tracks. FP32 model.",
529
  "demo": {
530
  "template": "audio_in_out",
531
  "config": {
532
  "sample_rate": 44100,
533
  "segment_length": 343980,
534
+ "output_stems": ["drums", "bass", "vocals", "other"]
 
 
 
 
 
535
  }
536
  },
537
  "files": [
538
  {
539
  "name": "HTDemucs_SourceSeparation_F32.mlpackage.zip",
540
+ "url": "TODO",
541
  "archive": "zip",
542
+ "size_bytes": 360000000,
543
+ "sha256": "TODO",
544
  "compute_units": "cpuOnly",
545
  "kind": "model"
546
  }
547
  ],
548
+ "requirements": { "min_ios": "17.0", "min_ram_mb": 1000 },
549
+ "license": { "name": "MIT", "url": "https://github.com/adefossez/demucs" },
550
+ "upstream": { "name": "adefossez/demucs", "url": "https://github.com/adefossez/demucs", "year": 2021 }
 
 
 
 
 
 
 
 
 
 
551
  },
552
  {
553
  "id": "kokoro",
554
  "name": "Kokoro-82M",
555
  "subtitle": "Multilingual TTS",
556
  "category_id": "speech",
557
+ "description_md": "English + Japanese text-to-speech. 24 kHz mono. On-device G2P. StyleTTS2 + iSTFTNet vocoder. 10 voices, bucketed decoder (128/256/512).",
558
  "demo": {
559
  "template": "text_to_audio",
560
  "config": {
561
  "mode": "tts",
562
  "sample_rate": 24000,
563
  "vocab_file": "kokoro_vocab.json",
564
+ "voices": ["af_heart", "af_bella", "am_michael", "bf_emma", "bm_george"]
 
 
 
 
 
 
565
  }
566
  },
567
  "files": [
568
  {
569
  "name": "Kokoro_Predictor.mlpackage.zip",
570
+ "url": "TODO",
571
  "archive": "zip",
572
+ "size_bytes": 75000000,
573
+ "sha256": "TODO",
574
  "compute_units": "cpuAndGPU",
575
  "kind": "model"
576
  },
577
  {
578
  "name": "Kokoro_Decoder_128.mlpackage.zip",
579
+ "url": "TODO",
580
  "archive": "zip",
581
+ "size_bytes": 238000000,
582
+ "sha256": "TODO",
583
  "compute_units": "all",
584
  "kind": "model"
585
  },
586
  {
587
  "name": "Kokoro_Decoder_256.mlpackage.zip",
588
+ "url": "TODO",
589
  "archive": "zip",
590
+ "size_bytes": 241000000,
591
+ "sha256": "TODO",
592
  "compute_units": "all",
593
  "kind": "model"
594
  },
595
  {
596
  "name": "Kokoro_Decoder_512.mlpackage.zip",
597
+ "url": "TODO",
598
  "archive": "zip",
599
+ "size_bytes": 246000000,
600
+ "sha256": "TODO",
601
  "compute_units": "all",
602
  "kind": "model"
603
  },
604
  {
605
  "name": "kokoro_vocab.json",
606
+ "url": "TODO",
607
+ "size_bytes": 5000,
608
+ "sha256": "TODO",
609
  "kind": "vocab"
610
  }
611
  ],
612
+ "requirements": { "min_ios": "17.0", "min_ram_mb": 1000 },
613
+ "license": { "name": "Apache-2.0", "url": "https://huggingface.co/hexgrad/Kokoro-82M" },
614
+ "upstream": { "name": "hexgrad/Kokoro-82M", "url": "https://huggingface.co/hexgrad/Kokoro-82M", "year": 2024 }
 
 
 
 
 
 
 
 
 
 
615
  },
616
  {
617
  "id": "stable_audio",
618
  "name": "Stable Audio Open",
619
  "subtitle": "Text-to-Music, 2024",
620
  "category_id": "speech",
621
+ "description_md": "Text-to-music generation. Up to 11.9s stereo 44.1 kHz. Rectified flow DiT + T5 encoder + Oobleck VAE decoder.",
622
  "demo": {
623
  "template": "text_to_audio",
624
  "config": {
 
630
  "files": [
631
  {
632
  "name": "StableAudioT5Encoder.mlpackage.zip",
633
+ "url": "TODO",
634
  "archive": "zip",
635
+ "size_bytes": 105000000,
636
+ "sha256": "TODO",
637
  "compute_units": "cpuOnly",
638
  "kind": "model"
639
  },
640
  {
641
  "name": "StableAudioNumberEmbedder.mlpackage.zip",
642
+ "url": "TODO",
643
  "archive": "zip",
644
+ "size_bytes": 400000,
645
+ "sha256": "TODO",
646
  "compute_units": "cpuOnly",
647
  "kind": "model"
648
  },
649
  {
650
  "name": "StableAudioDiT.mlpackage.zip",
651
+ "url": "TODO",
652
  "archive": "zip",
653
+ "size_bytes": 326000000,
654
+ "sha256": "TODO",
655
  "compute_units": "cpuOnly",
656
  "kind": "model"
657
  },
658
  {
659
  "name": "StableAudioVAEDecoder.mlpackage.zip",
660
+ "url": "TODO",
661
  "archive": "zip",
662
+ "size_bytes": 149000000,
663
+ "sha256": "TODO",
664
  "compute_units": "cpuAndGPU",
665
  "kind": "model"
666
  },
667
  {
668
  "name": "t5_vocab.json",
669
+ "url": "TODO",
670
+ "size_bytes": 800000,
671
+ "sha256": "TODO",
672
  "kind": "vocab"
673
  }
674
  ],
675
+ "requirements": { "min_ios": "17.0", "min_ram_mb": 1200 },
676
+ "license": { "name": "custom", "url": "https://huggingface.co/stabilityai/stable-audio-open-small" },
677
+ "upstream": { "name": "stabilityai/stable-audio-open-small", "url": "https://huggingface.co/stabilityai/stable-audio-open-small", "year": 2024 }
678
+ },
679
+ {
680
+ "id": "basicpitch",
681
+ "name": "Basic Pitch",
682
+ "subtitle": "Spotify, Music Transcription",
683
+ "category_id": "audio",
684
+ "description_md": "Polyphonic music transcription: audio → MIDI notes. Tiny 17K-param model (272 KB). Windowed inference at 22.05 kHz.",
685
+ "demo": {
686
+ "template": "audio_to_score",
687
+ "config": {
688
+ "sample_rate": 22050,
689
+ "window_size": 43844,
690
+ "hop_size": 256,
691
+ "n_bins": 88,
692
+ "onset_threshold": 0.5,
693
+ "note_threshold": 0.5
694
+ }
695
  },
696
+ "files": [
697
+ {
698
+ "name": "nmp.mlpackage.zip",
699
+ "url": "TODO",
700
+ "archive": "zip",
701
+ "size_bytes": 272000,
702
+ "sha256": "TODO",
703
+ "compute_units": "all",
704
+ "kind": "model"
705
+ }
706
+ ],
707
+ "requirements": { "min_ios": "17.0", "min_ram_mb": 200 },
708
+ "license": { "name": "Apache-2.0", "url": "https://github.com/spotify/basic-pitch" },
709
+ "upstream": { "name": "spotify/basic-pitch", "url": "https://github.com/spotify/basic-pitch", "year": 2022 }
710
+ },
711
+ {
712
+ "id": "diarization",
713
+ "name": "Pyannote Diarization",
714
+ "subtitle": "Speaker Identification",
715
+ "category_id": "audio",
716
+ "description_md": "Speaker diarization: who spoke when. 16 kHz mono input, 10s segments. Outputs per-frame speaker logits.",
717
+ "demo": {
718
+ "template": "audio_in_out",
719
+ "config": {
720
+ "sample_rate": 16000,
721
+ "output_stems": ["speaker_timeline"]
722
+ }
723
  },
724
+ "files": [
725
+ {
726
+ "name": "Pyannote_Segmentation3_0.mlpackage.zip",
727
+ "url": "TODO",
728
+ "archive": "zip",
729
+ "size_bytes": 25000000,
730
+ "sha256": "TODO",
731
+ "compute_units": "cpuAndGPU",
732
+ "kind": "model"
733
+ }
734
+ ],
735
+ "requirements": { "min_ios": "17.0", "min_ram_mb": 200 },
736
+ "license": { "name": "MIT", "url": "https://github.com/pyannote/pyannote-audio" },
737
+ "upstream": { "name": "pyannote/pyannote-audio", "url": "https://github.com/pyannote/pyannote-audio", "year": 2021 }
738
  },
739
  {
740
  "id": "openvoice",
741
  "name": "OpenVoice V2",
742
  "subtitle": "Voice Cloning",
743
  "category_id": "audio",
744
+ "description_md": "Zero-shot voice conversion. Clone a speaker from ~10s reference audio. Speaker encoder + voice converter.",
745
  "demo": {
746
  "template": "audio_in_out",
747
  "config": {
748
  "sample_rate": 22050,
749
+ "output_stems": ["converted"]
 
 
750
  }
751
  },
752
  "files": [
753
  {
754
  "name": "OpenVoice_SpeakerEncoder.mlpackage.zip",
755
+ "url": "TODO",
756
  "archive": "zip",
757
+ "size_bytes": 35000000,
758
+ "sha256": "TODO",
759
  "compute_units": "cpuAndGPU",
760
  "kind": "model"
761
  },
762
  {
763
  "name": "OpenVoice_VoiceConverter.mlpackage.zip",
764
+ "url": "TODO",
765
  "archive": "zip",
766
+ "size_bytes": 100000000,
767
+ "sha256": "TODO",
768
  "compute_units": "cpuAndGPU",
769
  "kind": "model"
770
  }
771
  ],
772
+ "requirements": { "min_ios": "17.0", "min_ram_mb": 500 },
773
+ "license": { "name": "MIT", "url": "https://github.com/myshell-ai/OpenVoice" },
774
+ "upstream": { "name": "myshell-ai/OpenVoice", "url": "https://github.com/myshell-ai/OpenVoice", "year": 2023 }
775
+ },
776
+ {
777
+ "id": "realesrgan",
778
+ "name": "Real-ESRGAN 4x",
779
+ "subtitle": "Super Resolution, 2021",
780
+ "category_id": "enhancement",
781
+ "description_md": "Real-world blind super-resolution. 4× upscale from any input. Handles noise, blur, and JPEG artifacts. 512×512 input → 2048×2048 output.",
782
+ "demo": {
783
+ "template": "image_in_out",
784
+ "config": { "input_size": 512, "output_type": "image" }
785
  },
786
+ "files": [
787
+ {
788
+ "name": "RealESRGAN_x4.mlpackage.zip",
789
+ "url": "https://huggingface.co/mlboydaisuke/coreml-zoo/resolve/main/realesrgan/RealESRGAN_x4.mlpackage.zip",
790
+ "archive": "zip",
791
+ "size_bytes": 66857221,
792
+ "sha256": "6107dc417de87bf974e5b225a2632e2c78f2849265dc897981f482e922050ec9",
793
+ "compute_units": "all",
794
+ "kind": "model"
795
+ }
796
+ ],
797
+ "requirements": { "min_ios": "17.0", "min_ram_mb": 500 },
798
+ "license": { "name": "BSD-3-Clause", "url": "https://github.com/xinntao/Real-ESRGAN/blob/master/LICENSE" },
799
+ "upstream": { "name": "xinntao/Real-ESRGAN", "url": "https://github.com/xinntao/Real-ESRGAN", "year": 2021 }
800
+ },
801
+ {
802
+ "id": "gfpgan",
803
+ "name": "GFPGAN",
804
+ "subtitle": "Face Restoration, 2021",
805
+ "category_id": "restoration",
806
+ "description_md": "Blind face restoration with generative facial prior. Restores degraded face photos to high quality. 512×512 input/output.",
807
+ "demo": {
808
+ "template": "image_in_out",
809
+ "config": { "input_size": 512, "output_type": "image" }
810
  },
811
+ "files": [
812
+ {
813
+ "name": "GFPGAN.mlpackage.zip",
814
+ "url": "https://huggingface.co/mlboydaisuke/coreml-zoo/resolve/main/gfpgan/GFPGAN.mlpackage.zip",
815
+ "archive": "zip",
816
+ "size_bytes": 337392296,
817
+ "sha256": "218a39c226adecb2ccbc1e358023b80a5cf2510be85dfc3ab0da698fad51391a",
818
+ "compute_units": "all",
819
+ "kind": "model"
820
+ }
821
+ ],
822
+ "requirements": { "min_ios": "17.0", "min_ram_mb": 600 },
823
+ "license": { "name": "Apache-2.0", "url": "https://github.com/TencentARC/GFPGAN/blob/master/LICENSE" },
824
+ "upstream": { "name": "TencentARC/GFPGAN", "url": "https://github.com/TencentARC/GFPGAN", "year": 2021 }
825
  },
826
  {
827
+ "id": "rfdetr_n",
828
+ "name": "RF-DETR Nano",
829
+ "subtitle": "Object Detection, 2025",
830
+ "category_id": "detection",
831
+ "description_md": "End-to-end transformer detector. 384×384 input. 300 queries, 91 classes (COCO + background). No NMS needed. Output: confidence [300,91] + coordinates [300,4] in normalized cxcywh.",
832
  "demo": {
833
+ "template": "image_detection",
834
  "config": {
835
+ "input_size": 384,
836
+ "confidence_threshold": 0.5,
837
+ "output_format": "detr",
838
+ "num_classes": 91,
839
+ "background_class": 0
840
  }
841
  },
842
  "files": [
843
  {
844
+ "name": "rfdetr_n_coco.mlpackage.zip",
845
+ "url": "https://huggingface.co/mlboydaisuke/coreml-zoo/resolve/main/rfdetr/rfdetr_n_coco.mlpackage.zip",
846
  "archive": "zip",
847
+ "size_bytes": 99819094,
848
+ "sha256": "3cac3793b97aa88d5f79290afee24ba86e30da65e884933e3f8b0ba077ec48b4",
849
+ "compute_units": "all",
850
+ "kind": "model"
851
+ }
852
+ ],
853
+ "requirements": { "min_ios": "17.0", "min_ram_mb": 400 },
854
+ "license": { "name": "Apache-2.0", "url": "https://github.com/roboflow/rf-detr/blob/main/LICENSE" },
855
+ "upstream": { "name": "roboflow/rf-detr", "url": "https://github.com/roboflow/rf-detr", "year": 2025 }
856
+ },
857
+ {
858
+ "id": "face_parsing",
859
+ "name": "Face Parsing",
860
+ "subtitle": "Facial Segmentation, 2019",
861
+ "category_id": "segmentation",
862
+ "description_md": "Semantic face parsing into 19 regions: skin, nose, eyes, eyebrows, ears, mouth, lip, hair, hat, eyeglass, earring, necklace, neck, cloth, background. 512×512 input.",
863
+ "demo": {
864
+ "template": "image_in_out",
865
+ "config": { "input_size": 512, "output_type": "segmap", "num_classes": 19 }
866
+ },
867
+ "files": [
868
+ {
869
+ "name": "FaceParsing.mlpackage.zip",
870
+ "url": "https://huggingface.co/mlboydaisuke/coreml-zoo/resolve/main/faceparsing/FaceParsing.mlpackage.zip",
871
+ "archive": "zip",
872
+ "size_bytes": 53182369,
873
+ "sha256": "e7ebd6cc3f53486becc0dbf3b74027bc045aa4158402936ea09c3625682be6bb",
874
+ "compute_units": "all",
875
  "kind": "model"
876
  }
877
  ],
878
+ "requirements": { "min_ios": "17.0", "min_ram_mb": 300 },
879
+ "license": { "name": "MIT", "url": "https://github.com/zllrunning/face-parsing.PyTorch/blob/master/LICENSE" },
880
+ "upstream": { "name": "zllrunning/face-parsing.PyTorch", "url": "https://github.com/zllrunning/face-parsing.PyTorch", "year": 2019 }
881
+ },
882
+ {
883
+ "id": "mobilesam",
884
+ "name": "MobileSAM",
885
+ "subtitle": "Segment Anything, 2023",
886
+ "category_id": "segmentation",
887
+ "description_md": "Lightweight Segment Anything. Tap any point to generate a segmentation mask. ViT-Tiny encoder (13 MB) + lightweight decoder (9.8 MB). ~60× smaller than SAM.",
888
+ "demo": {
889
+ "template": "segment_anything",
890
+ "config": {
891
+ "encoder": "MobileSAM_Encoder.mlpackage.zip",
892
+ "decoder": "MobileSAM_Decoder.mlpackage.zip",
893
+ "input_size": 1024
894
+ }
895
  },
896
+ "files": [
897
+ {
898
+ "name": "MobileSAM.zip",
899
+ "url": "https://huggingface.co/mlboydaisuke/coreml-zoo/resolve/main/mobilesam/MobileSAM.zip",
900
+ "archive": "zip",
901
+ "size_bytes": 20143994,
902
+ "sha256": "0d8d48cb90a48cd860cc3105f54fdeca2a3cb75876a7c936e7243221e3f24681",
903
+ "compute_units": "all",
904
+ "kind": "model"
905
+ }
906
+ ],
907
+ "requirements": { "min_ios": "17.0", "min_ram_mb": 300 },
908
+ "license": { "name": "Apache-2.0", "url": "https://github.com/ChaoningZhang/MobileSAM/blob/master/LICENSE" },
909
+ "upstream": { "name": "ChaoningZhang/MobileSAM", "url": "https://github.com/ChaoningZhang/MobileSAM", "year": 2023 }
910
+ },
911
+ {
912
+ "id": "lama",
913
+ "name": "LaMa",
914
+ "subtitle": "Image Inpainting, 2022",
915
+ "category_id": "inpainting",
916
+ "description_md": "Resolution-robust large mask inpainting. Draw over unwanted objects to remove them. Fast Fourier convolutions for global context. 800×800 input.",
917
+ "demo": {
918
+ "template": "inpainting",
919
+ "config": { "input_size": 800 }
920
  },
921
+ "files": [
922
+ {
923
+ "name": "LaMa.mlpackage.zip",
924
+ "url": "https://huggingface.co/mlboydaisuke/coreml-zoo/resolve/main/lama/LaMa.mlpackage.zip",
925
+ "archive": "zip",
926
+ "size_bytes": 196237256,
927
+ "sha256": "b57b8451a1a86c00aea52d75230fb5f49d3076eec67403192758c9d2b59c0e69",
928
+ "compute_units": "all",
929
+ "kind": "model"
930
+ }
931
+ ],
932
+ "requirements": { "min_ios": "17.0", "min_ram_mb": 600 },
933
+ "license": { "name": "Apache-2.0", "url": "https://github.com/advimman/lama/blob/main/LICENSE" },
934
+ "upstream": { "name": "advimman/lama", "url": "https://github.com/advimman/lama", "year": 2022 }
935
  }
936
  ]
937
+ }