mlboydaisuke commited on
Commit
213ab71
·
verified ·
1 Parent(s): e979512

Upload models.json with huggingface_hub

Browse files
Files changed (1) hide show
  1. models.json +1004 -7
models.json CHANGED
@@ -3,26 +3,376 @@
3
  "updated_at": "2026-04-10",
4
  "min_app_version": "1.0",
5
  "categories": [
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6
  {
7
  "id": "depth",
8
- "name": "Monocular Depth Estimation",
9
  "icon": "cube.transparent",
10
- "order": 1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
11
  }
12
  ],
13
  "models": [
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
14
  {
15
  "id": "moge2_vitb_normal_504",
16
  "name": "MoGe-2 ViT-B (504×504)",
17
  "subtitle": "Microsoft, CVPR 2025",
18
  "category_id": "depth",
19
- "description_md": "Monocular geometry from a single image. Predicts metric depth, surface normals, and a confidence mask in one forward pass.\n\nBased on a DINOv2 ViT-B/14 backbone with three task heads. The successor to MiDaS-style relative depth: depth comes out in real meters.",
20
- "thumbnail_url": null,
21
  "demo": {
22
  "template": "depth_visualization",
23
  "config": {
24
  "input_size": 504,
25
- "output_keys": ["depth", "normal", "mask", "metric_scale"],
 
 
 
 
 
26
  "depth_unit": "meters"
27
  }
28
  },
@@ -33,7 +383,8 @@
33
  "archive": "zip",
34
  "size_bytes": 193312088,
35
  "sha256": "f60cfb4804707a489d99e24453188cd31ddcabb299bbf6da4507edc9cecbf9e7",
36
- "compute_units": "all"
 
37
  }
38
  ],
39
  "requirements": {
@@ -49,6 +400,652 @@
49
  "url": "https://github.com/microsoft/MoGe",
50
  "year": 2025
51
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
52
  }
53
  ]
54
- }
 
3
  "updated_at": "2026-04-10",
4
  "min_app_version": "1.0",
5
  "categories": [
6
+ {
7
+ "id": "segmentation",
8
+ "name": "Segmentation",
9
+ "icon": "person.and.background.dotted",
10
+ "order": 1
11
+ },
12
+ {
13
+ "id": "enhancement",
14
+ "name": "Image Enhancement",
15
+ "icon": "wand.and.stars",
16
+ "order": 2
17
+ },
18
+ {
19
+ "id": "detection",
20
+ "name": "Object Detection",
21
+ "icon": "viewfinder",
22
+ "order": 3
23
+ },
24
  {
25
  "id": "depth",
26
+ "name": "Depth & Geometry",
27
  "icon": "cube.transparent",
28
+ "order": 4
29
+ },
30
+ {
31
+ "id": "vision_language",
32
+ "name": "Vision-Language",
33
+ "icon": "text.viewfinder",
34
+ "order": 5
35
+ },
36
+ {
37
+ "id": "face",
38
+ "name": "Face Processing",
39
+ "icon": "face.smiling",
40
+ "order": 6
41
+ },
42
+ {
43
+ "id": "generation",
44
+ "name": "Image Generation",
45
+ "icon": "sparkles",
46
+ "order": 7
47
+ },
48
+ {
49
+ "id": "video",
50
+ "name": "Video Processing",
51
+ "icon": "film",
52
+ "order": 8
53
+ },
54
+ {
55
+ "id": "audio",
56
+ "name": "Audio Processing",
57
+ "icon": "waveform.circle",
58
+ "order": 9
59
+ },
60
+ {
61
+ "id": "speech",
62
+ "name": "Speech & Music",
63
+ "icon": "music.note",
64
+ "order": 10
65
  }
66
  ],
67
  "models": [
68
+ {
69
+ "id": "rmbg_1_4",
70
+ "name": "RMBG-1.4",
71
+ "subtitle": "BRIA AI, 2023",
72
+ "category_id": "segmentation",
73
+ "description_md": "High-quality background removal. Outputs foreground with alpha mask. 1024×1024 input.",
74
+ "demo": {
75
+ "template": "image_in_out",
76
+ "config": {
77
+ "input_size": 1024,
78
+ "output_type": "mask"
79
+ }
80
+ },
81
+ "files": [
82
+ {
83
+ "name": "RMBG_1_4.mlpackage.zip",
84
+ "url": "https://huggingface.co/mlboydaisuke/coreml-zoo/resolve/main/rmbg/RMBG_1_4.mlpackage.zip",
85
+ "archive": "zip",
86
+ "size_bytes": 38771210,
87
+ "sha256": "a80dbb5f04c922a8fa698c38592e4e52af4e62471d70bc7c59c28a3355a1da95",
88
+ "compute_units": "cpuAndGPU",
89
+ "kind": "model"
90
+ }
91
+ ],
92
+ "requirements": {
93
+ "min_ios": "17.0",
94
+ "min_ram_mb": 300
95
+ },
96
+ "license": {
97
+ "name": "Apache-2.0",
98
+ "url": "https://huggingface.co/briaai/RMBG-1.4"
99
+ },
100
+ "upstream": {
101
+ "name": "briaai/RMBG-1.4",
102
+ "url": "https://huggingface.co/briaai/RMBG-1.4",
103
+ "year": 2023
104
+ }
105
+ },
106
+ {
107
+ "id": "ddcolor",
108
+ "name": "DDColor Tiny",
109
+ "subtitle": "Image Colorization, 2023",
110
+ "category_id": "enhancement",
111
+ "description_md": "Automatic grayscale image colorization via dual decoders. 512×512 input.",
112
+ "demo": {
113
+ "template": "image_in_out",
114
+ "config": {
115
+ "input_size": 512,
116
+ "output_type": "image"
117
+ }
118
+ },
119
+ "files": [
120
+ {
121
+ "name": "DDColor_Tiny.mlpackage.zip",
122
+ "url": "https://huggingface.co/mlboydaisuke/coreml-zoo/resolve/main/ddcolor/DDColor_Tiny.mlpackage.zip",
123
+ "archive": "zip",
124
+ "size_bytes": 212344570,
125
+ "sha256": "bfecea37d66005f602efe13978360b8e4707923234c3d1d00beeb4e36cb1b02c",
126
+ "compute_units": "all",
127
+ "kind": "model"
128
+ }
129
+ ],
130
+ "requirements": {
131
+ "min_ios": "17.0",
132
+ "min_ram_mb": 400
133
+ },
134
+ "license": {
135
+ "name": "Apache-2.0",
136
+ "url": "https://github.com/piddnad/DDColor"
137
+ },
138
+ "upstream": {
139
+ "name": "piddnad/DDColor",
140
+ "url": "https://github.com/piddnad/DDColor",
141
+ "year": 2023
142
+ }
143
+ },
144
+ {
145
+ "id": "sinsr",
146
+ "name": "SinSR",
147
+ "subtitle": "Single-Step Super-Resolution, 2024",
148
+ "category_id": "enhancement",
149
+ "description_md": "4× super-resolution via single-step diffusion. 256→1024. Swin Transformer denoiser (FP32).",
150
+ "demo": {
151
+ "template": "image_in_out",
152
+ "config": {
153
+ "input_size": 256,
154
+ "output_type": "image"
155
+ }
156
+ },
157
+ "files": [
158
+ {
159
+ "name": "SinSR_Encoder.mlpackage.zip",
160
+ "url": "https://huggingface.co/mlboydaisuke/coreml-zoo/resolve/main/sinsr/SinSR_Encoder.mlpackage.zip",
161
+ "archive": "zip",
162
+ "size_bytes": 41246338,
163
+ "sha256": "fdec09d17561ec1bb5a2e829683d48c2b45e76b876285619a6e29a3523b8b7e2",
164
+ "compute_units": "cpuAndGPU",
165
+ "kind": "model"
166
+ },
167
+ {
168
+ "name": "SinSR_Denoiser.mlpackage.zip",
169
+ "url": "https://huggingface.co/mlboydaisuke/coreml-zoo/resolve/main/sinsr/SinSR_Denoiser.mlpackage.zip",
170
+ "archive": "zip",
171
+ "size_bytes": 440014511,
172
+ "sha256": "b31374c2d539b2cdd81499d6062c801ca00e405f5a67507cd609d14e2d6d4beb",
173
+ "compute_units": "cpuOnly",
174
+ "kind": "model"
175
+ },
176
+ {
177
+ "name": "SinSR_Decoder.mlpackage.zip",
178
+ "url": "https://huggingface.co/mlboydaisuke/coreml-zoo/resolve/main/sinsr/SinSR_Decoder.mlpackage.zip",
179
+ "archive": "zip",
180
+ "size_bytes": 60880285,
181
+ "sha256": "b8b9a7b52d6b240cf9fb3352b286ea83eb984fd73f5dd81c9f034f0016a5cb8c",
182
+ "compute_units": "cpuAndGPU",
183
+ "kind": "model"
184
+ }
185
+ ],
186
+ "requirements": {
187
+ "min_ios": "17.0",
188
+ "min_ram_mb": 600
189
+ },
190
+ "license": {
191
+ "name": "Apache-2.0",
192
+ "url": "https://github.com/wyf0912/SinSR"
193
+ },
194
+ "upstream": {
195
+ "name": "wyf0912/SinSR",
196
+ "url": "https://github.com/wyf0912/SinSR",
197
+ "year": 2024
198
+ }
199
+ },
200
+ {
201
+ "id": "yolo26s",
202
+ "name": "YOLO26s",
203
+ "subtitle": "NMS-Free Detection, 2026",
204
+ "category_id": "detection",
205
+ "description_md": "NMS-free object detection. 640×640 input, 80 COCO classes.",
206
+ "demo": {
207
+ "template": "image_detection",
208
+ "config": {
209
+ "input_size": 640,
210
+ "confidence_threshold": 0.25
211
+ }
212
+ },
213
+ "files": [
214
+ {
215
+ "name": "yolo26s.mlpackage.zip",
216
+ "url": "https://huggingface.co/mlboydaisuke/coreml-zoo/resolve/main/yolo26/yolo26s.mlpackage.zip",
217
+ "archive": "zip",
218
+ "size_bytes": 17697581,
219
+ "sha256": "0ec02fb0cf2dbd6e09601cbbc00a9734156ea4c2a52b0da23a984337074c6fd4",
220
+ "compute_units": "all",
221
+ "kind": "model"
222
+ }
223
+ ],
224
+ "requirements": {
225
+ "min_ios": "17.0",
226
+ "min_ram_mb": 300
227
+ },
228
+ "license": {
229
+ "name": "AGPL-3.0",
230
+ "url": "https://github.com/ultralytics/ultralytics"
231
+ },
232
+ "upstream": {
233
+ "name": "ultralytics/ultralytics",
234
+ "url": "https://github.com/ultralytics/ultralytics",
235
+ "year": 2026
236
+ }
237
+ },
238
+ {
239
+ "id": "yolo11s",
240
+ "name": "YOLO11s",
241
+ "subtitle": "Object Detection, 2024",
242
+ "category_id": "detection",
243
+ "description_md": "YOLO11 small detection with Vision framework NMS. 640×640 input.",
244
+ "demo": {
245
+ "template": "image_detection",
246
+ "config": {
247
+ "input_size": 640,
248
+ "confidence_threshold": 0.25
249
+ }
250
+ },
251
+ "files": [
252
+ {
253
+ "name": "yolo11s.mlpackage.zip",
254
+ "url": "https://huggingface.co/mlboydaisuke/coreml-zoo/resolve/main/yolov9/yolo11s.mlpackage.zip",
255
+ "archive": "zip",
256
+ "size_bytes": 17580204,
257
+ "sha256": "79e82aacc3ad20fc1eb990df6979fae9b927d4b06f33bd20ec0e1c0dcb7d1f6b",
258
+ "compute_units": "all",
259
+ "kind": "model"
260
+ }
261
+ ],
262
+ "requirements": {
263
+ "min_ios": "17.0",
264
+ "min_ram_mb": 300
265
+ },
266
+ "license": {
267
+ "name": "AGPL-3.0",
268
+ "url": "https://github.com/ultralytics/ultralytics"
269
+ },
270
+ "upstream": {
271
+ "name": "ultralytics/ultralytics",
272
+ "url": "https://github.com/ultralytics/ultralytics",
273
+ "year": 2024
274
+ }
275
+ },
276
+ {
277
+ "id": "yolov10n",
278
+ "name": "YOLOv10n",
279
+ "subtitle": "Object Detection, 2024",
280
+ "category_id": "detection",
281
+ "description_md": "YOLOv10 nano. 640×640 input. Dual-assignment strategy.",
282
+ "demo": {
283
+ "template": "image_detection",
284
+ "config": {
285
+ "input_size": 640,
286
+ "confidence_threshold": 0.25
287
+ }
288
+ },
289
+ "files": [
290
+ {
291
+ "name": "YOLOv10N.mlpackage.zip",
292
+ "url": "https://huggingface.co/mlboydaisuke/coreml-zoo/resolve/main/yolov10/YOLOv10N.mlpackage.zip",
293
+ "archive": "zip",
294
+ "size_bytes": 4309168,
295
+ "sha256": "9a687144a6b0b764f508c8f544fe46b6674629b8f09a1e99d8ca69b0be899891",
296
+ "compute_units": "all",
297
+ "kind": "model"
298
+ }
299
+ ],
300
+ "requirements": {
301
+ "min_ios": "17.0",
302
+ "min_ram_mb": 300
303
+ },
304
+ "license": {
305
+ "name": "AGPL-3.0",
306
+ "url": "https://github.com/THU-MIG/yolov10"
307
+ },
308
+ "upstream": {
309
+ "name": "THU-MIG/yolov10",
310
+ "url": "https://github.com/THU-MIG/yolov10",
311
+ "year": 2024
312
+ }
313
+ },
314
+ {
315
+ "id": "yoloworld",
316
+ "name": "YOLO-World",
317
+ "subtitle": "Open-Vocabulary Detection, 2024",
318
+ "category_id": "detection",
319
+ "description_md": "Open-vocabulary detection. Type any text query. YOLO-World V2-S + CLIP ViT-B/32.",
320
+ "demo": {
321
+ "template": "open_vocab_detection",
322
+ "config": {
323
+ "input_size": 640
324
+ }
325
+ },
326
+ "files": [
327
+ {
328
+ "name": "yoloworld_detector.mlpackage.zip",
329
+ "url": "https://huggingface.co/mlboydaisuke/coreml-zoo/resolve/main/yoloworld/yoloworld_detector.mlpackage.zip",
330
+ "archive": "zip",
331
+ "size_bytes": 23710620,
332
+ "sha256": "611d299ae74c83f90a5cc9f4585709859d5db735baa8ade721e0c2d99cd5af92",
333
+ "compute_units": "all",
334
+ "kind": "model"
335
+ },
336
+ {
337
+ "name": "clip_text_encoder.mlpackage.zip",
338
+ "url": "https://huggingface.co/mlboydaisuke/coreml-zoo/resolve/main/yoloworld/clip_text_encoder.mlpackage.zip",
339
+ "archive": "zip",
340
+ "size_bytes": 116681932,
341
+ "sha256": "45770a743297e8c2a57cc330d4f5c80f47734263680895b33b593b50dd2c382b",
342
+ "compute_units": "cpuOnly",
343
+ "kind": "model"
344
+ }
345
+ ],
346
+ "requirements": {
347
+ "min_ios": "17.0",
348
+ "min_ram_mb": 600
349
+ },
350
+ "license": {
351
+ "name": "GPL-3.0",
352
+ "url": "https://github.com/AILab-CVC/YOLO-World"
353
+ },
354
+ "upstream": {
355
+ "name": "AILab-CVC/YOLO-World",
356
+ "url": "https://github.com/AILab-CVC/YOLO-World",
357
+ "year": 2024
358
+ }
359
+ },
360
  {
361
  "id": "moge2_vitb_normal_504",
362
  "name": "MoGe-2 ViT-B (504×504)",
363
  "subtitle": "Microsoft, CVPR 2025",
364
  "category_id": "depth",
365
+ "description_md": "Monocular geometry from a single image. Metric depth, surface normals, confidence mask. DINOv2 ViT-B/14 backbone.",
 
366
  "demo": {
367
  "template": "depth_visualization",
368
  "config": {
369
  "input_size": 504,
370
+ "output_keys": [
371
+ "depth",
372
+ "normal",
373
+ "mask",
374
+ "metric_scale"
375
+ ],
376
  "depth_unit": "meters"
377
  }
378
  },
 
383
  "archive": "zip",
384
  "size_bytes": 193312088,
385
  "sha256": "f60cfb4804707a489d99e24453188cd31ddcabb299bbf6da4507edc9cecbf9e7",
386
+ "compute_units": "all",
387
+ "kind": "model"
388
  }
389
  ],
390
  "requirements": {
 
400
  "url": "https://github.com/microsoft/MoGe",
401
  "year": 2025
402
  }
403
+ },
404
+ {
405
+ "id": "siglip",
406
+ "name": "SigLIP",
407
+ "subtitle": "Zero-Shot Classification, 2023",
408
+ "category_id": "vision_language",
409
+ "description_md": "Zero-shot image classification. Dual encoder (image + text). 224×224 input.",
410
+ "demo": {
411
+ "template": "zero_shot_classify",
412
+ "config": {
413
+ "input_size": 224,
414
+ "image_encoder": "SigLIP_ImageEncoder.mlpackage.zip",
415
+ "text_encoder": "SigLIP_TextEncoder.mlpackage.zip",
416
+ "vocab_file": "siglip_vocab.json",
417
+ "prompt_template": "{}",
418
+ "logit_scale": 117.33
419
+ }
420
+ },
421
+ "files": [
422
+ {
423
+ "name": "SigLIP_ImageEncoder.mlpackage.zip",
424
+ "url": "https://huggingface.co/mlboydaisuke/coreml-zoo/resolve/main/siglip/SigLIP_ImageEncoder.mlpackage.zip",
425
+ "archive": "zip",
426
+ "size_bytes": 170352400,
427
+ "sha256": "98f6abf5f4aa145199f4ae22305f9c1d5929eee6b126daad84783b2b2090ee24",
428
+ "compute_units": "cpuOnly",
429
+ "kind": "model"
430
+ },
431
+ {
432
+ "name": "SigLIP_TextEncoder.mlpackage.zip",
433
+ "url": "https://huggingface.co/mlboydaisuke/coreml-zoo/resolve/main/siglip/SigLIP_TextEncoder.mlpackage.zip",
434
+ "archive": "zip",
435
+ "size_bytes": 203975769,
436
+ "sha256": "9dead2d58705838aef7ad83c3bf4036698c78d872ca1cdd04f2c4a6272009ccf",
437
+ "compute_units": "cpuOnly",
438
+ "kind": "model"
439
+ },
440
+ {
441
+ "name": "siglip_vocab.json",
442
+ "url": "https://huggingface.co/mlboydaisuke/coreml-zoo/resolve/main/siglip/siglip_vocab.json",
443
+ "size_bytes": 673754,
444
+ "sha256": "b94b3a58e04f619936b3890804dff7c478522c07515ff748cf127c5443ee5229",
445
+ "kind": "vocab"
446
+ }
447
+ ],
448
+ "requirements": {
449
+ "min_ios": "17.0",
450
+ "min_ram_mb": 800
451
+ },
452
+ "license": {
453
+ "name": "Apache-2.0",
454
+ "url": "https://github.com/google-research/big_vision"
455
+ },
456
+ "upstream": {
457
+ "name": "google-research/big_vision",
458
+ "url": "https://github.com/google-research/big_vision",
459
+ "year": 2023
460
+ }
461
+ },
462
+ {
463
+ "id": "florence2",
464
+ "name": "Florence-2",
465
+ "subtitle": "Microsoft, 2024",
466
+ "category_id": "vision_language",
467
+ "description_md": "Vision-language captioning, OCR, and VQA. Three-stage encoder-decoder. 768×768 input.",
468
+ "demo": {
469
+ "template": "image_to_text",
470
+ "config": {
471
+ "image_size": 768,
472
+ "max_tokens": 256,
473
+ "vision_encoder": "Florence2VisionEncoder.mlpackage.zip",
474
+ "text_encoder": "Florence2TextEncoder.mlpackage.zip",
475
+ "decoder": "Florence2Decoder.mlpackage.zip",
476
+ "vocab_file": "florence2_vocab.json",
477
+ "tasks": {
478
+ "caption": [
479
+ 0,
480
+ 2264,
481
+ 473,
482
+ 5,
483
+ 2274,
484
+ 6190,
485
+ 116,
486
+ 2
487
+ ],
488
+ "detailed_caption": [
489
+ 0,
490
+ 2264,
491
+ 473,
492
+ 5,
493
+ 31962,
494
+ 2274,
495
+ 6190,
496
+ 116,
497
+ 2
498
+ ],
499
+ "ocr": [
500
+ 0,
501
+ 2264,
502
+ 473,
503
+ 5,
504
+ 71307,
505
+ 116,
506
+ 2
507
+ ]
508
+ }
509
+ }
510
+ },
511
+ "files": [
512
+ {
513
+ "name": "Florence2VisionEncoder.mlpackage.zip",
514
+ "url": "https://huggingface.co/mlboydaisuke/coreml-zoo/resolve/main/florence2/Florence2VisionEncoder.mlpackage.zip",
515
+ "archive": "zip",
516
+ "size_bytes": 81198683,
517
+ "sha256": "9422f189c21220a0f9966eb9d780856772feb55597dcc579fc4e3c88990d0046",
518
+ "compute_units": "cpuOnly",
519
+ "kind": "model"
520
+ },
521
+ {
522
+ "name": "Florence2TextEncoder.mlpackage.zip",
523
+ "url": "https://huggingface.co/mlboydaisuke/coreml-zoo/resolve/main/florence2/Florence2TextEncoder.mlpackage.zip",
524
+ "archive": "zip",
525
+ "size_bytes": 72742890,
526
+ "sha256": "f985deeef0408ea8aac33ac4f5c6d9635cd9c64c98b53f85031db6e27f3bfd92",
527
+ "compute_units": "cpuOnly",
528
+ "kind": "model"
529
+ },
530
+ {
531
+ "name": "Florence2Decoder.mlpackage.zip",
532
+ "url": "https://huggingface.co/mlboydaisuke/coreml-zoo/resolve/main/florence2/Florence2Decoder.mlpackage.zip",
533
+ "archive": "zip",
534
+ "size_bytes": 85329746,
535
+ "sha256": "fe85a6faab5281272bcd79dabfbf87d60ba1a78dd9455e2bf71c67a134d61dc5",
536
+ "compute_units": "cpuOnly",
537
+ "kind": "model"
538
+ },
539
+ {
540
+ "name": "florence2_vocab.json",
541
+ "url": "https://huggingface.co/mlboydaisuke/coreml-zoo/resolve/main/florence2/florence2_vocab.json",
542
+ "size_bytes": 999352,
543
+ "sha256": "861fee9af5520403f6dbb4940d6af6627f1481b71cdc4a870f1f61344e57e645",
544
+ "kind": "vocab"
545
+ }
546
+ ],
547
+ "requirements": {
548
+ "min_ios": "17.0",
549
+ "min_ram_mb": 1200
550
+ },
551
+ "license": {
552
+ "name": "MIT",
553
+ "url": "https://huggingface.co/microsoft/Florence-2-base"
554
+ },
555
+ "upstream": {
556
+ "name": "microsoft/Florence-2",
557
+ "url": "https://huggingface.co/microsoft/Florence-2-base",
558
+ "year": 2024
559
+ }
560
+ },
561
+ {
562
+ "id": "face3d",
563
+ "name": "3DDFA V2",
564
+ "subtitle": "3D Face Reconstruction, 2020",
565
+ "category_id": "face",
566
+ "description_md": "Single-image 3D face reconstruction. Predicts 6 DoF pose + expression parameters.",
567
+ "demo": {
568
+ "template": "face_3d",
569
+ "config": {
570
+ "input_size": 120
571
+ }
572
+ },
573
+ "files": [
574
+ {
575
+ "name": "3DDFA_V2.mlpackage.zip",
576
+ "url": "https://huggingface.co/mlboydaisuke/coreml-zoo/resolve/main/face3d/3DDFA_V2.mlpackage.zip",
577
+ "archive": "zip",
578
+ "size_bytes": 6083375,
579
+ "sha256": "0f715dc220c046f558e3b8fc65246df9a2eec77182830a16628783430cdacdc8",
580
+ "compute_units": "all",
581
+ "kind": "model"
582
+ }
583
+ ],
584
+ "requirements": {
585
+ "min_ios": "17.0",
586
+ "min_ram_mb": 200
587
+ },
588
+ "license": {
589
+ "name": "MIT",
590
+ "url": "https://github.com/cleardusk/3DDFA_V2"
591
+ },
592
+ "upstream": {
593
+ "name": "cleardusk/3DDFA_V2",
594
+ "url": "https://github.com/cleardusk/3DDFA_V2",
595
+ "year": 2020
596
+ }
597
+ },
598
+ {
599
+ "id": "hypersd",
600
+ "name": "Hyper-SD (1-Step)",
601
+ "subtitle": "ByteDance, 2024",
602
+ "category_id": "generation",
603
+ "description_md": "Single-step text-to-image from SD1.5 via TCD distillation. 512×512. Chunked UNet (6-bit).",
604
+ "demo": {
605
+ "template": "text_to_image",
606
+ "config": {
607
+ "image_size": 512,
608
+ "latent_size": 64,
609
+ "latent_channels": 4,
610
+ "steps": 1,
611
+ "guidance_scale": 1.0,
612
+ "text_encoder": "HyperSDTextEncoder.mlpackage.zip",
613
+ "unet_chunk1": "HyperSDUnetChunk1.mlpackage.zip",
614
+ "unet_chunk2": "HyperSDUnetChunk2.mlpackage.zip",
615
+ "vae_decoder": "HyperSDVAEDecoder.mlpackage.zip",
616
+ "vocab_file": "vocab.json",
617
+ "merges_file": "merges.txt"
618
+ }
619
+ },
620
+ "files": [
621
+ {
622
+ "name": "HyperSDTextEncoder.mlpackage.zip",
623
+ "url": "https://huggingface.co/mlboydaisuke/coreml-zoo/resolve/main/hypersd/HyperSDTextEncoder.mlpackage.zip",
624
+ "archive": "zip",
625
+ "size_bytes": 226397794,
626
+ "sha256": "201b0fcc3573811aac6a4e8545c695bc4fb2f7710ea0d60c227919d87b37687e",
627
+ "compute_units": "cpuAndNeuralEngine",
628
+ "kind": "model"
629
+ },
630
+ {
631
+ "name": "HyperSDUnetChunk1.mlpackage.zip",
632
+ "url": "https://huggingface.co/mlboydaisuke/coreml-zoo/resolve/main/hypersd/HyperSDUnetChunk1.mlpackage.zip",
633
+ "archive": "zip",
634
+ "size_bytes": 324819653,
635
+ "sha256": "279da11b8231aeeb9045f6ceabebb3a68c20a1b86ecc81aa6914b77ce76d5203",
636
+ "compute_units": "cpuAndNeuralEngine",
637
+ "kind": "model"
638
+ },
639
+ {
640
+ "name": "HyperSDUnetChunk2.mlpackage.zip",
641
+ "url": "https://huggingface.co/mlboydaisuke/coreml-zoo/resolve/main/hypersd/HyperSDUnetChunk2.mlpackage.zip",
642
+ "archive": "zip",
643
+ "size_bytes": 304530429,
644
+ "sha256": "0a700d11a105da589bb3e5666e38b9c72fa283149951b253fc11722e70e72faa",
645
+ "compute_units": "cpuAndNeuralEngine",
646
+ "kind": "model"
647
+ },
648
+ {
649
+ "name": "HyperSDVAEDecoder.mlpackage.zip",
650
+ "url": "https://huggingface.co/mlboydaisuke/coreml-zoo/resolve/main/hypersd/HyperSDVAEDecoder.mlpackage.zip",
651
+ "archive": "zip",
652
+ "size_bytes": 91282754,
653
+ "sha256": "1260371542d845a2261ed2de36c5fe3e9ccb740a6ceb59b1990705d125e8cf66",
654
+ "compute_units": "cpuAndNeuralEngine",
655
+ "kind": "model"
656
+ },
657
+ {
658
+ "name": "vocab.json",
659
+ "url": "https://huggingface.co/mlboydaisuke/coreml-zoo/resolve/main/hypersd/vocab.json",
660
+ "size_bytes": 1059962,
661
+ "sha256": "e089ad92ba36837a0d31433e555c8f45fe601ab5c221d4f607ded32d9f7a4349",
662
+ "kind": "vocab"
663
+ },
664
+ {
665
+ "name": "merges.txt",
666
+ "url": "https://huggingface.co/mlboydaisuke/coreml-zoo/resolve/main/hypersd/merges.txt",
667
+ "size_bytes": 524619,
668
+ "sha256": "9fd691f7c8039210e0fced15865466c65820d09b63988b0174bfe25de299051a",
669
+ "kind": "vocab"
670
+ }
671
+ ],
672
+ "requirements": {
673
+ "min_ios": "17.0",
674
+ "min_ram_mb": 1000
675
+ },
676
+ "license": {
677
+ "name": "OpenRAIL-M",
678
+ "url": "https://huggingface.co/ByteDance/Hyper-SD"
679
+ },
680
+ "upstream": {
681
+ "name": "ByteDance/Hyper-SD",
682
+ "url": "https://huggingface.co/ByteDance/Hyper-SD",
683
+ "year": 2024
684
+ }
685
+ },
686
+ {
687
+ "id": "matanyone",
688
+ "name": "MatAnyone",
689
+ "subtitle": "Video Matting, 2025",
690
+ "category_id": "video",
691
+ "description_md": "Temporally consistent video matting. 5-model pipeline with memory propagation.",
692
+ "demo": {
693
+ "template": "video_matting",
694
+ "config": {
695
+ "frame_size": 512,
696
+ "encoder": "MatAnyone_encoder.mlpackage.zip",
697
+ "mask_encoder": "MatAnyone_mask_encoder.mlpackage.zip",
698
+ "read_first": "MatAnyone_read_first.mlpackage.zip",
699
+ "read": "MatAnyone_read.mlpackage.zip",
700
+ "decoder": "MatAnyone_decoder.mlpackage.zip"
701
+ }
702
+ },
703
+ "files": [
704
+ {
705
+ "name": "MatAnyone_encoder.mlpackage.zip",
706
+ "url": "https://huggingface.co/mlboydaisuke/coreml-zoo/resolve/main/matanyone/MatAnyone_encoder.mlpackage.zip",
707
+ "archive": "zip",
708
+ "size_bytes": 17306121,
709
+ "sha256": "97ffd6bc4611f9a3351dc890fc00954ba48171e517e66a39f7a5f1f38110dfda",
710
+ "compute_units": "all",
711
+ "kind": "model"
712
+ },
713
+ {
714
+ "name": "MatAnyone_mask_encoder.mlpackage.zip",
715
+ "url": "https://huggingface.co/mlboydaisuke/coreml-zoo/resolve/main/matanyone/MatAnyone_mask_encoder.mlpackage.zip",
716
+ "archive": "zip",
717
+ "size_bytes": 16819866,
718
+ "sha256": "ba67559188ffc64d8e46418c051c6a55815d4482def17519fa518daac7d5a911",
719
+ "compute_units": "all",
720
+ "kind": "model"
721
+ },
722
+ {
723
+ "name": "MatAnyone_read_first.mlpackage.zip",
724
+ "url": "https://huggingface.co/mlboydaisuke/coreml-zoo/resolve/main/matanyone/MatAnyone_read_first.mlpackage.zip",
725
+ "archive": "zip",
726
+ "size_bytes": 21991849,
727
+ "sha256": "34daf7227dbcec7373a3fef175259fa7ec631ed8cb91d5595ca57ee9b22df7bb",
728
+ "compute_units": "cpuOnly",
729
+ "kind": "model"
730
+ },
731
+ {
732
+ "name": "MatAnyone_read.mlpackage.zip",
733
+ "url": "https://huggingface.co/mlboydaisuke/coreml-zoo/resolve/main/matanyone/MatAnyone_read.mlpackage.zip",
734
+ "archive": "zip",
735
+ "size_bytes": 22135429,
736
+ "sha256": "052e52c0ffb7ff9ede448128950cd4c1c9a96589b6900c82b5104d99addb7fa5",
737
+ "compute_units": "cpuOnly",
738
+ "kind": "model"
739
+ },
740
+ {
741
+ "name": "MatAnyone_decoder.mlpackage.zip",
742
+ "url": "https://huggingface.co/mlboydaisuke/coreml-zoo/resolve/main/matanyone/MatAnyone_decoder.mlpackage.zip",
743
+ "archive": "zip",
744
+ "size_bytes": 8807630,
745
+ "sha256": "67136aa67000e604838fe9aa7de151c514ef84f0b83f1da0f043cf70652d28eb",
746
+ "compute_units": "all",
747
+ "kind": "model"
748
+ }
749
+ ],
750
+ "requirements": {
751
+ "min_ios": "17.0",
752
+ "min_ram_mb": 800
753
+ },
754
+ "license": {
755
+ "name": "MIT",
756
+ "url": "https://github.com/pq-yang/MatAnyone"
757
+ },
758
+ "upstream": {
759
+ "name": "pq-yang/MatAnyone",
760
+ "url": "https://github.com/pq-yang/MatAnyone",
761
+ "year": 2025
762
+ }
763
+ },
764
+ {
765
+ "id": "demucs",
766
+ "name": "HTDemucs",
767
+ "subtitle": "Audio Source Separation",
768
+ "category_id": "audio",
769
+ "description_md": "Split music into 4 stems: drums, bass, vocals, other. 44.1 kHz stereo, FP32.",
770
+ "demo": {
771
+ "template": "audio_in_out",
772
+ "config": {
773
+ "sample_rate": 44100,
774
+ "segment_length": 343980,
775
+ "output_stems": [
776
+ "drums",
777
+ "bass",
778
+ "vocals",
779
+ "other"
780
+ ]
781
+ }
782
+ },
783
+ "files": [
784
+ {
785
+ "name": "HTDemucs_SourceSeparation_F32.mlpackage.zip",
786
+ "url": "https://huggingface.co/mlboydaisuke/coreml-zoo/resolve/main/demucs/HTDemucs_SourceSeparation_F32.mlpackage.zip",
787
+ "archive": "zip",
788
+ "size_bytes": 79076395,
789
+ "sha256": "0fbb941e15a5b2fa425d14fe630ed4c14b6dee72780c1f5b2b05f58803bce5f7",
790
+ "compute_units": "cpuOnly",
791
+ "kind": "model"
792
+ }
793
+ ],
794
+ "requirements": {
795
+ "min_ios": "17.0",
796
+ "min_ram_mb": 1000
797
+ },
798
+ "license": {
799
+ "name": "MIT",
800
+ "url": "https://github.com/adefossez/demucs"
801
+ },
802
+ "upstream": {
803
+ "name": "adefossez/demucs",
804
+ "url": "https://github.com/adefossez/demucs",
805
+ "year": 2021
806
+ }
807
+ },
808
+ {
809
+ "id": "kokoro",
810
+ "name": "Kokoro-82M",
811
+ "subtitle": "Multilingual TTS",
812
+ "category_id": "speech",
813
+ "description_md": "English + Japanese text-to-speech. 24 kHz. StyleTTS2 + iSTFTNet vocoder. Multiple voices.",
814
+ "demo": {
815
+ "template": "text_to_audio",
816
+ "config": {
817
+ "mode": "tts",
818
+ "sample_rate": 24000,
819
+ "vocab_file": "kokoro_vocab.json",
820
+ "voices": [
821
+ "af_heart",
822
+ "af_bella",
823
+ "am_michael",
824
+ "bf_emma",
825
+ "bm_george"
826
+ ]
827
+ }
828
+ },
829
+ "files": [
830
+ {
831
+ "name": "Kokoro_Predictor.mlpackage.zip",
832
+ "url": "https://huggingface.co/mlboydaisuke/coreml-zoo/resolve/main/kokoro/Kokoro_Predictor.mlpackage.zip",
833
+ "archive": "zip",
834
+ "size_bytes": 72191470,
835
+ "sha256": "af1d55dc842980c32b5591a70f603941f11ab60a435bed0c13a107a8ef467bed",
836
+ "compute_units": "cpuAndGPU",
837
+ "kind": "model"
838
+ },
839
+ {
840
+ "name": "Kokoro_Decoder_128.mlpackage.zip",
841
+ "url": "https://huggingface.co/mlboydaisuke/coreml-zoo/resolve/main/kokoro/Kokoro_Decoder_128.mlpackage.zip",
842
+ "archive": "zip",
843
+ "size_bytes": 229120589,
844
+ "sha256": "cece0d072f5ba6aa3f729cf4c76b4de51823bcc65a26ab363c10441c3cd8b306",
845
+ "compute_units": "all",
846
+ "kind": "model"
847
+ },
848
+ {
849
+ "name": "Kokoro_Decoder_256.mlpackage.zip",
850
+ "url": "https://huggingface.co/mlboydaisuke/coreml-zoo/resolve/main/kokoro/Kokoro_Decoder_256.mlpackage.zip",
851
+ "archive": "zip",
852
+ "size_bytes": 229123438,
853
+ "sha256": "36d5e16d5c5ccb500fc96f1b07a1d5ac57b791f8e09e61b78319d76949003efe",
854
+ "compute_units": "all",
855
+ "kind": "model"
856
+ },
857
+ {
858
+ "name": "Kokoro_Decoder_512.mlpackage.zip",
859
+ "url": "https://huggingface.co/mlboydaisuke/coreml-zoo/resolve/main/kokoro/Kokoro_Decoder_512.mlpackage.zip",
860
+ "archive": "zip",
861
+ "size_bytes": 229128735,
862
+ "sha256": "0a44484c327e4fe8443b0bcf104d6964fe3f30d628c9e78aee3f31af7f2475dc",
863
+ "compute_units": "all",
864
+ "kind": "model"
865
+ },
866
+ {
867
+ "name": "kokoro_vocab.json",
868
+ "url": "https://huggingface.co/mlboydaisuke/coreml-zoo/resolve/main/kokoro/kokoro_vocab.json",
869
+ "size_bytes": 1144,
870
+ "sha256": "70abefbe8a1c8865e43e0a43bbdc25b91a33e4aa053479d443ccf23e20a59e5d",
871
+ "kind": "vocab"
872
+ }
873
+ ],
874
+ "requirements": {
875
+ "min_ios": "17.0",
876
+ "min_ram_mb": 1000
877
+ },
878
+ "license": {
879
+ "name": "Apache-2.0",
880
+ "url": "https://huggingface.co/hexgrad/Kokoro-82M"
881
+ },
882
+ "upstream": {
883
+ "name": "hexgrad/Kokoro-82M",
884
+ "url": "https://huggingface.co/hexgrad/Kokoro-82M",
885
+ "year": 2024
886
+ }
887
+ },
888
+ {
889
+ "id": "stable_audio",
890
+ "name": "Stable Audio Open",
891
+ "subtitle": "Text-to-Music, 2024",
892
+ "category_id": "speech",
893
+ "description_md": "Text-to-music. Up to 11.9s stereo 44.1 kHz. Rectified flow DiT + T5 + Oobleck VAE.",
894
+ "demo": {
895
+ "template": "text_to_audio",
896
+ "config": {
897
+ "mode": "music",
898
+ "sample_rate": 44100,
899
+ "max_duration": 11.9
900
+ }
901
+ },
902
+ "files": [
903
+ {
904
+ "name": "StableAudioT5Encoder.mlpackage.zip",
905
+ "url": "https://huggingface.co/mlboydaisuke/coreml-zoo/resolve/main/stableaudio/StableAudioT5Encoder.mlpackage.zip",
906
+ "archive": "zip",
907
+ "size_bytes": 98538259,
908
+ "sha256": "319a8ba775d309240253ced68a03a3923d0aec9a79f608044f9403bdcfe4b741",
909
+ "compute_units": "cpuOnly",
910
+ "kind": "model"
911
+ },
912
+ {
913
+ "name": "StableAudioNumberEmbedder.mlpackage.zip",
914
+ "url": "https://huggingface.co/mlboydaisuke/coreml-zoo/resolve/main/stableaudio/StableAudioNumberEmbedder.mlpackage.zip",
915
+ "archive": "zip",
916
+ "size_bytes": 376018,
917
+ "sha256": "04bdc5de00a2cf1c4a18f80c94f0d74ecfab41f3ad99f2fb7a031d6ff5af75da",
918
+ "compute_units": "cpuOnly",
919
+ "kind": "model"
920
+ },
921
+ {
922
+ "name": "StableAudioDiT.mlpackage.zip",
923
+ "url": "https://huggingface.co/mlboydaisuke/coreml-zoo/resolve/main/stableaudio/StableAudioDiT.mlpackage.zip",
924
+ "archive": "zip",
925
+ "size_bytes": 1265748504,
926
+ "sha256": "b17da4fc4df857821d39dbdf7d3bfe7062a2272ab3e5df1284d545afb54047e4",
927
+ "compute_units": "cpuOnly",
928
+ "kind": "model"
929
+ },
930
+ {
931
+ "name": "StableAudioVAEDecoder.mlpackage.zip",
932
+ "url": "https://huggingface.co/mlboydaisuke/coreml-zoo/resolve/main/stableaudio/StableAudioVAEDecoder.mlpackage.zip",
933
+ "archive": "zip",
934
+ "size_bytes": 144960275,
935
+ "sha256": "7207544cca9799cc1d6803c5e81badd0bb4527b2d3a64d5cab5700a5f19a9374",
936
+ "compute_units": "cpuAndGPU",
937
+ "kind": "model"
938
+ },
939
+ {
940
+ "name": "t5_vocab.json",
941
+ "url": "https://huggingface.co/mlboydaisuke/coreml-zoo/resolve/main/stableaudio/t5_vocab.json",
942
+ "size_bytes": 749757,
943
+ "sha256": "7c9ff3ac1b3dbcaa617ee659f2df68688cfd44f1a5eb3be3fa0a2f56c749d56a",
944
+ "kind": "vocab"
945
+ }
946
+ ],
947
+ "requirements": {
948
+ "min_ios": "17.0",
949
+ "min_ram_mb": 1200
950
+ },
951
+ "license": {
952
+ "name": "custom",
953
+ "url": "https://huggingface.co/stabilityai/stable-audio-open-small"
954
+ },
955
+ "upstream": {
956
+ "name": "stabilityai/stable-audio-open-small",
957
+ "url": "https://huggingface.co/stabilityai/stable-audio-open-small",
958
+ "year": 2024
959
+ }
960
+ },
961
+ {
962
+ "id": "openvoice",
963
+ "name": "OpenVoice V2",
964
+ "subtitle": "Voice Cloning",
965
+ "category_id": "audio",
966
+ "description_md": "Zero-shot voice conversion. Clone a speaker from ~10s reference audio.",
967
+ "demo": {
968
+ "template": "audio_in_out",
969
+ "config": {
970
+ "sample_rate": 22050,
971
+ "output_stems": [
972
+ "converted"
973
+ ]
974
+ }
975
+ },
976
+ "files": [
977
+ {
978
+ "name": "OpenVoice_SpeakerEncoder.mlpackage.zip",
979
+ "url": "https://huggingface.co/mlboydaisuke/coreml-zoo/resolve/main/openvoice/OpenVoice_SpeakerEncoder.mlpackage.zip",
980
+ "archive": "zip",
981
+ "size_bytes": 1519880,
982
+ "sha256": "c3f2a96aaf5ecb5c5afc62b3d3dfbd47dc7ae64bc3edb7aa68befb54aef74459",
983
+ "compute_units": "cpuAndGPU",
984
+ "kind": "model"
985
+ },
986
+ {
987
+ "name": "OpenVoice_VoiceConverter.mlpackage.zip",
988
+ "url": "https://huggingface.co/mlboydaisuke/coreml-zoo/resolve/main/openvoice/OpenVoice_VoiceConverter.mlpackage.zip",
989
+ "archive": "zip",
990
+ "size_bytes": 59799630,
991
+ "sha256": "ef3ce8a2d1564aefa13830d7d0ca43f85e0aa62d5f59622c8bc456c307ab5e05",
992
+ "compute_units": "cpuAndGPU",
993
+ "kind": "model"
994
+ }
995
+ ],
996
+ "requirements": {
997
+ "min_ios": "17.0",
998
+ "min_ram_mb": 500
999
+ },
1000
+ "license": {
1001
+ "name": "MIT",
1002
+ "url": "https://github.com/myshell-ai/OpenVoice"
1003
+ },
1004
+ "upstream": {
1005
+ "name": "myshell-ai/OpenVoice",
1006
+ "url": "https://github.com/myshell-ai/OpenVoice",
1007
+ "year": 2023
1008
+ }
1009
+ },
1010
+ {
1011
+ "id": "diarization",
1012
+ "name": "Pyannote Diarization",
1013
+ "subtitle": "Speaker Identification",
1014
+ "category_id": "audio",
1015
+ "description_md": "Speaker diarization: who spoke when. 16 kHz mono, 10s segments.",
1016
+ "demo": {
1017
+ "template": "audio_in_out",
1018
+ "config": {
1019
+ "sample_rate": 16000,
1020
+ "output_stems": [
1021
+ "speaker_timeline"
1022
+ ]
1023
+ }
1024
+ },
1025
+ "files": [
1026
+ {
1027
+ "name": "SpeakerSegmentation.mlpackage.zip",
1028
+ "url": "https://huggingface.co/mlboydaisuke/coreml-zoo/resolve/main/diarization/SpeakerSegmentation.mlpackage.zip",
1029
+ "archive": "zip",
1030
+ "size_bytes": 5327137,
1031
+ "sha256": "dcfa2b98900f2b99029abfb593644b70418186a6ec2e94c9a79c2b3d7a84378a",
1032
+ "compute_units": "cpuAndGPU",
1033
+ "kind": "model"
1034
+ }
1035
+ ],
1036
+ "requirements": {
1037
+ "min_ios": "17.0",
1038
+ "min_ram_mb": 200
1039
+ },
1040
+ "license": {
1041
+ "name": "MIT",
1042
+ "url": "https://github.com/pyannote/pyannote-audio"
1043
+ },
1044
+ "upstream": {
1045
+ "name": "pyannote/pyannote-audio",
1046
+ "url": "https://github.com/pyannote/pyannote-audio",
1047
+ "year": 2021
1048
+ }
1049
  }
1050
  ]
1051
+ }