StephanST commited on
Commit
67437d9
·
verified ·
1 Parent(s): 039a7d2

Upload folder using huggingface_hub

Browse files
h/cider-w8a8/README.md ADDED
@@ -0,0 +1,84 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: other
3
+ license_name: nvidia-open-model-license
4
+ base_model: nvidia/C-RADIOv4-H
5
+ library_name: mlx
6
+ pipeline_tag: image-feature-extraction
7
+ tags:
8
+ - mlx
9
+ - c-radio
10
+ - vision
11
+ - embeddings
12
+ - quantized
13
+ - cider
14
+ - apple-silicon
15
+ ---
16
+
17
+ # C-RADIOv4-H MLX Cider W8A8
18
+
19
+ This is a self-contained MLX bundle converted from `nvidia/C-RADIOv4-H` for Apple
20
+ Silicon image embeddings.
21
+
22
+ Implementation repository:
23
+
24
+ https://github.com/stephansturges/c-radio_v4_MLX
25
+
26
+ ## Source
27
+
28
+ - Upstream model: https://huggingface.co/nvidia/C-RADIOv4-H
29
+ - Upstream revision: `0057b339059c0b9e1b4ba996f975410ebbfdfcc8`
30
+ - Converted bundle path in the local repo: `bundles/c-radiov4-h-cider-w8a8`
31
+
32
+ ## Format
33
+
34
+ - Runtime: MLX plus Cider
35
+ - Quantization: W8A8, per-channel int8 weights and online int8 activation quantization
36
+ - Required hardware: Apple M5 or newer
37
+ - Required package: https://github.com/Mininglamp-AI/cider
38
+ - Quantized tensors: 129
39
+ - Copied tensors: 261
40
+ - Bundle size observed locally: 685 MB
41
+
42
+ ## Measured Accuracy
43
+
44
+ Against the local bf16 MLX bundle on `data/golden_images/smoke.jpg`:
45
+
46
+ | Image size | Summary cosine | Spatial cosine |
47
+ | ---: | ---: | ---: |
48
+ | 256x256 | 0.998008 | 0.997055 |
49
+ | 512x512 | 0.997202 | 0.996210 |
50
+
51
+ ## Measured Speed
52
+
53
+ Apple M5 Max, `mlx==0.31.2`, Cider `0.7.0`, no output materialization:
54
+
55
+ | Resolution | Batch | p50 latency | Throughput |
56
+ | ---: | ---: | ---: | ---: |
57
+ | 256x256 | 1 | 15.5 ms | 64.6 images/s |
58
+ | 256x256 | 4 | 37.4 ms | 106.9 images/s |
59
+ | 512x512 | 1 | 47.1 ms | 21.2 images/s |
60
+ | 512x512 | 4 | 179.6 ms | 22.3 images/s |
61
+
62
+ ## Usage
63
+
64
+ Install Cider in a Python `>=3.12` environment, then run:
65
+
66
+ ```sh
67
+ cradio-mlx embed \
68
+ --backend mlx-h \
69
+ --checkpoint /path/to/this/bundle \
70
+ --image image.jpg \
71
+ --image-size 512 \
72
+ --dtype bfloat16 \
73
+ --save-npz embedding.npz
74
+ ```
75
+
76
+ ## License
77
+
78
+ The implementation code in `c-radio_v4_MLX` is MIT licensed. The model weights and this
79
+ converted bundle are governed by NVIDIA's Open Model License Agreement, not by the MIT
80
+ license. Preserve NVIDIA provenance and license terms when redistributing this bundle.
81
+
82
+ NVIDIA Open Model License Agreement:
83
+
84
+ https://developer.download.nvidia.com/licenses/nvidia-open-model-license-agreement-june-2024.pdf
h/cider-w8a8/config.json ADDED
@@ -0,0 +1,273 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "adaptor_configs": {},
3
+ "adaptor_names": null,
4
+ "architectures": [
5
+ "RADIOModel"
6
+ ],
7
+ "args": {
8
+ "aa": null,
9
+ "amp": true,
10
+ "amp_dtype": "bfloat16",
11
+ "amp_impl": "native",
12
+ "aug_repeats": 0,
13
+ "aug_splits": 0,
14
+ "auto_workload_inspector": false,
15
+ "bn_eps": null,
16
+ "bn_momentum": null,
17
+ "cache_dir": null,
18
+ "channels_last": false,
19
+ "checkpoint_folder": null,
20
+ "checkpoint_hist": 10,
21
+ "chk_keep_forever": 100,
22
+ "class_map": "",
23
+ "clip_grad": null,
24
+ "clip_mode": "norm",
25
+ "cls_token_per_teacher": true,
26
+ "coco_annotations_file": "/datasets/coco2017-adlsa/annotations/captions_val2017.json",
27
+ "coco_image_dir": "/datasets/coco2017-adlsa/val2017",
28
+ "color_jitter": 0.4,
29
+ "cooldown_epochs": 0,
30
+ "cpe_max_size": 2048,
31
+ "cpe_num_registers": null,
32
+ "crd_loss": false,
33
+ "crd_loss_weight": 0.8,
34
+ "crop_pct": null,
35
+ "cutmix": 0.0,
36
+ "cutmix_minmax": null,
37
+ "dataset_download": false,
38
+ "debug_full_knn": false,
39
+ "decay_epochs": 90,
40
+ "decay_milestones": [
41
+ 90,
42
+ 180,
43
+ 270
44
+ ],
45
+ "decay_rate": 0.1,
46
+ "depchain": true,
47
+ "detect_anomaly": false,
48
+ "dist_bn": "reduce",
49
+ "dist_norm_weight": 0.0,
50
+ "distributed": true,
51
+ "drop": 0.0,
52
+ "drop_block": null,
53
+ "drop_connect": null,
54
+ "drop_path": null,
55
+ "dtype": "float32",
56
+ "epoch": 299,
57
+ "epoch_repeats": 0.0,
58
+ "eval": false,
59
+ "eval_metric": "knn_top1",
60
+ "eval_teacher": false,
61
+ "eval_teacher_only": false,
62
+ "eval_throughput": false,
63
+ "fast_norm": false,
64
+ "fd_loss_fn": "MSE",
65
+ "feature_normalization": "PHI_STANDARDIZE",
66
+ "feature_summarizer": "cls_token",
67
+ "feature_upscale_factor": null,
68
+ "force_disable_damp": false,
69
+ "force_disable_spectral_reparam": false,
70
+ "force_new_wandb_id": false,
71
+ "force_spectral_reparam": false,
72
+ "freeze_bn": false,
73
+ "fsdp": true,
74
+ "full_equivariance": false,
75
+ "fuser": "",
76
+ "gp": null,
77
+ "grad_accum_steps": 1,
78
+ "grad_checkpointing": false,
79
+ "head_init_bias": null,
80
+ "head_init_scale": null,
81
+ "head_lr": null,
82
+ "head_warmup": 3,
83
+ "head_weight_decay": 0.0005,
84
+ "hflip": 0.5,
85
+ "img_size": null,
86
+ "in_chans": null,
87
+ "initial_checkpoint": null,
88
+ "input_size": null,
89
+ "interpolation": "",
90
+ "layer_decay": null,
91
+ "local_rank": 0,
92
+ "log_interval": 50,
93
+ "log_mlflow": false,
94
+ "log_teacher_timings": true,
95
+ "log_train_metrics_per_epoch": true,
96
+ "log_train_metrics_per_log_interval": true,
97
+ "log_wandb": true,
98
+ "loss_auto_balance": false,
99
+ "lr_base": 0.1,
100
+ "lr_base_scale": "",
101
+ "lr_base_size": 256,
102
+ "lr_cycle_decay": 0.5,
103
+ "lr_cycle_limit": 1,
104
+ "lr_cycle_mul": 1.0,
105
+ "lr_k_decay": 1.0,
106
+ "lr_noise": null,
107
+ "lr_noise_pct": 0.67,
108
+ "lr_noise_std": 1.0,
109
+ "mean": null,
110
+ "mesa": false,
111
+ "min_lr": 1e-05,
112
+ "mixup": 0.0,
113
+ "mixup_mode": "batch",
114
+ "mixup_off_epoch": 0,
115
+ "mixup_prob": 1.0,
116
+ "mixup_switch_prob": 0.5,
117
+ "mlp_hidden_size": 1520,
118
+ "mlp_num_inner": 2,
119
+ "mlp_version": "v2",
120
+ "model": "vit_huge_patch16_224",
121
+ "model_kwargs": {},
122
+ "model_norm": false,
123
+ "momentum": 0.9,
124
+ "no_custom_validation": false,
125
+ "no_ddp_bb": true,
126
+ "no_knn": false,
127
+ "no_prefetcher": false,
128
+ "no_resume_opt": false,
129
+ "no_save_checkpoint": false,
130
+ "no_val": false,
131
+ "num_classes": null,
132
+ "on_demand_workload_inspector": false,
133
+ "one_logger_app_tag": "",
134
+ "one_logger_is_baseline": false,
135
+ "one_logger_run_name": "",
136
+ "onelogger": null,
137
+ "opt_betas": null,
138
+ "opt_eps": null,
139
+ "overfit": false,
140
+ "patience_epochs": 10,
141
+ "perf_test_no_aug": false,
142
+ "perf_test_no_decode": false,
143
+ "perf_test_no_io": false,
144
+ "perf_test_only_dataloader": false,
145
+ "perf_test_simple_aug": false,
146
+ "pin_mem": false,
147
+ "prefetcher": true,
148
+ "pretrained": false,
149
+ "processed_neck_outputs": null,
150
+ "profile_train_exit_after_profiling": false,
151
+ "profile_train_export_chrome_trace": true,
152
+ "profile_train_export_csv": false,
153
+ "profile_train_iterations": 0,
154
+ "qradio": false,
155
+ "qradio_max_tokens": 512,
156
+ "qradio_min_tokens": 32,
157
+ "qradio_patch_token_mask_initial_ratio": 0.95,
158
+ "qradio_progressive_2d": false,
159
+ "qradio_quantizer": null,
160
+ "qradio_ramp_alpha": 1.5,
161
+ "rank": 0,
162
+ "ratio": [
163
+ 0.75,
164
+ 1.3333333333333333
165
+ ],
166
+ "recount": 1,
167
+ "recovery_interval": 0,
168
+ "register_multiple": 10,
169
+ "remode": "pixel",
170
+ "reprob": 0.0,
171
+ "reset_loss_state": true,
172
+ "resplit": false,
173
+ "sample_tracking": false,
174
+ "save_images": false,
175
+ "scale": [
176
+ 0.5,
177
+ 1.0
178
+ ],
179
+ "sched": "cosine",
180
+ "seed": 42,
181
+ "shift_equivariance": false,
182
+ "smoothing": 0.1,
183
+ "source_tracking": false,
184
+ "spectral_heads": false,
185
+ "spectral_reparam": false,
186
+ "spectral_weight_decay": null,
187
+ "split_bn": false,
188
+ "start_epoch": null,
189
+ "std": null,
190
+ "stream_teachers": false,
191
+ "student_intermediate_indices": null,
192
+ "student_load_skip_state_dict_keys_regex": null,
193
+ "student_reinit_model_layers_regex": null,
194
+ "student_strict_load_ignore_mismatched_shape_keys_regex": null,
195
+ "student_strict_load_ignore_missing_keys_regex": null,
196
+ "student_strict_load_ignore_unexpected_keys_regex": null,
197
+ "student_strict_load_state_dict": false,
198
+ "sync_bn": false,
199
+ "sync_resolutions_across_ranks": true,
200
+ "synchronize_step": false,
201
+ "teachers": [
202
+ {
203
+ "model": "siglip2-g-384",
204
+ "name": "siglip2-g",
205
+ "spatial_mlp_version": "attn",
206
+ "type": "siglip2",
207
+ "use_summary": true
208
+ },
209
+ {
210
+ "model": "dinov3_vit7b16",
211
+ "name": "dino_v3_7b",
212
+ "type": "dino_v3",
213
+ "use_summary": true
214
+ },
215
+ {
216
+ "model": "default",
217
+ "name": "sam3",
218
+ "type": "sam3",
219
+ "use_summary": false
220
+ }
221
+ ],
222
+ "timing_warmup_iters": 20,
223
+ "tokenizer_kwargs": {},
224
+ "tokenizer_type": null,
225
+ "tome": null,
226
+ "torchcompile": null,
227
+ "torchscript": false,
228
+ "train_interpolation": "random",
229
+ "train_split": "train",
230
+ "tta": 0,
231
+ "untie_neck_weights": false,
232
+ "use_coco": false,
233
+ "use_multi_epochs_loader": false,
234
+ "val_ema_only": false,
235
+ "val_split": "val",
236
+ "vflip": 0.0,
237
+ "vitdet_version": 1,
238
+ "wandb_entity": "",
239
+ "wandb_id": "",
240
+ "wandb_job_type": "",
241
+ "wandb_name": "",
242
+ "wandb_project": "",
243
+ "wandb_tags": null,
244
+ "warmup_lr": 1e-05,
245
+ "warmup_prefix": false,
246
+ "worker_seeding": "all",
247
+ "workers": 8,
248
+ "workload_inspector_analyze_nsys_traces": false,
249
+ "workload_inspector_baseline_start_iter": 1500,
250
+ "workload_inspector_major_slowdown_p95_factor": 10.0,
251
+ "workload_inspector_minor_slowdown_p95_factor": 3.0,
252
+ "workload_inspector_no_slowdown_check": false,
253
+ "workload_inspector_simulate_slowdown_num_times": 1,
254
+ "workload_inspector_simulate_slowdown_start_iter": null,
255
+ "world_size": 256
256
+ },
257
+ "auto_map": {
258
+ "AutoConfig": "hf_model.RADIOConfig",
259
+ "AutoModel": "hf_model.RADIOModel"
260
+ },
261
+ "feature_normalizer_config": null,
262
+ "inter_feature_normalizer_config": null,
263
+ "max_resolution": 2048,
264
+ "patch_size": 16,
265
+ "preferred_resolution": [
266
+ 512,
267
+ 512
268
+ ],
269
+ "torch_dtype": "float32",
270
+ "transformers_version": "4.51.3",
271
+ "version": "c-radio_v4-h",
272
+ "vitdet_window_size": null
273
+ }
h/cider-w8a8/manifest.json ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "created_at": "2026-05-22T10:33:29Z",
3
+ "dtype": "bfloat16",
4
+ "extra": {
5
+ "conversion_state": "quantized_self_contained",
6
+ "quantization_stats": {
7
+ "copied_tensors": 261,
8
+ "padded_features": 0,
9
+ "padded_tensors": 0,
10
+ "quantized_tensors": 129
11
+ },
12
+ "source_bundle": "bundles/c-radiov4-h-bf16",
13
+ "weights_file": "model.safetensors"
14
+ },
15
+ "license": "nvidia-open-model-license",
16
+ "manifest_version": 1,
17
+ "max_resolution": 2048,
18
+ "model_id": "nvidia/C-RADIOv4-H",
19
+ "patch_size": 16,
20
+ "preferred_resolution": 512,
21
+ "quantization": {
22
+ "activation_bits": 8,
23
+ "bits": 8,
24
+ "group_size": 0,
25
+ "mode": "cider-w8a8",
26
+ "runtime": "cider",
27
+ "scheme": "symmetric_per_channel",
28
+ "state": "packed_w8a8_runtime",
29
+ "weight_bits": 8
30
+ },
31
+ "revision": "0057b339059c0b9e1b4ba996f975410ebbfdfcc8",
32
+ "source_files": {
33
+ "README.md": "be08bbaf49b38c62eb8496ebfd9b92292b4258838ade3d0e13e18d86978192d7",
34
+ "config.json": "f141520df919f1c29cd4f5a95340d9a39b15fbacafa36ca1054c55c919b2af62",
35
+ "model.safetensors": "f15d9d0c907974743a26a448da573daa2f2b75019fc1b9f31a45f0fdf49a7ee1",
36
+ "preprocessor_config.json": "6ca88d165592015b02a183d26a20fc96748e837d1cb21bde5f611141aca593cf"
37
+ },
38
+ "variant": "h"
39
+ }
h/cider-w8a8/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f15d9d0c907974743a26a448da573daa2f2b75019fc1b9f31a45f0fdf49a7ee1
3
+ size 717798440
h/cider-w8a8/preprocessor_config.json ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "crop_size": {
3
+ "height": 512,
4
+ "width": 512
5
+ },
6
+ "do_center_crop": false,
7
+ "do_convert_rgb": true,
8
+ "do_normalize": false,
9
+ "do_rescale": true,
10
+ "do_resize": false,
11
+ "image_processor_type": "CLIPImageProcessor",
12
+ "processor_class": "CLIPProcessor",
13
+ "resample": 3,
14
+ "size": {
15
+ "shortest_edge": 512
16
+ }
17
+ }