Safetensors
roman-bachmann commited on
Commit
bfb1430
·
0 Parent(s):

Initial commit

Browse files
Files changed (4) hide show
  1. .gitattributes +35 -0
  2. README.md +79 -0
  3. config.json +297 -0
  4. model.safetensors +3 -0
.gitattributes ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
README.md ADDED
@@ -0,0 +1,79 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: apple-amlr
3
+ ---
4
+
5
+ # FlexTok: Resampling Images into 1D Token Sequences of Flexible Length
6
+
7
+ [`Website`](https://flextok.epfl.ch) | [`arXiv`](https://arxiv.org/abs/2502.13967) | [`GitHub`](https://github.com/apple/ml-flextok) | [`🤗 Demo`](https://huggingface.co/spaces/EPFL-VILAB/FlexTok) | [`BibTeX`](#citation)
8
+
9
+ Official implementation and pre-trained models for: <br>
10
+ [**FlexTok: Resampling Images into 1D Token Sequences of Flexible Length**](https://arxiv.org/abs/2502.13967), arXiv 2025 <br>
11
+ *[Roman Bachmann](https://roman-bachmann.github.io/)\*, [Jesse Allardice](https://github.com/JesseAllardice)\*, [David Mizrahi](https://dmizrahi.com/)\*, [Enrico Fini](https://scholar.google.com/citations?user=OQMtSKIAAAAJ), [Oğuzhan Fatih Kar](https://ofkar.github.io/), [Elmira Amirloo](https://elamirloo.github.io/), [Alaaeldin El-Nouby](https://aelnouby.github.io/), [Amir Zamir](https://vilab.epfl.ch/zamir/), [Afshin Dehghan](https://scholar.google.com/citations?user=wcX-UW4AAAAJ)*
12
+
13
+
14
+ ## Installation
15
+ For install instructions, please see https://github.com/apple/ml-flextok.
16
+
17
+
18
+ ## Usage
19
+
20
+ To load the `FlexTok d18-d28 DFN` model directly from HuggingFace Hub, call:
21
+ ```python
22
+ from flextok.flextok_wrapper import FlexTokFromHub
23
+ model = FlexTokFromHub.from_pretrained('EPFL-VILAB/flextok_d18_d28_dfn').eval()
24
+ ```
25
+
26
+ The model can also be loaded by downloading the `model.safetensors` checkpoint in this repository manually and loading it using our helper functions:
27
+ ```python
28
+ from hydra.utils import instantiate
29
+ from flextok.utils.checkpoint import load_safetensors
30
+
31
+ ckpt, config = load_safetensors('/path/to/model.safetensors')
32
+ model = instantiate(config).eval()
33
+ model.load_state_dict(ckpt)
34
+ ```
35
+
36
+ After loading a FlexTok model, image batches can be encoded using:
37
+ ```python
38
+ from flextok.utils.demo import imgs_from_urls
39
+ # Load example images of shape (B, 3, 256, 256), normalized to [-1,1]
40
+ imgs = imgs_from_urls(urls=['https://storage.googleapis.com/flextok_site/nb_demo_images/0.png'])
41
+
42
+ # tokens_list is a list of [1, 256] discrete token sequences
43
+ tokens_list = model.tokenize(imgs)
44
+ ```
45
+
46
+ The list of token sequences can be truncated in a nested fashion:
47
+ ```python
48
+ k_keep = 64 # For example, only keep the first 64 out of 256 tokens
49
+ tokens_list = [t[:,:k_keep] for t in tokens_list]
50
+ ```
51
+
52
+ To decode the tokens with FlexTok's rectified flow decoder, call:
53
+ ```python
54
+ # tokens_list is a list of [1, l] discrete token sequences, with l <= 256
55
+ # reconst is a [B, 3, 256, 256] tensor, normalized to [-1,1]
56
+ reconst = model.detokenize(
57
+ tokens_list,
58
+ timesteps=20, # Number of denoising steps
59
+ guidance_scale=7.5, # Classifier-free guidance scale
60
+ perform_norm_guidance=True, # See https://arxiv.org/abs/2410.02416
61
+ )
62
+ ```
63
+
64
+
65
+ ## Citation
66
+
67
+ If you find this repository helpful, please consider citing our work:
68
+ ```
69
+ @article{flextok,
70
+ title={{FlexTok}: Resampling Images into 1D Token Sequences of Flexible Length},
71
+ author={Roman Bachmann and Jesse Allardice and David Mizrahi and Enrico Fini and O{\u{g}}uzhan Fatih Kar and Elmira Amirloo and Alaaeldin El-Nouby and Amir Zamir and Afshin Dehghan},
72
+ journal={arXiv 2025},
73
+ year={2025},
74
+ }
75
+ ```
76
+
77
+ ## License
78
+
79
+ The model weights in this repository are released under the Apple Model License for Research.
config.json ADDED
@@ -0,0 +1,297 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "encoder": {
3
+ "_target_": "flextok.model.utils.wrappers.SequentialModuleDictWrapper",
4
+ "module_dict": {
5
+ "enc_channels_to_last": {
6
+ "_target_": "flextok.model.utils.dict_ops.PerSampleOp",
7
+ "read_key": "vae_latents",
8
+ "write_key": "vae_latents_bhwc",
9
+ "per_sample_op": {
10
+ "_target_": "flextok.model.utils.dict_ops.channels_first_to_last",
11
+ "_partial_": true
12
+ }
13
+ },
14
+ "enc_patch_emb": {
15
+ "_target_": "flextok.model.preprocessors.patching.PatchEmbedder",
16
+ "input_tensor_list_read_key": "vae_latents_bhwc",
17
+ "patches_list_write_key": "enc_vae_latents_patched",
18
+ "n_patches_write_key": "enc_n_patches",
19
+ "channels_in": 16,
20
+ "dim": 1152,
21
+ "patch_sizes": [
22
+ 2,
23
+ 2
24
+ ],
25
+ "flatten_patches": false
26
+ },
27
+ "enc_posemb_module": {
28
+ "_target_": "flextok.model.utils.posembs.PositionalEmbeddingAdder",
29
+ "read_key": "enc_vae_latents_patched",
30
+ "write_key": "enc_vae_latents_patched",
31
+ "dim": 1152,
32
+ "max_sizes": [
33
+ 16,
34
+ 16
35
+ ],
36
+ "posemb_type": "sincos",
37
+ "posemb_scaling": "absolute"
38
+ },
39
+ "enc_register_module": {
40
+ "_target_": "flextok.model.preprocessors.registers.Registers1D",
41
+ "input_tensor_list_read_key": "enc_vae_latents_patched",
42
+ "register_sizes_read_write_key": "register_sizes",
43
+ "registers_write_key": "enc_registers",
44
+ "dim": 1152,
45
+ "n_min": 256,
46
+ "n_max": 256,
47
+ "size_sampling_mode": "uniform",
48
+ "ordering_mode": "nested"
49
+ },
50
+ "enc_seq_packer": {
51
+ "_target_": "flextok.model.preprocessors.flex_seq_packing.BlockWiseSequencePacker",
52
+ "input_list_read_keys": [
53
+ "enc_vae_latents_patched",
54
+ "enc_registers"
55
+ ],
56
+ "packed_seq_write_key": "enc_packed_seq",
57
+ "block_mask_write_key": "enc_block_mask",
58
+ "inner_packed_shapes_write_key": "enc_ps_inner",
59
+ "outer_packed_shapes_write_key": "enc_ps_outer",
60
+ "mask_mode": "causal_last",
61
+ "pad_to_multiple": 128
62
+ },
63
+ "enc_transformer": {
64
+ "_target_": "flextok.model.trunks.transformers.FlexTransformer",
65
+ "input_seq_read_key": "enc_packed_seq",
66
+ "output_seq_write_key": "enc_packed_seq",
67
+ "dim": 1152,
68
+ "depth": 18,
69
+ "block_mask_read_key": "enc_block_mask",
70
+ "use_act_checkpoint": true
71
+ },
72
+ "enc_unpacker": {
73
+ "_target_": "flextok.model.postprocessors.seq_unpacking.SequenceUnpacker",
74
+ "packed_seq_read_key": "enc_packed_seq",
75
+ "inner_seq_write_keys": [
76
+ "enc_vae_latents_patched",
77
+ "enc_registers"
78
+ ],
79
+ "inner_packed_shapes_read_key": "enc_ps_inner",
80
+ "outer_packed_shapes_read_key": "enc_ps_outer"
81
+ },
82
+ "enc_to_latents": {
83
+ "_target_": "flextok.model.postprocessors.heads.LinearHead",
84
+ "read_key": "enc_registers",
85
+ "write_key": "enc_registers",
86
+ "dim": 1152,
87
+ "dim_out": 6,
88
+ "use_mup_readout": false,
89
+ "weight_init_style": "zero",
90
+ "dtype_override": null
91
+ }
92
+ }
93
+ },
94
+ "decoder": {
95
+ "_target_": "flextok.model.utils.wrappers.SequentialModuleDictWrapper",
96
+ "module_dict": {
97
+ "dec_from_latents": {
98
+ "_target_": "flextok.model.preprocessors.linear.LinearLayer",
99
+ "read_key": "enc_registers_quant",
100
+ "write_key": "dec_registers_proj",
101
+ "dim_in": 6,
102
+ "dim": 1792
103
+ },
104
+ "dec_registers_posemb_module": {
105
+ "_target_": "flextok.model.utils.posembs.PositionalEmbeddingAdder",
106
+ "read_key": "dec_registers_proj",
107
+ "write_key": "dec_registers_proj",
108
+ "dim": 1792,
109
+ "max_sizes": [
110
+ 256
111
+ ],
112
+ "posemb_type": "learnable_sum",
113
+ "posemb_scaling": "absolute"
114
+ },
115
+ "dec_nested_dropout": {
116
+ "_target_": "flextok.model.preprocessors.token_dropout.MaskedNestedDropout",
117
+ "read_write_key": "dec_registers_proj",
118
+ "dim": 1792,
119
+ "size_sampling_mode": "pow2"
120
+ },
121
+ "dec_latent_dropout": {
122
+ "_target_": "flextok.model.preprocessors.nullcond.LearnedNullCond",
123
+ "read_write_key": "dec_registers_proj",
124
+ "dim": 1792,
125
+ "dropout_prob": 0.2
126
+ },
127
+ "dec_noise_channels_to_last": {
128
+ "_target_": "flextok.model.utils.dict_ops.PerSampleOp",
129
+ "read_key": "vae_latents_noised",
130
+ "write_key": "vae_latents_noised_bhwc",
131
+ "per_sample_op": {
132
+ "_target_": "flextok.model.utils.dict_ops.channels_first_to_last",
133
+ "_partial_": true
134
+ }
135
+ },
136
+ "dec_noise_patch_emb": {
137
+ "_target_": "flextok.model.preprocessors.patching.PatchEmbedder",
138
+ "input_tensor_list_read_key": "vae_latents_noised_bhwc",
139
+ "patches_list_write_key": "vae_latents_noised_patched",
140
+ "n_patches_write_key": "dec_n_patches",
141
+ "channels_in": 16,
142
+ "dim": 1792,
143
+ "patch_sizes": [
144
+ 2,
145
+ 2
146
+ ],
147
+ "flatten_patches": false
148
+ },
149
+ "dec_patches_posemb_module": {
150
+ "_target_": "flextok.model.utils.posembs.PositionalEmbeddingAdder",
151
+ "read_key": "vae_latents_noised_patched",
152
+ "write_key": "dec_patches",
153
+ "dim": 1792,
154
+ "max_sizes": [
155
+ 16,
156
+ 16
157
+ ],
158
+ "posemb_type": "sincos",
159
+ "posemb_scaling": "absolute"
160
+ },
161
+ "dec_seq_packer": {
162
+ "_target_": "flextok.model.preprocessors.flex_seq_packing.BlockWiseSequencePacker",
163
+ "input_list_read_keys": [
164
+ "dec_patches",
165
+ "dec_registers_proj"
166
+ ],
167
+ "packed_seq_write_key": "dec_packed_seq",
168
+ "block_mask_write_key": "dec_block_mask",
169
+ "inner_packed_shapes_write_key": "dec_ps_inner",
170
+ "outer_packed_shapes_write_key": "dec_ps_outer",
171
+ "emb_packing_fn_write_key": "emb_packing_fn",
172
+ "mask_mode": "full",
173
+ "pad_to_multiple": 128,
174
+ "per_subseq_embs": true
175
+ },
176
+ "dec_time_embedder": {
177
+ "_target_": "flextok.model.preprocessors.time_embedding.TimestepEmbedder",
178
+ "timesteps_read_key": "timesteps",
179
+ "time_embedding_write_key": "dec_temb",
180
+ "dim": 1792,
181
+ "frequency_embedding_size": 256,
182
+ "max_timestep": 1000.0
183
+ },
184
+ "dec_transformer": {
185
+ "_target_": "flextok.model.trunks.transformers.FlexTransformer",
186
+ "input_seq_read_key": "dec_packed_seq",
187
+ "output_seq_write_key": "dec_packed_seq",
188
+ "dim": 1792,
189
+ "depth": 28,
190
+ "block_mask_read_key": "dec_block_mask",
191
+ "adaLN_emb_read_key": "dec_temb",
192
+ "adaLN_packing_fn_read_key": "emb_packing_fn",
193
+ "adaLN_expansion": 2,
194
+ "intermediate_layer_write_key": "dec_packed_seq_repa_layer",
195
+ "intermediate_layers": [
196
+ 1
197
+ ],
198
+ "use_act_checkpoint": true
199
+ },
200
+ "dec_unpacker": {
201
+ "_target_": "flextok.model.postprocessors.seq_unpacking.SequenceUnpacker",
202
+ "packed_seq_read_key": "dec_packed_seq",
203
+ "inner_seq_write_keys": [
204
+ "dec_patches",
205
+ "dec_registers_proj"
206
+ ],
207
+ "inner_packed_shapes_read_key": "dec_ps_inner",
208
+ "outer_packed_shapes_read_key": "dec_ps_outer"
209
+ },
210
+ "dec_repa_unpacker": {
211
+ "_target_": "flextok.model.postprocessors.seq_unpacking.SequenceUnpacker",
212
+ "packed_seq_read_key": "dec_packed_seq_repa_layer",
213
+ "inner_seq_write_keys": [
214
+ "dec_patches_repa_layer",
215
+ "dec_registers_repa_layer"
216
+ ],
217
+ "inner_packed_shapes_read_key": "dec_ps_inner",
218
+ "outer_packed_shapes_read_key": "dec_ps_outer"
219
+ },
220
+ "dec_to_patches": {
221
+ "_target_": "flextok.model.postprocessors.heads.ToPatchesLinearHead",
222
+ "read_key": "dec_patches",
223
+ "write_key": "dec_patches",
224
+ "dim": 1792,
225
+ "channels_out": 16,
226
+ "patch_sizes": [
227
+ 2,
228
+ 2
229
+ ],
230
+ "use_mup_readout": false,
231
+ "weight_init_style": "zero",
232
+ "adaLN_emb_read_key": "dec_temb"
233
+ },
234
+ "dec_channels_to_first": {
235
+ "_target_": "flextok.model.utils.dict_ops.PerSampleOp",
236
+ "read_key": "dec_patches",
237
+ "write_key": "vae_latents_reconst",
238
+ "per_sample_op": {
239
+ "_target_": "flextok.model.utils.dict_ops.channels_last_to_first",
240
+ "_partial_": true
241
+ }
242
+ }
243
+ }
244
+ },
245
+ "pipeline": {
246
+ "_target_": "flextok.flow_matching.pipelines.MinRFPipeline",
247
+ "_partial_": true,
248
+ "target_sizes_read_key": null,
249
+ "latents_read_key": "enc_registers_quant",
250
+ "timesteps_read_key": "timesteps",
251
+ "noised_images_read_key": "vae_latents_noised",
252
+ "reconst_write_key": "vae_latents_reconst",
253
+ "out_channels": 16
254
+ },
255
+ "flow_matching_noise_module": {
256
+ "_target_": "flextok.flow_matching.noise_modules.MinRFNoiseModule",
257
+ "clean_images_read_key": "vae_latents",
258
+ "noised_images_write_key": "vae_latents_noised",
259
+ "noise_write_key": "flow_noise",
260
+ "timesteps_write_key": "timesteps",
261
+ "sigmas_write_key": "sigmas",
262
+ "ln": false,
263
+ "stratisfied": false,
264
+ "mode_scale": 0.25
265
+ },
266
+ "vae": {
267
+ "_target_": "flextok.vae_wrapper.StableDiffusionVAE",
268
+ "images_read_key": "rgb",
269
+ "vae_latents_read_key": "vae_latents_reconst",
270
+ "vae_latents_write_key": "vae_latents",
271
+ "images_reconst_write_key": "rgb_reconst",
272
+ "vae_kl_loss_write_key": "kl_loss",
273
+ "dtype_override": null,
274
+ "sample_posterior": true,
275
+ "compile_encode_fn": false,
276
+ "force_vae_encode": true,
277
+ "latent_channels": 16,
278
+ "scaling_factor": 0.88
279
+ },
280
+ "_target_": "flextok.flextok_wrapper.FlexTok",
281
+ "regularizer": {
282
+ "_target_": "flextok.regularizers.quantize_fsq.FSQ",
283
+ "latents_read_key": "enc_registers",
284
+ "quants_write_key": "enc_registers_quant",
285
+ "tokens_write_key": "tokens",
286
+ "levels": [
287
+ 8,
288
+ 8,
289
+ 8,
290
+ 5,
291
+ 5,
292
+ 5
293
+ ],
294
+ "drop_quant_p": 0.0,
295
+ "packed_call": false
296
+ }
297
+ }
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cbc24fb097acee43eaeb3ee0ff4e3466875ae908fb0ac79ce698f32b7c0ad852
3
+ size 10163625244