| { |
| "encoder": { |
| "_target_": "flextok.model.utils.wrappers.SequentialModuleDictWrapper", |
| "module_dict": { |
| "enc_channels_to_last": { |
| "_target_": "flextok.model.utils.dict_ops.PerSampleOp", |
| "read_key": "vae_latents", |
| "write_key": "vae_latents_bhwc", |
| "per_sample_op": { |
| "_target_": "flextok.model.utils.dict_ops.channels_first_to_last", |
| "_partial_": true |
| } |
| }, |
| "enc_patch_emb": { |
| "_target_": "flextok.model.preprocessors.patching.PatchEmbedder", |
| "input_tensor_list_read_key": "vae_latents_bhwc", |
| "patches_list_write_key": "enc_vae_latents_patched", |
| "n_patches_write_key": "enc_n_patches", |
| "channels_in": 16, |
| "dim": 1152, |
| "patch_sizes": [ |
| 2, |
| 2 |
| ], |
| "flatten_patches": false |
| }, |
| "enc_posemb_module": { |
| "_target_": "flextok.model.utils.posembs.PositionalEmbeddingAdder", |
| "read_key": "enc_vae_latents_patched", |
| "write_key": "enc_vae_latents_patched", |
| "dim": 1152, |
| "max_sizes": [ |
| 16, |
| 16 |
| ], |
| "posemb_type": "sincos", |
| "posemb_scaling": "absolute" |
| }, |
| "enc_register_module": { |
| "_target_": "flextok.model.preprocessors.registers.Registers1D", |
| "input_tensor_list_read_key": "enc_vae_latents_patched", |
| "register_sizes_read_write_key": "register_sizes", |
| "registers_write_key": "enc_registers", |
| "dim": 1152, |
| "n_min": 256, |
| "n_max": 256, |
| "size_sampling_mode": "uniform", |
| "ordering_mode": "nested" |
| }, |
| "enc_seq_packer": { |
| "_target_": "flextok.model.preprocessors.flex_seq_packing.BlockWiseSequencePacker", |
| "input_list_read_keys": [ |
| "enc_vae_latents_patched", |
| "enc_registers" |
| ], |
| "packed_seq_write_key": "enc_packed_seq", |
| "block_mask_write_key": "enc_block_mask", |
| "inner_packed_shapes_write_key": "enc_ps_inner", |
| "outer_packed_shapes_write_key": "enc_ps_outer", |
| "mask_mode": "causal_last", |
| "pad_to_multiple": 128 |
| }, |
| "enc_transformer": { |
| "_target_": "flextok.model.trunks.transformers.FlexTransformer", |
| "input_seq_read_key": "enc_packed_seq", |
| "output_seq_write_key": "enc_packed_seq", |
| "dim": 1152, |
| "depth": 18, |
| "block_mask_read_key": "enc_block_mask", |
| "use_act_checkpoint": true |
| }, |
| "enc_unpacker": { |
| "_target_": "flextok.model.postprocessors.seq_unpacking.SequenceUnpacker", |
| "packed_seq_read_key": "enc_packed_seq", |
| "inner_seq_write_keys": [ |
| "enc_vae_latents_patched", |
| "enc_registers" |
| ], |
| "inner_packed_shapes_read_key": "enc_ps_inner", |
| "outer_packed_shapes_read_key": "enc_ps_outer" |
| }, |
| "enc_to_latents": { |
| "_target_": "flextok.model.postprocessors.heads.LinearHead", |
| "read_key": "enc_registers", |
| "write_key": "enc_registers", |
| "dim": 1152, |
| "dim_out": 6, |
| "use_mup_readout": false, |
| "weight_init_style": "zero", |
| "dtype_override": null |
| } |
| } |
| }, |
| "decoder": { |
| "_target_": "flextok.model.utils.wrappers.SequentialModuleDictWrapper", |
| "module_dict": { |
| "dec_from_latents": { |
| "_target_": "flextok.model.preprocessors.linear.LinearLayer", |
| "read_key": "enc_registers_quant", |
| "write_key": "dec_registers_proj", |
| "dim_in": 6, |
| "dim": 1792 |
| }, |
| "dec_registers_posemb_module": { |
| "_target_": "flextok.model.utils.posembs.PositionalEmbeddingAdder", |
| "read_key": "dec_registers_proj", |
| "write_key": "dec_registers_proj", |
| "dim": 1792, |
| "max_sizes": [ |
| 256 |
| ], |
| "posemb_type": "learnable_sum", |
| "posemb_scaling": "absolute" |
| }, |
| "dec_nested_dropout": { |
| "_target_": "flextok.model.preprocessors.token_dropout.MaskedNestedDropout", |
| "read_write_key": "dec_registers_proj", |
| "dim": 1792, |
| "size_sampling_mode": "pow2" |
| }, |
| "dec_latent_dropout": { |
| "_target_": "flextok.model.preprocessors.nullcond.LearnedNullCond", |
| "read_write_key": "dec_registers_proj", |
| "dim": 1792, |
| "dropout_prob": 0.2 |
| }, |
| "dec_noise_channels_to_last": { |
| "_target_": "flextok.model.utils.dict_ops.PerSampleOp", |
| "read_key": "vae_latents_noised", |
| "write_key": "vae_latents_noised_bhwc", |
| "per_sample_op": { |
| "_target_": "flextok.model.utils.dict_ops.channels_first_to_last", |
| "_partial_": true |
| } |
| }, |
| "dec_noise_patch_emb": { |
| "_target_": "flextok.model.preprocessors.patching.PatchEmbedder", |
| "input_tensor_list_read_key": "vae_latents_noised_bhwc", |
| "patches_list_write_key": "vae_latents_noised_patched", |
| "n_patches_write_key": "dec_n_patches", |
| "channels_in": 16, |
| "dim": 1792, |
| "patch_sizes": [ |
| 2, |
| 2 |
| ], |
| "flatten_patches": false |
| }, |
| "dec_patches_posemb_module": { |
| "_target_": "flextok.model.utils.posembs.PositionalEmbeddingAdder", |
| "read_key": "vae_latents_noised_patched", |
| "write_key": "dec_patches", |
| "dim": 1792, |
| "max_sizes": [ |
| 16, |
| 16 |
| ], |
| "posemb_type": "sincos", |
| "posemb_scaling": "absolute" |
| }, |
| "dec_seq_packer": { |
| "_target_": "flextok.model.preprocessors.flex_seq_packing.BlockWiseSequencePacker", |
| "input_list_read_keys": [ |
| "dec_patches", |
| "dec_registers_proj" |
| ], |
| "packed_seq_write_key": "dec_packed_seq", |
| "block_mask_write_key": "dec_block_mask", |
| "inner_packed_shapes_write_key": "dec_ps_inner", |
| "outer_packed_shapes_write_key": "dec_ps_outer", |
| "emb_packing_fn_write_key": "emb_packing_fn", |
| "mask_mode": "full", |
| "pad_to_multiple": 128, |
| "per_subseq_embs": true |
| }, |
| "dec_time_embedder": { |
| "_target_": "flextok.model.preprocessors.time_embedding.TimestepEmbedder", |
| "timesteps_read_key": "timesteps", |
| "time_embedding_write_key": "dec_temb", |
| "dim": 1792, |
| "frequency_embedding_size": 256, |
| "max_timestep": 1000.0 |
| }, |
| "dec_transformer": { |
| "_target_": "flextok.model.trunks.transformers.FlexTransformer", |
| "input_seq_read_key": "dec_packed_seq", |
| "output_seq_write_key": "dec_packed_seq", |
| "dim": 1792, |
| "depth": 28, |
| "block_mask_read_key": "dec_block_mask", |
| "adaLN_emb_read_key": "dec_temb", |
| "adaLN_packing_fn_read_key": "emb_packing_fn", |
| "adaLN_expansion": 2, |
| "intermediate_layer_write_key": "dec_packed_seq_repa_layer", |
| "intermediate_layers": [ |
| 1 |
| ], |
| "use_act_checkpoint": true |
| }, |
| "dec_unpacker": { |
| "_target_": "flextok.model.postprocessors.seq_unpacking.SequenceUnpacker", |
| "packed_seq_read_key": "dec_packed_seq", |
| "inner_seq_write_keys": [ |
| "dec_patches", |
| "dec_registers_proj" |
| ], |
| "inner_packed_shapes_read_key": "dec_ps_inner", |
| "outer_packed_shapes_read_key": "dec_ps_outer" |
| }, |
| "dec_repa_unpacker": { |
| "_target_": "flextok.model.postprocessors.seq_unpacking.SequenceUnpacker", |
| "packed_seq_read_key": "dec_packed_seq_repa_layer", |
| "inner_seq_write_keys": [ |
| "dec_patches_repa_layer", |
| "dec_registers_repa_layer" |
| ], |
| "inner_packed_shapes_read_key": "dec_ps_inner", |
| "outer_packed_shapes_read_key": "dec_ps_outer" |
| }, |
| "dec_to_patches": { |
| "_target_": "flextok.model.postprocessors.heads.ToPatchesLinearHead", |
| "read_key": "dec_patches", |
| "write_key": "dec_patches", |
| "dim": 1792, |
| "channels_out": 16, |
| "patch_sizes": [ |
| 2, |
| 2 |
| ], |
| "use_mup_readout": false, |
| "weight_init_style": "zero", |
| "adaLN_emb_read_key": "dec_temb" |
| }, |
| "dec_channels_to_first": { |
| "_target_": "flextok.model.utils.dict_ops.PerSampleOp", |
| "read_key": "dec_patches", |
| "write_key": "vae_latents_reconst", |
| "per_sample_op": { |
| "_target_": "flextok.model.utils.dict_ops.channels_last_to_first", |
| "_partial_": true |
| } |
| } |
| } |
| }, |
| "pipeline": { |
| "_target_": "flextok.flow_matching.pipelines.MinRFPipeline", |
| "_partial_": true, |
| "target_sizes_read_key": null, |
| "latents_read_key": "enc_registers_quant", |
| "timesteps_read_key": "timesteps", |
| "noised_images_read_key": "vae_latents_noised", |
| "reconst_write_key": "vae_latents_reconst", |
| "out_channels": 16 |
| }, |
| "flow_matching_noise_module": { |
| "_target_": "flextok.flow_matching.noise_modules.MinRFNoiseModule", |
| "clean_images_read_key": "vae_latents", |
| "noised_images_write_key": "vae_latents_noised", |
| "noise_write_key": "flow_noise", |
| "timesteps_write_key": "timesteps", |
| "sigmas_write_key": "sigmas", |
| "ln": false, |
| "stratisfied": false, |
| "mode_scale": 0.25 |
| }, |
| "vae": { |
| "_target_": "flextok.vae_wrapper.StableDiffusionVAE", |
| "images_read_key": "rgb", |
| "vae_latents_read_key": "vae_latents_reconst", |
| "vae_latents_write_key": "vae_latents", |
| "images_reconst_write_key": "rgb_reconst", |
| "vae_kl_loss_write_key": "kl_loss", |
| "dtype_override": null, |
| "sample_posterior": true, |
| "compile_encode_fn": false, |
| "force_vae_encode": true, |
| "latent_channels": 16, |
| "scaling_factor": 0.88 |
| }, |
| "_target_": "flextok.flextok_wrapper.FlexTok", |
| "regularizer": { |
| "_target_": "flextok.regularizers.quantize_fsq.FSQ", |
| "latents_read_key": "enc_registers", |
| "quants_write_key": "enc_registers_quant", |
| "tokens_write_key": "tokens", |
| "levels": [ |
| 8, |
| 8, |
| 8, |
| 5, |
| 5, |
| 5 |
| ], |
| "drop_quant_p": 0.0, |
| "packed_call": false |
| } |
| } |