| { |
| "add_timestep_token": false, |
| "add_vision_branch": true, |
| "add_vision_branch_reuse_layernorm": false, |
| "add_vision_gen_mask_token": false, |
| "add_vision_soi_eoi_tokens": false, |
| "add_vision_soi_token": false, |
| "architectures": [ |
| "MultiModalityCausalLM" |
| ], |
| "batch_size_t2i": 16, |
| "ctrlnet_training": false, |
| "diffusion_decoder_config": { |
| "cls": "black-forest-labs/FLUX.1-dev", |
| "model_type": "diffusion_decoder", |
| "params": { |
| "controlnet_model_name_or_path": null, |
| "diffusion_decoder_text_dropout_prob": 0.0, |
| "num_double_layers": 1, |
| "num_single_layers": 4, |
| "pretrained_model_name_or_path": "black-forest-labs/FLUX.1-dev", |
| "revision": null, |
| "variant": null |
| } |
| }, |
| "e2e_training": false, |
| "fully_trainable": false, |
| "inner_dim": 3584, |
| "lambda_clip": 1.0, |
| "lambda_gpu": true, |
| "mar_style_vision_branch": false, |
| "max_seq_length": 300, |
| "model_type": "multi_modality", |
| "num_visual_gen_tokens": 256, |
| "precise_prompt_mask": 3, |
| "proportion_empty_prompts": 0.0, |
| "remove_vae": false, |
| "skip_text_part2": true, |
| "t2i_resolution": 448, |
| "timestep_sampling_strategy": "uniform", |
| "torch_dtype": "bfloat16", |
| "transformers_version": "4.49.0", |
| "use_2d_query_tokens": false, |
| "use_cache": false, |
| "use_clip_visual_encoder": true, |
| "use_discrete_visual_tokenizer": false, |
| "vae_scale_by_4": false, |
| "vae_w_ctrlnet_training": false, |
| "vae_wo_ctrlnet_training": false, |
| "vision_denoising_type": "mar", |
| "vision_gen_aligner_config": { |
| "cls": "MlpProjector", |
| "model_type": "vision_gen_aligner", |
| "params": { |
| "depth": 2, |
| "input_dim": 8, |
| "n_embed": 3584, |
| "projector_type": "mlp_gelu" |
| } |
| }, |
| "vision_gen_dec_config": { |
| "cls": "ShallowUViTDecoder", |
| "model_type": "vision_gen_dec", |
| "params": { |
| "block_out_channels": [ |
| 768 |
| ], |
| "elementwise_affine": true, |
| "frozen_vision_gen_encdec": false, |
| "hidden_size": 2048, |
| "in_channels": 768, |
| "layers_in_middle": 2, |
| "norm_eps": 1e-06, |
| "out_channels": null, |
| "upsamples": 1, |
| "use_bias": true, |
| "use_mid_block": true |
| } |
| }, |
| "vision_gen_enc_config": { |
| "cls": "ShallowUViTEncoder", |
| "model_type": "vision_gen_enc", |
| "params": { |
| "block_out_channels": [ |
| 768 |
| ], |
| "elementwize_affine": true, |
| "frozen_vision_gen_encdec": false, |
| "hidden_size": 2048, |
| "input_channels": null, |
| "kernel_size": 2, |
| "layers_in_middle": 2, |
| "norm_eps": 1e-06, |
| "num_extra_tensors": 5, |
| "padding": 0, |
| "stride": 2, |
| "use_bias": true, |
| "use_mid_block": true |
| } |
| }, |
| "vision_gen_head_config": { |
| "cls": "VisionHead", |
| "model_type": "vision_gen_head", |
| "params": { |
| "image_token_embed": 3584, |
| "image_token_size": 8192, |
| "n_embed": 3584 |
| } |
| }, |
| "vision_gen_tokenizer_config": { |
| "cls": "magvitv2", |
| "model_type": "vision_gen_tokenizer", |
| "params": { |
| "image_token_size": 8192, |
| "model_name_or_path": "showlab/magvitv2", |
| "n_embed": 8 |
| } |
| }, |
| "vision_gen_vae_config": { |
| "cls": "AutoencoderKL", |
| "model_type": "vision_gen_vae", |
| "params": { |
| "frozen_vision_gen_vae": false, |
| "huggingface_token": "", |
| "load_from_pretrained": true, |
| "model_name_or_path": null |
| } |
| }, |
| "vision_head_type": "linear", |
| "vision_language_model_config": { |
| "cls": "Qwen2_5_VLForConditionalGeneration", |
| "model_type": "vision_language_model", |
| "params": { |
| "frozen_modules_in_vlm": [ |
| "vision_language_model.visual", |
| "vision_language_model.lm_head", |
| "vision_language_model.model.embed_tokens", |
| "vision_language_model.model.layers", |
| "vision_language_model.model.norm" |
| ], |
| "huggingface_token": "", |
| "load_from_pretrained": true, |
| "model_name_or_path": "Qwen/Qwen2.5-VL-7B-Instruct", |
| "remove_vision_und_encoder": false |
| } |
| }, |
| "vision_loss_type": "mse", |
| "vision_pos_emb_type": "learnable_pos_emb" |
| } |
|
|