Bifrost-1 / config.json
hanlincs's picture
Upload 7 files
61b49e0 verified
{
"add_timestep_token": false,
"add_vision_branch": true,
"add_vision_branch_reuse_layernorm": false,
"add_vision_gen_mask_token": false,
"add_vision_soi_eoi_tokens": false,
"add_vision_soi_token": false,
"architectures": [
"MultiModalityCausalLM"
],
"batch_size_t2i": 16,
"ctrlnet_training": false,
"diffusion_decoder_config": {
"cls": "black-forest-labs/FLUX.1-dev",
"model_type": "diffusion_decoder",
"params": {
"controlnet_model_name_or_path": null,
"diffusion_decoder_text_dropout_prob": 0.0,
"num_double_layers": 1,
"num_single_layers": 4,
"pretrained_model_name_or_path": "black-forest-labs/FLUX.1-dev",
"revision": null,
"variant": null
}
},
"e2e_training": false,
"fully_trainable": false,
"inner_dim": 3584,
"lambda_clip": 1.0,
"lambda_gpu": true,
"mar_style_vision_branch": false,
"max_seq_length": 300,
"model_type": "multi_modality",
"num_visual_gen_tokens": 256,
"precise_prompt_mask": 3,
"proportion_empty_prompts": 0.0,
"remove_vae": false,
"skip_text_part2": true,
"t2i_resolution": 448,
"timestep_sampling_strategy": "uniform",
"torch_dtype": "bfloat16",
"transformers_version": "4.49.0",
"use_2d_query_tokens": false,
"use_cache": false,
"use_clip_visual_encoder": true,
"use_discrete_visual_tokenizer": false,
"vae_scale_by_4": false,
"vae_w_ctrlnet_training": false,
"vae_wo_ctrlnet_training": false,
"vision_denoising_type": "mar",
"vision_gen_aligner_config": {
"cls": "MlpProjector",
"model_type": "vision_gen_aligner",
"params": {
"depth": 2,
"input_dim": 8,
"n_embed": 3584,
"projector_type": "mlp_gelu"
}
},
"vision_gen_dec_config": {
"cls": "ShallowUViTDecoder",
"model_type": "vision_gen_dec",
"params": {
"block_out_channels": [
768
],
"elementwise_affine": true,
"frozen_vision_gen_encdec": false,
"hidden_size": 2048,
"in_channels": 768,
"layers_in_middle": 2,
"norm_eps": 1e-06,
"out_channels": null,
"upsamples": 1,
"use_bias": true,
"use_mid_block": true
}
},
"vision_gen_enc_config": {
"cls": "ShallowUViTEncoder",
"model_type": "vision_gen_enc",
"params": {
"block_out_channels": [
768
],
"elementwize_affine": true,
"frozen_vision_gen_encdec": false,
"hidden_size": 2048,
"input_channels": null,
"kernel_size": 2,
"layers_in_middle": 2,
"norm_eps": 1e-06,
"num_extra_tensors": 5,
"padding": 0,
"stride": 2,
"use_bias": true,
"use_mid_block": true
}
},
"vision_gen_head_config": {
"cls": "VisionHead",
"model_type": "vision_gen_head",
"params": {
"image_token_embed": 3584,
"image_token_size": 8192,
"n_embed": 3584
}
},
"vision_gen_tokenizer_config": {
"cls": "magvitv2",
"model_type": "vision_gen_tokenizer",
"params": {
"image_token_size": 8192,
"model_name_or_path": "showlab/magvitv2",
"n_embed": 8
}
},
"vision_gen_vae_config": {
"cls": "AutoencoderKL",
"model_type": "vision_gen_vae",
"params": {
"frozen_vision_gen_vae": false,
"huggingface_token": "",
"load_from_pretrained": true,
"model_name_or_path": null
}
},
"vision_head_type": "linear",
"vision_language_model_config": {
"cls": "Qwen2_5_VLForConditionalGeneration",
"model_type": "vision_language_model",
"params": {
"frozen_modules_in_vlm": [
"vision_language_model.visual",
"vision_language_model.lm_head",
"vision_language_model.model.embed_tokens",
"vision_language_model.model.layers",
"vision_language_model.model.norm"
],
"huggingface_token": "",
"load_from_pretrained": true,
"model_name_or_path": "Qwen/Qwen2.5-VL-7B-Instruct",
"remove_vision_und_encoder": false
}
},
"vision_loss_type": "mse",
"vision_pos_emb_type": "learnable_pos_emb"
}