| |
| |
| |
| |
|
|
| model: |
| modality: image |
| |
| |
| |
| remove_image_tokenizer: true |
| |
| meta_model: |
| _target_: l3m.model.meta_models.InSeriesMetaModels |
| models: |
| image_tokenizer: |
| _target_: flextok_ar.model.integration.ImageResamplerTokenizer |
| |
| model_id: ZhitongGao/GridAR_256 |
| force_vae_encode: true |
| sample_posterior: true |
| image_dims: [256, 256] |
| read_key: image |
| write_key: image_token_ids |
| vae_latent_read_key: vae_latents |
| del_decoders: false |
| token_grid_size: [16, 16] |
| |
| text_encoder: |
| _target_: flextok_ar.model.text_encoder.T5EmbedderWithMLP |
| t5_embedder: |
| _target_: flextok_ar.model.text_encoder.T5Embedder |
| read_key: text |
| text_embeddings_write_key: text_embeddings |
| text_embeddings_mask_write_key: cross_attn_mask |
| hf_hub_path: google/flan-t5-xl |
| encoder_seqlen_max: 128 |
| decoder_seqlen: 256 |
| cond_dropout_p: 0.1 |
| mlp: |
| _target_: flextok_ar.model.text_encoder.TextToEmbedMLP |
| text_dim: 2048 |
| embed_dim: 2304 |
| act_layer: |
| _target_: torch.nn.GELU |
| _partial_: true |
| approximate: tanh |
| use_bias: false |
| |
| ar_image_model: |
| _target_: l3m.model.meta_models.MetaModel |
| preprocessor: |
| _target_: flextok_ar.model.preprocessors.ARImageEmbedPreprocessor |
| read_key: image_token_ids |
| write_key: |
| - input_embeddings |
| - target_token_ids |
| inference_read_key: pred_image_token_ids |
| token_grid_size: [16, 16] |
| codebook_size: 64000 |
| embed_dim: 2304 |
| pos_embed_type: absolute |
| num_classes: null |
| |
| trunk: |
| _target_: l3m.model.trunks.transformer_decoder.TransformerDecoder |
| read_key: input_embeddings |
| write_key: output_embeddings |
| encoder_output_key: text_embeddings |
| self_attn_mask_read_key: cross_attn_mask |
| embed_dim: 2304 |
| num_blocks: 36 |
| mlp_ratio: 4 |
| norm_layer: |
| _target_: l3m.model.layers.normalization.LayerNormFP32 |
| _partial_: true |
| eps: 1.0e-05 |
| ffn_target: |
| _target_: l3m.model.layers.ffn.SwiGLUFFN |
| _partial_: true |
| self_attn_target: |
| _target_: l3m.model.layers.attention.EfficientAttention |
| _partial_: true |
| dim: 2304 |
| num_heads: 36 |
| qkv_bias: false |
| is_causal: true |
| qk_norm: |
| _target_: l3m.model.layers.normalization.LayerNormFP32 |
| _partial_: true |
| eps: 1.0e-05 |
| rope_pos_embed: null |
| cross_attn_target: |
| _target_: flextok_ar.model.attention.GeneralizedAttentionWithMask |
| _partial_: true |
| dim: 2304 |
| encoder_dim: 2304 |
| num_heads: 36 |
| qkv_bias: false |
| is_causal: false |
| qk_norm: |
| _target_: l3m.model.layers.normalization.LayerNormFP32 |
| _partial_: true |
| eps: 1.0e-05 |
| weight_init_style: jax |
| post_trunk_norm: true |
| use_bias: false |
| |
| postprocessor: |
| _target_: torch.nn.Identity |
| |
| head: |
| _target_: l3m.model.heads.classifier.LinearClassifier |
| read_key: output_embeddings |
| write_key: token_preds |
| in_features: 2304 |
| out_features: 64000 |
|
|
| generation: |
| model_type: ar_text_to_image_model |
| sample: true |
| temperature: 1.0 |
| top_k: 0 |
| top_p: 0.0 |
| cfg_factor: 3.0 |
| num_keep_tokens: 256 |
| num_samples: 1 |
| timesteps: 25 |
| tokenizer_cfg_factor: 5.0 |
| tokenizer_perform_norm_guidance: true |
|
|
| |
| decode: |
| vae_image_sizes: 32 |
|
|
| image: |
| mean: [0.5, 0.5, 0.5] |
| std: [0.5, 0.5, 0.5] |
| size: 256 |
|
|
| device: cuda |
| seed: 42 |
|
|
|
|