{ "vlm_cfg": { "vit_hidden_dim": 768, "vit_inter_dim": 3072, "vit_patch_size": 16, "vit_img_size": 512, "vit_n_heads": 12, "vit_dropout": 0.0, "vit_n_blocks": 12, "vit_ln_eps": 1e-06, "vit_cls_flag": false, "vit_model_type": "google/siglip2-base-patch16-512", "lm_hidden_dim": 576, "lm_inter_dim": 1536, "lm_rms_eps": 1e-05, "lm_re_base": 100000, "lm_max_position_embeddings": 8192, "lm_base_vocab_size": 49152, "extra_token_amount": 66, "lm_vocab_size": 49218, "lm_n_heads": 9, "lm_n_kv_heads": 3, "lm_dropout": 0.0, "lm_n_blocks": 30, "lm_attn_scaling": 1.0, "lm_pad_aware_rope": false, "lm_max_length": 2048, "lm_use_tokens": false, "lm_tie_weights": true, "lm_model_type": "HuggingFaceTB/SmolLM2-135M-Instruct", "lm_tokenizer": "HuggingFaceTB/SmolLM2-135M-Instruct", "lm_chat_template": "{% for message in messages %}{{'<|im_start|>' + message['role'] + '\\n' + message['content'] + '<|im_end|>' + '\\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\\n' }}{% endif %}", "mp_pixel_shuffle_factor": 4, "mp_image_token_length": 64, "max_img_size": 2048, "resize_to_max_side_len": false, "vlm_extra_tokens": { "image_token": "<|image|>", "global_image_token": "<|global_image|>", "r1c1": "", "r1c2": "", "r1c3": "", "r1c4": "", "r1c5": "", "r1c6": "", "r1c7": "", "r1c8": "", "r2c1": "", "r2c2": "", "r2c3": "", "r2c4": "", "r2c5": "", "r2c6": "", "r2c7": "", "r2c8": "", "r3c1": "", "r3c2": "", "r3c3": "", "r3c4": "", "r3c5": "", "r3c6": "", "r3c7": "", "r3c8": "", "r4c1": "", "r4c2": "", "r4c3": "", "r4c4": "", "r4c5": "", "r4c6": "", "r4c7": "", "r4c8": "", "r5c1": "", "r5c2": "", "r5c3": "", "r5c4": "", "r5c5": "", "r5c6": "", "r5c7": "", "r5c8": "", "r6c1": "", "r6c2": "", "r6c3": "", "r6c4": "", "r6c5": "", "r6c6": "", "r6c7": "", "r6c8": "", "r7c1": "", "r7c2": "", "r7c3": "", "r7c4": "", "r7c5": "", "r7c6": "", "r7c7": "", "r7c8": "", "r8c1": "", "r8c2": "", "r8c3": "", "r8c4": "", "r8c5": "", "r8c6": "", "r8c7": "", "r8c8": "" }, "vlm_load_backbone_weights": true, "use_kv_bridge": true, "kv_bridge_mode": "identity", "kv_bridge_affine_stack_depth": 1, "kv_bridge_adapter_expansion_factor": 1.0, "kv_bridge_use_gate": true }, "train_cfg": { "lr_mp": 5e-05, "lr_vision_backbone": 0.0, "lr_language_backbone": 1e-05, "lr_right_tower": 0.0, "lr_kv_bridge": 0.0001, "batch_size": 8, "gradient_accumulation_steps": 16, "max_grad_norm": 1.0, "max_training_steps": 2000, "warmup_ratio": 0.01, "stats_log_interval": 100, "compile": false, "eval_in_epochs": false, "eval_interval": 250, "use_packing": false, "max_images_per_example": 1, "max_images_per_knapsack": 4, "max_sample_length": 2048, "train_dataset_path": "patrickamadeus/the_cauldron", "train_dataset_name": [ "all" ], "train_split": "train", "val_split": "validation", "stream_dataset": false, "enable_source_filter": true, "allowed_dataset_sources": [ "ocrvqa" ], "relevance_min_rating": 1, "image_correspondence_min_rating": 1, "visual_dependency_min_rating": 1, "formatting_min_rating": 1, "wandb_entity": "HuggingFace", "log_wandb": true, "push_checkpoints_to_hub": true, "save_training_state_to_hub": true, "checkpoint_repo_pattern": "patrickamadeus/dt-ocrvqa-{i}", "hf_private": false, "push_final_model_to_hub": false } }