{ "architectures": [ "APMAE" ], "attention_scaler": "log_normalize", "base_learning_rate": 0.00015, "batch_size": 60, "correct_only": true, "dataset_location": "Razvan27/network_paper", "dataset_name": "JavaNearDedupFull", "dataset_split": "train", "dataset_split_seed": 42, "decoder_device": "cuda:0", "decoder_dim": 512, "decoder_dim_head": 64, "decoder_heads": 8, "decoder_layers": 8, "decoder_mlp_dim": 2048, "encoder_device": "cuda:0", "encoder_dim": 512, "encoder_dim_head": 64, "encoder_dropout": 0.0, "encoder_emb_dropout": 0.0, "encoder_heads": 16, "encoder_layers": 24, "encoder_mlp_dim": 2048, "encoder_pool": "cls", "head_selection_strategy": [ "layerwise", 0.25 ], "hf_datasets_cache": "./huggingface/datasets", "hf_home": "./huggingface", "hf_transformers_cache": "./huggingface/models", "hidden_act": "gelu", "initial_seed": 45, "iter_loader_workers": 8, "lang": "java", "layer_norm_eps": 1e-12, "mask_ratio": 0.5, "max_epochs": 1, "max_length": 256, "min_length": 256, "model_type": "ap_mae", "num_channels": 1, "num_classes": 2, "patch_size": 32, "qkv_bias": false, "queries": [ "random" ], "save_model_frequency": 15000, "target_model_device": "cuda:0", "target_model_name": "bigcode/starcoder2-7b", "torch_dtype": "float32", "train_batches": 150000, "transformers_version": "4.55.2", "val_batches": 120, "visualize_frequency": 2000, "vitmae_preload_name": null }