AP-MAE-SC2-3B / config.json
LaughingLogits's picture
Upload APMAE
147d6dc verified
{
"architectures": [
"APMAE"
],
"attention_scaler": "log_normalize",
"base_learning_rate": 0.00015,
"batch_size": 60,
"correct_only": true,
"dataset_location": "Razvan27/network_paper",
"dataset_name": "JavaNearDedupFull",
"dataset_split": "train",
"dataset_split_seed": 42,
"decoder_device": "cuda:0",
"decoder_dim": 512,
"decoder_dim_head": 64,
"decoder_heads": 8,
"decoder_layers": 8,
"decoder_mlp_dim": 2048,
"encoder_device": "cuda:0",
"encoder_dim": 512,
"encoder_dim_head": 64,
"encoder_dropout": 0.0,
"encoder_emb_dropout": 0.0,
"encoder_heads": 16,
"encoder_layers": 24,
"encoder_mlp_dim": 2048,
"encoder_pool": "cls",
"head_selection_strategy": [
"layerwise",
0.25
],
"hf_datasets_cache": "./huggingface/datasets",
"hf_home": "./huggingface",
"hf_transformers_cache": "./huggingface/models",
"hidden_act": "gelu",
"initial_seed": 45,
"iter_loader_workers": 8,
"lang": "java",
"layer_norm_eps": 1e-12,
"mask_ratio": 0.5,
"max_epochs": 1,
"max_length": 256,
"min_length": 256,
"model_type": "ap_mae",
"num_channels": 1,
"num_classes": 2,
"patch_size": 32,
"qkv_bias": false,
"queries": [
"random"
],
"save_model_frequency": 15000,
"target_model_device": "cuda:0",
"target_model_name": "bigcode/starcoder2-3b",
"torch_dtype": "float32",
"train_batches": 150000,
"transformers_version": "4.55.2",
"val_batches": 120,
"visualize_frequency": 2000,
"vitmae_preload_name": null
}