| { | |
| "architectures": [ | |
| "APMAE" | |
| ], | |
| "attention_scaler": "log_normalize", | |
| "base_learning_rate": 0.00015, | |
| "batch_size": 60, | |
| "correct_only": true, | |
| "dataset_location": "Razvan27/network_paper", | |
| "dataset_name": "JavaNearDedupFull", | |
| "dataset_split": "train", | |
| "dataset_split_seed": 42, | |
| "decoder_device": "cuda:0", | |
| "decoder_dim": 512, | |
| "decoder_dim_head": 64, | |
| "decoder_heads": 8, | |
| "decoder_layers": 8, | |
| "decoder_mlp_dim": 2048, | |
| "encoder_device": "cuda:0", | |
| "encoder_dim": 512, | |
| "encoder_dim_head": 64, | |
| "encoder_dropout": 0.0, | |
| "encoder_emb_dropout": 0.0, | |
| "encoder_heads": 16, | |
| "encoder_layers": 24, | |
| "encoder_mlp_dim": 2048, | |
| "encoder_pool": "cls", | |
| "head_selection_strategy": [ | |
| "layerwise", | |
| 0.25 | |
| ], | |
| "hf_datasets_cache": "./huggingface/datasets", | |
| "hf_home": "./huggingface", | |
| "hf_transformers_cache": "./huggingface/models", | |
| "hidden_act": "gelu", | |
| "initial_seed": 45, | |
| "iter_loader_workers": 8, | |
| "lang": "java", | |
| "layer_norm_eps": 1e-12, | |
| "mask_ratio": 0.5, | |
| "max_epochs": 1, | |
| "max_length": 256, | |
| "min_length": 256, | |
| "model_type": "ap_mae", | |
| "num_channels": 1, | |
| "num_classes": 2, | |
| "patch_size": 32, | |
| "qkv_bias": false, | |
| "queries": [ | |
| "random" | |
| ], | |
| "save_model_frequency": 15000, | |
| "target_model_device": "cuda:0", | |
| "target_model_name": "bigcode/starcoder2-3b", | |
| "torch_dtype": "float32", | |
| "train_batches": 150000, | |
| "transformers_version": "4.55.2", | |
| "val_batches": 120, | |
| "visualize_frequency": 2000, | |
| "vitmae_preload_name": null | |
| } | |