Invalid JSON: Expected double-quoted property name in JSONat line 36, column 5
| { | |
| "src_vocab_size": 250880, | |
| "report_every": 50, | |
| "save_data": "/media/vincent/Crucial X6/NMT_work/en-de/runs/48-0-32-4096-16384/", | |
| "skip_empty_level": "silent", | |
| "decoder_start_token": "<s>", | |
| "seed": 1234, | |
| "log_file": "/media/vincent/Crucial X6/NMT_work/en-de/runs/48-0-32-4096-16384-with-estim/48-0-32-4096-16384-with-estim.log", | |
| "n_sample": 0, | |
| "tgt_vocab_size": 250880, | |
| "default_specials": [ | |
| "<unk>", | |
| "<blank>", | |
| "<s>", | |
| "</s>" | |
| ], | |
| "model": { | |
| "rotary_theta": 10000, | |
| "hidden_size": 4096, | |
| "transformer_ff": 16384, | |
| "layers": 48, | |
| "parallel_residual": false, | |
| "mlp_activation_fn": "gelu", | |
| "add_ffnbias": true, | |
| "add_qkvbias": true, | |
| "norm_eps": 1e-05, | |
| "heads": 32, | |
| "embeddings": { | |
| "n_positions": 514, | |
| "word_vec_size": 4096, | |
| "src_word_vec_size": 4096, | |
| "position_shift": 2, | |
| "freeze_word_vecs_enc": true, | |
| "position_encoding_type": "Learned", | |
| "tgt_word_vec_size": 4096, | |
| }, | |
| "shared_layer_norm": false, | |
| "num_experts_per_tok": 0, | |
| "max_relative_positions": 0, | |
| "heads_kv": 32, | |
| "num_experts": 0, | |
| "architecture": "transformer_encoder", | |
| "sliding_window": 0, | |
| "share_decoder_embeddings": true, | |
| "left_pad": false, | |
| "add_estimator": true, | |
| "encoder": { | |
| "encoder_type": "transformer", | |
| "src_word_vec_size": 4096 | |
| }, | |
| "layer_norm": "standard", | |
| "rotary_interleave": false, | |
| "rotary_dim": 0 | |
| }, | |
| "src_vocab": "/mnt/InternalCrucial4/LLM_work/xlm-roberta-xl-eole/dict2.txt", | |
| "vocab_size_multiple": 1, | |
| "share_vocab": true, | |
| "tgt_vocab": null, | |
| "transforms": [ | |
| "sentencepiece" | |
| ], | |
| "transforms_configs": { | |
| "onmt_tokenize": {}, | |
| "tokendrop": {}, | |
| "bpe": {}, | |
| "filtertoolong": { | |
| "src_seq_length": 94, | |
| "tgt_seq_length": 94 | |
| }, | |
| "inlinetags": {}, | |
| "clean": {}, | |
| "suffix": {}, | |
| "docify": {}, | |
| "switchout": {}, | |
| "uppercase": {}, | |
| "terminology": {}, | |
| "sentencepiece": { | |
| "tgt_subword_model": "/mnt/InternalCrucial4/LLM_work/xlm-roberta-xxl-eole/sentencepiece.bpe.model", | |
| "src_subword_model": "/mnt/InternalCrucial4/LLM_work/xlm-roberta-xxl-eole/sentencepiece.bpe.model" | |
| }, | |
| "normalize": {}, | |
| "bart": {}, | |
| "insert_mask_before_placeholder": {}, | |
| "prefix": {}, | |
| "tokenmask": {} | |
| }, | |
| "training": { | |
| "world_size": 1, | |
| "w_bit": 0, | |
| "group_size": 0, | |
| "batch_type": "sents", | |
| "param_init_glorot": true, | |
| "prefetch_factor": 400, | |
| "learning_rate_decay": 1.0, | |
| "decay_steps": 100000, | |
| "param_init": 0.0, | |
| "save_checkpoint_steps": 4000, | |
| "accum_count": [ | |
| 8 | |
| ], | |
| "num_workers": 2, | |
| "model_dtype": "fp16", | |
| "start_decay_steps": 1000000, | |
| "label_smoothing": 0.1, | |
| "keep_checkpoint": 50, | |
| "train_from": "/mnt/InternalCrucial4/LLM_work/xlm-roberta-xxl-eole/", | |
| "valid_batch_size": 1, | |
| "estim_loss_lambda_steps": [ | |
| 0 | |
| ], | |
| "quant_type": "bnb_NF4", | |
| "batch_size_multiple": 1, | |
| "attention_dropout": [ | |
| 0.0 | |
| ], | |
| "learning_rate": 1.5e-05, | |
| "model_path": "/media/vincent/Crucial X6/NMT_work/en-de/runs/48-0-32-4096-16384-with-estim", | |
| "batch_size": 8, | |
| "dropout_steps": [ | |
| 0 | |
| ], | |
| "dropout": [ | |
| 0.1 | |
| ], | |
| "score_threshold": 0.0, | |
| "gpu_ranks": [ | |
| 0 | |
| ], | |
| "optim": "fusedadam", | |
| "normalization": "tokens", | |
| "valid_steps": 1000, | |
| "train_steps": 4000, | |
| "adam_beta2": 0.998, | |
| "decay_method": "none", | |
| "estim_loss_lambda": [ | |
| 1.0 | |
| ], | |
| "average_decay": 0.0, | |
| "accum_steps": [ | |
| 0 | |
| ], | |
| "quant_layers": [ | |
| "linear_values", | |
| "linear_query", | |
| "linear_keys", | |
| "final_linear", | |
| "gate_up_proj", | |
| "down_proj" | |
| ], | |
| "max_grad_norm": 1.0, | |
| "self_attn_backend": "pytorch", | |
| "freeze_encoder": true, | |
| "bucket_size": 262144 | |
| }, | |
| "data": {} | |
| } |