{ "_name": null, "architectures": [ "RobertaModel" ], "attention_probs_dropout_prob": 0.1, "bmuf": { "_name": null, "average_sync": false, "block_lr": 1.0, "block_momentum": 0.875, "distributed_world_size": 2, "global_sync_iter": 50, "use_nbm": false, "warmup_iterations": 500 }, "bos_token_id": 0, "bpe": null, "checkpoint": { "_name": null, "best_checkpoint_metric": "loss", "checkpoint_shard_count": 1, "checkpoint_suffix": "", "continue_once": null, "finetune_from_model": null, "keep_best_checkpoints": -1, "keep_interval_updates": -1, "keep_interval_updates_pattern": -1, "keep_last_epochs": -1, "load_checkpoint_on_all_dp_ranks": false, "maximize_best_checkpoint_metric": false, "model_parallel_size": 1, "no_epoch_checkpoints": true, "no_last_checkpoints": false, "no_save": false, "no_save_optimizer_state": false, "optimizer_overrides": "{}", "patience": -1, "reset_dataloader": false, "reset_lr_scheduler": false, "reset_meters": false, "reset_optimizer": false, "restore_file": "checkpoint_last.pt", "save_dir": "roberturk_checkpoints", "save_interval": 1, "save_interval_updates": 50000, "write_checkpoints_asynchronously": false }, "classifier_dropout": null, "common": { "_name": null, "aim_repo": null, "aim_run_hash": null, "all_gather_list_size": 16384, "amp": false, "amp_batch_retries": 2, "amp_init_scale": 128, "amp_scale_window": null, "azureml_logging": false, "bf16": false, "cpu": false, "empty_cache_freq": 0, "fp16": true, "fp16_init_scale": 128, "fp16_no_flatten_grads": false, "fp16_scale_tolerance": 0.0, "fp16_scale_window": null, "log_file": null, "log_format": "json", "log_interval": 200, "memory_efficient_bf16": false, "memory_efficient_fp16": false, "min_loss_scale": 0.0001, "model_parallel_size": 1, "no_progress_bar": false, "on_cpu_convert_precision": false, "plasma_path": "/tmp/plasma", "profile": false, "quantization_config_path": null, "reset_logging": false, "seed": 1, "suppress_crashes": false, "tensorboard_logdir": null, "threshold_loss_scale": null, "tpu": false, "use_plasma_view": false, "user_dir": null, "wandb_project": null }, "common_eval": { "_name": null, "model_overrides": "{}", "path": null, "post_process": null, "quiet": false, "results_path": null }, "criterion": { "_name": "masked_lm", "tpu": false }, "dataset": { "_name": null, "batch_size": 16, "batch_size_valid": 16, "combine_valid_subsets": null, "curriculum": 0, "data_buffer_size": 10, "dataset_impl": null, "disable_validation": false, "fixed_validation_seed": null, "gen_subset": "test", "grouped_shuffling": false, "ignore_unused_valid_subsets": true, "max_tokens": null, "max_tokens_valid": null, "max_valid_steps": null, "num_shards": 1, "num_workers": 1, "required_batch_size_multiple": 8, "required_seq_len_multiple": 1, "shard_id": 0, "skip_invalid_size_inputs_valid_test": true, "train_subset": "train", "update_epoch_batch_itr": false, "update_ordered_indices_seed": false, "valid_subset": "valid", "validate_after_updates": 0, "validate_interval": 1, "validate_interval_updates": 0 }, "distributed_training": { "_name": null, "broadcast_buffers": false, "bucket_cap_mb": 25, "cpu_offload": false, "ddp_backend": "pytorch_ddp", "ddp_comm_hook": "none", "device_id": 0, "distributed_backend": "nccl", "distributed_init_method": "tcp://localhost:51855", "distributed_no_spawn": false, "distributed_num_procs": 2, "distributed_port": 51855, "distributed_rank": 0, "distributed_world_size": 2, "fast_stat_sync": false, "find_unused_parameters": false, "fix_batches_to_gpus": false, "fp16": true, "fp32_reduce_scatter": false, "gradient_as_bucket_view": false, "heartbeat_timeout": -1, "localsgd_frequency": 3, "memory_efficient_fp16": false, "no_reshard_after_forward": false, "not_fsdp_flatten_parameters": false, "nprocs_per_node": 2, "pipeline_balance": null, "pipeline_checkpoint": "never", "pipeline_chunks": 0, "pipeline_decoder_balance": null, "pipeline_decoder_devices": null, "pipeline_devices": null, "pipeline_encoder_balance": null, "pipeline_encoder_devices": null, "pipeline_model_parallel": false, "slowmo_base_algorithm": "localsgd", "slowmo_momentum": null, "tpu": false, "use_sharded_state": false, "zero_sharding": "none" }, "ema": { "_name": null, "ema_decay": 0.9999, "ema_fp32": false, "ema_seed_model": null, "ema_start_update": 0, "ema_update_freq": 1, "store_ema": false }, "eos_token_id": 2, "eval_lm": { "_name": null, "context_window": 0, "output_word_probs": false, "output_word_stats": false, "softmax_batch": 9223372036854775807 }, "generation": { "_name": null, "beam": 5, "beam_mt": 0, "constraints": null, "decoding_format": null, "diverse_beam_groups": -1, "diverse_beam_strength": 0.5, "diversity_rate": -1.0, "eos_token": null, "iter_decode_eos_penalty": 0.0, "iter_decode_force_max_iter": false, "iter_decode_max_iter": 10, "iter_decode_with_beam": 1, "iter_decode_with_external_reranker": false, "lenpen": 1.0, "lenpen_mt": 1.0, "lm_path": null, "lm_weight": 0.0, "match_source_len": false, "max_len_a": 0.0, "max_len_a_mt": 0.0, "max_len_b": 200, "max_len_b_mt": 200, "min_len": 1, "nbest": 1, "no_beamable_mm": false, "no_early_stop": false, "no_repeat_ngram_size": 0, "no_seed_provided": false, "prefix_size": 0, "print_alignment": null, "print_step": false, "replace_unk": null, "retain_dropout": false, "retain_dropout_modules": null, "retain_iter_history": false, "sacrebleu": false, "sampling": false, "sampling_topk": -1, "sampling_topp": -1.0, "score_reference": false, "temperature": 1.0, "unkpen": 0.0, "unnormalized": false }, "hidden_act": "gelu", "hidden_dropout_prob": 0.1, "hidden_size": 768, "initializer_range": 0.02, "interactive": { "_name": null, "buffer_size": 0, "input": "-" }, "intermediate_size": 3072, "job_logging_cfg": { "disable_existing_loggers": false, "formatters": { "simple": { "format": "[%(asctime)s][%(name)s][%(levelname)s] - %(message)s" } }, "handlers": { "console": { "class": "logging.StreamHandler", "formatter": "simple", "stream": "ext://sys.stdout" }, "file": { "class": "logging.FileHandler", "filename": "hydra_train.log", "formatter": "simple" } }, "root": { "handlers": [ "console", "file" ], "level": "INFO" }, "version": 1 }, "layer_norm_eps": 1e-12, "lr_scheduler": { "_name": "polynomial_decay", "end_learning_rate": 0.0, "force_anneal": null, "lr": [ 1e-05 ], "power": 1.0, "total_num_update": 10000000.0, "warmup_updates": 10000 }, "max_position_embeddings": 512, "model": { "_name": "roberta", "activation_dropout": 0.0, "activation_fn": "gelu", "adaptive_input": false, "attention_dropout": 0.1, "dropout": 0.1, "encoder_attention_heads": 12, "encoder_embed_dim": 768, "encoder_ffn_embed_dim": 3072, "encoder_layerdrop": 0.0, "encoder_layers": 12, "encoder_layers_to_keep": null, "encoder_learned_pos": true, "encoder_normalize_before": false, "layernorm_embedding": true, "max_positions": 256, "max_source_positions": 256, "no_scale_embedding": true, "no_token_positional_embeddings": false, "pooler_activation_fn": "tanh", "pooler_dropout": 0.0, "quant_noise_pq": 0, "quant_noise_pq_block_size": 8, "quant_noise_scalar": 0, "spectral_norm_classification_head": false, "untie_weights_roberta": false }, "model_type": "roberta", "num_attention_heads": 12, "num_hidden_layers": 12, "optimization": { "_name": null, "clip_norm": 0.0, "debug_param_names": false, "lr": [ 1e-05 ], "max_epoch": 0, "max_update": 10000000, "sentence_avg": false, "skip_remainder_batch": false, "stop_min_lr": -1.0, "stop_time_hours": 0.0, "update_freq": [ 8 ], "use_bmuf": false }, "optimizer": { "_name": "adam", "adam_betas": "(0.9,0.98)", "adam_eps": 1e-06, "fp16_adam_stats": false, "lr": [ 1e-05 ], "tpu": false, "use_old_adam": false, "weight_decay": 0.01 }, "pad_token_id": 1, "position_embedding_type": "absolute", "scoring": null, "task": { "_name": "masked_lm", "d2v2_multi": false, "data": "data-bin/roberturk_bin", "freq_weighted_replacement": false, "include_index": true, "include_target_tokens": false, "leave_unmasked_prob": 0.1, "mask_multiple_length": 1, "mask_prob": 0.15, "mask_stdev": 0.0, "mask_whole_words": false, "random_token_prob": 0.1, "sample_break_mode": "complete", "seed": 1, "shorten_data_split_list": "", "shorten_method": "none", "skip_masking": false, "tokens_per_sample": 256 }, "tokenizer": null, "torch_dtype": "float32", "transformers_version": "4.35.2", "type_vocab_size": 2, "use_cache": true, "vocab_size": 50265 }