| { | |
| "_name": null, | |
| "architectures": [ | |
| "RobertaModel" | |
| ], | |
| "attention_probs_dropout_prob": 0.1, | |
| "bmuf": { | |
| "_name": null, | |
| "average_sync": false, | |
| "block_lr": 1.0, | |
| "block_momentum": 0.875, | |
| "distributed_world_size": 2, | |
| "global_sync_iter": 50, | |
| "use_nbm": false, | |
| "warmup_iterations": 500 | |
| }, | |
| "bos_token_id": 0, | |
| "bpe": null, | |
| "checkpoint": { | |
| "_name": null, | |
| "best_checkpoint_metric": "loss", | |
| "checkpoint_shard_count": 1, | |
| "checkpoint_suffix": "", | |
| "continue_once": null, | |
| "finetune_from_model": null, | |
| "keep_best_checkpoints": -1, | |
| "keep_interval_updates": -1, | |
| "keep_interval_updates_pattern": -1, | |
| "keep_last_epochs": -1, | |
| "load_checkpoint_on_all_dp_ranks": false, | |
| "maximize_best_checkpoint_metric": false, | |
| "model_parallel_size": 1, | |
| "no_epoch_checkpoints": true, | |
| "no_last_checkpoints": false, | |
| "no_save": false, | |
| "no_save_optimizer_state": false, | |
| "optimizer_overrides": "{}", | |
| "patience": -1, | |
| "reset_dataloader": false, | |
| "reset_lr_scheduler": false, | |
| "reset_meters": false, | |
| "reset_optimizer": false, | |
| "restore_file": "checkpoint_last.pt", | |
| "save_dir": "roberturk_checkpoints", | |
| "save_interval": 1, | |
| "save_interval_updates": 50000, | |
| "write_checkpoints_asynchronously": false | |
| }, | |
| "classifier_dropout": null, | |
| "common": { | |
| "_name": null, | |
| "aim_repo": null, | |
| "aim_run_hash": null, | |
| "all_gather_list_size": 16384, | |
| "amp": false, | |
| "amp_batch_retries": 2, | |
| "amp_init_scale": 128, | |
| "amp_scale_window": null, | |
| "azureml_logging": false, | |
| "bf16": false, | |
| "cpu": false, | |
| "empty_cache_freq": 0, | |
| "fp16": true, | |
| "fp16_init_scale": 128, | |
| "fp16_no_flatten_grads": false, | |
| "fp16_scale_tolerance": 0.0, | |
| "fp16_scale_window": null, | |
| "log_file": null, | |
| "log_format": "json", | |
| "log_interval": 200, | |
| "memory_efficient_bf16": false, | |
| "memory_efficient_fp16": false, | |
| "min_loss_scale": 0.0001, | |
| "model_parallel_size": 1, | |
| "no_progress_bar": false, | |
| "on_cpu_convert_precision": false, | |
| "plasma_path": "/tmp/plasma", | |
| "profile": false, | |
| "quantization_config_path": null, | |
| "reset_logging": false, | |
| "seed": 1, | |
| "suppress_crashes": false, | |
| "tensorboard_logdir": null, | |
| "threshold_loss_scale": null, | |
| "tpu": false, | |
| "use_plasma_view": false, | |
| "user_dir": null, | |
| "wandb_project": null | |
| }, | |
| "common_eval": { | |
| "_name": null, | |
| "model_overrides": "{}", | |
| "path": null, | |
| "post_process": null, | |
| "quiet": false, | |
| "results_path": null | |
| }, | |
| "criterion": { | |
| "_name": "masked_lm", | |
| "tpu": false | |
| }, | |
| "dataset": { | |
| "_name": null, | |
| "batch_size": 16, | |
| "batch_size_valid": 16, | |
| "combine_valid_subsets": null, | |
| "curriculum": 0, | |
| "data_buffer_size": 10, | |
| "dataset_impl": null, | |
| "disable_validation": false, | |
| "fixed_validation_seed": null, | |
| "gen_subset": "test", | |
| "grouped_shuffling": false, | |
| "ignore_unused_valid_subsets": true, | |
| "max_tokens": null, | |
| "max_tokens_valid": null, | |
| "max_valid_steps": null, | |
| "num_shards": 1, | |
| "num_workers": 1, | |
| "required_batch_size_multiple": 8, | |
| "required_seq_len_multiple": 1, | |
| "shard_id": 0, | |
| "skip_invalid_size_inputs_valid_test": true, | |
| "train_subset": "train", | |
| "update_epoch_batch_itr": false, | |
| "update_ordered_indices_seed": false, | |
| "valid_subset": "valid", | |
| "validate_after_updates": 0, | |
| "validate_interval": 1, | |
| "validate_interval_updates": 0 | |
| }, | |
| "distributed_training": { | |
| "_name": null, | |
| "broadcast_buffers": false, | |
| "bucket_cap_mb": 25, | |
| "cpu_offload": false, | |
| "ddp_backend": "pytorch_ddp", | |
| "ddp_comm_hook": "none", | |
| "device_id": 0, | |
| "distributed_backend": "nccl", | |
| "distributed_init_method": "tcp://localhost:51855", | |
| "distributed_no_spawn": false, | |
| "distributed_num_procs": 2, | |
| "distributed_port": 51855, | |
| "distributed_rank": 0, | |
| "distributed_world_size": 2, | |
| "fast_stat_sync": false, | |
| "find_unused_parameters": false, | |
| "fix_batches_to_gpus": false, | |
| "fp16": true, | |
| "fp32_reduce_scatter": false, | |
| "gradient_as_bucket_view": false, | |
| "heartbeat_timeout": -1, | |
| "localsgd_frequency": 3, | |
| "memory_efficient_fp16": false, | |
| "no_reshard_after_forward": false, | |
| "not_fsdp_flatten_parameters": false, | |
| "nprocs_per_node": 2, | |
| "pipeline_balance": null, | |
| "pipeline_checkpoint": "never", | |
| "pipeline_chunks": 0, | |
| "pipeline_decoder_balance": null, | |
| "pipeline_decoder_devices": null, | |
| "pipeline_devices": null, | |
| "pipeline_encoder_balance": null, | |
| "pipeline_encoder_devices": null, | |
| "pipeline_model_parallel": false, | |
| "slowmo_base_algorithm": "localsgd", | |
| "slowmo_momentum": null, | |
| "tpu": false, | |
| "use_sharded_state": false, | |
| "zero_sharding": "none" | |
| }, | |
| "ema": { | |
| "_name": null, | |
| "ema_decay": 0.9999, | |
| "ema_fp32": false, | |
| "ema_seed_model": null, | |
| "ema_start_update": 0, | |
| "ema_update_freq": 1, | |
| "store_ema": false | |
| }, | |
| "eos_token_id": 2, | |
| "eval_lm": { | |
| "_name": null, | |
| "context_window": 0, | |
| "output_word_probs": false, | |
| "output_word_stats": false, | |
| "softmax_batch": 9223372036854775807 | |
| }, | |
| "generation": { | |
| "_name": null, | |
| "beam": 5, | |
| "beam_mt": 0, | |
| "constraints": null, | |
| "decoding_format": null, | |
| "diverse_beam_groups": -1, | |
| "diverse_beam_strength": 0.5, | |
| "diversity_rate": -1.0, | |
| "eos_token": null, | |
| "iter_decode_eos_penalty": 0.0, | |
| "iter_decode_force_max_iter": false, | |
| "iter_decode_max_iter": 10, | |
| "iter_decode_with_beam": 1, | |
| "iter_decode_with_external_reranker": false, | |
| "lenpen": 1.0, | |
| "lenpen_mt": 1.0, | |
| "lm_path": null, | |
| "lm_weight": 0.0, | |
| "match_source_len": false, | |
| "max_len_a": 0.0, | |
| "max_len_a_mt": 0.0, | |
| "max_len_b": 200, | |
| "max_len_b_mt": 200, | |
| "min_len": 1, | |
| "nbest": 1, | |
| "no_beamable_mm": false, | |
| "no_early_stop": false, | |
| "no_repeat_ngram_size": 0, | |
| "no_seed_provided": false, | |
| "prefix_size": 0, | |
| "print_alignment": null, | |
| "print_step": false, | |
| "replace_unk": null, | |
| "retain_dropout": false, | |
| "retain_dropout_modules": null, | |
| "retain_iter_history": false, | |
| "sacrebleu": false, | |
| "sampling": false, | |
| "sampling_topk": -1, | |
| "sampling_topp": -1.0, | |
| "score_reference": false, | |
| "temperature": 1.0, | |
| "unkpen": 0.0, | |
| "unnormalized": false | |
| }, | |
| "hidden_act": "gelu", | |
| "hidden_dropout_prob": 0.1, | |
| "hidden_size": 768, | |
| "initializer_range": 0.02, | |
| "interactive": { | |
| "_name": null, | |
| "buffer_size": 0, | |
| "input": "-" | |
| }, | |
| "intermediate_size": 3072, | |
| "job_logging_cfg": { | |
| "disable_existing_loggers": false, | |
| "formatters": { | |
| "simple": { | |
| "format": "[%(asctime)s][%(name)s][%(levelname)s] - %(message)s" | |
| } | |
| }, | |
| "handlers": { | |
| "console": { | |
| "class": "logging.StreamHandler", | |
| "formatter": "simple", | |
| "stream": "ext://sys.stdout" | |
| }, | |
| "file": { | |
| "class": "logging.FileHandler", | |
| "filename": "hydra_train.log", | |
| "formatter": "simple" | |
| } | |
| }, | |
| "root": { | |
| "handlers": [ | |
| "console", | |
| "file" | |
| ], | |
| "level": "INFO" | |
| }, | |
| "version": 1 | |
| }, | |
| "layer_norm_eps": 1e-12, | |
| "lr_scheduler": { | |
| "_name": "polynomial_decay", | |
| "end_learning_rate": 0.0, | |
| "force_anneal": null, | |
| "lr": [ | |
| 1e-05 | |
| ], | |
| "power": 1.0, | |
| "total_num_update": 10000000.0, | |
| "warmup_updates": 10000 | |
| }, | |
| "max_position_embeddings": 512, | |
| "model": { | |
| "_name": "roberta", | |
| "activation_dropout": 0.0, | |
| "activation_fn": "gelu", | |
| "adaptive_input": false, | |
| "attention_dropout": 0.1, | |
| "dropout": 0.1, | |
| "encoder_attention_heads": 12, | |
| "encoder_embed_dim": 768, | |
| "encoder_ffn_embed_dim": 3072, | |
| "encoder_layerdrop": 0.0, | |
| "encoder_layers": 12, | |
| "encoder_layers_to_keep": null, | |
| "encoder_learned_pos": true, | |
| "encoder_normalize_before": false, | |
| "layernorm_embedding": true, | |
| "max_positions": 256, | |
| "max_source_positions": 256, | |
| "no_scale_embedding": true, | |
| "no_token_positional_embeddings": false, | |
| "pooler_activation_fn": "tanh", | |
| "pooler_dropout": 0.0, | |
| "quant_noise_pq": 0, | |
| "quant_noise_pq_block_size": 8, | |
| "quant_noise_scalar": 0, | |
| "spectral_norm_classification_head": false, | |
| "untie_weights_roberta": false | |
| }, | |
| "model_type": "roberta", | |
| "num_attention_heads": 12, | |
| "num_hidden_layers": 12, | |
| "optimization": { | |
| "_name": null, | |
| "clip_norm": 0.0, | |
| "debug_param_names": false, | |
| "lr": [ | |
| 1e-05 | |
| ], | |
| "max_epoch": 0, | |
| "max_update": 10000000, | |
| "sentence_avg": false, | |
| "skip_remainder_batch": false, | |
| "stop_min_lr": -1.0, | |
| "stop_time_hours": 0.0, | |
| "update_freq": [ | |
| 8 | |
| ], | |
| "use_bmuf": false | |
| }, | |
| "optimizer": { | |
| "_name": "adam", | |
| "adam_betas": "(0.9,0.98)", | |
| "adam_eps": 1e-06, | |
| "fp16_adam_stats": false, | |
| "lr": [ | |
| 1e-05 | |
| ], | |
| "tpu": false, | |
| "use_old_adam": false, | |
| "weight_decay": 0.01 | |
| }, | |
| "pad_token_id": 1, | |
| "position_embedding_type": "absolute", | |
| "scoring": null, | |
| "task": { | |
| "_name": "masked_lm", | |
| "d2v2_multi": false, | |
| "data": "data-bin/roberturk_bin", | |
| "freq_weighted_replacement": false, | |
| "include_index": true, | |
| "include_target_tokens": false, | |
| "leave_unmasked_prob": 0.1, | |
| "mask_multiple_length": 1, | |
| "mask_prob": 0.15, | |
| "mask_stdev": 0.0, | |
| "mask_whole_words": false, | |
| "random_token_prob": 0.1, | |
| "sample_break_mode": "complete", | |
| "seed": 1, | |
| "shorten_data_split_list": "", | |
| "shorten_method": "none", | |
| "skip_masking": false, | |
| "tokens_per_sample": 256 | |
| }, | |
| "tokenizer": null, | |
| "torch_dtype": "float32", | |
| "transformers_version": "4.35.2", | |
| "type_vocab_size": 2, | |
| "use_cache": true, | |
| "vocab_size": 50265 | |
| } | |