| algorithms: |
| gradient_clipping: |
| clipping_threshold: 1.0 |
| clipping_type: norm |
| callbacks: |
| lr_monitor: {} |
| memory_monitor: {} |
| runtime_estimator: {} |
| speed_monitor: |
| window_size: 10 |
| console_log_interval: 50ba |
| cross_doc_attention: false |
| dataloaders: |
| - dataset: |
| batch_type: lm |
| local: outputs/experiments/arxiv-citation-doc-id-begin/data/streaming/ |
| max_seq_len: 2048 |
| shuffle: true |
| split: train |
| drop_last: false |
| name: train_loader_docs |
| num_workers: 0 |
| - dataset: |
| batch_type: fact |
| local: outputs/experiments/arxiv-citation-doc-id-begin/data/streaming/qa |
| masking: |
| cross_doc_attention: false |
| max_seq_len: 2048 |
| shuffle: true |
| split: qa_attribution_train |
| drop_last: false |
| name: train_q_a_url |
| num_workers: 0 |
| deepspeed_config: |
| bf16: |
| enabled: true |
| train_batch_size: 80 |
| zero_optimization: |
| allgather_bucket_size: 200000000.0 |
| contiguous_gradients: true |
| offload_optimizer: |
| device: cpu |
| pin_memory: true |
| overlap_comm: true |
| reduce_bucket_size: true |
| reduce_scatter: true |
| stage: 3 |
| device_eval_batch_size: 40 |
| device_train_microbatch_size: 2 |
| eval_first: false |
| eval_interval: 1 |
| eval_subset_num_batches: -1 |
| experiment: |
| data: |
| augment: |
| doc: |
| do: false |
| method: permute |
| n_sample_per_doc: 2 |
| finetune: |
| neg_create_probability: 0.0 |
| number_non_attributable_negatives: 0 |
| qa_data_path: /root/autodl-tmp/intrinsic-source-citation/dataset/ours |
| text_data_path: dataset/ours/pretrain |
| train_data_path: /root/autodl-tmp/intrinsic-source-citation/dataset/ours/pretrain/train |
| eval: |
| disable_all_eval: true |
| disable_attribution_eval: true |
| disable_non_attrib_eval: true |
| disable_qa_eval: true |
| icl_eval: false |
| ppl_eval: false |
| use_ais: false |
| experiment: |
| name: arxiv-citation-doc-id-begin |
| output_dir: outputs/experiments/ |
| model: |
| checkpoint: /root/autodl-tmp/intrinsic-source-citation/outputs/experiments/arxiv-citation-doc-id-begin/checkpoints/latest-rank0.pt.tar |
| ckpt_dir: /root/autodl-tmp/intrinsic-source-citation/outputs/experiments/arxiv-citation-doc-id-begin/checkpoints |
| name: TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T |
| train: |
| config_template_path: conf/templates/train_config.yaml |
| cross_doc_attention: false |
| device_eval_batch_size: 40 |
| device_train_microbatch_size: 2 |
| eval_first: false |
| finetune_q_a: false |
| finetune_q_a_doc_url: false |
| finetune_q_a_url: true |
| finetune_q_url_a: false |
| loss_type: mask |
| lr: 8.0e-05 |
| max_duration: 5ep |
| pretrain: true |
| q_a_url_predict_url_only: false |
| repeat_url_across_doc: false |
| save_folder: outputs/experiments/arxiv-citation-doc-id-begin/checkpoints |
| sequential: false |
| url_location: no_url |
| url_loss_factor: 1.0 |
| weight_decay: 0.02 |
| global_seed: 17 |
| global_train_batch_size: 80 |
| log_to_console: true |
| loggers: |
| wandb: |
| project: intrinsic-source-citation |
| max_duration: 5ep |
| max_seq_len: 2048 |
| model: |
| ckpt_dir: outputs/experiments/arxiv-citation-doc-id-begin/checkpoints |
| loss: |
| type: mask |
| url_loss_factor: 1.0 |
| name: hf_causal_lm |
| pretrained: true |
| pretrained_model_name_or_path: TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T |
| ood_url_trie: outputs/experiments/arxiv-citation-doc-id-begin/data/streaming/unseen_url_trie.pkl |
| optimizer: |
| betas: |
| - 0.9 |
| - 0.98 |
| eps: 1.0e-06 |
| lr: 8.0e-05 |
| name: deepspeed_adam |
| weight_decay: 0.02 |
| precision: amp_bf16 |
| progress_bar: false |
| run_name: arxiv-citation-doc-id-begin |
| save_folder: outputs/experiments/arxiv-citation-doc-id-begin/checkpoints |
| save_interval: 1ep |
| save_num_checkpoints_to_keep: 1 |
| scheduler: |
| alpha_f: 0.1 |
| name: linear_decay_with_warmup |
| t_warmup: 1ep |
| seed: 17 |
| streaming: outputs/experiments/arxiv-citation-doc-id-begin/data/streaming/ |
| text_data_path: dataset/ours/pretrain |
| tokenizer: |
| kwargs: |
| model_max_length: 2048 |
| name: outputs/experiments/arxiv-citation-doc-id-begin/data/streaming//tokenizer |
| tokenizer_name: outputs/experiments/arxiv-citation-doc-id-begin/data/streaming//tokenizer |
| url_trie: outputs/experiments/arxiv-citation-doc-id-begin/data/streaming/url_trie.pkl |
| dist_timeout: 600.0 |
| n_gpus: 1 |
| device_train_batch_size: 80 |
| device_train_grad_accum: 40 |
| n_params: 1100056576 |
|
|