| 06/03/2024 09:30:43 - WARNING - __main__ - Process rank: 0, device: cuda:0, n_gpu: 8 distributed training: True, 16-bits training: False |
| 06/03/2024 09:30:43 - INFO - __main__ - Training parameters TrainingArguments( |
| _n_gpu=8, |
| adafactor=False, |
| adam_beta1=0.9, |
| adam_beta2=0.999, |
| adam_epsilon=1e-08, |
| analysis_dataset=legalbench_train_annotated, |
| analysis_mode=1.0, |
| auto_find_batch_size=False, |
| bf16=True, |
| bf16_full_eval=False, |
| data_seed=None, |
| dataloader_drop_last=False, |
| dataloader_num_workers=0, |
| dataloader_persistent_workers=False, |
| dataloader_pin_memory=True, |
| ddp_backend=None, |
| ddp_broadcast_buffers=None, |
| ddp_bucket_cap_mb=None, |
| ddp_find_unused_parameters=None, |
| ddp_timeout=1800, |
| debug=[], |
| deepspeed=None, |
| disable_tqdm=False, |
| dispatch_batches=None, |
| do_eval=True, |
| do_predict=False, |
| do_train=True, |
| eval_accumulation_steps=None, |
| eval_delay=0, |
| eval_steps=2000, |
| evaluation_strategy=steps, |
| fp16=False, |
| fp16_backend=auto, |
| fp16_full_eval=False, |
| fp16_opt_level=O1, |
| fsdp=[], |
| fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_grad_ckpt': False}, |
| fsdp_min_num_params=0, |
| fsdp_transformer_layer_cls_to_wrap=None, |
| full_determinism=False, |
| gradient_accumulation_steps=32, |
| gradient_checkpointing=False, |
| gradient_checkpointing_kwargs=None, |
| granularity=sequence, |
| greater_is_better=False, |
| group_by_length=True, |
| half_precision_backend=auto, |
| hub_always_push=False, |
| hub_model_id=None, |
| hub_private_repo=False, |
| hub_strategy=every_save, |
| hub_token=<HUB_TOKEN>, |
| ignore_data_skip=False, |
| include_inputs_for_metrics=False, |
| include_num_input_tokens_seen=False, |
| include_tokens_per_second=False, |
| inner_lr=1e-05, |
| jit_mode_eval=False, |
| label_names=None, |
| label_smoothing_factor=0.0, |
| learning_rate=2e-05, |
| length_column_name=length, |
| load_best_model_at_end=True, |
| local_rank=0, |
| log_level=passive, |
| log_level_replica=warning, |
| log_on_each_node=True, |
| logging_dir=final_out/llama3-8b-instruct-final-less-lora-everything/runs/Jun03_09-30-41_sagemaker-data-sci-ml-p4d-24xlarge-7e6e825bf88acadc865d379de4ed, |
| logging_first_step=False, |
| logging_nan_inf_filter=True, |
| logging_steps=1.0, |
| logging_strategy=steps, |
| lr_scheduler_kwargs={}, |
| lr_scheduler_type=linear, |
| max_grad_norm=1.0, |
| max_steps=-1, |
| meta_epilson=1.0, |
| metric_for_best_model=eval_loss, |
| mp_parameters=, |
| neftune_noise_alpha=None, |
| no_cuda=False, |
| num_train_epochs=5.0, |
| optim=adamw_torch, |
| optim_args=None, |
| output_dir=final_out/llama3-8b-instruct-final-less-lora-everything, |
| overwrite_output_dir=False, |
| past_index=-1, |
| per_device_eval_batch_size=4, |
| per_device_train_batch_size=4, |
| prediction_loss_only=False, |
| push_to_hub=False, |
| push_to_hub_model_id=None, |
| push_to_hub_organization=None, |
| push_to_hub_token=<PUSH_TO_HUB_TOKEN>, |
| ray_scope=last, |
| remove_unused_columns=True, |
| report_to=['wandb'], |
| resume_from_checkpoint=None, |
| run_name=final_out/llama3-8b-instruct-final-less-lora-everything, |
| save_on_each_node=False, |
| save_only_model=False, |
| save_safetensors=True, |
| save_steps=2000, |
| save_strategy=steps, |
| save_total_limit=None, |
| seed=0, |
| select_frac=0.5, |
| skip_memory_metrics=True, |
| split_batches=False, |
| target_dataset=mmlu, |
| tf32=False, |
| torch_compile=False, |
| torch_compile_backend=None, |
| torch_compile_mode=None, |
| torchdynamo=None, |
| tpu_metrics_debug=False, |
| tpu_num_cores=None, |
| train_dataset_names=None, |
| use_cpu=False, |
| use_ipex=False, |
| use_legacy_prediction_loop=False, |
| use_mps_device=False, |
| warmup_ratio=0.03, |
| warmup_steps=0, |
| weight_decay=0.0, |
| weight_learning_rate=1e-05, |
| ) |
| 06/03/2024 09:30:43 - INFO - __main__ - Model parameters ModelArguments(model_name_or_path='meta-llama/Meta-Llama-3-8B-Instruct', reference_model_path=None, model_meta_name_or_path=None, config_name=None, tokenizer_name=None, cache_dir=None, use_fast_tokenizer=True, model_revision='main', use_auth_token=False, torch_dtype=None, token_wise=False, lora=True, lora_r=128, lora_alpha=512.0, lora_dropout=0.1, lora_target_modules=None) |
| 06/03/2024 09:30:43 - INFO - __main__ - Dataset parameters DataArguments(train_files=['legalbench_train_annotated'], overwrite_cache=False, preprocessing_num_workers=None, max_seq_length=8192, sample_data_seed=42, percentage=1.0, data_dir='data') |
| /opt/conda/lib/python3.10/site-packages/huggingface_hub/file_download.py:1132: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`. |
| warnings.warn( |
| [INFO|tokenization_utils_base.py:2026] 2024-06-03 09:30:43,346 >> loading file tokenizer.json from cache at /root/.cache/huggingface/hub/models--meta-llama--Meta-Llama-3-8B-Instruct/snapshots/e1945c40cd546c78e41f1151f4db032b271faeaa/tokenizer.json |
| [INFO|tokenization_utils_base.py:2026] 2024-06-03 09:30:43,346 >> loading file added_tokens.json from cache at None |
| [INFO|tokenization_utils_base.py:2026] 2024-06-03 09:30:43,346 >> loading file special_tokens_map.json from cache at /root/.cache/huggingface/hub/models--meta-llama--Meta-Llama-3-8B-Instruct/snapshots/e1945c40cd546c78e41f1151f4db032b271faeaa/special_tokens_map.json |
| [INFO|tokenization_utils_base.py:2026] 2024-06-03 09:30:43,346 >> loading file tokenizer_config.json from cache at /root/.cache/huggingface/hub/models--meta-llama--Meta-Llama-3-8B-Instruct/snapshots/e1945c40cd546c78e41f1151f4db032b271faeaa/tokenizer_config.json |
| [WARNING|logging.py:314] 2024-06-03 09:30:43,664 >> Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained. |
| Overwrite dataset info from restored data version if exists. |
| Loading Legalbench data (legalbench_train_annotated) |
| 06/03/2024 09:30:45 - INFO - datasets.builder - Overwrite dataset info from restored data version if exists. |
| Loading Dataset info from /root/.cache/huggingface/datasets/adminsafesign___legalbench_train_annotated_instruct_dataset/default/0.0.0/5da14aca2e7c80fd6c61cb95e2e40f2d723b957a |
| 06/03/2024 09:30:45 - INFO - datasets.info - Loading Dataset info from /root/.cache/huggingface/datasets/adminsafesign___legalbench_train_annotated_instruct_dataset/default/0.0.0/5da14aca2e7c80fd6c61cb95e2e40f2d723b957a |
| Found cached dataset legalbench_train_annotated_instruct_dataset (/root/.cache/huggingface/datasets/adminsafesign___legalbench_train_annotated_instruct_dataset/default/0.0.0/5da14aca2e7c80fd6c61cb95e2e40f2d723b957a) |
| 06/03/2024 09:30:45 - INFO - datasets.builder - Found cached dataset legalbench_train_annotated_instruct_dataset (/root/.cache/huggingface/datasets/adminsafesign___legalbench_train_annotated_instruct_dataset/default/0.0.0/5da14aca2e7c80fd6c61cb95e2e40f2d723b957a) |
| Loading Dataset info from /root/.cache/huggingface/datasets/adminsafesign___legalbench_train_annotated_instruct_dataset/default/0.0.0/5da14aca2e7c80fd6c61cb95e2e40f2d723b957a |
| 06/03/2024 09:30:45 - INFO - datasets.info - Loading Dataset info from /root/.cache/huggingface/datasets/adminsafesign___legalbench_train_annotated_instruct_dataset/default/0.0.0/5da14aca2e7c80fd6c61cb95e2e40f2d723b957a |
|
0%| | 0/1656 [00:00<?, ?it/s]
22%|ββββββββββββββββββββββββββ | 365/1656 [00:00<00:00, 1938.73it/s]
100%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ| 1656/1656 [00:00<00:00, 6714.36it/s] |
| ================================================================================ |
| Size of combined datasets: 1656 |
| ================================================================================ |
| Spawning 10 processes |
| 06/03/2024 09:30:46 - INFO - datasets.arrow_dataset - Spawning 10 processes |
|
Tokenizing and reformatting instruction data (num_proc=10): 0%| | 0/1656 [00:00<?, ? examples/s]
Tokenizing and reformatting instruction data (num_proc=10): 1%|β | 21/1656 [00:00<00:47, 34.60 examples/s]
Tokenizing and reformatting instruction data (num_proc=10): 4%|ββ | 62/1656 [00:00<00:16, 94.36 examples/s]
Tokenizing and reformatting instruction data (num_proc=10): 15%|ββββββββ | 249/1656 [00:00<00:03, 392.12 examples/s]
Tokenizing and reformatting instruction data (num_proc=10): 29%|ββββββββββββββ | 472/1656 [00:01<00:01, 691.88 examples/s]
Tokenizing and reformatting instruction data (num_proc=10): 38%|ββββββββββββββββββ | 623/1656 [00:01<00:01, 751.27 examples/s]
Tokenizing and reformatting instruction data (num_proc=10): 50%|ββββββββββββββββββββββββ | 827/1656 [00:01<00:00, 901.94 examples/s]
Tokenizing and reformatting instruction data (num_proc=10): 57%|ββββββββββββββββββββββββββββ | 952/1656 [00:01<00:00, 887.17 examples/s]
Tokenizing and reformatting instruction data (num_proc=10): 68%|ββββββββββββββββββββββββββββββββ | 1120/1656 [00:01<00:00, 984.75 examples/s]
Tokenizing and reformatting instruction data (num_proc=10): 76%|ββββββββββββββββββββββββββββββββββββ | 1265/1656 [00:01<00:00, 1063.35 examples/s]
Tokenizing and reformatting instruction data (num_proc=10): 88%|βββββββββββββββββββββββββββββββββββββββββ | 1450/1656 [00:01<00:00, 1190.95 examples/s]
Tokenizing and reformatting instruction data (num_proc=10): 96%|βββββββββββββββββββββββββββββββββββββββββββββ | 1586/1656 [00:02<00:00, 916.84 examples/s]
Tokenizing and reformatting instruction data (num_proc=10): 100%|βββββββββββββββββββββββββββββββββββββββββββββββ| 1656/1656 [00:02<00:00, 705.79 examples/s] |
| Concatenating 10 shards |
| 06/03/2024 09:30:48 - INFO - datasets.arrow_dataset - Concatenating 10 shards |
| /opt/conda/lib/python3.10/site-packages/huggingface_hub/file_download.py:1132: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`. |
| warnings.warn( |
| [INFO|configuration_utils.py:739] 2024-06-03 09:30:48,888 >> loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--meta-llama--Meta-Llama-3-8B-Instruct/snapshots/e1945c40cd546c78e41f1151f4db032b271faeaa/config.json |
| [INFO|configuration_utils.py:802] 2024-06-03 09:30:48,889 >> Model config LlamaConfig { |
| "_name_or_path": "meta-llama/Meta-Llama-3-8B-Instruct", |
| "architectures": [ |
| "LlamaForCausalLM" |
| ], |
| "attention_bias": false, |
| "attention_dropout": 0.0, |
| "bos_token_id": 128000, |
| "eos_token_id": 128009, |
| "hidden_act": "silu", |
| "hidden_size": 4096, |
| "initializer_range": 0.02, |
| "intermediate_size": 14336, |
| "max_position_embeddings": 8192, |
| "model_type": "llama", |
| "num_attention_heads": 32, |
| "num_hidden_layers": 32, |
| "num_key_value_heads": 8, |
| "pretraining_tp": 1, |
| "rms_norm_eps": 1e-05, |
| "rope_scaling": null, |
| "rope_theta": 500000.0, |
| "tie_word_embeddings": false, |
| "transformers_version": "4.36.2", |
| "use_cache": true, |
| "vocab_size": 128256 |
| } |
|
|
| [INFO|modeling_utils.py:3344] 2024-06-03 09:30:48,932 >> loading weights file model.safetensors from cache at /root/.cache/huggingface/hub/models--meta-llama--Meta-Llama-3-8B-Instruct/snapshots/e1945c40cd546c78e41f1151f4db032b271faeaa/model.safetensors.index.json |
| [INFO|configuration_utils.py:826] 2024-06-03 09:30:48,949 >> Generate config GenerationConfig { |
| "bos_token_id": 128000, |
| "eos_token_id": 128009 |
| } |
|
|
| Removed 0 oversized examples |
|
Loading checkpoint shards: 0%| | 0/4 [00:00<?, ?it/s]
Loading checkpoint shards: 25%|ββββββββββββββββββββββββ | 1/4 [00:03<00:09, 3.16s/it]
Loading checkpoint shards: 50%|βββββββββββββββββββββββββββββββββββββββββββββββ | 2/4 [00:05<00:05, 2.95s/it]
Loading checkpoint shards: 75%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 3/4 [00:23<00:09, 9.79s/it]
Loading checkpoint shards: 100%|ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ| 4/4 [00:28<00:00, 7.67s/it]
Loading checkpoint shards: 100%|ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ| 4/4 [00:28<00:00, 7.08s/it] |
| [INFO|modeling_utils.py:4185] 2024-06-03 09:31:18,972 >> All model checkpoint weights were used when initializing LlamaForCausalLM. |
|
|
| [INFO|modeling_utils.py:4193] 2024-06-03 09:31:18,972 >> All the weights of LlamaForCausalLM were initialized from the model checkpoint at meta-llama/Meta-Llama-3-8B-Instruct. |
| If your task is similar to the task the model of the checkpoint was trained on, you can already use LlamaForCausalLM for predictions without further training. |
| [INFO|configuration_utils.py:781] 2024-06-03 09:31:19,343 >> loading configuration file generation_config.json from cache at /root/.cache/huggingface/hub/models--meta-llama--Meta-Llama-3-8B-Instruct/snapshots/e1945c40cd546c78e41f1151f4db032b271faeaa/generation_config.json |
| [INFO|configuration_utils.py:826] 2024-06-03 09:31:19,343 >> Generate config GenerationConfig { |
| "bos_token_id": 128000, |
| "do_sample": true, |
| "eos_token_id": [ |
| 128001, |
| 128009 |
| ], |
| "max_length": 4096, |
| "temperature": 0.6, |
| "top_p": 0.9 |
| } |
|
|
| 06/03/2024 09:31:20 - INFO - __main__ - Applied LoRA to model. |
| trainable params: 54,525,952 || all params: 8,084,852,736 || trainable%: 0.674421090655225 |
|
Map: 0%| | 0/1656 [00:00<?, ? examples/s]/root/meta_weight_llm/less/train/data_arguments.py:53: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.clone().detach() or sourceTensor.clone().detach().requires_grad_(True), rather than torch.tensor(sourceTensor). |
| com_len = (torch.tensor(labels) > -1).sum() |
|
Map: 60%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 1000/1656 [00:00<00:00, 8649.19 examples/s]
Map: 100%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ| 1656/1656 [00:00<00:00, 8620.75 examples/s] |
| [train set] examples: 1656; # avg tokens: 262.96136474609375 |
| [train set] examples: 1656; # avg completion tokens: 2.4480676651000977 |
| 06/03/2024 09:31:20 - INFO - __main__ - Sample 788 of the training set: {'input_ids': tensor([128000, 128006, 882, 128007, 198, 39, 7596, 352, 374, |
| 459, 704, 8838, 98462, 5224, 11784, 311, 12391, 279, |
| 8206, 315, 279, 5030, 50697, 382, 48, 25, 2057, |
| 12391, 53215, 574, 264, 1695, 4394, 11, 264, 5224, |
| 1903, 304, 5590, 430, 568, 10456, 311, 2571, 14177, |
| 9875, 13, 128009, 128006, 78191, 128007, 2201, 128001]), 'labels': tensor([ -100, -100, -100, -100, -100, -100, -100, -100, -100, |
| -100, -100, -100, -100, -100, -100, -100, -100, -100, |
| -100, -100, -100, -100, -100, -100, -100, -100, -100, |
| -100, -100, -100, -100, -100, -100, -100, -100, -100, |
| -100, -100, -100, -100, -100, -100, -100, -100, -100, |
| -100, -100, -100, -100, -100, -100, 2201, 128001]), 'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
| 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
| 1, 1, 1, 1, 1]), 'n_unmaked_labels': tensor(2)}. |
| 06/03/2024 09:31:20 - INFO - __main__ - trainable model_params: 54525952 |
| Overwrite dataset info from restored data version if exists. |
| 06/03/2024 09:31:22 - INFO - datasets.builder - Overwrite dataset info from restored data version if exists. |
| Loading Dataset info from /root/.cache/huggingface/datasets/adminsafesign___legalbench_train_annotated_instruct_dataset/default/0.0.0/5da14aca2e7c80fd6c61cb95e2e40f2d723b957a |
| 06/03/2024 09:31:22 - INFO - datasets.info - Loading Dataset info from /root/.cache/huggingface/datasets/adminsafesign___legalbench_train_annotated_instruct_dataset/default/0.0.0/5da14aca2e7c80fd6c61cb95e2e40f2d723b957a |
| Found cached dataset legalbench_train_annotated_instruct_dataset (/root/.cache/huggingface/datasets/adminsafesign___legalbench_train_annotated_instruct_dataset/default/0.0.0/5da14aca2e7c80fd6c61cb95e2e40f2d723b957a) |
| 06/03/2024 09:31:22 - INFO - datasets.builder - Found cached dataset legalbench_train_annotated_instruct_dataset (/root/.cache/huggingface/datasets/adminsafesign___legalbench_train_annotated_instruct_dataset/default/0.0.0/5da14aca2e7c80fd6c61cb95e2e40f2d723b957a) |
| Loading Dataset info from /root/.cache/huggingface/datasets/adminsafesign___legalbench_train_annotated_instruct_dataset/default/0.0.0/5da14aca2e7c80fd6c61cb95e2e40f2d723b957a |
| 06/03/2024 09:31:22 - INFO - datasets.info - Loading Dataset info from /root/.cache/huggingface/datasets/adminsafesign___legalbench_train_annotated_instruct_dataset/default/0.0.0/5da14aca2e7c80fd6c61cb95e2e40f2d723b957a |
| [WARNING|tokenization_utils_base.py:2605] 2024-06-03 09:31:22,541 >> Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`. |
| [INFO|trainer.py:396] 2024-06-03 09:31:24,080 >> You have loaded a model on multiple GPUs. `is_model_parallel` attribute will be force-set to `True` to avoid any unexpected behavior such as device placement mismatching. |
| [INFO|trainer.py:519] 2024-06-03 09:31:24,084 >> max_steps is given, it will override any value given in num_train_epochs |
| [INFO|trainer.py:568] 2024-06-03 09:31:24,084 >> Using auto half precision backend |
| ******** Example starts ******** |
| <|start_header_id|>user<|end_header_id|> |
| A mark is generic if it is the common name for the product. A mark is descriptive if it describes a purpose, nature, or attribute of the product. A mark is suggestive if it suggests or implies a quality or characteristic of the product. A mark is arbitrary if it is a real English word that has no relation to the product. A mark is fanciful if it is an invented word. |
|
|
| Q: The mark 'Ivory' for a product made of elephant tusks. What is the type of mark?<|eot_id|><|start_header_id|>assistant<|end_header_id|>generic |
| ******** Example ends ******** |
| PeftModelForCausalLM( |
| (base_model): LoraModel( |
| (model): LlamaForCausalLM( |
| (model): LlamaModel( |
| (embed_tokens): Embedding(128264, 4096) |
| (layers): ModuleList( |
| (0-31): 32 x LlamaDecoderLayer( |
| (self_attn): LlamaSdpaAttention( |
| (q_proj): lora.Linear( |
| (base_layer): Linear(in_features=4096, out_features=4096, bias=False) |
| (lora_dropout): ModuleDict( |
| (default): Dropout(p=0.1, inplace=False) |
| ) |
| (lora_A): ModuleDict( |
| (default): Linear(in_features=4096, out_features=128, bias=False) |
| ) |
| (lora_B): ModuleDict( |
| (default): Linear(in_features=128, out_features=4096, bias=False) |
| ) |
| (lora_embedding_A): ParameterDict() |
| (lora_embedding_B): ParameterDict() |
| ) |
| (k_proj): Linear(in_features=4096, out_features=1024, bias=False) |
| (v_proj): lora.Linear( |
| (base_layer): Linear(in_features=4096, out_features=1024, bias=False) |
| (lora_dropout): ModuleDict( |
| (default): Dropout(p=0.1, inplace=False) |
| ) |
| (lora_A): ModuleDict( |
| (default): Linear(in_features=4096, out_features=128, bias=False) |
| ) |
| (lora_B): ModuleDict( |
| (default): Linear(in_features=128, out_features=1024, bias=False) |
| ) |
| (lora_embedding_A): ParameterDict() |
| (lora_embedding_B): ParameterDict() |
| ) |
| (o_proj): Linear(in_features=4096, out_features=4096, bias=False) |
| (rotary_emb): LlamaRotaryEmbedding() |
| ) |
| (mlp): LlamaMLP( |
| (gate_proj): Linear(in_features=4096, out_features=14336, bias=False) |
| (up_proj): Linear(in_features=4096, out_features=14336, bias=False) |
| (down_proj): Linear(in_features=14336, out_features=4096, bias=False) |
| (act_fn): SiLU() |
| ) |
| (input_layernorm): LlamaRMSNorm() |
| (post_attention_layernorm): LlamaRMSNorm() |
| ) |
| ) |
| (norm): LlamaRMSNorm() |
| ) |
| (lm_head): Linear(in_features=4096, out_features=128264, bias=False) |
| ) |
| ) |
| ) |
| > /root/meta_weight_llm/less/train/train.py(192)main() |
| -> if analysis_dataset is not None: |
| (Pdb) 187 ), |
| 188 ) |
| 189 import pdb; pdb.set_trace() |
| 190 |
| 191 # Initial evaluation |
| 192 -> if analysis_dataset is not None: |
| 193 trainer.evaluate(analysis_dataset) |
| 194 |
| 195 # Training |
| 196 train_result = trainer.train() |
| 197 trainer.save_model( |
| (Pdb) [INFO|hub.py:748] 2024-06-03 09:32:39,136 >> Uploading the following files to adminsafesign/SafeSign-8B-Instruct-Test: README.md,adapter_model.safetensors,adapter_config.json |
|
adapter_model.safetensors: 0%| | 0.00/218M [00:00<?, ?B/s]
adapter_model.safetensors: 1%|β | 1.43M/218M [00:00<00:17, 12.3MB/s]
adapter_model.safetensors: 2%|βββ | 5.34M/218M [00:00<00:08, 24.1MB/s]
adapter_model.safetensors: 6%|ββββββ | 13.4M/218M [00:00<00:04, 47.3MB/s]
adapter_model.safetensors: 8%|ββββββββ | 18.4M/218M [00:00<00:08, 24.4MB/s]
adapter_model.safetensors: 11%|ββββββββββ | 23.3M/218M [00:00<00:07, 27.3MB/s]
adapter_model.safetensors: 13%|ββββββββββββ | 29.2M/218M [00:00<00:05, 31.7MB/s]
adapter_model.safetensors: 15%|ββββββββββββββ | 33.0M/218M [00:01<00:09, 19.9MB/s]
adapter_model.safetensors: 18%|ββββββββββββββββ | 39.7M/218M [00:01<00:06, 25.9MB/s]
adapter_model.safetensors: 21%|βββββββββββββββββββ | 46.1M/218M [00:01<00:05, 30.7MB/s]
adapter_model.safetensors: 23%|βββββββββββββββββββββ | 50.0M/218M [00:02<00:08, 20.8MB/s]
adapter_model.safetensors: 25%|ββββββββββββββββββββββ | 54.7M/218M [00:02<00:07, 22.2MB/s]
adapter_model.safetensors: 26%|ββββββββββββββββββββββββ | 57.6M/218M [00:02<00:07, 20.3MB/s]
adapter_model.safetensors: 29%|ββββββββββββββββββββββββββ | 63.1M/218M [00:02<00:06, 24.9MB/s]
adapter_model.safetensors: 30%|βββββββββββββββββββββββββββ | 66.1M/218M [00:02<00:09, 16.3MB/s]
adapter_model.safetensors: 32%|βββββββββββββββββββββββββββββ | 70.2M/218M [00:03<00:07, 18.8MB/s]
adapter_model.safetensors: 34%|ββββββββββββββββββββββββββββββ | 74.1M/218M [00:03<00:07, 20.1MB/s]
adapter_model.safetensors: 37%|βββββββββββββββββββββββββββββββββ | 80.0M/218M [00:03<00:07, 17.5MB/s]
adapter_model.safetensors: 40%|βββββββββββββββββββββββββββββββββββ | 86.8M/218M [00:03<00:05, 23.3MB/s]
adapter_model.safetensors: 42%|βββββββββββββββββββββββββββββββββββββ | 91.6M/218M [00:03<00:04, 26.3MB/s]
adapter_model.safetensors: 44%|βββββββββββββββββββββββββββββββββββββββ | 96.0M/218M [00:04<00:06, 17.7MB/s]
adapter_model.safetensors: 48%|βββββββββββββββββββββββββββββββββββββββββββ | 105M/218M [00:04<00:04, 26.3MB/s]
adapter_model.safetensors: 51%|ββββββββββββββββββββββββββββββββββββββββββββββ | 112M/218M [00:04<00:05, 20.9MB/s]
adapter_model.safetensors: 59%|βββββββββββββββββββββββββββββββββββββββββββββββββββββ | 128M/218M [00:05<00:02, 30.4MB/s]
adapter_model.safetensors: 66%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 144M/218M [00:05<00:02, 35.8MB/s]
adapter_model.safetensors: 70%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 153M/218M [00:05<00:01, 38.8MB/s]
adapter_model.safetensors: 73%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 158M/218M [00:05<00:01, 41.1MB/s]
adapter_model.safetensors: 75%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 163M/218M [00:06<00:02, 25.3MB/s]
adapter_model.safetensors: 79%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 172M/218M [00:06<00:01, 33.4MB/s]
adapter_model.safetensors: 81%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 177M/218M [00:07<00:02, 20.3MB/s]
adapter_model.safetensors: 88%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 192M/218M [00:07<00:01, 24.7MB/s]
adapter_model.safetensors: 95%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 208M/218M [00:07<00:00, 31.4MB/s]
adapter_model.safetensors: 100%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ| 218M/218M [00:08<00:00, 26.3MB/s] |
| CommitInfo(commit_url='https://huggingface.co/adminsafesign/SafeSign-8B-Instruct-Test/commit/240fb3f8035f99be74d327b191a26fde6c606f26', commit_message='Upload model', commit_description='', oid='240fb3f8035f99be74d327b191a26fde6c606f26', pr_url=None, pr_revision=None, pr_num=None) |
| (Pdb) <bound method Trainer.push_to_hub of <transformers.trainer.Trainer object at 0x7fb9a8326140>> |
| (Pdb) [INFO|trainer.py:2889] 2024-06-03 09:33:21,306 >> Saving model checkpoint to final_out/llama3-8b-instruct-final-less-lora-everything |
| [INFO|tokenization_utils_base.py:2432] 2024-06-03 09:33:23,479 >> tokenizer config file saved in final_out/llama3-8b-instruct-final-less-lora-everything/tokenizer_config.json |
| [INFO|tokenization_utils_base.py:2441] 2024-06-03 09:33:23,490 >> Special tokens file saved in final_out/llama3-8b-instruct-final-less-lora-everything/special_tokens_map.json |
| [INFO|modelcard.py:452] 2024-06-03 09:33:23,882 >> Dropping the following result as it does not have all the necessary fields: |
| {} |
|
Upload 2 LFS files: 0%| | 0/2 [00:00<?, ?it/s] |
|
adapter_model.safetensors: 0%| | 0.00/218M [00:00<?, ?B/s][A |
|
|
|
training_args.bin: 0%| | 0.00/5.11k [00:00<?, ?B/s][A[A |
|
adapter_model.safetensors: 1%|β | 1.75M/218M [00:00<00:14, 15.1MB/s][A
training_args.bin: 100%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ| 5.11k/5.11k [00:00<00:00, 32.0kB/s] |
|
|
|
adapter_model.safetensors: 2%|ββ | 3.72M/218M [00:00<00:14, 15.3MB/s][A |
|
adapter_model.safetensors: 4%|ββββ | 9.04M/218M [00:00<00:07, 26.4MB/s][A |
|
adapter_model.safetensors: 7%|βββββββ | 16.0M/218M [00:00<00:11, 17.7MB/s][A |
|
adapter_model.safetensors: 15%|βββββββββββββ | 32.0M/218M [00:01<00:06, 27.5MB/s][A |
|
adapter_model.safetensors: 17%|βββββββββββββββ | 36.9M/218M [00:01<00:06, 29.2MB/s][A |
|
adapter_model.safetensors: 19%|βββββββββββββββββ | 40.7M/218M [00:01<00:06, 29.5MB/s][A |
|
adapter_model.safetensors: 20%|ββββββββββββββββββ | 43.7M/218M [00:01<00:06, 28.7MB/s][A |
|
adapter_model.safetensors: 22%|ββββββββββββββββββββ | 48.0M/218M [00:02<00:09, 18.7MB/s][A |
|
adapter_model.safetensors: 25%|ββββββββββββββββββββββ | 53.6M/218M [00:02<00:07, 21.9MB/s][A |
|
adapter_model.safetensors: 26%|βββββββββββββββββββββββ | 57.0M/218M [00:02<00:07, 20.7MB/s][A |
|
adapter_model.safetensors: 28%|βββββββββββββββββββββββββ | 60.6M/218M [00:02<00:06, 23.3MB/s][A |
|
adapter_model.safetensors: 29%|ββββββββββββββββββββββββββ | 64.0M/218M [00:03<00:11, 13.2MB/s][A |
|
adapter_model.safetensors: 33%|ββββββββββββββββββββββββββββββ | 72.6M/218M [00:03<00:07, 20.4MB/s][A |
|
adapter_model.safetensors: 36%|ββββββββββββββββββββββββββββββββ | 79.2M/218M [00:03<00:05, 26.6MB/s][A |
|
adapter_model.safetensors: 38%|ββββββββββββββββββββββββββββββββββ | 83.1M/218M [00:04<00:08, 15.8MB/s][A |
|
adapter_model.safetensors: 44%|βββββββββββββββββββββββββββββββββββββββ | 96.0M/218M [00:04<00:05, 22.6MB/s][A |
|
adapter_model.safetensors: 47%|ββββββββββββββββββββββββββββββββββββββββββ | 103M/218M [00:04<00:04, 25.7MB/s][A |
|
adapter_model.safetensors: 49%|ββββββββββββββββββββββββββββββββββββββββββββ | 106M/218M [00:04<00:04, 25.3MB/s][A |
|
adapter_model.safetensors: 51%|ββββββββββββββββββββββββββββββββββββββββββββββ | 111M/218M [00:04<00:03, 28.2MB/s][A |
|
adapter_model.safetensors: 52%|βββββββββββββββββββββββββββββββββββββββββββββββ | 114M/218M [00:05<00:05, 18.3MB/s][A |
|
adapter_model.safetensors: 55%|βββββββββββββββββββββββββββββββββββββββββββββββββ | 119M/218M [00:05<00:04, 20.4MB/s][A |
|
adapter_model.safetensors: 56%|ββββββββββββββββββββββββββββββββββββββββββββββββββ | 122M/218M [00:05<00:05, 19.1MB/s][A |
|
adapter_model.safetensors: 58%|ββββββββββββββββββββββββββββββββββββββββββββββββββββ | 128M/218M [00:05<00:03, 22.8MB/s][A |
|
adapter_model.safetensors: 60%|ββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 130M/218M [00:06<00:05, 15.5MB/s][A |
|
adapter_model.safetensors: 66%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 144M/218M [00:06<00:03, 24.0MB/s][A |
|
adapter_model.safetensors: 73%|ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 160M/218M [00:06<00:01, 33.1MB/s][A |
|
adapter_model.safetensors: 77%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 168M/218M [00:06<00:01, 35.9MB/s][A |
|
adapter_model.safetensors: 79%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 173M/218M [00:07<00:01, 37.3MB/s][A |
|
adapter_model.safetensors: 81%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 177M/218M [00:07<00:01, 23.9MB/s][A |
|
adapter_model.safetensors: 88%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 192M/218M [00:07<00:00, 31.2MB/s][A |
|
adapter_model.safetensors: 95%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 208M/218M [00:08<00:00, 34.5MB/s][A
adapter_model.safetensors: 100%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ| 218M/218M [00:09<00:00, 21.8MB/s] |
|
Upload 2 LFS files: 50%|βββββββββββββββββββββββββββββββββββββββββββββββββββ | 1/2 [00:10<00:10, 10.23s/it]
Upload 2 LFS files: 100%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ| 2/2 [00:10<00:00, 5.12s/it] |
|
|