diff --git a/.gitattributes b/.gitattributes index a6344aac8c09253b3b630fb776ae94478aa0275b..21cdde5d41ad5dd26b595186125a05c17768f1a1 100644 --- a/.gitattributes +++ b/.gitattributes @@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text *.zip filter=lfs diff=lfs merge=lfs -text *.zst filter=lfs diff=lfs merge=lfs -text *tfevents* filter=lfs diff=lfs merge=lfs -text +wandb/offline-run-20250516_073747-jc2tz43q/run-jc2tz43q.wandb filter=lfs diff=lfs merge=lfs -text diff --git a/.ipynb_checkpoints/fim-checkpoint.py b/.ipynb_checkpoints/fim-checkpoint.py new file mode 100644 index 0000000000000000000000000000000000000000..ef1d57bc2cf9994a80ffa0239492bad0ba311854 --- /dev/null +++ b/.ipynb_checkpoints/fim-checkpoint.py @@ -0,0 +1,141 @@ +# coding=utf-8 +# Copyright 2024 Sourab Mangrulkar. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import functools +import numpy as np + + +# this is expensive so we cache it +@functools.lru_cache(maxsize=None) +def get_fim_token_ids(tokenizer): + if "codellama" in tokenizer.name_or_path: + return ( + tokenizer.bos_token_id, + tokenizer.suffix_id, + tokenizer.prefix_id, + tokenizer.middle_id, + 0, + ) + elif "deepseek-coder" in tokenizer.name_or_path: + return ( + tokenizer.bos_token_id, + tokenizer.encode("<|fim▁hole|>", add_special_tokens=False)[0], + tokenizer.encode("<|fim▁begin|>", add_special_tokens=False)[0], + tokenizer.encode("<|fim▁end|>", add_special_tokens=False)[0], + tokenizer.encode("", add_special_tokens=False)[0], + ) + elif "stable-code" in tokenizer.name_or_path: + return ( + tokenizer.bos_token_id, + tokenizer.encode("")[0], + tokenizer.encode("")[0], + tokenizer.encode("")[0], + tokenizer.encode("")[0], + ) + else: + bos_token_id = None + try: + FIM_PREFIX, FIM_MIDDLE, FIM_SUFFIX, FIM_PAD = tokenizer.special_tokens_map[ + "additional_special_tokens" + ][1:5] + suffix_tok_id, prefix_tok_id, middle_tok_id, pad_tok_id = ( + tokenizer.vocab[tok] + for tok in [FIM_SUFFIX, FIM_PREFIX, FIM_MIDDLE, FIM_PAD] + ) + except KeyError: + suffix_tok_id, prefix_tok_id, middle_tok_id, pad_tok_id = ( + None, + None, + None, + None, + ) + return bos_token_id, suffix_tok_id, prefix_tok_id, middle_tok_id, pad_tok_id + + +def _bos_token_processing(prefix_token_list, bos_token): + if bos_token is not None: + # add the BOS token to the beginning of the list + prefix_token_list.insert(0, bos_token) + + return prefix_token_list + + +## Adapted from https://github.com/bigcode-project/Megatron-LM/blob/6c4bf908df8fd86b4977f54bf5b8bd4b521003d1/megatron/data/gpt_dataset.py +def permute( + sample, + np_rng, + suffix_tok_id, + prefix_tok_id, + middle_tok_id, + pad_tok_id, + fim_rate=0.5, + fim_spm_rate=0.5, + truncate_or_pad=False, + bos_token_id=None, +): + """ + Take in a sample (list of tokens) and perform a FIM transformation on it with a probability of fim_rate, using two FIM modes: + PSM and SPM (with a probability of fim_spm_rate). + """ + + if np_rng.binomial(1, fim_rate): + boundaries = list(np_rng.randint(low=0, high=len(sample) + 1, size=2)) + boundaries.sort() + + prefix = np.array(sample[: boundaries[0]], dtype=np.int64) + middle = np.array(sample[boundaries[0] : boundaries[1]], dtype=np.int64) + suffix = np.array(sample[boundaries[1] :], dtype=np.int64) + + if truncate_or_pad: + new_length = suffix.shape[0] + prefix.shape[0] + middle.shape[0] + 3 + diff = new_length - len(sample) + if diff > 0: + if suffix.shape[0] <= diff: + return sample, np_rng + suffix = suffix[: suffix.shape[0] - diff] + elif diff < 0: + suffix = np.concatenate([suffix, np.full((-1 * diff), pad_tok_id)]) + + if np_rng.binomial(1, fim_spm_rate): + prefix_special_tokens = _bos_token_processing( + [prefix_tok_id, suffix_tok_id], bos_token_id + ) + # SPM (variant 2 from FIM paper) + new_sample = np.concatenate( + [ + prefix_special_tokens, + suffix, + [middle_tok_id], + prefix, + middle, + ] + ) + else: + prefix_special_tokens = _bos_token_processing([prefix_tok_id], bos_token_id) + # PSM + new_sample = np.concatenate( + [ + prefix_special_tokens, + prefix, + [suffix_tok_id], + suffix, + [middle_tok_id], + middle, + ] + ) + else: + # don't do FIM preproc + new_sample = sample + return list(new_sample), np_rng diff --git a/.ipynb_checkpoints/requirements-checkpoint.txt b/.ipynb_checkpoints/requirements-checkpoint.txt new file mode 100644 index 0000000000000000000000000000000000000000..6b7b96512c409868cf9c62d05ed6254a9dc6bd5f --- /dev/null +++ b/.ipynb_checkpoints/requirements-checkpoint.txt @@ -0,0 +1,14 @@ +git+https://github.com/huggingface/transformers +git+https://github.com/huggingface/accelerate +git+https://github.com/huggingface/peft +trl +huggingface-hub +bitsandbytes +evaluate +datasets +einops +wandb +tiktoken +deepspeed +tqdm +safetensors \ No newline at end of file diff --git a/.ipynb_checkpoints/run_peft-checkpoint.sh b/.ipynb_checkpoints/run_peft-checkpoint.sh new file mode 100644 index 0000000000000000000000000000000000000000..540c0e1e72cd3ca63699ac7eaebf293d475951bd --- /dev/null +++ b/.ipynb_checkpoints/run_peft-checkpoint.sh @@ -0,0 +1,40 @@ +CUDA_VISIBLE_DEVICES=0 WANDB_PROJECT=personal-code-copilot python3 train.py \ +--model_name_or_path "codellama/CodeLlama-7b-Instruct-hf" \ +--dataset_name "smangrul/hug_stack" \ +--splits "train" \ +--max_seq_len 2048 \ +--max_steps 2000 \ +--save_steps 500 \ +--eval_steps 100 \ +--logging_steps 5 \ +--log_level "info" \ +--logging_strategy "steps" \ +--save_strategy "steps" \ +--push_to_hub \ +--hub_private_repo True \ +--hub_strategy "every_save" \ +--bf16 True \ +--learning_rate 3e-4 \ +--lr_scheduler_type "cosine" \ +--weight_decay 0.1 \ +--warmup_ratio 0.1 \ +--max_grad_norm 1.0 \ +--output_dir "codellama-hugcoder" \ +--per_device_train_batch_size 4 \ +--per_device_eval_batch_size 4 \ +--gradient_accumulation_steps 4 \ +--gradient_checkpointing True \ +--use_reentrant True \ +--dataset_text_field "text" \ +--test_size 0.1 \ +--fim_rate 0.5 \ +--fim_spm_rate 0.5 \ +--use_peft_lora True \ +--lora_r 32 \ +--lora_alpha 64 \ +--lora_dropout 0.1 \ +--lora_target_modules "all-linear" \ +--use_4bit_quantization True \ +--use_nested_quant True \ +--bnb_4bit_compute_dtype "bfloat16" \ +--use_flash_attn True \ No newline at end of file diff --git a/.ipynb_checkpoints/train-checkpoint.py b/.ipynb_checkpoints/train-checkpoint.py new file mode 100644 index 0000000000000000000000000000000000000000..f7d648ac1bcaddd7ff39da29193f1e7f2ccd9f35 --- /dev/null +++ b/.ipynb_checkpoints/train-checkpoint.py @@ -0,0 +1,495 @@ +# coding=utf-8 +# Copyright 2024 Sourab Mangrulkar. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +Continued pre-training/fine-tuning of code LLMs for code autocompletion. +""" + +import gc +import os +import random +import sys +from typing import Optional +from dataclasses import dataclass, field + +import numpy as np +import torch +from datasets import load_dataset +from torch.utils.data import IterableDataset +from tqdm import tqdm +from transformers import ( + AutoModelForCausalLM, + AutoTokenizer, + Trainer, + TrainingArguments, + HfArgumentParser, + set_seed, + BitsAndBytesConfig, +) + +from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training, replace_lora_weights_loftq +import fim + + +# Define and parse arguments. +@dataclass +class ModelArguments: + """ + Arguments pertaining to which model/config/tokenizer we are going to fine-tune from. + """ + + model_name_or_path: str = field( + metadata={ + "help": "Path to pretrained model or model identifier from huggingface.co/models" + } + ) + lora_alpha: Optional[int] = field(default=16) + lora_dropout: Optional[float] = field(default=0.1) + lora_r: Optional[int] = field(default=64) + lora_target_modules: Optional[str] = field( + default="q_proj,k_proj,v_proj,o_proj,down_proj,up_proj,gate_proj", + metadata={ + "help": "comma separated list of target modules to apply LoRA layers to" + }, + ) + use_nested_quant: Optional[bool] = field( + default=False, + metadata={"help": "Activate nested quantization for 4bit base models"}, + ) + bnb_4bit_compute_dtype: Optional[str] = field( + default="float16", + metadata={"help": "Compute dtype for 4bit base models"}, + ) + bnb_4bit_quant_type: Optional[str] = field( + default="nf4", + metadata={"help": "Quantization type fp4 or nf4"}, + ) + use_flash_attn: Optional[bool] = field( + default=False, + metadata={"help": "Enables Flash attention for training."}, + ) + use_peft_lora: Optional[bool] = field( + default=False, + metadata={"help": "Enables PEFT LoRA for training."}, + ) + use_8bit_qunatization: Optional[bool] = field( + default=False, + metadata={"help": "Enables loading model in 8bit."}, + ) + use_4bit_quantization: Optional[bool] = field( + default=False, + metadata={"help": "Enables loading model in 4bit."}, + ) + use_reentrant: Optional[bool] = field( + default=False, + metadata={"help": "Gradient Checkpointing param. Refer the related docs"}, + ) + use_unsloth: Optional[bool] = field( + default=False, + metadata={"help": "Enables UnSloth for training."}, + ) + use_loftq: Optional[bool] = field( + default=False, + metadata={"help": "Enables LoftQ init for the LoRA adapters when using QLoRA."}, + ) + use_loftq_callback: Optional[bool] = field( + default=False, + metadata={"help": "Enables LoftQ callback comparing logits of base model to the ones from LoftQ init. Provides better init."}, + ) + + +@dataclass +class DataTrainingArguments: + dataset_name: Optional[str] = field( + default="smangrul/hug_stack", + metadata={"help": "The preference dataset to use."}, + ) + dataset_text_field: str = field( + default="text", metadata={"help": "Dataset field to use as input text."} + ) + max_seq_length: Optional[int] = field(default=4096) + test_size: Optional[float] = field(default=0.1) + fim_rate: Optional[float] = field(default=0.5) + fim_spm_rate: Optional[float] = field(default=0.5) + splits: Optional[str] = field( + default="train", + metadata={"help": "Comma separate list of the splits to use from the dataset."}, + ) + + +def chars_token_ratio(dataset, tokenizer, data_column, nb_examples=400): + """ + Estimate the average number of characters per token in the dataset. + """ + total_characters, total_tokens = 0, 0 + for _, example in tqdm(zip(range(nb_examples), iter(dataset)), total=nb_examples): + total_characters += len(example[data_column]) + total_tokens += len(tokenizer(example[data_column]).tokens()) + + return total_characters / total_tokens + + +class ConstantLengthDataset(IterableDataset): + """ + Iterable dataset that returns constant length chunks of tokens from stream of text files. + Args: + tokenizer (Tokenizer): The processor used for proccessing the data. + dataset (dataset.Dataset): Dataset with text files. + infinite (bool): If True the iterator is reset after dataset reaches end else stops. + seq_length (int): Length of token sequences to return. + num_of_sequences (int): Number of token sequences to keep in buffer. + chars_per_token (int): Number of characters per token used to estimate number of tokens in text buffer. + fim_rate (float): Rate (0.0 to 1.0) that sample will be permuted with FIM. + fim_spm_rate (float): Rate (0.0 to 1.0) of FIM permuations that will use SPM. + seed (int): Seed for random number generator. + """ + + def __init__( + self, + tokenizer, + dataset, + infinite=False, + seq_length=1024, + num_of_sequences=1024, + chars_per_token=3.6, + content_field="content", + fim_rate=0.5, + fim_spm_rate=0.5, + seed=0, + shuffle=False, + ): + self.tokenizer = tokenizer + self.concat_token_id = tokenizer.eos_token_id + self.dataset = dataset + self.seq_length = seq_length + self.infinite = infinite + self.current_size = 0 + self.max_buffer_size = seq_length * chars_per_token * num_of_sequences + self.content_field = content_field + self.fim_rate = fim_rate + self.fim_spm_rate = fim_spm_rate + self.seed = seed + self.shuffle = shuffle + + ( + self.bos_token_id, + self.suffix_tok_id, + self.prefix_tok_id, + self.middle_tok_id, + self.pad_tok_id, + ) = fim.get_fim_token_ids(self.tokenizer) + if not self.suffix_tok_id and self.fim_rate > 0: + print("FIM is not supported by tokenizer, disabling FIM") + self.fim_rate = 0 + + def __iter__(self): + iterator = iter(self.dataset) + more_examples = True + np_rng = np.random.RandomState(seed=self.seed) + while more_examples: + buffer, buffer_len = [], 0 + while True: + if buffer_len >= self.max_buffer_size: + break + try: + buffer.append(next(iterator)[self.content_field]) + buffer_len += len(buffer[-1]) + except StopIteration: + if self.infinite: + iterator = iter(self.dataset) + else: + more_examples = False + break + tokenized_inputs = self.tokenizer( + buffer, truncation=False, add_special_tokens=False + )["input_ids"] + all_token_ids = [] + + for tokenized_input in tokenized_inputs: + # optionally do FIM permutations + if self.fim_rate > 0: + tokenized_input, np_rng = fim.permute( + tokenized_input, + np_rng, + self.suffix_tok_id, + self.prefix_tok_id, + self.middle_tok_id, + self.pad_tok_id, + fim_rate=self.fim_rate, + fim_spm_rate=self.fim_spm_rate, + truncate_or_pad=False, + bos_token_id=self.bos_token_id, + ) + + all_token_ids.extend(tokenized_input + [self.concat_token_id]) + examples = [] + for i in range(0, len(all_token_ids), self.seq_length): + input_ids = all_token_ids[i : i + self.seq_length] + if len(input_ids) == self.seq_length: + examples.append(input_ids) + if self.shuffle: + random.shuffle(examples) + for example in examples: + self.current_size += 1 + yield { + "input_ids": torch.LongTensor(example), + "labels": torch.LongTensor(example), + } + + +def create_datasets(tokenizer, args, seed): + dataset = load_dataset(args.dataset_name, split=args.splits) + dataset = dataset.train_test_split( + test_size=args.test_size, seed=seed, shuffle=True + ) + train_data = dataset["train"] + valid_data = dataset["test"] + print( + f"Size of the train set: {len(train_data)}. Size of the validation set: {len(valid_data)}" + ) + chars_per_token = chars_token_ratio(train_data, tokenizer, args.dataset_text_field) + print(f"The character to token ratio of the dataset is: {chars_per_token:.2f}") + train_dataset = ConstantLengthDataset( + tokenizer, + train_data, + infinite=True, + seq_length=args.max_seq_length, + chars_per_token=chars_per_token, + content_field=args.dataset_text_field, + fim_rate=args.fim_rate, + fim_spm_rate=args.fim_spm_rate, + seed=seed, + shuffle=True, + ) + valid_dataset = ConstantLengthDataset( + tokenizer, + valid_data, + infinite=False, + seq_length=args.max_seq_length, + chars_per_token=chars_per_token, + content_field=args.dataset_text_field, + fim_rate=args.fim_rate, + fim_spm_rate=args.fim_spm_rate, + seed=seed, + ) + print(f"A sample of valid dataset: {next(iter(valid_dataset))}") + return train_dataset, valid_dataset + +def get_mae(x, y): + return (x - y).abs().mean() + + +def get_mse(x, y): + return torch.pow(x - y, 2).mean() + + +def error_report(x, y): + mae = get_mae(x, y) + mse = get_mse(x, y) + print( + f"Mean absolute error: {mae:>8.5f}\n" + f"Mean squared error: {mse:>8.5f}" + ) + + +def loftq_init(model, tokenizer, train_dataset, max_seq_length, args): + if args.use_loftq_callback: + compute_dtype = getattr(torch, args.bnb_4bit_compute_dtype) + base_model = AutoModelForCausalLM.from_pretrained(args.model_name_or_path, torch_dtype=compute_dtype) + base_model.resize_token_embeddings(len(tokenizer), pad_to_multiple_of=8) + random_input_ids = torch.randint(0, len(train_dataset), size=(1,)).numpy().tolist() + random_inputs = [train_dataset[i]['content'] for i in random_input_ids] + random_inputs = tokenizer(random_inputs, return_tensors="pt", padding=True, truncation="max_length", max_length=max_seq_length) + logits_base = base_model(**random_inputs).logits + del base_model + gc.collect() + + def loftq_callback(model, module_name): + """Callable to replace weights with LoFTQ if the mse is lower than the current best one.""" + global current_mse + logits = model(**random_inputs).logits + mse = get_mse(logits_base, logits) + if mse < current_mse: + current_mse = mse + print(f"MSE improved for module {module_name}") + return True + print(f"MSE did not improve for module {module_name}") + return False + + replace_lora_weights_loftq(model, callback=loftq_callback) + logits_loftq_callback = model(**random_inputs).logits + error_report(logits_base, logits_loftq_callback) + else: + replace_lora_weights_loftq(model) + + +def create_and_prepare_model(args, data_args, training_args): + device_map = None + bnb_config = None + + load_in_8bit = args.use_8bit_qunatization + load_in_4bit = args.use_4bit_quantization + + if args.use_unsloth: + from unsloth import FastLanguageModel + + if args.use_4bit_quantization: + compute_dtype = getattr(torch, args.bnb_4bit_compute_dtype) + + bnb_config = BitsAndBytesConfig( + load_in_4bit=args.use_4bit_quantization, + bnb_4bit_quant_type=args.bnb_4bit_quant_type, + bnb_4bit_compute_dtype=compute_dtype, + bnb_4bit_use_double_quant=args.use_nested_quant, + ) + + if compute_dtype == torch.float16 and args.use_4bit_quantization: + major, _ = torch.cuda.get_device_capability() + if major >= 8: + print("=" * 80) + print( + "Your GPU supports bfloat16, you can accelerate training with the argument --bf16" + ) + print("=" * 80) + + if args.use_4bit_quantization or args.use_8bit_qunatization: + device_map = ( + int(os.environ.get("LOCAL_RANK", -1)) + if torch.distributed.is_available() and torch.distributed.is_initialized() + else "auto" + ) # {"": 0} + + if args.use_unsloth: + # Load model + model, _ = FastLanguageModel.from_pretrained( + model_name=args.model_name_or_path, + max_seq_length=data_args.max_seq_length, + dtype=None, + load_in_4bit=load_in_4bit, + ) + else: + model = AutoModelForCausalLM.from_pretrained( + args.model_name_or_path, + load_in_8bit=load_in_8bit, + quantization_config=bnb_config, + device_map=device_map, + trust_remote_code=True, + attn_implementation="flash_attention_2" if args.use_flash_attn else "eager", + ) + + if ( + (args.use_4bit_quantization or args.use_8bit_qunatization) + and args.use_peft_lora + and not args.use_unsloth + ): + model = prepare_model_for_kbit_training( + model, + use_gradient_checkpointing=training_args.gradient_checkpointing, + gradient_checkpointing_kwargs={"use_reentrant": model_args.use_reentrant}, + ) + + if args.use_peft_lora and not args.use_unsloth: + peft_config = LoraConfig( + lora_alpha=args.lora_alpha, + lora_dropout=args.lora_dropout, + r=args.lora_r, + bias="none", + task_type="CAUSAL_LM", + target_modules=args.lora_target_modules.split(",") + if args.lora_target_modules != "all-linear" + else args.lora_target_modules, + ) + model = get_peft_model(model, peft_config) + elif args.use_peft_lora and args.use_unsloth: + # Do model patching and add fast LoRA weights + model = FastLanguageModel.get_peft_model( + model, + lora_alpha=args.lora_alpha, + lora_dropout=args.lora_dropout, + r=args.lora_r, + target_modules=args.lora_target_modules.split(",") + if args.lora_target_modules != "all-linear" + else args.lora_target_modules, + use_gradient_checkpointing=training_args.gradient_checkpointing, + random_state=training_args.seed, + max_seq_length=data_args.max_seq_length, + ) + return model + + +def main(model_args, data_args, training_args): + # Set seed for reproducibility + set_seed(training_args.seed) + + # load the tokenizer + tokenizer = AutoTokenizer.from_pretrained(model_args.model_name_or_path) + + # load the datasets + train_dataset, eval_dataset = create_datasets( + tokenizer, data_args, training_args.seed + ) + train_dataset.start_iteration = 0 + + model = create_and_prepare_model(model_args, data_args, training_args) + # gradient ckpt + model.config.use_cache = not training_args.gradient_checkpointing + training_args.gradient_checkpointing = ( + training_args.gradient_checkpointing and not model_args.use_unsloth + ) + if training_args.gradient_checkpointing: + training_args.gradient_checkpointing_kwargs = { + "use_reentrant": model_args.use_reentrant + } + + # trainer + trainer = Trainer( + model=model, + args=training_args, + train_dataset=train_dataset, + eval_dataset=eval_dataset, + ) + trainer.accelerator.print(f"{trainer.model}") + if model_args.use_peft_lora: + trainer.model.print_trainable_parameters() + + # LoftQ initialization when using QLoRA + if model_args.use_4bit_quantization and model_args.use_loftq: + loftq_init(trainer.model, tokenizer, train_dataset, data_args.max_seq_length ,model_args) + + # train + checkpoint = None + if training_args.resume_from_checkpoint is not None: + checkpoint = training_args.resume_from_checkpoint + trainer.train(resume_from_checkpoint=checkpoint) + + # saving final model + if trainer.is_fsdp_enabled: + trainer.accelerator.state.fsdp_plugin.set_state_dict_type("FULL_STATE_DICT") + trainer.save_model() + + +if __name__ == "__main__": + parser = HfArgumentParser( + (ModelArguments, DataTrainingArguments, TrainingArguments) + ) + if len(sys.argv) == 2 and sys.argv[1].endswith(".json"): + # If we pass only one argument to the script and it's the path to a json file, + # let's parse it to get our arguments. + model_args, data_args, training_args = parser.parse_json_file( + json_file=os.path.abspath(sys.argv[1]) + ) + else: + model_args, data_args, training_args = parser.parse_args_into_dataclasses() + main(model_args, data_args, training_args) diff --git a/__pycache__/fim.cpython-310.pyc b/__pycache__/fim.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b29077b42a2538a642ff269a6b4c4d01eb469dd2 Binary files /dev/null and b/__pycache__/fim.cpython-310.pyc differ diff --git a/codellama-hugcoder/README.md b/codellama-hugcoder/README.md new file mode 100644 index 0000000000000000000000000000000000000000..de5239e6f140c1823c2e3f7a14e5c9cacb55e3fd --- /dev/null +++ b/codellama-hugcoder/README.md @@ -0,0 +1,57 @@ +--- +library_name: peft +license: llama2 +base_model: codellama/CodeLlama-7b-Instruct-hf +tags: +- generated_from_trainer +model-index: +- name: codellama-hugcoder + results: [] +--- + + + +# codellama-hugcoder + +This model is a fine-tuned version of [codellama/CodeLlama-7b-Instruct-hf](https://huggingface.co/codellama/CodeLlama-7b-Instruct-hf) on an unknown dataset. + +## Model description + +More information needed + +## Intended uses & limitations + +More information needed + +## Training and evaluation data + +More information needed + +## Training procedure + +### Training hyperparameters + +The following hyperparameters were used during training: +- learning_rate: 0.0003 +- train_batch_size: 4 +- eval_batch_size: 4 +- seed: 42 +- gradient_accumulation_steps: 4 +- total_train_batch_size: 16 +- optimizer: Use adamw_torch with betas=(0.9,0.999) and epsilon=1e-08 and optimizer_args=No additional optimizer arguments +- lr_scheduler_type: cosine +- lr_scheduler_warmup_ratio: 0.1 +- training_steps: 2000 + +### Training results + + + +### Framework versions + +- PEFT 0.15.2.dev0 +- Transformers 4.52.0.dev0 +- Pytorch 2.6.0+cu124 +- Datasets 3.2.0 +- Tokenizers 0.21.1 \ No newline at end of file diff --git a/codellama-hugcoder/adapter_config.json b/codellama-hugcoder/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..5103ef08b354afd3803f5061efdf7a079a493cb1 --- /dev/null +++ b/codellama-hugcoder/adapter_config.json @@ -0,0 +1,39 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "codellama/CodeLlama-7b-Instruct-hf", + "bias": "none", + "corda_config": null, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 64, + "lora_bias": false, + "lora_dropout": 0.1, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 32, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "down_proj", + "up_proj", + "k_proj", + "q_proj", + "v_proj", + "gate_proj", + "o_proj" + ], + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/codellama-hugcoder/adapter_model.safetensors b/codellama-hugcoder/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..58d3ccd4c40a5bb55497cd8825213decfac35527 --- /dev/null +++ b/codellama-hugcoder/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:456cbd6da326b2c6f27a85ab19d40e13bf3fb60689cbe5ec56653d42193963f8 +size 319876032 diff --git a/codellama-hugcoder/checkpoint-1000/README.md b/codellama-hugcoder/checkpoint-1000/README.md new file mode 100644 index 0000000000000000000000000000000000000000..a8daf76b93ee8508456247003fa155d657d2e354 --- /dev/null +++ b/codellama-hugcoder/checkpoint-1000/README.md @@ -0,0 +1,202 @@ +--- +base_model: codellama/CodeLlama-7b-Instruct-hf +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.15.2.dev0 \ No newline at end of file diff --git a/codellama-hugcoder/checkpoint-1000/adapter_config.json b/codellama-hugcoder/checkpoint-1000/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..5103ef08b354afd3803f5061efdf7a079a493cb1 --- /dev/null +++ b/codellama-hugcoder/checkpoint-1000/adapter_config.json @@ -0,0 +1,39 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "codellama/CodeLlama-7b-Instruct-hf", + "bias": "none", + "corda_config": null, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 64, + "lora_bias": false, + "lora_dropout": 0.1, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 32, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "down_proj", + "up_proj", + "k_proj", + "q_proj", + "v_proj", + "gate_proj", + "o_proj" + ], + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/codellama-hugcoder/checkpoint-1000/adapter_model.safetensors b/codellama-hugcoder/checkpoint-1000/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..e6ddbcefd786f5c0a7c637ee5516b92c5b877848 --- /dev/null +++ b/codellama-hugcoder/checkpoint-1000/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a808764ea3b6733b0a7c7a6002b640b2b9246cabcd9ad2d940aa7f43c05d66e3 +size 319876032 diff --git a/codellama-hugcoder/checkpoint-1000/optimizer.pt b/codellama-hugcoder/checkpoint-1000/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..6a7dc10c090ce071a89ce8347c0693338c37b5af --- /dev/null +++ b/codellama-hugcoder/checkpoint-1000/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4025993adcd424dc3d3b0c61b41a0e262786b3bb304e6a592a013e59b80a6b38 +size 640009682 diff --git a/codellama-hugcoder/checkpoint-1000/rng_state.pth b/codellama-hugcoder/checkpoint-1000/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..991eada42a154d777614c77b2064af65c7abfeeb --- /dev/null +++ b/codellama-hugcoder/checkpoint-1000/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8f822cfb134cd0b1f54ce227e6d11176dede74f86c94420156b0a49753efe3b7 +size 14244 diff --git a/codellama-hugcoder/checkpoint-1000/scheduler.pt b/codellama-hugcoder/checkpoint-1000/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..5b24b95d3a3c869a6e1fd81aef577a3b307bd3a6 --- /dev/null +++ b/codellama-hugcoder/checkpoint-1000/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3569157643c45495d0de4a184cdcaab0e6cab5317a8ad5f0b1bbb2d736dd80d4 +size 1064 diff --git a/codellama-hugcoder/checkpoint-1000/trainer_state.json b/codellama-hugcoder/checkpoint-1000/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..dcb698779e3d5b5820312186b79098454e6552ba --- /dev/null +++ b/codellama-hugcoder/checkpoint-1000/trainer_state.json @@ -0,0 +1,1434 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.5, + "eval_steps": 100.0, + "global_step": 1000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0025, + "grad_norm": 0.09379793703556061, + "learning_rate": 5.999999999999999e-06, + "loss": 0.6799, + "step": 5 + }, + { + "epoch": 0.005, + "grad_norm": 0.1399833709001541, + "learning_rate": 1.3499999999999998e-05, + "loss": 0.6954, + "step": 10 + }, + { + "epoch": 0.0075, + "grad_norm": 0.08632303029298782, + "learning_rate": 2.1e-05, + "loss": 0.6921, + "step": 15 + }, + { + "epoch": 0.01, + "grad_norm": 0.10006701201200485, + "learning_rate": 2.8499999999999998e-05, + "loss": 0.69, + "step": 20 + }, + { + "epoch": 0.0125, + "grad_norm": 0.07633858919143677, + "learning_rate": 3.5999999999999994e-05, + "loss": 0.6722, + "step": 25 + }, + { + "epoch": 0.015, + "grad_norm": 0.09399061650037766, + "learning_rate": 4.3499999999999993e-05, + "loss": 0.6453, + "step": 30 + }, + { + "epoch": 0.0175, + "grad_norm": 0.0843738541007042, + "learning_rate": 5.1e-05, + "loss": 0.6276, + "step": 35 + }, + { + "epoch": 0.02, + "grad_norm": 0.08583351224660873, + "learning_rate": 5.85e-05, + "loss": 0.58, + "step": 40 + }, + { + "epoch": 0.0225, + "grad_norm": 0.09571370482444763, + "learning_rate": 6.599999999999999e-05, + "loss": 0.6355, + "step": 45 + }, + { + "epoch": 0.025, + "grad_norm": 0.1083935871720314, + "learning_rate": 7.35e-05, + "loss": 0.589, + "step": 50 + }, + { + "epoch": 0.0275, + "grad_norm": 0.10387319326400757, + "learning_rate": 8.1e-05, + "loss": 0.6061, + "step": 55 + }, + { + "epoch": 0.03, + "grad_norm": 0.11083361506462097, + "learning_rate": 8.849999999999998e-05, + "loss": 0.572, + "step": 60 + }, + { + "epoch": 0.0325, + "grad_norm": 0.12665686011314392, + "learning_rate": 9.599999999999999e-05, + "loss": 0.5442, + "step": 65 + }, + { + "epoch": 0.035, + "grad_norm": 0.1308053582906723, + "learning_rate": 0.00010349999999999998, + "loss": 0.6524, + "step": 70 + }, + { + "epoch": 0.0375, + "grad_norm": 0.13535510003566742, + "learning_rate": 0.00011099999999999999, + "loss": 0.6404, + "step": 75 + }, + { + "epoch": 0.04, + "grad_norm": 0.12833671271800995, + "learning_rate": 0.0001185, + "loss": 0.5717, + "step": 80 + }, + { + "epoch": 0.0425, + "grad_norm": 0.11962099373340607, + "learning_rate": 0.00012599999999999997, + "loss": 0.6098, + "step": 85 + }, + { + "epoch": 0.045, + "grad_norm": 0.13898271322250366, + "learning_rate": 0.0001335, + "loss": 0.6099, + "step": 90 + }, + { + "epoch": 0.0475, + "grad_norm": 0.14486610889434814, + "learning_rate": 0.00014099999999999998, + "loss": 0.5744, + "step": 95 + }, + { + "epoch": 0.05, + "grad_norm": 0.1432138830423355, + "learning_rate": 0.00014849999999999998, + "loss": 0.5659, + "step": 100 + }, + { + "epoch": 0.0525, + "grad_norm": 0.13487878441810608, + "learning_rate": 0.000156, + "loss": 0.5622, + "step": 105 + }, + { + "epoch": 0.055, + "grad_norm": 0.12495309859514236, + "learning_rate": 0.0001635, + "loss": 0.5951, + "step": 110 + }, + { + "epoch": 0.0575, + "grad_norm": 0.13011734187602997, + "learning_rate": 0.00017099999999999998, + "loss": 0.6249, + "step": 115 + }, + { + "epoch": 0.06, + "grad_norm": 0.13987745344638824, + "learning_rate": 0.00017849999999999997, + "loss": 0.559, + "step": 120 + }, + { + "epoch": 0.0625, + "grad_norm": 0.13373605906963348, + "learning_rate": 0.000186, + "loss": 0.5475, + "step": 125 + }, + { + "epoch": 0.065, + "grad_norm": 0.12433867901563644, + "learning_rate": 0.0001935, + "loss": 0.5274, + "step": 130 + }, + { + "epoch": 0.0675, + "grad_norm": 0.11097615957260132, + "learning_rate": 0.000201, + "loss": 0.678, + "step": 135 + }, + { + "epoch": 0.07, + "grad_norm": 0.1155027225613594, + "learning_rate": 0.00020849999999999997, + "loss": 0.5611, + "step": 140 + }, + { + "epoch": 0.0725, + "grad_norm": 0.11431068181991577, + "learning_rate": 0.00021599999999999996, + "loss": 0.6054, + "step": 145 + }, + { + "epoch": 0.075, + "grad_norm": 0.09796140342950821, + "learning_rate": 0.00022349999999999998, + "loss": 0.5472, + "step": 150 + }, + { + "epoch": 0.0775, + "grad_norm": 0.09489257633686066, + "learning_rate": 0.00023099999999999998, + "loss": 0.4636, + "step": 155 + }, + { + "epoch": 0.08, + "grad_norm": 0.10787788033485413, + "learning_rate": 0.0002385, + "loss": 0.6164, + "step": 160 + }, + { + "epoch": 0.0825, + "grad_norm": 0.10261733084917068, + "learning_rate": 0.00024599999999999996, + "loss": 0.5408, + "step": 165 + }, + { + "epoch": 0.085, + "grad_norm": 0.11870352178812027, + "learning_rate": 0.0002535, + "loss": 0.5268, + "step": 170 + }, + { + "epoch": 0.0875, + "grad_norm": 0.11910569667816162, + "learning_rate": 0.000261, + "loss": 0.5461, + "step": 175 + }, + { + "epoch": 0.09, + "grad_norm": 0.10083702206611633, + "learning_rate": 0.00026849999999999997, + "loss": 0.4794, + "step": 180 + }, + { + "epoch": 0.0925, + "grad_norm": 0.10453511029481888, + "learning_rate": 0.000276, + "loss": 0.5539, + "step": 185 + }, + { + "epoch": 0.095, + "grad_norm": 0.101403146982193, + "learning_rate": 0.00028349999999999995, + "loss": 0.5346, + "step": 190 + }, + { + "epoch": 0.0975, + "grad_norm": 0.10724789649248123, + "learning_rate": 0.00029099999999999997, + "loss": 0.6026, + "step": 195 + }, + { + "epoch": 0.1, + "grad_norm": 0.1140277311205864, + "learning_rate": 0.0002985, + "loss": 0.5193, + "step": 200 + }, + { + "epoch": 0.1025, + "grad_norm": 0.09706108272075653, + "learning_rate": 0.0002999963446058092, + "loss": 0.54, + "step": 205 + }, + { + "epoch": 0.105, + "grad_norm": 0.10003062337636948, + "learning_rate": 0.0002999814948722491, + "loss": 0.5365, + "step": 210 + }, + { + "epoch": 0.1075, + "grad_norm": 0.1078687533736229, + "learning_rate": 0.00029995522346717746, + "loss": 0.5889, + "step": 215 + }, + { + "epoch": 0.11, + "grad_norm": 0.10538115352392197, + "learning_rate": 0.0002999175323912636, + "loss": 0.5611, + "step": 220 + }, + { + "epoch": 0.1125, + "grad_norm": 0.1020808294415474, + "learning_rate": 0.00029986842451482874, + "loss": 0.6103, + "step": 225 + }, + { + "epoch": 0.115, + "grad_norm": 0.09635835886001587, + "learning_rate": 0.0002998079035776279, + "loss": 0.5229, + "step": 230 + }, + { + "epoch": 0.1175, + "grad_norm": 0.10287190228700638, + "learning_rate": 0.0002997359741885648, + "loss": 0.5312, + "step": 235 + }, + { + "epoch": 0.12, + "grad_norm": 0.09160075336694717, + "learning_rate": 0.0002996526418253408, + "loss": 0.5673, + "step": 240 + }, + { + "epoch": 0.1225, + "grad_norm": 0.08691006153821945, + "learning_rate": 0.000299557912834038, + "loss": 0.5326, + "step": 245 + }, + { + "epoch": 0.125, + "grad_norm": 0.10096988826990128, + "learning_rate": 0.00029945179442863594, + "loss": 0.6004, + "step": 250 + }, + { + "epoch": 0.1275, + "grad_norm": 0.09594204276800156, + "learning_rate": 0.000299334294690462, + "loss": 0.5516, + "step": 255 + }, + { + "epoch": 0.13, + "grad_norm": 0.10281919687986374, + "learning_rate": 0.00029920542256757607, + "loss": 0.5515, + "step": 260 + }, + { + "epoch": 0.1325, + "grad_norm": 0.08547840267419815, + "learning_rate": 0.00029906518787408944, + "loss": 0.5243, + "step": 265 + }, + { + "epoch": 0.135, + "grad_norm": 0.10161560773849487, + "learning_rate": 0.0002989136012894168, + "loss": 0.5096, + "step": 270 + }, + { + "epoch": 0.1375, + "grad_norm": 0.09101904183626175, + "learning_rate": 0.0002987506743574635, + "loss": 0.553, + "step": 275 + }, + { + "epoch": 0.14, + "grad_norm": 0.09769442677497864, + "learning_rate": 0.0002985764194857463, + "loss": 0.4953, + "step": 280 + }, + { + "epoch": 0.1425, + "grad_norm": 0.10991579294204712, + "learning_rate": 0.00029839084994444826, + "loss": 0.5152, + "step": 285 + }, + { + "epoch": 0.145, + "grad_norm": 0.09450916200876236, + "learning_rate": 0.00029819397986540836, + "loss": 0.5397, + "step": 290 + }, + { + "epoch": 0.1475, + "grad_norm": 0.10876069217920303, + "learning_rate": 0.0002979858242410454, + "loss": 0.4858, + "step": 295 + }, + { + "epoch": 0.15, + "grad_norm": 0.097995825111866, + "learning_rate": 0.00029776639892321606, + "loss": 0.5566, + "step": 300 + }, + { + "epoch": 0.1525, + "grad_norm": 0.1145048514008522, + "learning_rate": 0.0002975357206220079, + "loss": 0.4531, + "step": 305 + }, + { + "epoch": 0.155, + "grad_norm": 0.10271880775690079, + "learning_rate": 0.00029729380690446654, + "loss": 0.5199, + "step": 310 + }, + { + "epoch": 0.1575, + "grad_norm": 0.11095371842384338, + "learning_rate": 0.0002970406761932583, + "loss": 0.5416, + "step": 315 + }, + { + "epoch": 0.16, + "grad_norm": 0.09949438273906708, + "learning_rate": 0.00029677634776526673, + "loss": 0.4841, + "step": 320 + }, + { + "epoch": 0.1625, + "grad_norm": 0.1163724958896637, + "learning_rate": 0.00029650084175012517, + "loss": 0.4913, + "step": 325 + }, + { + "epoch": 0.165, + "grad_norm": 0.10726840049028397, + "learning_rate": 0.00029621417912868323, + "loss": 0.5203, + "step": 330 + }, + { + "epoch": 0.1675, + "grad_norm": 0.09609931707382202, + "learning_rate": 0.00029591638173140947, + "loss": 0.5607, + "step": 335 + }, + { + "epoch": 0.17, + "grad_norm": 0.10824442654848099, + "learning_rate": 0.0002956074722367286, + "loss": 0.6004, + "step": 340 + }, + { + "epoch": 0.1725, + "grad_norm": 0.10465679317712784, + "learning_rate": 0.00029528747416929463, + "loss": 0.5216, + "step": 345 + }, + { + "epoch": 0.175, + "grad_norm": 0.10518354922533035, + "learning_rate": 0.0002949564118981994, + "loss": 0.499, + "step": 350 + }, + { + "epoch": 0.1775, + "grad_norm": 0.0955279991030693, + "learning_rate": 0.0002946143106351165, + "loss": 0.5607, + "step": 355 + }, + { + "epoch": 0.18, + "grad_norm": 0.11159654706716537, + "learning_rate": 0.0002942611964323817, + "loss": 0.5204, + "step": 360 + }, + { + "epoch": 0.1825, + "grad_norm": 0.09571187198162079, + "learning_rate": 0.0002938970961810086, + "loss": 0.6113, + "step": 365 + }, + { + "epoch": 0.185, + "grad_norm": 0.11854679882526398, + "learning_rate": 0.0002935220376086411, + "loss": 0.5639, + "step": 370 + }, + { + "epoch": 0.1875, + "grad_norm": 0.1050512045621872, + "learning_rate": 0.0002931360492774415, + "loss": 0.548, + "step": 375 + }, + { + "epoch": 0.19, + "grad_norm": 0.1053968220949173, + "learning_rate": 0.0002927391605819157, + "loss": 0.5507, + "step": 380 + }, + { + "epoch": 0.1925, + "grad_norm": 0.10567320138216019, + "learning_rate": 0.00029233140174667445, + "loss": 0.5312, + "step": 385 + }, + { + "epoch": 0.195, + "grad_norm": 0.11914283782243729, + "learning_rate": 0.0002919128038241318, + "loss": 0.5961, + "step": 390 + }, + { + "epoch": 0.1975, + "grad_norm": 0.09915795922279358, + "learning_rate": 0.0002914833986921401, + "loss": 0.5086, + "step": 395 + }, + { + "epoch": 0.2, + "grad_norm": 0.10796502232551575, + "learning_rate": 0.0002910432190515628, + "loss": 0.5585, + "step": 400 + }, + { + "epoch": 0.2025, + "grad_norm": 0.10748997330665588, + "learning_rate": 0.00029059229842378373, + "loss": 0.5466, + "step": 405 + }, + { + "epoch": 0.205, + "grad_norm": 0.10696308314800262, + "learning_rate": 0.0002901306711481544, + "loss": 0.5513, + "step": 410 + }, + { + "epoch": 0.2075, + "grad_norm": 0.10418657958507538, + "learning_rate": 0.0002896583723793792, + "loss": 0.5391, + "step": 415 + }, + { + "epoch": 0.21, + "grad_norm": 0.16421550512313843, + "learning_rate": 0.00028917543808483796, + "loss": 0.4699, + "step": 420 + }, + { + "epoch": 0.2125, + "grad_norm": 0.12929962575435638, + "learning_rate": 0.00028868190504184696, + "loss": 0.4984, + "step": 425 + }, + { + "epoch": 0.215, + "grad_norm": 0.10469454526901245, + "learning_rate": 0.00028817781083485816, + "loss": 0.5119, + "step": 430 + }, + { + "epoch": 0.2175, + "grad_norm": 0.0964970663189888, + "learning_rate": 0.00028766319385259713, + "loss": 0.5167, + "step": 435 + }, + { + "epoch": 0.22, + "grad_norm": 0.12395574152469635, + "learning_rate": 0.00028713809328513953, + "loss": 0.5692, + "step": 440 + }, + { + "epoch": 0.2225, + "grad_norm": 0.10189738124608994, + "learning_rate": 0.0002866025491209265, + "loss": 0.4628, + "step": 445 + }, + { + "epoch": 0.225, + "grad_norm": 0.10433454066514969, + "learning_rate": 0.0002860566021437197, + "loss": 0.4869, + "step": 450 + }, + { + "epoch": 0.2275, + "grad_norm": 0.13003456592559814, + "learning_rate": 0.0002855002939294951, + "loss": 0.5291, + "step": 455 + }, + { + "epoch": 0.23, + "grad_norm": 0.11692202836275101, + "learning_rate": 0.000284933666843277, + "loss": 0.5229, + "step": 460 + }, + { + "epoch": 0.2325, + "grad_norm": 0.10757846385240555, + "learning_rate": 0.0002843567640359119, + "loss": 0.435, + "step": 465 + }, + { + "epoch": 0.235, + "grad_norm": 0.10775501281023026, + "learning_rate": 0.00028376962944078206, + "loss": 0.4418, + "step": 470 + }, + { + "epoch": 0.2375, + "grad_norm": 0.11543692648410797, + "learning_rate": 0.00028317230777046015, + "loss": 0.4204, + "step": 475 + }, + { + "epoch": 0.24, + "grad_norm": 0.10946698486804962, + "learning_rate": 0.00028256484451330403, + "loss": 0.49, + "step": 480 + }, + { + "epoch": 0.2425, + "grad_norm": 0.11528221517801285, + "learning_rate": 0.00028194728592999247, + "loss": 0.4752, + "step": 485 + }, + { + "epoch": 0.245, + "grad_norm": 0.10474205762147903, + "learning_rate": 0.0002813196790500027, + "loss": 0.4847, + "step": 490 + }, + { + "epoch": 0.2475, + "grad_norm": 0.10768820345401764, + "learning_rate": 0.00028068207166802837, + "loss": 0.4664, + "step": 495 + }, + { + "epoch": 0.25, + "grad_norm": 0.12158560007810593, + "learning_rate": 0.00028003451234034037, + "loss": 0.4741, + "step": 500 + }, + { + "epoch": 0.2525, + "grad_norm": 0.11635497957468033, + "learning_rate": 0.0002793770503810886, + "loss": 0.4969, + "step": 505 + }, + { + "epoch": 0.255, + "grad_norm": 0.12205849587917328, + "learning_rate": 0.00027870973585854665, + "loss": 0.4798, + "step": 510 + }, + { + "epoch": 0.2575, + "grad_norm": 0.10270871222019196, + "learning_rate": 0.00027803261959129905, + "loss": 0.3888, + "step": 515 + }, + { + "epoch": 0.26, + "grad_norm": 0.11313367635011673, + "learning_rate": 0.0002773457531443712, + "loss": 0.4759, + "step": 520 + }, + { + "epoch": 0.2625, + "grad_norm": 0.12905193865299225, + "learning_rate": 0.00027664918882530225, + "loss": 0.4442, + "step": 525 + }, + { + "epoch": 0.265, + "grad_norm": 0.11690939962863922, + "learning_rate": 0.00027594297968016197, + "loss": 0.5535, + "step": 530 + }, + { + "epoch": 0.2675, + "grad_norm": 0.10021405667066574, + "learning_rate": 0.00027522717948951094, + "loss": 0.4717, + "step": 535 + }, + { + "epoch": 0.27, + "grad_norm": 0.10104178637266159, + "learning_rate": 0.0002745018427643051, + "loss": 0.4906, + "step": 540 + }, + { + "epoch": 0.2725, + "grad_norm": 0.12113891541957855, + "learning_rate": 0.00027376702474174425, + "loss": 0.5674, + "step": 545 + }, + { + "epoch": 0.275, + "grad_norm": 0.11330476403236389, + "learning_rate": 0.0002730227813810658, + "loss": 0.5184, + "step": 550 + }, + { + "epoch": 0.2775, + "grad_norm": 0.1025850847363472, + "learning_rate": 0.0002722691693592831, + "loss": 0.4395, + "step": 555 + }, + { + "epoch": 0.28, + "grad_norm": 0.11591499298810959, + "learning_rate": 0.0002715062460668694, + "loss": 0.5003, + "step": 560 + }, + { + "epoch": 0.2825, + "grad_norm": 0.11281153559684753, + "learning_rate": 0.0002707340696033871, + "loss": 0.4672, + "step": 565 + }, + { + "epoch": 0.285, + "grad_norm": 0.1123538464307785, + "learning_rate": 0.00026995269877306356, + "loss": 0.513, + "step": 570 + }, + { + "epoch": 0.2875, + "grad_norm": 0.10776390135288239, + "learning_rate": 0.0002691621930803127, + "loss": 0.4572, + "step": 575 + }, + { + "epoch": 0.29, + "grad_norm": 0.10008667409420013, + "learning_rate": 0.0002683626127252036, + "loss": 0.4618, + "step": 580 + }, + { + "epoch": 0.2925, + "grad_norm": 0.13961340487003326, + "learning_rate": 0.00026755401859887595, + "loss": 0.4819, + "step": 585 + }, + { + "epoch": 0.295, + "grad_norm": 0.1476685106754303, + "learning_rate": 0.00026673647227890316, + "loss": 0.4964, + "step": 590 + }, + { + "epoch": 0.2975, + "grad_norm": 0.09795507788658142, + "learning_rate": 0.00026591003602460263, + "loss": 0.4796, + "step": 595 + }, + { + "epoch": 0.3, + "grad_norm": 0.10903532058000565, + "learning_rate": 0.00026507477277229496, + "loss": 0.4775, + "step": 600 + }, + { + "epoch": 0.3025, + "grad_norm": 0.10258448123931885, + "learning_rate": 0.0002642307461305105, + "loss": 0.4519, + "step": 605 + }, + { + "epoch": 0.305, + "grad_norm": 0.11204435676336288, + "learning_rate": 0.0002633780203751459, + "loss": 0.4451, + "step": 610 + }, + { + "epoch": 0.3075, + "grad_norm": 0.10147629678249359, + "learning_rate": 0.0002625166604445689, + "loss": 0.4256, + "step": 615 + }, + { + "epoch": 0.31, + "grad_norm": 0.10481107234954834, + "learning_rate": 0.00026164673193467306, + "loss": 0.4381, + "step": 620 + }, + { + "epoch": 0.3125, + "grad_norm": 0.10856641829013824, + "learning_rate": 0.00026076830109388255, + "loss": 0.4958, + "step": 625 + }, + { + "epoch": 0.315, + "grad_norm": 0.09918677806854248, + "learning_rate": 0.0002598814348181068, + "loss": 0.4335, + "step": 630 + }, + { + "epoch": 0.3175, + "grad_norm": 0.10417389869689941, + "learning_rate": 0.00025898620064564637, + "loss": 0.4603, + "step": 635 + }, + { + "epoch": 0.32, + "grad_norm": 0.0903329998254776, + "learning_rate": 0.00025808266675204954, + "loss": 0.3932, + "step": 640 + }, + { + "epoch": 0.3225, + "grad_norm": 0.11511855572462082, + "learning_rate": 0.0002571709019449205, + "loss": 0.4169, + "step": 645 + }, + { + "epoch": 0.325, + "grad_norm": 0.11355557292699814, + "learning_rate": 0.0002562509756586793, + "loss": 0.4455, + "step": 650 + }, + { + "epoch": 0.3275, + "grad_norm": 0.1271187961101532, + "learning_rate": 0.00025532295794927437, + "loss": 0.4902, + "step": 655 + }, + { + "epoch": 0.33, + "grad_norm": 0.11936645954847336, + "learning_rate": 0.0002543869194888471, + "loss": 0.4843, + "step": 660 + }, + { + "epoch": 0.3325, + "grad_norm": 0.11935465037822723, + "learning_rate": 0.00025344293156035044, + "loss": 0.4402, + "step": 665 + }, + { + "epoch": 0.335, + "grad_norm": 0.13073407113552094, + "learning_rate": 0.00025249106605211986, + "loss": 0.467, + "step": 670 + }, + { + "epoch": 0.3375, + "grad_norm": 0.10340435802936554, + "learning_rate": 0.0002515313954523991, + "loss": 0.4827, + "step": 675 + }, + { + "epoch": 0.34, + "grad_norm": 0.11634550243616104, + "learning_rate": 0.00025056399284381983, + "loss": 0.466, + "step": 680 + }, + { + "epoch": 0.3425, + "grad_norm": 0.10582319647073746, + "learning_rate": 0.0002495889318978362, + "loss": 0.4751, + "step": 685 + }, + { + "epoch": 0.345, + "grad_norm": 0.16781780123710632, + "learning_rate": 0.00024860628686911436, + "loss": 0.4717, + "step": 690 + }, + { + "epoch": 0.3475, + "grad_norm": 0.11522196233272552, + "learning_rate": 0.0002476161325898776, + "loss": 0.4687, + "step": 695 + }, + { + "epoch": 0.35, + "grad_norm": 0.11830449104309082, + "learning_rate": 0.000246618544464208, + "loss": 0.436, + "step": 700 + }, + { + "epoch": 0.3525, + "grad_norm": 0.17485427856445312, + "learning_rate": 0.0002456135984623034, + "loss": 0.4284, + "step": 705 + }, + { + "epoch": 0.355, + "grad_norm": 0.12288108468055725, + "learning_rate": 0.00024460137111469296, + "loss": 0.4261, + "step": 710 + }, + { + "epoch": 0.3575, + "grad_norm": 0.11587081104516983, + "learning_rate": 0.0002435819395064079, + "loss": 0.4493, + "step": 715 + }, + { + "epoch": 0.36, + "grad_norm": 0.10690271109342575, + "learning_rate": 0.0002425553812711123, + "loss": 0.4648, + "step": 720 + }, + { + "epoch": 0.3625, + "grad_norm": 0.10404397547245026, + "learning_rate": 0.00024152177458519014, + "loss": 0.4634, + "step": 725 + }, + { + "epoch": 0.365, + "grad_norm": 0.11986954510211945, + "learning_rate": 0.00024048119816179236, + "loss": 0.4525, + "step": 730 + }, + { + "epoch": 0.3675, + "grad_norm": 0.10243026167154312, + "learning_rate": 0.00023943373124484234, + "loss": 0.4572, + "step": 735 + }, + { + "epoch": 0.37, + "grad_norm": 0.10386748611927032, + "learning_rate": 0.00023837945360300129, + "loss": 0.3884, + "step": 740 + }, + { + "epoch": 0.3725, + "grad_norm": 0.11165735125541687, + "learning_rate": 0.0002373184455235934, + "loss": 0.4902, + "step": 745 + }, + { + "epoch": 0.375, + "grad_norm": 0.09951601922512054, + "learning_rate": 0.00023625078780649178, + "loss": 0.4541, + "step": 750 + }, + { + "epoch": 0.3775, + "grad_norm": 0.10347504913806915, + "learning_rate": 0.00023517656175796518, + "loss": 0.3871, + "step": 755 + }, + { + "epoch": 0.38, + "grad_norm": 0.10478132963180542, + "learning_rate": 0.00023409584918448627, + "loss": 0.4329, + "step": 760 + }, + { + "epoch": 0.3825, + "grad_norm": 0.1198212131857872, + "learning_rate": 0.00023300873238650159, + "loss": 0.425, + "step": 765 + }, + { + "epoch": 0.385, + "grad_norm": 0.1103711724281311, + "learning_rate": 0.00023191529415216434, + "loss": 0.4274, + "step": 770 + }, + { + "epoch": 0.3875, + "grad_norm": 0.09940385073423386, + "learning_rate": 0.00023081561775102944, + "loss": 0.4368, + "step": 775 + }, + { + "epoch": 0.39, + "grad_norm": 0.11599268019199371, + "learning_rate": 0.00022970978692771242, + "loss": 0.4386, + "step": 780 + }, + { + "epoch": 0.3925, + "grad_norm": 0.10101296752691269, + "learning_rate": 0.00022859788589551188, + "loss": 0.4696, + "step": 785 + }, + { + "epoch": 0.395, + "grad_norm": 0.10112808644771576, + "learning_rate": 0.00022747999932999624, + "loss": 0.4066, + "step": 790 + }, + { + "epoch": 0.3975, + "grad_norm": 0.09595459699630737, + "learning_rate": 0.00022635621236255567, + "loss": 0.4837, + "step": 795 + }, + { + "epoch": 0.4, + "grad_norm": 0.10761380940675735, + "learning_rate": 0.00022522661057391857, + "loss": 0.5446, + "step": 800 + }, + { + "epoch": 0.4025, + "grad_norm": 0.11919954419136047, + "learning_rate": 0.00022409127998763463, + "loss": 0.5027, + "step": 805 + }, + { + "epoch": 0.405, + "grad_norm": 0.10851597785949707, + "learning_rate": 0.00022295030706352356, + "loss": 0.4481, + "step": 810 + }, + { + "epoch": 0.4075, + "grad_norm": 0.10030311346054077, + "learning_rate": 0.00022180377869109104, + "loss": 0.4709, + "step": 815 + }, + { + "epoch": 0.41, + "grad_norm": 0.111280657351017, + "learning_rate": 0.00022065178218291147, + "loss": 0.4423, + "step": 820 + }, + { + "epoch": 0.4125, + "grad_norm": 0.11253602802753448, + "learning_rate": 0.00021949440526797926, + "loss": 0.4136, + "step": 825 + }, + { + "epoch": 0.415, + "grad_norm": 0.10805424302816391, + "learning_rate": 0.00021833173608502732, + "loss": 0.4656, + "step": 830 + }, + { + "epoch": 0.4175, + "grad_norm": 0.10983198881149292, + "learning_rate": 0.00021716386317581542, + "loss": 0.3687, + "step": 835 + }, + { + "epoch": 0.42, + "grad_norm": 0.10653118044137955, + "learning_rate": 0.00021599087547838727, + "loss": 0.4654, + "step": 840 + }, + { + "epoch": 0.4225, + "grad_norm": 0.10856354981660843, + "learning_rate": 0.00021481286232029735, + "loss": 0.4298, + "step": 845 + }, + { + "epoch": 0.425, + "grad_norm": 0.11233706772327423, + "learning_rate": 0.0002136299134118085, + "loss": 0.4484, + "step": 850 + }, + { + "epoch": 0.4275, + "grad_norm": 0.1085442528128624, + "learning_rate": 0.00021244211883906017, + "loss": 0.4776, + "step": 855 + }, + { + "epoch": 0.43, + "grad_norm": 0.12297824025154114, + "learning_rate": 0.0002112495690572077, + "loss": 0.4029, + "step": 860 + }, + { + "epoch": 0.4325, + "grad_norm": 0.10838114470243454, + "learning_rate": 0.00021005235488353428, + "loss": 0.4848, + "step": 865 + }, + { + "epoch": 0.435, + "grad_norm": 0.10273341834545135, + "learning_rate": 0.0002088505674905342, + "loss": 0.3989, + "step": 870 + }, + { + "epoch": 0.4375, + "grad_norm": 0.11189126968383789, + "learning_rate": 0.0002076442983989705, + "loss": 0.438, + "step": 875 + }, + { + "epoch": 0.44, + "grad_norm": 0.11592905968427658, + "learning_rate": 0.0002064336394709048, + "loss": 0.4786, + "step": 880 + }, + { + "epoch": 0.4425, + "grad_norm": 0.11230389773845673, + "learning_rate": 0.0002052186829027017, + "loss": 0.3999, + "step": 885 + }, + { + "epoch": 0.445, + "grad_norm": 0.12455113977193832, + "learning_rate": 0.00020399952121800767, + "loss": 0.4856, + "step": 890 + }, + { + "epoch": 0.4475, + "grad_norm": 0.1001812294125557, + "learning_rate": 0.00020277624726070526, + "loss": 0.4689, + "step": 895 + }, + { + "epoch": 0.45, + "grad_norm": 0.11319112777709961, + "learning_rate": 0.00020154895418784242, + "loss": 0.3998, + "step": 900 + }, + { + "epoch": 0.4525, + "grad_norm": 0.11322236061096191, + "learning_rate": 0.00020031773546253824, + "loss": 0.4321, + "step": 905 + }, + { + "epoch": 0.455, + "grad_norm": 0.12924689054489136, + "learning_rate": 0.00019908268484686558, + "loss": 0.4208, + "step": 910 + }, + { + "epoch": 0.4575, + "grad_norm": 0.11435618251562119, + "learning_rate": 0.00019784389639471048, + "loss": 0.4682, + "step": 915 + }, + { + "epoch": 0.46, + "grad_norm": 0.10801081359386444, + "learning_rate": 0.00019660146444460975, + "loss": 0.428, + "step": 920 + }, + { + "epoch": 0.4625, + "grad_norm": 0.10906939953565598, + "learning_rate": 0.0001953554836125667, + "loss": 0.4455, + "step": 925 + }, + { + "epoch": 0.465, + "grad_norm": 0.10790123790502548, + "learning_rate": 0.00019410604878484556, + "loss": 0.4544, + "step": 930 + }, + { + "epoch": 0.4675, + "grad_norm": 0.10536376386880875, + "learning_rate": 0.000192853255110746, + "loss": 0.376, + "step": 935 + }, + { + "epoch": 0.47, + "grad_norm": 0.11744682490825653, + "learning_rate": 0.00019159719799535668, + "loss": 0.3887, + "step": 940 + }, + { + "epoch": 0.4725, + "grad_norm": 0.12954068183898926, + "learning_rate": 0.00019033797309228983, + "loss": 0.4075, + "step": 945 + }, + { + "epoch": 0.475, + "grad_norm": 0.1401606798171997, + "learning_rate": 0.00018907567629639725, + "loss": 0.4454, + "step": 950 + }, + { + "epoch": 0.4775, + "grad_norm": 0.12059322744607925, + "learning_rate": 0.00018781040373646706, + "loss": 0.4339, + "step": 955 + }, + { + "epoch": 0.48, + "grad_norm": 0.11798987537622452, + "learning_rate": 0.00018654225176790336, + "loss": 0.4405, + "step": 960 + }, + { + "epoch": 0.4825, + "grad_norm": 0.11344211548566818, + "learning_rate": 0.00018527131696538846, + "loss": 0.4124, + "step": 965 + }, + { + "epoch": 0.485, + "grad_norm": 0.10373330116271973, + "learning_rate": 0.00018399769611552824, + "loss": 0.4329, + "step": 970 + }, + { + "epoch": 0.4875, + "grad_norm": 0.12053704261779785, + "learning_rate": 0.0001827214862094814, + "loss": 0.4944, + "step": 975 + }, + { + "epoch": 0.49, + "grad_norm": 0.141033336520195, + "learning_rate": 0.00018144278443557328, + "loss": 0.4569, + "step": 980 + }, + { + "epoch": 0.4925, + "grad_norm": 0.10922867804765701, + "learning_rate": 0.0001801616881718947, + "loss": 0.3879, + "step": 985 + }, + { + "epoch": 0.495, + "grad_norm": 0.09843657910823822, + "learning_rate": 0.00017887829497888612, + "loss": 0.4106, + "step": 990 + }, + { + "epoch": 0.4975, + "grad_norm": 0.12131062150001526, + "learning_rate": 0.000177592702591908, + "loss": 0.4023, + "step": 995 + }, + { + "epoch": 0.5, + "grad_norm": 0.11343283206224442, + "learning_rate": 0.00017630500891379806, + "loss": 0.4824, + "step": 1000 + } + ], + "logging_steps": 5, + "max_steps": 2000, + "num_input_tokens_seen": 0, + "num_train_epochs": 9223372036854775807, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.314789078859776e+18, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +} diff --git a/codellama-hugcoder/checkpoint-1000/training_args.bin b/codellama-hugcoder/checkpoint-1000/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..56c791e5156768b7f6e7b84cd13f2e54e47fad93 --- /dev/null +++ b/codellama-hugcoder/checkpoint-1000/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ce19e7480e96c4d26efe137d7fe1582e71cf088cb5b49be23c8ccd4b8298bb4b +size 5304 diff --git a/codellama-hugcoder/checkpoint-1500/README.md b/codellama-hugcoder/checkpoint-1500/README.md new file mode 100644 index 0000000000000000000000000000000000000000..a8daf76b93ee8508456247003fa155d657d2e354 --- /dev/null +++ b/codellama-hugcoder/checkpoint-1500/README.md @@ -0,0 +1,202 @@ +--- +base_model: codellama/CodeLlama-7b-Instruct-hf +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.15.2.dev0 \ No newline at end of file diff --git a/codellama-hugcoder/checkpoint-1500/adapter_config.json b/codellama-hugcoder/checkpoint-1500/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..5103ef08b354afd3803f5061efdf7a079a493cb1 --- /dev/null +++ b/codellama-hugcoder/checkpoint-1500/adapter_config.json @@ -0,0 +1,39 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "codellama/CodeLlama-7b-Instruct-hf", + "bias": "none", + "corda_config": null, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 64, + "lora_bias": false, + "lora_dropout": 0.1, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 32, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "down_proj", + "up_proj", + "k_proj", + "q_proj", + "v_proj", + "gate_proj", + "o_proj" + ], + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/codellama-hugcoder/checkpoint-1500/adapter_model.safetensors b/codellama-hugcoder/checkpoint-1500/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..b4a6a146f60aa497ffcbc2b5247f443943944031 --- /dev/null +++ b/codellama-hugcoder/checkpoint-1500/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:954883169196fec3dbbf2581acd2ff6690fa789729045bb04113f1bb36637c46 +size 319876032 diff --git a/codellama-hugcoder/checkpoint-1500/optimizer.pt b/codellama-hugcoder/checkpoint-1500/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..b62651c4a46378842c8470f5e4e0164bd1e32669 --- /dev/null +++ b/codellama-hugcoder/checkpoint-1500/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:135e0fda5af04719269dc4cca8199c95f610932728fc80b6e63f3d656098bd57 +size 640009682 diff --git a/codellama-hugcoder/checkpoint-1500/rng_state.pth b/codellama-hugcoder/checkpoint-1500/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..f06fbb84d8d04c17347fd349f63eefcd95addf6d --- /dev/null +++ b/codellama-hugcoder/checkpoint-1500/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fda3c0b12e2631264746b16f7dd8a85fd763004a3c1d20e136ad6fae01987d26 +size 14244 diff --git a/codellama-hugcoder/checkpoint-1500/scheduler.pt b/codellama-hugcoder/checkpoint-1500/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..a8076c3f23ed44081a0388e235c4ca5039d23d13 --- /dev/null +++ b/codellama-hugcoder/checkpoint-1500/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:046c4144f3d3e450ad1c1129a3ed6e680f6f65f10c488eeb2fd00b8cd376efa0 +size 1064 diff --git a/codellama-hugcoder/checkpoint-1500/trainer_state.json b/codellama-hugcoder/checkpoint-1500/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..ea00c923cf539fa4f4d768dc48177acff1149bf3 --- /dev/null +++ b/codellama-hugcoder/checkpoint-1500/trainer_state.json @@ -0,0 +1,2134 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.75, + "eval_steps": 100.0, + "global_step": 1500, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0025, + "grad_norm": 0.09379793703556061, + "learning_rate": 5.999999999999999e-06, + "loss": 0.6799, + "step": 5 + }, + { + "epoch": 0.005, + "grad_norm": 0.1399833709001541, + "learning_rate": 1.3499999999999998e-05, + "loss": 0.6954, + "step": 10 + }, + { + "epoch": 0.0075, + "grad_norm": 0.08632303029298782, + "learning_rate": 2.1e-05, + "loss": 0.6921, + "step": 15 + }, + { + "epoch": 0.01, + "grad_norm": 0.10006701201200485, + "learning_rate": 2.8499999999999998e-05, + "loss": 0.69, + "step": 20 + }, + { + "epoch": 0.0125, + "grad_norm": 0.07633858919143677, + "learning_rate": 3.5999999999999994e-05, + "loss": 0.6722, + "step": 25 + }, + { + "epoch": 0.015, + "grad_norm": 0.09399061650037766, + "learning_rate": 4.3499999999999993e-05, + "loss": 0.6453, + "step": 30 + }, + { + "epoch": 0.0175, + "grad_norm": 0.0843738541007042, + "learning_rate": 5.1e-05, + "loss": 0.6276, + "step": 35 + }, + { + "epoch": 0.02, + "grad_norm": 0.08583351224660873, + "learning_rate": 5.85e-05, + "loss": 0.58, + "step": 40 + }, + { + "epoch": 0.0225, + "grad_norm": 0.09571370482444763, + "learning_rate": 6.599999999999999e-05, + "loss": 0.6355, + "step": 45 + }, + { + "epoch": 0.025, + "grad_norm": 0.1083935871720314, + "learning_rate": 7.35e-05, + "loss": 0.589, + "step": 50 + }, + { + "epoch": 0.0275, + "grad_norm": 0.10387319326400757, + "learning_rate": 8.1e-05, + "loss": 0.6061, + "step": 55 + }, + { + "epoch": 0.03, + "grad_norm": 0.11083361506462097, + "learning_rate": 8.849999999999998e-05, + "loss": 0.572, + "step": 60 + }, + { + "epoch": 0.0325, + "grad_norm": 0.12665686011314392, + "learning_rate": 9.599999999999999e-05, + "loss": 0.5442, + "step": 65 + }, + { + "epoch": 0.035, + "grad_norm": 0.1308053582906723, + "learning_rate": 0.00010349999999999998, + "loss": 0.6524, + "step": 70 + }, + { + "epoch": 0.0375, + "grad_norm": 0.13535510003566742, + "learning_rate": 0.00011099999999999999, + "loss": 0.6404, + "step": 75 + }, + { + "epoch": 0.04, + "grad_norm": 0.12833671271800995, + "learning_rate": 0.0001185, + "loss": 0.5717, + "step": 80 + }, + { + "epoch": 0.0425, + "grad_norm": 0.11962099373340607, + "learning_rate": 0.00012599999999999997, + "loss": 0.6098, + "step": 85 + }, + { + "epoch": 0.045, + "grad_norm": 0.13898271322250366, + "learning_rate": 0.0001335, + "loss": 0.6099, + "step": 90 + }, + { + "epoch": 0.0475, + "grad_norm": 0.14486610889434814, + "learning_rate": 0.00014099999999999998, + "loss": 0.5744, + "step": 95 + }, + { + "epoch": 0.05, + "grad_norm": 0.1432138830423355, + "learning_rate": 0.00014849999999999998, + "loss": 0.5659, + "step": 100 + }, + { + "epoch": 0.0525, + "grad_norm": 0.13487878441810608, + "learning_rate": 0.000156, + "loss": 0.5622, + "step": 105 + }, + { + "epoch": 0.055, + "grad_norm": 0.12495309859514236, + "learning_rate": 0.0001635, + "loss": 0.5951, + "step": 110 + }, + { + "epoch": 0.0575, + "grad_norm": 0.13011734187602997, + "learning_rate": 0.00017099999999999998, + "loss": 0.6249, + "step": 115 + }, + { + "epoch": 0.06, + "grad_norm": 0.13987745344638824, + "learning_rate": 0.00017849999999999997, + "loss": 0.559, + "step": 120 + }, + { + "epoch": 0.0625, + "grad_norm": 0.13373605906963348, + "learning_rate": 0.000186, + "loss": 0.5475, + "step": 125 + }, + { + "epoch": 0.065, + "grad_norm": 0.12433867901563644, + "learning_rate": 0.0001935, + "loss": 0.5274, + "step": 130 + }, + { + "epoch": 0.0675, + "grad_norm": 0.11097615957260132, + "learning_rate": 0.000201, + "loss": 0.678, + "step": 135 + }, + { + "epoch": 0.07, + "grad_norm": 0.1155027225613594, + "learning_rate": 0.00020849999999999997, + "loss": 0.5611, + "step": 140 + }, + { + "epoch": 0.0725, + "grad_norm": 0.11431068181991577, + "learning_rate": 0.00021599999999999996, + "loss": 0.6054, + "step": 145 + }, + { + "epoch": 0.075, + "grad_norm": 0.09796140342950821, + "learning_rate": 0.00022349999999999998, + "loss": 0.5472, + "step": 150 + }, + { + "epoch": 0.0775, + "grad_norm": 0.09489257633686066, + "learning_rate": 0.00023099999999999998, + "loss": 0.4636, + "step": 155 + }, + { + "epoch": 0.08, + "grad_norm": 0.10787788033485413, + "learning_rate": 0.0002385, + "loss": 0.6164, + "step": 160 + }, + { + "epoch": 0.0825, + "grad_norm": 0.10261733084917068, + "learning_rate": 0.00024599999999999996, + "loss": 0.5408, + "step": 165 + }, + { + "epoch": 0.085, + "grad_norm": 0.11870352178812027, + "learning_rate": 0.0002535, + "loss": 0.5268, + "step": 170 + }, + { + "epoch": 0.0875, + "grad_norm": 0.11910569667816162, + "learning_rate": 0.000261, + "loss": 0.5461, + "step": 175 + }, + { + "epoch": 0.09, + "grad_norm": 0.10083702206611633, + "learning_rate": 0.00026849999999999997, + "loss": 0.4794, + "step": 180 + }, + { + "epoch": 0.0925, + "grad_norm": 0.10453511029481888, + "learning_rate": 0.000276, + "loss": 0.5539, + "step": 185 + }, + { + "epoch": 0.095, + "grad_norm": 0.101403146982193, + "learning_rate": 0.00028349999999999995, + "loss": 0.5346, + "step": 190 + }, + { + "epoch": 0.0975, + "grad_norm": 0.10724789649248123, + "learning_rate": 0.00029099999999999997, + "loss": 0.6026, + "step": 195 + }, + { + "epoch": 0.1, + "grad_norm": 0.1140277311205864, + "learning_rate": 0.0002985, + "loss": 0.5193, + "step": 200 + }, + { + "epoch": 0.1025, + "grad_norm": 0.09706108272075653, + "learning_rate": 0.0002999963446058092, + "loss": 0.54, + "step": 205 + }, + { + "epoch": 0.105, + "grad_norm": 0.10003062337636948, + "learning_rate": 0.0002999814948722491, + "loss": 0.5365, + "step": 210 + }, + { + "epoch": 0.1075, + "grad_norm": 0.1078687533736229, + "learning_rate": 0.00029995522346717746, + "loss": 0.5889, + "step": 215 + }, + { + "epoch": 0.11, + "grad_norm": 0.10538115352392197, + "learning_rate": 0.0002999175323912636, + "loss": 0.5611, + "step": 220 + }, + { + "epoch": 0.1125, + "grad_norm": 0.1020808294415474, + "learning_rate": 0.00029986842451482874, + "loss": 0.6103, + "step": 225 + }, + { + "epoch": 0.115, + "grad_norm": 0.09635835886001587, + "learning_rate": 0.0002998079035776279, + "loss": 0.5229, + "step": 230 + }, + { + "epoch": 0.1175, + "grad_norm": 0.10287190228700638, + "learning_rate": 0.0002997359741885648, + "loss": 0.5312, + "step": 235 + }, + { + "epoch": 0.12, + "grad_norm": 0.09160075336694717, + "learning_rate": 0.0002996526418253408, + "loss": 0.5673, + "step": 240 + }, + { + "epoch": 0.1225, + "grad_norm": 0.08691006153821945, + "learning_rate": 0.000299557912834038, + "loss": 0.5326, + "step": 245 + }, + { + "epoch": 0.125, + "grad_norm": 0.10096988826990128, + "learning_rate": 0.00029945179442863594, + "loss": 0.6004, + "step": 250 + }, + { + "epoch": 0.1275, + "grad_norm": 0.09594204276800156, + "learning_rate": 0.000299334294690462, + "loss": 0.5516, + "step": 255 + }, + { + "epoch": 0.13, + "grad_norm": 0.10281919687986374, + "learning_rate": 0.00029920542256757607, + "loss": 0.5515, + "step": 260 + }, + { + "epoch": 0.1325, + "grad_norm": 0.08547840267419815, + "learning_rate": 0.00029906518787408944, + "loss": 0.5243, + "step": 265 + }, + { + "epoch": 0.135, + "grad_norm": 0.10161560773849487, + "learning_rate": 0.0002989136012894168, + "loss": 0.5096, + "step": 270 + }, + { + "epoch": 0.1375, + "grad_norm": 0.09101904183626175, + "learning_rate": 0.0002987506743574635, + "loss": 0.553, + "step": 275 + }, + { + "epoch": 0.14, + "grad_norm": 0.09769442677497864, + "learning_rate": 0.0002985764194857463, + "loss": 0.4953, + "step": 280 + }, + { + "epoch": 0.1425, + "grad_norm": 0.10991579294204712, + "learning_rate": 0.00029839084994444826, + "loss": 0.5152, + "step": 285 + }, + { + "epoch": 0.145, + "grad_norm": 0.09450916200876236, + "learning_rate": 0.00029819397986540836, + "loss": 0.5397, + "step": 290 + }, + { + "epoch": 0.1475, + "grad_norm": 0.10876069217920303, + "learning_rate": 0.0002979858242410454, + "loss": 0.4858, + "step": 295 + }, + { + "epoch": 0.15, + "grad_norm": 0.097995825111866, + "learning_rate": 0.00029776639892321606, + "loss": 0.5566, + "step": 300 + }, + { + "epoch": 0.1525, + "grad_norm": 0.1145048514008522, + "learning_rate": 0.0002975357206220079, + "loss": 0.4531, + "step": 305 + }, + { + "epoch": 0.155, + "grad_norm": 0.10271880775690079, + "learning_rate": 0.00029729380690446654, + "loss": 0.5199, + "step": 310 + }, + { + "epoch": 0.1575, + "grad_norm": 0.11095371842384338, + "learning_rate": 0.0002970406761932583, + "loss": 0.5416, + "step": 315 + }, + { + "epoch": 0.16, + "grad_norm": 0.09949438273906708, + "learning_rate": 0.00029677634776526673, + "loss": 0.4841, + "step": 320 + }, + { + "epoch": 0.1625, + "grad_norm": 0.1163724958896637, + "learning_rate": 0.00029650084175012517, + "loss": 0.4913, + "step": 325 + }, + { + "epoch": 0.165, + "grad_norm": 0.10726840049028397, + "learning_rate": 0.00029621417912868323, + "loss": 0.5203, + "step": 330 + }, + { + "epoch": 0.1675, + "grad_norm": 0.09609931707382202, + "learning_rate": 0.00029591638173140947, + "loss": 0.5607, + "step": 335 + }, + { + "epoch": 0.17, + "grad_norm": 0.10824442654848099, + "learning_rate": 0.0002956074722367286, + "loss": 0.6004, + "step": 340 + }, + { + "epoch": 0.1725, + "grad_norm": 0.10465679317712784, + "learning_rate": 0.00029528747416929463, + "loss": 0.5216, + "step": 345 + }, + { + "epoch": 0.175, + "grad_norm": 0.10518354922533035, + "learning_rate": 0.0002949564118981994, + "loss": 0.499, + "step": 350 + }, + { + "epoch": 0.1775, + "grad_norm": 0.0955279991030693, + "learning_rate": 0.0002946143106351165, + "loss": 0.5607, + "step": 355 + }, + { + "epoch": 0.18, + "grad_norm": 0.11159654706716537, + "learning_rate": 0.0002942611964323817, + "loss": 0.5204, + "step": 360 + }, + { + "epoch": 0.1825, + "grad_norm": 0.09571187198162079, + "learning_rate": 0.0002938970961810086, + "loss": 0.6113, + "step": 365 + }, + { + "epoch": 0.185, + "grad_norm": 0.11854679882526398, + "learning_rate": 0.0002935220376086411, + "loss": 0.5639, + "step": 370 + }, + { + "epoch": 0.1875, + "grad_norm": 0.1050512045621872, + "learning_rate": 0.0002931360492774415, + "loss": 0.548, + "step": 375 + }, + { + "epoch": 0.19, + "grad_norm": 0.1053968220949173, + "learning_rate": 0.0002927391605819157, + "loss": 0.5507, + "step": 380 + }, + { + "epoch": 0.1925, + "grad_norm": 0.10567320138216019, + "learning_rate": 0.00029233140174667445, + "loss": 0.5312, + "step": 385 + }, + { + "epoch": 0.195, + "grad_norm": 0.11914283782243729, + "learning_rate": 0.0002919128038241318, + "loss": 0.5961, + "step": 390 + }, + { + "epoch": 0.1975, + "grad_norm": 0.09915795922279358, + "learning_rate": 0.0002914833986921401, + "loss": 0.5086, + "step": 395 + }, + { + "epoch": 0.2, + "grad_norm": 0.10796502232551575, + "learning_rate": 0.0002910432190515628, + "loss": 0.5585, + "step": 400 + }, + { + "epoch": 0.2025, + "grad_norm": 0.10748997330665588, + "learning_rate": 0.00029059229842378373, + "loss": 0.5466, + "step": 405 + }, + { + "epoch": 0.205, + "grad_norm": 0.10696308314800262, + "learning_rate": 0.0002901306711481544, + "loss": 0.5513, + "step": 410 + }, + { + "epoch": 0.2075, + "grad_norm": 0.10418657958507538, + "learning_rate": 0.0002896583723793792, + "loss": 0.5391, + "step": 415 + }, + { + "epoch": 0.21, + "grad_norm": 0.16421550512313843, + "learning_rate": 0.00028917543808483796, + "loss": 0.4699, + "step": 420 + }, + { + "epoch": 0.2125, + "grad_norm": 0.12929962575435638, + "learning_rate": 0.00028868190504184696, + "loss": 0.4984, + "step": 425 + }, + { + "epoch": 0.215, + "grad_norm": 0.10469454526901245, + "learning_rate": 0.00028817781083485816, + "loss": 0.5119, + "step": 430 + }, + { + "epoch": 0.2175, + "grad_norm": 0.0964970663189888, + "learning_rate": 0.00028766319385259713, + "loss": 0.5167, + "step": 435 + }, + { + "epoch": 0.22, + "grad_norm": 0.12395574152469635, + "learning_rate": 0.00028713809328513953, + "loss": 0.5692, + "step": 440 + }, + { + "epoch": 0.2225, + "grad_norm": 0.10189738124608994, + "learning_rate": 0.0002866025491209265, + "loss": 0.4628, + "step": 445 + }, + { + "epoch": 0.225, + "grad_norm": 0.10433454066514969, + "learning_rate": 0.0002860566021437197, + "loss": 0.4869, + "step": 450 + }, + { + "epoch": 0.2275, + "grad_norm": 0.13003456592559814, + "learning_rate": 0.0002855002939294951, + "loss": 0.5291, + "step": 455 + }, + { + "epoch": 0.23, + "grad_norm": 0.11692202836275101, + "learning_rate": 0.000284933666843277, + "loss": 0.5229, + "step": 460 + }, + { + "epoch": 0.2325, + "grad_norm": 0.10757846385240555, + "learning_rate": 0.0002843567640359119, + "loss": 0.435, + "step": 465 + }, + { + "epoch": 0.235, + "grad_norm": 0.10775501281023026, + "learning_rate": 0.00028376962944078206, + "loss": 0.4418, + "step": 470 + }, + { + "epoch": 0.2375, + "grad_norm": 0.11543692648410797, + "learning_rate": 0.00028317230777046015, + "loss": 0.4204, + "step": 475 + }, + { + "epoch": 0.24, + "grad_norm": 0.10946698486804962, + "learning_rate": 0.00028256484451330403, + "loss": 0.49, + "step": 480 + }, + { + "epoch": 0.2425, + "grad_norm": 0.11528221517801285, + "learning_rate": 0.00028194728592999247, + "loss": 0.4752, + "step": 485 + }, + { + "epoch": 0.245, + "grad_norm": 0.10474205762147903, + "learning_rate": 0.0002813196790500027, + "loss": 0.4847, + "step": 490 + }, + { + "epoch": 0.2475, + "grad_norm": 0.10768820345401764, + "learning_rate": 0.00028068207166802837, + "loss": 0.4664, + "step": 495 + }, + { + "epoch": 0.25, + "grad_norm": 0.12158560007810593, + "learning_rate": 0.00028003451234034037, + "loss": 0.4741, + "step": 500 + }, + { + "epoch": 0.2525, + "grad_norm": 0.11635497957468033, + "learning_rate": 0.0002793770503810886, + "loss": 0.4969, + "step": 505 + }, + { + "epoch": 0.255, + "grad_norm": 0.12205849587917328, + "learning_rate": 0.00027870973585854665, + "loss": 0.4798, + "step": 510 + }, + { + "epoch": 0.2575, + "grad_norm": 0.10270871222019196, + "learning_rate": 0.00027803261959129905, + "loss": 0.3888, + "step": 515 + }, + { + "epoch": 0.26, + "grad_norm": 0.11313367635011673, + "learning_rate": 0.0002773457531443712, + "loss": 0.4759, + "step": 520 + }, + { + "epoch": 0.2625, + "grad_norm": 0.12905193865299225, + "learning_rate": 0.00027664918882530225, + "loss": 0.4442, + "step": 525 + }, + { + "epoch": 0.265, + "grad_norm": 0.11690939962863922, + "learning_rate": 0.00027594297968016197, + "loss": 0.5535, + "step": 530 + }, + { + "epoch": 0.2675, + "grad_norm": 0.10021405667066574, + "learning_rate": 0.00027522717948951094, + "loss": 0.4717, + "step": 535 + }, + { + "epoch": 0.27, + "grad_norm": 0.10104178637266159, + "learning_rate": 0.0002745018427643051, + "loss": 0.4906, + "step": 540 + }, + { + "epoch": 0.2725, + "grad_norm": 0.12113891541957855, + "learning_rate": 0.00027376702474174425, + "loss": 0.5674, + "step": 545 + }, + { + "epoch": 0.275, + "grad_norm": 0.11330476403236389, + "learning_rate": 0.0002730227813810658, + "loss": 0.5184, + "step": 550 + }, + { + "epoch": 0.2775, + "grad_norm": 0.1025850847363472, + "learning_rate": 0.0002722691693592831, + "loss": 0.4395, + "step": 555 + }, + { + "epoch": 0.28, + "grad_norm": 0.11591499298810959, + "learning_rate": 0.0002715062460668694, + "loss": 0.5003, + "step": 560 + }, + { + "epoch": 0.2825, + "grad_norm": 0.11281153559684753, + "learning_rate": 0.0002707340696033871, + "loss": 0.4672, + "step": 565 + }, + { + "epoch": 0.285, + "grad_norm": 0.1123538464307785, + "learning_rate": 0.00026995269877306356, + "loss": 0.513, + "step": 570 + }, + { + "epoch": 0.2875, + "grad_norm": 0.10776390135288239, + "learning_rate": 0.0002691621930803127, + "loss": 0.4572, + "step": 575 + }, + { + "epoch": 0.29, + "grad_norm": 0.10008667409420013, + "learning_rate": 0.0002683626127252036, + "loss": 0.4618, + "step": 580 + }, + { + "epoch": 0.2925, + "grad_norm": 0.13961340487003326, + "learning_rate": 0.00026755401859887595, + "loss": 0.4819, + "step": 585 + }, + { + "epoch": 0.295, + "grad_norm": 0.1476685106754303, + "learning_rate": 0.00026673647227890316, + "loss": 0.4964, + "step": 590 + }, + { + "epoch": 0.2975, + "grad_norm": 0.09795507788658142, + "learning_rate": 0.00026591003602460263, + "loss": 0.4796, + "step": 595 + }, + { + "epoch": 0.3, + "grad_norm": 0.10903532058000565, + "learning_rate": 0.00026507477277229496, + "loss": 0.4775, + "step": 600 + }, + { + "epoch": 0.3025, + "grad_norm": 0.10258448123931885, + "learning_rate": 0.0002642307461305105, + "loss": 0.4519, + "step": 605 + }, + { + "epoch": 0.305, + "grad_norm": 0.11204435676336288, + "learning_rate": 0.0002633780203751459, + "loss": 0.4451, + "step": 610 + }, + { + "epoch": 0.3075, + "grad_norm": 0.10147629678249359, + "learning_rate": 0.0002625166604445689, + "loss": 0.4256, + "step": 615 + }, + { + "epoch": 0.31, + "grad_norm": 0.10481107234954834, + "learning_rate": 0.00026164673193467306, + "loss": 0.4381, + "step": 620 + }, + { + "epoch": 0.3125, + "grad_norm": 0.10856641829013824, + "learning_rate": 0.00026076830109388255, + "loss": 0.4958, + "step": 625 + }, + { + "epoch": 0.315, + "grad_norm": 0.09918677806854248, + "learning_rate": 0.0002598814348181068, + "loss": 0.4335, + "step": 630 + }, + { + "epoch": 0.3175, + "grad_norm": 0.10417389869689941, + "learning_rate": 0.00025898620064564637, + "loss": 0.4603, + "step": 635 + }, + { + "epoch": 0.32, + "grad_norm": 0.0903329998254776, + "learning_rate": 0.00025808266675204954, + "loss": 0.3932, + "step": 640 + }, + { + "epoch": 0.3225, + "grad_norm": 0.11511855572462082, + "learning_rate": 0.0002571709019449205, + "loss": 0.4169, + "step": 645 + }, + { + "epoch": 0.325, + "grad_norm": 0.11355557292699814, + "learning_rate": 0.0002562509756586793, + "loss": 0.4455, + "step": 650 + }, + { + "epoch": 0.3275, + "grad_norm": 0.1271187961101532, + "learning_rate": 0.00025532295794927437, + "loss": 0.4902, + "step": 655 + }, + { + "epoch": 0.33, + "grad_norm": 0.11936645954847336, + "learning_rate": 0.0002543869194888471, + "loss": 0.4843, + "step": 660 + }, + { + "epoch": 0.3325, + "grad_norm": 0.11935465037822723, + "learning_rate": 0.00025344293156035044, + "loss": 0.4402, + "step": 665 + }, + { + "epoch": 0.335, + "grad_norm": 0.13073407113552094, + "learning_rate": 0.00025249106605211986, + "loss": 0.467, + "step": 670 + }, + { + "epoch": 0.3375, + "grad_norm": 0.10340435802936554, + "learning_rate": 0.0002515313954523991, + "loss": 0.4827, + "step": 675 + }, + { + "epoch": 0.34, + "grad_norm": 0.11634550243616104, + "learning_rate": 0.00025056399284381983, + "loss": 0.466, + "step": 680 + }, + { + "epoch": 0.3425, + "grad_norm": 0.10582319647073746, + "learning_rate": 0.0002495889318978362, + "loss": 0.4751, + "step": 685 + }, + { + "epoch": 0.345, + "grad_norm": 0.16781780123710632, + "learning_rate": 0.00024860628686911436, + "loss": 0.4717, + "step": 690 + }, + { + "epoch": 0.3475, + "grad_norm": 0.11522196233272552, + "learning_rate": 0.0002476161325898776, + "loss": 0.4687, + "step": 695 + }, + { + "epoch": 0.35, + "grad_norm": 0.11830449104309082, + "learning_rate": 0.000246618544464208, + "loss": 0.436, + "step": 700 + }, + { + "epoch": 0.3525, + "grad_norm": 0.17485427856445312, + "learning_rate": 0.0002456135984623034, + "loss": 0.4284, + "step": 705 + }, + { + "epoch": 0.355, + "grad_norm": 0.12288108468055725, + "learning_rate": 0.00024460137111469296, + "loss": 0.4261, + "step": 710 + }, + { + "epoch": 0.3575, + "grad_norm": 0.11587081104516983, + "learning_rate": 0.0002435819395064079, + "loss": 0.4493, + "step": 715 + }, + { + "epoch": 0.36, + "grad_norm": 0.10690271109342575, + "learning_rate": 0.0002425553812711123, + "loss": 0.4648, + "step": 720 + }, + { + "epoch": 0.3625, + "grad_norm": 0.10404397547245026, + "learning_rate": 0.00024152177458519014, + "loss": 0.4634, + "step": 725 + }, + { + "epoch": 0.365, + "grad_norm": 0.11986954510211945, + "learning_rate": 0.00024048119816179236, + "loss": 0.4525, + "step": 730 + }, + { + "epoch": 0.3675, + "grad_norm": 0.10243026167154312, + "learning_rate": 0.00023943373124484234, + "loss": 0.4572, + "step": 735 + }, + { + "epoch": 0.37, + "grad_norm": 0.10386748611927032, + "learning_rate": 0.00023837945360300129, + "loss": 0.3884, + "step": 740 + }, + { + "epoch": 0.3725, + "grad_norm": 0.11165735125541687, + "learning_rate": 0.0002373184455235934, + "loss": 0.4902, + "step": 745 + }, + { + "epoch": 0.375, + "grad_norm": 0.09951601922512054, + "learning_rate": 0.00023625078780649178, + "loss": 0.4541, + "step": 750 + }, + { + "epoch": 0.3775, + "grad_norm": 0.10347504913806915, + "learning_rate": 0.00023517656175796518, + "loss": 0.3871, + "step": 755 + }, + { + "epoch": 0.38, + "grad_norm": 0.10478132963180542, + "learning_rate": 0.00023409584918448627, + "loss": 0.4329, + "step": 760 + }, + { + "epoch": 0.3825, + "grad_norm": 0.1198212131857872, + "learning_rate": 0.00023300873238650159, + "loss": 0.425, + "step": 765 + }, + { + "epoch": 0.385, + "grad_norm": 0.1103711724281311, + "learning_rate": 0.00023191529415216434, + "loss": 0.4274, + "step": 770 + }, + { + "epoch": 0.3875, + "grad_norm": 0.09940385073423386, + "learning_rate": 0.00023081561775102944, + "loss": 0.4368, + "step": 775 + }, + { + "epoch": 0.39, + "grad_norm": 0.11599268019199371, + "learning_rate": 0.00022970978692771242, + "loss": 0.4386, + "step": 780 + }, + { + "epoch": 0.3925, + "grad_norm": 0.10101296752691269, + "learning_rate": 0.00022859788589551188, + "loss": 0.4696, + "step": 785 + }, + { + "epoch": 0.395, + "grad_norm": 0.10112808644771576, + "learning_rate": 0.00022747999932999624, + "loss": 0.4066, + "step": 790 + }, + { + "epoch": 0.3975, + "grad_norm": 0.09595459699630737, + "learning_rate": 0.00022635621236255567, + "loss": 0.4837, + "step": 795 + }, + { + "epoch": 0.4, + "grad_norm": 0.10761380940675735, + "learning_rate": 0.00022522661057391857, + "loss": 0.5446, + "step": 800 + }, + { + "epoch": 0.4025, + "grad_norm": 0.11919954419136047, + "learning_rate": 0.00022409127998763463, + "loss": 0.5027, + "step": 805 + }, + { + "epoch": 0.405, + "grad_norm": 0.10851597785949707, + "learning_rate": 0.00022295030706352356, + "loss": 0.4481, + "step": 810 + }, + { + "epoch": 0.4075, + "grad_norm": 0.10030311346054077, + "learning_rate": 0.00022180377869109104, + "loss": 0.4709, + "step": 815 + }, + { + "epoch": 0.41, + "grad_norm": 0.111280657351017, + "learning_rate": 0.00022065178218291147, + "loss": 0.4423, + "step": 820 + }, + { + "epoch": 0.4125, + "grad_norm": 0.11253602802753448, + "learning_rate": 0.00021949440526797926, + "loss": 0.4136, + "step": 825 + }, + { + "epoch": 0.415, + "grad_norm": 0.10805424302816391, + "learning_rate": 0.00021833173608502732, + "loss": 0.4656, + "step": 830 + }, + { + "epoch": 0.4175, + "grad_norm": 0.10983198881149292, + "learning_rate": 0.00021716386317581542, + "loss": 0.3687, + "step": 835 + }, + { + "epoch": 0.42, + "grad_norm": 0.10653118044137955, + "learning_rate": 0.00021599087547838727, + "loss": 0.4654, + "step": 840 + }, + { + "epoch": 0.4225, + "grad_norm": 0.10856354981660843, + "learning_rate": 0.00021481286232029735, + "loss": 0.4298, + "step": 845 + }, + { + "epoch": 0.425, + "grad_norm": 0.11233706772327423, + "learning_rate": 0.0002136299134118085, + "loss": 0.4484, + "step": 850 + }, + { + "epoch": 0.4275, + "grad_norm": 0.1085442528128624, + "learning_rate": 0.00021244211883906017, + "loss": 0.4776, + "step": 855 + }, + { + "epoch": 0.43, + "grad_norm": 0.12297824025154114, + "learning_rate": 0.0002112495690572077, + "loss": 0.4029, + "step": 860 + }, + { + "epoch": 0.4325, + "grad_norm": 0.10838114470243454, + "learning_rate": 0.00021005235488353428, + "loss": 0.4848, + "step": 865 + }, + { + "epoch": 0.435, + "grad_norm": 0.10273341834545135, + "learning_rate": 0.0002088505674905342, + "loss": 0.3989, + "step": 870 + }, + { + "epoch": 0.4375, + "grad_norm": 0.11189126968383789, + "learning_rate": 0.0002076442983989705, + "loss": 0.438, + "step": 875 + }, + { + "epoch": 0.44, + "grad_norm": 0.11592905968427658, + "learning_rate": 0.0002064336394709048, + "loss": 0.4786, + "step": 880 + }, + { + "epoch": 0.4425, + "grad_norm": 0.11230389773845673, + "learning_rate": 0.0002052186829027017, + "loss": 0.3999, + "step": 885 + }, + { + "epoch": 0.445, + "grad_norm": 0.12455113977193832, + "learning_rate": 0.00020399952121800767, + "loss": 0.4856, + "step": 890 + }, + { + "epoch": 0.4475, + "grad_norm": 0.1001812294125557, + "learning_rate": 0.00020277624726070526, + "loss": 0.4689, + "step": 895 + }, + { + "epoch": 0.45, + "grad_norm": 0.11319112777709961, + "learning_rate": 0.00020154895418784242, + "loss": 0.3998, + "step": 900 + }, + { + "epoch": 0.4525, + "grad_norm": 0.11322236061096191, + "learning_rate": 0.00020031773546253824, + "loss": 0.4321, + "step": 905 + }, + { + "epoch": 0.455, + "grad_norm": 0.12924689054489136, + "learning_rate": 0.00019908268484686558, + "loss": 0.4208, + "step": 910 + }, + { + "epoch": 0.4575, + "grad_norm": 0.11435618251562119, + "learning_rate": 0.00019784389639471048, + "loss": 0.4682, + "step": 915 + }, + { + "epoch": 0.46, + "grad_norm": 0.10801081359386444, + "learning_rate": 0.00019660146444460975, + "loss": 0.428, + "step": 920 + }, + { + "epoch": 0.4625, + "grad_norm": 0.10906939953565598, + "learning_rate": 0.0001953554836125667, + "loss": 0.4455, + "step": 925 + }, + { + "epoch": 0.465, + "grad_norm": 0.10790123790502548, + "learning_rate": 0.00019410604878484556, + "loss": 0.4544, + "step": 930 + }, + { + "epoch": 0.4675, + "grad_norm": 0.10536376386880875, + "learning_rate": 0.000192853255110746, + "loss": 0.376, + "step": 935 + }, + { + "epoch": 0.47, + "grad_norm": 0.11744682490825653, + "learning_rate": 0.00019159719799535668, + "loss": 0.3887, + "step": 940 + }, + { + "epoch": 0.4725, + "grad_norm": 0.12954068183898926, + "learning_rate": 0.00019033797309228983, + "loss": 0.4075, + "step": 945 + }, + { + "epoch": 0.475, + "grad_norm": 0.1401606798171997, + "learning_rate": 0.00018907567629639725, + "loss": 0.4454, + "step": 950 + }, + { + "epoch": 0.4775, + "grad_norm": 0.12059322744607925, + "learning_rate": 0.00018781040373646706, + "loss": 0.4339, + "step": 955 + }, + { + "epoch": 0.48, + "grad_norm": 0.11798987537622452, + "learning_rate": 0.00018654225176790336, + "loss": 0.4405, + "step": 960 + }, + { + "epoch": 0.4825, + "grad_norm": 0.11344211548566818, + "learning_rate": 0.00018527131696538846, + "loss": 0.4124, + "step": 965 + }, + { + "epoch": 0.485, + "grad_norm": 0.10373330116271973, + "learning_rate": 0.00018399769611552824, + "loss": 0.4329, + "step": 970 + }, + { + "epoch": 0.4875, + "grad_norm": 0.12053704261779785, + "learning_rate": 0.0001827214862094814, + "loss": 0.4944, + "step": 975 + }, + { + "epoch": 0.49, + "grad_norm": 0.141033336520195, + "learning_rate": 0.00018144278443557328, + "loss": 0.4569, + "step": 980 + }, + { + "epoch": 0.4925, + "grad_norm": 0.10922867804765701, + "learning_rate": 0.0001801616881718947, + "loss": 0.3879, + "step": 985 + }, + { + "epoch": 0.495, + "grad_norm": 0.09843657910823822, + "learning_rate": 0.00017887829497888612, + "loss": 0.4106, + "step": 990 + }, + { + "epoch": 0.4975, + "grad_norm": 0.12131062150001526, + "learning_rate": 0.000177592702591908, + "loss": 0.4023, + "step": 995 + }, + { + "epoch": 0.5, + "grad_norm": 0.11343283206224442, + "learning_rate": 0.00017630500891379806, + "loss": 0.4824, + "step": 1000 + }, + { + "epoch": 0.5025, + "grad_norm": 0.11050508171319962, + "learning_rate": 0.00017501531200741534, + "loss": 0.4098, + "step": 1005 + }, + { + "epoch": 0.505, + "grad_norm": 0.11737144738435745, + "learning_rate": 0.00017372371008817256, + "loss": 0.3943, + "step": 1010 + }, + { + "epoch": 0.5075, + "grad_norm": 0.11473528295755386, + "learning_rate": 0.00017243030151655643, + "loss": 0.3796, + "step": 1015 + }, + { + "epoch": 0.51, + "grad_norm": 0.13086555898189545, + "learning_rate": 0.00017113518479063738, + "loss": 0.4367, + "step": 1020 + }, + { + "epoch": 0.5125, + "grad_norm": 0.11752833425998688, + "learning_rate": 0.00016983845853856837, + "loss": 0.4097, + "step": 1025 + }, + { + "epoch": 0.515, + "grad_norm": 0.11596900969743729, + "learning_rate": 0.0001685402215110739, + "loss": 0.3812, + "step": 1030 + }, + { + "epoch": 0.5175, + "grad_norm": 0.11850260943174362, + "learning_rate": 0.00016724057257392998, + "loss": 0.4354, + "step": 1035 + }, + { + "epoch": 0.52, + "grad_norm": 0.12466365844011307, + "learning_rate": 0.00016593961070043498, + "loss": 0.4317, + "step": 1040 + }, + { + "epoch": 0.5225, + "grad_norm": 0.11178991943597794, + "learning_rate": 0.0001646374349638724, + "loss": 0.3936, + "step": 1045 + }, + { + "epoch": 0.525, + "grad_norm": 0.11252165585756302, + "learning_rate": 0.00016333414452996623, + "loss": 0.386, + "step": 1050 + }, + { + "epoch": 0.5275, + "grad_norm": 0.12886975705623627, + "learning_rate": 0.0001620298386493288, + "loss": 0.3965, + "step": 1055 + }, + { + "epoch": 0.53, + "grad_norm": 0.11716549098491669, + "learning_rate": 0.00016072461664990288, + "loss": 0.3924, + "step": 1060 + }, + { + "epoch": 0.5325, + "grad_norm": 0.11604485660791397, + "learning_rate": 0.000159418577929397, + "loss": 0.3624, + "step": 1065 + }, + { + "epoch": 0.535, + "grad_norm": 0.11538460850715637, + "learning_rate": 0.00015811182194771633, + "loss": 0.4338, + "step": 1070 + }, + { + "epoch": 0.5375, + "grad_norm": 0.11618762463331223, + "learning_rate": 0.00015680444821938804, + "loss": 0.4058, + "step": 1075 + }, + { + "epoch": 0.54, + "grad_norm": 0.11750835925340652, + "learning_rate": 0.00015549655630598343, + "loss": 0.4422, + "step": 1080 + }, + { + "epoch": 0.5425, + "grad_norm": 0.12725204229354858, + "learning_rate": 0.00015418824580853535, + "loss": 0.4422, + "step": 1085 + }, + { + "epoch": 0.545, + "grad_norm": 0.11274927109479904, + "learning_rate": 0.00015287961635995347, + "loss": 0.4229, + "step": 1090 + }, + { + "epoch": 0.5475, + "grad_norm": 0.11833129078149796, + "learning_rate": 0.00015157076761743686, + "loss": 0.4442, + "step": 1095 + }, + { + "epoch": 0.55, + "grad_norm": 0.11384794861078262, + "learning_rate": 0.00015026179925488475, + "loss": 0.4528, + "step": 1100 + }, + { + "epoch": 0.5525, + "grad_norm": 0.11864661425352097, + "learning_rate": 0.00014895281095530575, + "loss": 0.3988, + "step": 1105 + }, + { + "epoch": 0.555, + "grad_norm": 0.11673832684755325, + "learning_rate": 0.00014764390240322691, + "loss": 0.3544, + "step": 1110 + }, + { + "epoch": 0.5575, + "grad_norm": 0.1174502745270729, + "learning_rate": 0.00014633517327710202, + "loss": 0.4034, + "step": 1115 + }, + { + "epoch": 0.56, + "grad_norm": 0.12685547769069672, + "learning_rate": 0.00014502672324172107, + "loss": 0.3595, + "step": 1120 + }, + { + "epoch": 0.5625, + "grad_norm": 0.12368053942918777, + "learning_rate": 0.00014371865194062007, + "loss": 0.3395, + "step": 1125 + }, + { + "epoch": 0.565, + "grad_norm": 0.1077839657664299, + "learning_rate": 0.000142411058988493, + "loss": 0.4199, + "step": 1130 + }, + { + "epoch": 0.5675, + "grad_norm": 0.11699855327606201, + "learning_rate": 0.00014110404396360576, + "loss": 0.3443, + "step": 1135 + }, + { + "epoch": 0.57, + "grad_norm": 0.13238464295864105, + "learning_rate": 0.0001397977064002128, + "loss": 0.3499, + "step": 1140 + }, + { + "epoch": 0.5725, + "grad_norm": 0.11482933163642883, + "learning_rate": 0.0001384921457809772, + "loss": 0.3619, + "step": 1145 + }, + { + "epoch": 0.575, + "grad_norm": 0.13390353322029114, + "learning_rate": 0.00013718746152939487, + "loss": 0.3684, + "step": 1150 + }, + { + "epoch": 0.5775, + "grad_norm": 0.11464900523424149, + "learning_rate": 0.00013588375300222283, + "loss": 0.3313, + "step": 1155 + }, + { + "epoch": 0.58, + "grad_norm": 0.10367871820926666, + "learning_rate": 0.00013458111948191296, + "loss": 0.3323, + "step": 1160 + }, + { + "epoch": 0.5825, + "grad_norm": 0.12259294092655182, + "learning_rate": 0.0001332796601690512, + "loss": 0.3986, + "step": 1165 + }, + { + "epoch": 0.585, + "grad_norm": 0.10923358052968979, + "learning_rate": 0.00013197947417480292, + "loss": 0.3808, + "step": 1170 + }, + { + "epoch": 0.5875, + "grad_norm": 0.12479504942893982, + "learning_rate": 0.0001306806605133656, + "loss": 0.4429, + "step": 1175 + }, + { + "epoch": 0.59, + "grad_norm": 0.11521733552217484, + "learning_rate": 0.000129383318094428, + "loss": 0.4778, + "step": 1180 + }, + { + "epoch": 0.5925, + "grad_norm": 0.14112086594104767, + "learning_rate": 0.00012808754571563827, + "loss": 0.4634, + "step": 1185 + }, + { + "epoch": 0.595, + "grad_norm": 0.12947902083396912, + "learning_rate": 0.00012679344205507981, + "loss": 0.4439, + "step": 1190 + }, + { + "epoch": 0.5975, + "grad_norm": 0.13288578391075134, + "learning_rate": 0.0001255011056637567, + "loss": 0.4402, + "step": 1195 + }, + { + "epoch": 0.6, + "grad_norm": 0.1216069906949997, + "learning_rate": 0.00012421063495808853, + "loss": 0.4203, + "step": 1200 + }, + { + "epoch": 0.6025, + "grad_norm": 0.11649637669324875, + "learning_rate": 0.000122922128212416, + "loss": 0.4512, + "step": 1205 + }, + { + "epoch": 0.605, + "grad_norm": 0.1201406940817833, + "learning_rate": 0.00012163568355151628, + "loss": 0.3725, + "step": 1210 + }, + { + "epoch": 0.6075, + "grad_norm": 0.12117727100849152, + "learning_rate": 0.00012035139894313107, + "loss": 0.4352, + "step": 1215 + }, + { + "epoch": 0.61, + "grad_norm": 0.11709322035312653, + "learning_rate": 0.00011906937219050556, + "loss": 0.4189, + "step": 1220 + }, + { + "epoch": 0.6125, + "grad_norm": 0.11865726858377457, + "learning_rate": 0.0001177897009249405, + "loss": 0.3796, + "step": 1225 + }, + { + "epoch": 0.615, + "grad_norm": 0.10807759314775467, + "learning_rate": 0.0001165124825983573, + "loss": 0.4465, + "step": 1230 + }, + { + "epoch": 0.6175, + "grad_norm": 0.13788209855556488, + "learning_rate": 0.00011523781447587641, + "loss": 0.4994, + "step": 1235 + }, + { + "epoch": 0.62, + "grad_norm": 0.12921364605426788, + "learning_rate": 0.00011396579362841044, + "loss": 0.4251, + "step": 1240 + }, + { + "epoch": 0.6225, + "grad_norm": 0.12162365019321442, + "learning_rate": 0.0001126965169252718, + "loss": 0.3864, + "step": 1245 + }, + { + "epoch": 0.625, + "grad_norm": 0.12897826731204987, + "learning_rate": 0.00011143008102679559, + "loss": 0.3753, + "step": 1250 + }, + { + "epoch": 0.6275, + "grad_norm": 0.116109699010849, + "learning_rate": 0.00011016658237697866, + "loss": 0.3296, + "step": 1255 + }, + { + "epoch": 0.63, + "grad_norm": 0.12935414910316467, + "learning_rate": 0.00010890611719613512, + "loss": 0.3797, + "step": 1260 + }, + { + "epoch": 0.6325, + "grad_norm": 0.13730891048908234, + "learning_rate": 0.0001076487814735685, + "loss": 0.3711, + "step": 1265 + }, + { + "epoch": 0.635, + "grad_norm": 0.13870631158351898, + "learning_rate": 0.00010639467096026211, + "loss": 0.4328, + "step": 1270 + }, + { + "epoch": 0.6375, + "grad_norm": 0.11644043773412704, + "learning_rate": 0.00010514388116158701, + "loss": 0.3283, + "step": 1275 + }, + { + "epoch": 0.64, + "grad_norm": 0.12221091985702515, + "learning_rate": 0.00010389650733002894, + "loss": 0.3898, + "step": 1280 + }, + { + "epoch": 0.6425, + "grad_norm": 0.12048634141683578, + "learning_rate": 0.00010265264445793464, + "loss": 0.3256, + "step": 1285 + }, + { + "epoch": 0.645, + "grad_norm": 0.1250566840171814, + "learning_rate": 0.00010141238727027761, + "loss": 0.408, + "step": 1290 + }, + { + "epoch": 0.6475, + "grad_norm": 0.13518592715263367, + "learning_rate": 0.00010017583021744454, + "loss": 0.3763, + "step": 1295 + }, + { + "epoch": 0.65, + "grad_norm": 0.13047736883163452, + "learning_rate": 9.89430674680425e-05, + "loss": 0.3989, + "step": 1300 + }, + { + "epoch": 0.6525, + "grad_norm": 0.11474955826997757, + "learning_rate": 9.771419290172773e-05, + "loss": 0.3374, + "step": 1305 + }, + { + "epoch": 0.655, + "grad_norm": 0.11670063436031342, + "learning_rate": 9.648930010205619e-05, + "loss": 0.3343, + "step": 1310 + }, + { + "epoch": 0.6575, + "grad_norm": 0.15385080873966217, + "learning_rate": 9.526848234935704e-05, + "loss": 0.3432, + "step": 1315 + }, + { + "epoch": 0.66, + "grad_norm": 0.13441519439220428, + "learning_rate": 9.405183261362863e-05, + "loss": 0.3116, + "step": 1320 + }, + { + "epoch": 0.6625, + "grad_norm": 0.14772167801856995, + "learning_rate": 9.283944354745888e-05, + "loss": 0.3613, + "step": 1325 + }, + { + "epoch": 0.665, + "grad_norm": 0.12146154791116714, + "learning_rate": 9.163140747896907e-05, + "loss": 0.3411, + "step": 1330 + }, + { + "epoch": 0.6675, + "grad_norm": 0.1333102583885193, + "learning_rate": 9.042781640478291e-05, + "loss": 0.396, + "step": 1335 + }, + { + "epoch": 0.67, + "grad_norm": 0.12051521986722946, + "learning_rate": 8.922876198302062e-05, + "loss": 0.3837, + "step": 1340 + }, + { + "epoch": 0.6725, + "grad_norm": 0.12071400880813599, + "learning_rate": 8.803433552631874e-05, + "loss": 0.354, + "step": 1345 + }, + { + "epoch": 0.675, + "grad_norm": 0.11258620023727417, + "learning_rate": 8.684462799487635e-05, + "loss": 0.3197, + "step": 1350 + }, + { + "epoch": 0.6775, + "grad_norm": 0.11908067762851715, + "learning_rate": 8.565972998952814e-05, + "loss": 0.377, + "step": 1355 + }, + { + "epoch": 0.68, + "grad_norm": 0.1252991259098053, + "learning_rate": 8.447973174484469e-05, + "loss": 0.3438, + "step": 1360 + }, + { + "epoch": 0.6825, + "grad_norm": 0.12832245230674744, + "learning_rate": 8.330472312226091e-05, + "loss": 0.346, + "step": 1365 + }, + { + "epoch": 0.685, + "grad_norm": 0.1396942287683487, + "learning_rate": 8.213479360323258e-05, + "loss": 0.3886, + "step": 1370 + }, + { + "epoch": 0.6875, + "grad_norm": 0.12938210368156433, + "learning_rate": 8.097003228242225e-05, + "loss": 0.3699, + "step": 1375 + }, + { + "epoch": 0.69, + "grad_norm": 0.12459377944469452, + "learning_rate": 7.9810527860914e-05, + "loss": 0.3892, + "step": 1380 + }, + { + "epoch": 0.6925, + "grad_norm": 0.1360333263874054, + "learning_rate": 7.86563686394587e-05, + "loss": 0.3423, + "step": 1385 + }, + { + "epoch": 0.695, + "grad_norm": 0.1357765644788742, + "learning_rate": 7.750764251174963e-05, + "loss": 0.408, + "step": 1390 + }, + { + "epoch": 0.6975, + "grad_norm": 0.14453718066215515, + "learning_rate": 7.636443695772887e-05, + "loss": 0.3398, + "step": 1395 + }, + { + "epoch": 0.7, + "grad_norm": 0.11541519314050674, + "learning_rate": 7.522683903692547e-05, + "loss": 0.4203, + "step": 1400 + }, + { + "epoch": 0.7025, + "grad_norm": 0.13344840705394745, + "learning_rate": 7.409493538182545e-05, + "loss": 0.3694, + "step": 1405 + }, + { + "epoch": 0.705, + "grad_norm": 0.13069866597652435, + "learning_rate": 7.296881219127452e-05, + "loss": 0.3889, + "step": 1410 + }, + { + "epoch": 0.7075, + "grad_norm": 0.12457838654518127, + "learning_rate": 7.184855522391359e-05, + "loss": 0.3342, + "step": 1415 + }, + { + "epoch": 0.71, + "grad_norm": 0.11990659683942795, + "learning_rate": 7.073424979164794e-05, + "loss": 0.3855, + "step": 1420 + }, + { + "epoch": 0.7125, + "grad_norm": 0.1389523446559906, + "learning_rate": 6.962598075315046e-05, + "loss": 0.3943, + "step": 1425 + }, + { + "epoch": 0.715, + "grad_norm": 0.14108599722385406, + "learning_rate": 6.852383250739938e-05, + "loss": 0.388, + "step": 1430 + }, + { + "epoch": 0.7175, + "grad_norm": 0.1342005580663681, + "learning_rate": 6.742788898725065e-05, + "loss": 0.3602, + "step": 1435 + }, + { + "epoch": 0.72, + "grad_norm": 0.13516324758529663, + "learning_rate": 6.633823365304648e-05, + "loss": 0.3935, + "step": 1440 + }, + { + "epoch": 0.7225, + "grad_norm": 0.1302197426557541, + "learning_rate": 6.52549494862593e-05, + "loss": 0.3618, + "step": 1445 + }, + { + "epoch": 0.725, + "grad_norm": 0.12428996711969376, + "learning_rate": 6.417811898317259e-05, + "loss": 0.3338, + "step": 1450 + }, + { + "epoch": 0.7275, + "grad_norm": 0.11249776184558868, + "learning_rate": 6.31078241485982e-05, + "loss": 0.3819, + "step": 1455 + }, + { + "epoch": 0.73, + "grad_norm": 0.1359994113445282, + "learning_rate": 6.204414648963159e-05, + "loss": 0.3356, + "step": 1460 + }, + { + "epoch": 0.7325, + "grad_norm": 0.1118568629026413, + "learning_rate": 6.098716700944479e-05, + "loss": 0.3223, + "step": 1465 + }, + { + "epoch": 0.735, + "grad_norm": 0.12038140743970871, + "learning_rate": 5.993696620111741e-05, + "loss": 0.3481, + "step": 1470 + }, + { + "epoch": 0.7375, + "grad_norm": 0.12787550687789917, + "learning_rate": 5.889362404150703e-05, + "loss": 0.3766, + "step": 1475 + }, + { + "epoch": 0.74, + "grad_norm": 0.12134893983602524, + "learning_rate": 5.7857219985158506e-05, + "loss": 0.2916, + "step": 1480 + }, + { + "epoch": 0.7425, + "grad_norm": 0.1274223029613495, + "learning_rate": 5.682783295825345e-05, + "loss": 0.3095, + "step": 1485 + }, + { + "epoch": 0.745, + "grad_norm": 0.11817299574613571, + "learning_rate": 5.580554135259932e-05, + "loss": 0.3422, + "step": 1490 + }, + { + "epoch": 0.7475, + "grad_norm": 0.1348387748003006, + "learning_rate": 5.479042301965987e-05, + "loss": 0.4044, + "step": 1495 + }, + { + "epoch": 0.75, + "grad_norm": 0.14032681286334991, + "learning_rate": 5.378255526462631e-05, + "loss": 0.337, + "step": 1500 + } + ], + "logging_steps": 5, + "max_steps": 2000, + "num_input_tokens_seen": 0, + "num_train_epochs": 9223372036854775807, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.972183618289664e+18, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +} diff --git a/codellama-hugcoder/checkpoint-1500/training_args.bin b/codellama-hugcoder/checkpoint-1500/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..56c791e5156768b7f6e7b84cd13f2e54e47fad93 --- /dev/null +++ b/codellama-hugcoder/checkpoint-1500/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ce19e7480e96c4d26efe137d7fe1582e71cf088cb5b49be23c8ccd4b8298bb4b +size 5304 diff --git a/codellama-hugcoder/checkpoint-2000/README.md b/codellama-hugcoder/checkpoint-2000/README.md new file mode 100644 index 0000000000000000000000000000000000000000..a8daf76b93ee8508456247003fa155d657d2e354 --- /dev/null +++ b/codellama-hugcoder/checkpoint-2000/README.md @@ -0,0 +1,202 @@ +--- +base_model: codellama/CodeLlama-7b-Instruct-hf +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.15.2.dev0 \ No newline at end of file diff --git a/codellama-hugcoder/checkpoint-2000/adapter_config.json b/codellama-hugcoder/checkpoint-2000/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..5103ef08b354afd3803f5061efdf7a079a493cb1 --- /dev/null +++ b/codellama-hugcoder/checkpoint-2000/adapter_config.json @@ -0,0 +1,39 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "codellama/CodeLlama-7b-Instruct-hf", + "bias": "none", + "corda_config": null, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 64, + "lora_bias": false, + "lora_dropout": 0.1, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 32, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "down_proj", + "up_proj", + "k_proj", + "q_proj", + "v_proj", + "gate_proj", + "o_proj" + ], + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/codellama-hugcoder/checkpoint-2000/adapter_model.safetensors b/codellama-hugcoder/checkpoint-2000/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..58d3ccd4c40a5bb55497cd8825213decfac35527 --- /dev/null +++ b/codellama-hugcoder/checkpoint-2000/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:456cbd6da326b2c6f27a85ab19d40e13bf3fb60689cbe5ec56653d42193963f8 +size 319876032 diff --git a/codellama-hugcoder/checkpoint-2000/optimizer.pt b/codellama-hugcoder/checkpoint-2000/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..4cc780744f8d5428af28cca5b52ee03127c7c1a7 --- /dev/null +++ b/codellama-hugcoder/checkpoint-2000/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:688ec5889a6aa6b6675276da1e991b1ffaf231ca0b9db550ca1055ee967ab484 +size 640009682 diff --git a/codellama-hugcoder/checkpoint-2000/rng_state.pth b/codellama-hugcoder/checkpoint-2000/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..4c0e5b52927ff54f84fe5d982d2c372833bb465f --- /dev/null +++ b/codellama-hugcoder/checkpoint-2000/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d88eee16810615d69e99ef0af6ae2767f80f0c756dab6f8b6315f916e0a2772d +size 14180 diff --git a/codellama-hugcoder/checkpoint-2000/scheduler.pt b/codellama-hugcoder/checkpoint-2000/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..a1aa08fb4ca7865e35617d28dc511dd492902a0c --- /dev/null +++ b/codellama-hugcoder/checkpoint-2000/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0af176d761d71fce3fbce7001f4850782b022af8f40338e8e88b22363a32018f +size 1064 diff --git a/codellama-hugcoder/checkpoint-2000/trainer_state.json b/codellama-hugcoder/checkpoint-2000/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..c1a1c35192c6d64496c6c47b59e3c26bf2ca1fbb --- /dev/null +++ b/codellama-hugcoder/checkpoint-2000/trainer_state.json @@ -0,0 +1,2834 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 100.0, + "global_step": 2000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0025, + "grad_norm": 0.09379793703556061, + "learning_rate": 5.999999999999999e-06, + "loss": 0.6799, + "step": 5 + }, + { + "epoch": 0.005, + "grad_norm": 0.1399833709001541, + "learning_rate": 1.3499999999999998e-05, + "loss": 0.6954, + "step": 10 + }, + { + "epoch": 0.0075, + "grad_norm": 0.08632303029298782, + "learning_rate": 2.1e-05, + "loss": 0.6921, + "step": 15 + }, + { + "epoch": 0.01, + "grad_norm": 0.10006701201200485, + "learning_rate": 2.8499999999999998e-05, + "loss": 0.69, + "step": 20 + }, + { + "epoch": 0.0125, + "grad_norm": 0.07633858919143677, + "learning_rate": 3.5999999999999994e-05, + "loss": 0.6722, + "step": 25 + }, + { + "epoch": 0.015, + "grad_norm": 0.09399061650037766, + "learning_rate": 4.3499999999999993e-05, + "loss": 0.6453, + "step": 30 + }, + { + "epoch": 0.0175, + "grad_norm": 0.0843738541007042, + "learning_rate": 5.1e-05, + "loss": 0.6276, + "step": 35 + }, + { + "epoch": 0.02, + "grad_norm": 0.08583351224660873, + "learning_rate": 5.85e-05, + "loss": 0.58, + "step": 40 + }, + { + "epoch": 0.0225, + "grad_norm": 0.09571370482444763, + "learning_rate": 6.599999999999999e-05, + "loss": 0.6355, + "step": 45 + }, + { + "epoch": 0.025, + "grad_norm": 0.1083935871720314, + "learning_rate": 7.35e-05, + "loss": 0.589, + "step": 50 + }, + { + "epoch": 0.0275, + "grad_norm": 0.10387319326400757, + "learning_rate": 8.1e-05, + "loss": 0.6061, + "step": 55 + }, + { + "epoch": 0.03, + "grad_norm": 0.11083361506462097, + "learning_rate": 8.849999999999998e-05, + "loss": 0.572, + "step": 60 + }, + { + "epoch": 0.0325, + "grad_norm": 0.12665686011314392, + "learning_rate": 9.599999999999999e-05, + "loss": 0.5442, + "step": 65 + }, + { + "epoch": 0.035, + "grad_norm": 0.1308053582906723, + "learning_rate": 0.00010349999999999998, + "loss": 0.6524, + "step": 70 + }, + { + "epoch": 0.0375, + "grad_norm": 0.13535510003566742, + "learning_rate": 0.00011099999999999999, + "loss": 0.6404, + "step": 75 + }, + { + "epoch": 0.04, + "grad_norm": 0.12833671271800995, + "learning_rate": 0.0001185, + "loss": 0.5717, + "step": 80 + }, + { + "epoch": 0.0425, + "grad_norm": 0.11962099373340607, + "learning_rate": 0.00012599999999999997, + "loss": 0.6098, + "step": 85 + }, + { + "epoch": 0.045, + "grad_norm": 0.13898271322250366, + "learning_rate": 0.0001335, + "loss": 0.6099, + "step": 90 + }, + { + "epoch": 0.0475, + "grad_norm": 0.14486610889434814, + "learning_rate": 0.00014099999999999998, + "loss": 0.5744, + "step": 95 + }, + { + "epoch": 0.05, + "grad_norm": 0.1432138830423355, + "learning_rate": 0.00014849999999999998, + "loss": 0.5659, + "step": 100 + }, + { + "epoch": 0.0525, + "grad_norm": 0.13487878441810608, + "learning_rate": 0.000156, + "loss": 0.5622, + "step": 105 + }, + { + "epoch": 0.055, + "grad_norm": 0.12495309859514236, + "learning_rate": 0.0001635, + "loss": 0.5951, + "step": 110 + }, + { + "epoch": 0.0575, + "grad_norm": 0.13011734187602997, + "learning_rate": 0.00017099999999999998, + "loss": 0.6249, + "step": 115 + }, + { + "epoch": 0.06, + "grad_norm": 0.13987745344638824, + "learning_rate": 0.00017849999999999997, + "loss": 0.559, + "step": 120 + }, + { + "epoch": 0.0625, + "grad_norm": 0.13373605906963348, + "learning_rate": 0.000186, + "loss": 0.5475, + "step": 125 + }, + { + "epoch": 0.065, + "grad_norm": 0.12433867901563644, + "learning_rate": 0.0001935, + "loss": 0.5274, + "step": 130 + }, + { + "epoch": 0.0675, + "grad_norm": 0.11097615957260132, + "learning_rate": 0.000201, + "loss": 0.678, + "step": 135 + }, + { + "epoch": 0.07, + "grad_norm": 0.1155027225613594, + "learning_rate": 0.00020849999999999997, + "loss": 0.5611, + "step": 140 + }, + { + "epoch": 0.0725, + "grad_norm": 0.11431068181991577, + "learning_rate": 0.00021599999999999996, + "loss": 0.6054, + "step": 145 + }, + { + "epoch": 0.075, + "grad_norm": 0.09796140342950821, + "learning_rate": 0.00022349999999999998, + "loss": 0.5472, + "step": 150 + }, + { + "epoch": 0.0775, + "grad_norm": 0.09489257633686066, + "learning_rate": 0.00023099999999999998, + "loss": 0.4636, + "step": 155 + }, + { + "epoch": 0.08, + "grad_norm": 0.10787788033485413, + "learning_rate": 0.0002385, + "loss": 0.6164, + "step": 160 + }, + { + "epoch": 0.0825, + "grad_norm": 0.10261733084917068, + "learning_rate": 0.00024599999999999996, + "loss": 0.5408, + "step": 165 + }, + { + "epoch": 0.085, + "grad_norm": 0.11870352178812027, + "learning_rate": 0.0002535, + "loss": 0.5268, + "step": 170 + }, + { + "epoch": 0.0875, + "grad_norm": 0.11910569667816162, + "learning_rate": 0.000261, + "loss": 0.5461, + "step": 175 + }, + { + "epoch": 0.09, + "grad_norm": 0.10083702206611633, + "learning_rate": 0.00026849999999999997, + "loss": 0.4794, + "step": 180 + }, + { + "epoch": 0.0925, + "grad_norm": 0.10453511029481888, + "learning_rate": 0.000276, + "loss": 0.5539, + "step": 185 + }, + { + "epoch": 0.095, + "grad_norm": 0.101403146982193, + "learning_rate": 0.00028349999999999995, + "loss": 0.5346, + "step": 190 + }, + { + "epoch": 0.0975, + "grad_norm": 0.10724789649248123, + "learning_rate": 0.00029099999999999997, + "loss": 0.6026, + "step": 195 + }, + { + "epoch": 0.1, + "grad_norm": 0.1140277311205864, + "learning_rate": 0.0002985, + "loss": 0.5193, + "step": 200 + }, + { + "epoch": 0.1025, + "grad_norm": 0.09706108272075653, + "learning_rate": 0.0002999963446058092, + "loss": 0.54, + "step": 205 + }, + { + "epoch": 0.105, + "grad_norm": 0.10003062337636948, + "learning_rate": 0.0002999814948722491, + "loss": 0.5365, + "step": 210 + }, + { + "epoch": 0.1075, + "grad_norm": 0.1078687533736229, + "learning_rate": 0.00029995522346717746, + "loss": 0.5889, + "step": 215 + }, + { + "epoch": 0.11, + "grad_norm": 0.10538115352392197, + "learning_rate": 0.0002999175323912636, + "loss": 0.5611, + "step": 220 + }, + { + "epoch": 0.1125, + "grad_norm": 0.1020808294415474, + "learning_rate": 0.00029986842451482874, + "loss": 0.6103, + "step": 225 + }, + { + "epoch": 0.115, + "grad_norm": 0.09635835886001587, + "learning_rate": 0.0002998079035776279, + "loss": 0.5229, + "step": 230 + }, + { + "epoch": 0.1175, + "grad_norm": 0.10287190228700638, + "learning_rate": 0.0002997359741885648, + "loss": 0.5312, + "step": 235 + }, + { + "epoch": 0.12, + "grad_norm": 0.09160075336694717, + "learning_rate": 0.0002996526418253408, + "loss": 0.5673, + "step": 240 + }, + { + "epoch": 0.1225, + "grad_norm": 0.08691006153821945, + "learning_rate": 0.000299557912834038, + "loss": 0.5326, + "step": 245 + }, + { + "epoch": 0.125, + "grad_norm": 0.10096988826990128, + "learning_rate": 0.00029945179442863594, + "loss": 0.6004, + "step": 250 + }, + { + "epoch": 0.1275, + "grad_norm": 0.09594204276800156, + "learning_rate": 0.000299334294690462, + "loss": 0.5516, + "step": 255 + }, + { + "epoch": 0.13, + "grad_norm": 0.10281919687986374, + "learning_rate": 0.00029920542256757607, + "loss": 0.5515, + "step": 260 + }, + { + "epoch": 0.1325, + "grad_norm": 0.08547840267419815, + "learning_rate": 0.00029906518787408944, + "loss": 0.5243, + "step": 265 + }, + { + "epoch": 0.135, + "grad_norm": 0.10161560773849487, + "learning_rate": 0.0002989136012894168, + "loss": 0.5096, + "step": 270 + }, + { + "epoch": 0.1375, + "grad_norm": 0.09101904183626175, + "learning_rate": 0.0002987506743574635, + "loss": 0.553, + "step": 275 + }, + { + "epoch": 0.14, + "grad_norm": 0.09769442677497864, + "learning_rate": 0.0002985764194857463, + "loss": 0.4953, + "step": 280 + }, + { + "epoch": 0.1425, + "grad_norm": 0.10991579294204712, + "learning_rate": 0.00029839084994444826, + "loss": 0.5152, + "step": 285 + }, + { + "epoch": 0.145, + "grad_norm": 0.09450916200876236, + "learning_rate": 0.00029819397986540836, + "loss": 0.5397, + "step": 290 + }, + { + "epoch": 0.1475, + "grad_norm": 0.10876069217920303, + "learning_rate": 0.0002979858242410454, + "loss": 0.4858, + "step": 295 + }, + { + "epoch": 0.15, + "grad_norm": 0.097995825111866, + "learning_rate": 0.00029776639892321606, + "loss": 0.5566, + "step": 300 + }, + { + "epoch": 0.1525, + "grad_norm": 0.1145048514008522, + "learning_rate": 0.0002975357206220079, + "loss": 0.4531, + "step": 305 + }, + { + "epoch": 0.155, + "grad_norm": 0.10271880775690079, + "learning_rate": 0.00029729380690446654, + "loss": 0.5199, + "step": 310 + }, + { + "epoch": 0.1575, + "grad_norm": 0.11095371842384338, + "learning_rate": 0.0002970406761932583, + "loss": 0.5416, + "step": 315 + }, + { + "epoch": 0.16, + "grad_norm": 0.09949438273906708, + "learning_rate": 0.00029677634776526673, + "loss": 0.4841, + "step": 320 + }, + { + "epoch": 0.1625, + "grad_norm": 0.1163724958896637, + "learning_rate": 0.00029650084175012517, + "loss": 0.4913, + "step": 325 + }, + { + "epoch": 0.165, + "grad_norm": 0.10726840049028397, + "learning_rate": 0.00029621417912868323, + "loss": 0.5203, + "step": 330 + }, + { + "epoch": 0.1675, + "grad_norm": 0.09609931707382202, + "learning_rate": 0.00029591638173140947, + "loss": 0.5607, + "step": 335 + }, + { + "epoch": 0.17, + "grad_norm": 0.10824442654848099, + "learning_rate": 0.0002956074722367286, + "loss": 0.6004, + "step": 340 + }, + { + "epoch": 0.1725, + "grad_norm": 0.10465679317712784, + "learning_rate": 0.00029528747416929463, + "loss": 0.5216, + "step": 345 + }, + { + "epoch": 0.175, + "grad_norm": 0.10518354922533035, + "learning_rate": 0.0002949564118981994, + "loss": 0.499, + "step": 350 + }, + { + "epoch": 0.1775, + "grad_norm": 0.0955279991030693, + "learning_rate": 0.0002946143106351165, + "loss": 0.5607, + "step": 355 + }, + { + "epoch": 0.18, + "grad_norm": 0.11159654706716537, + "learning_rate": 0.0002942611964323817, + "loss": 0.5204, + "step": 360 + }, + { + "epoch": 0.1825, + "grad_norm": 0.09571187198162079, + "learning_rate": 0.0002938970961810086, + "loss": 0.6113, + "step": 365 + }, + { + "epoch": 0.185, + "grad_norm": 0.11854679882526398, + "learning_rate": 0.0002935220376086411, + "loss": 0.5639, + "step": 370 + }, + { + "epoch": 0.1875, + "grad_norm": 0.1050512045621872, + "learning_rate": 0.0002931360492774415, + "loss": 0.548, + "step": 375 + }, + { + "epoch": 0.19, + "grad_norm": 0.1053968220949173, + "learning_rate": 0.0002927391605819157, + "loss": 0.5507, + "step": 380 + }, + { + "epoch": 0.1925, + "grad_norm": 0.10567320138216019, + "learning_rate": 0.00029233140174667445, + "loss": 0.5312, + "step": 385 + }, + { + "epoch": 0.195, + "grad_norm": 0.11914283782243729, + "learning_rate": 0.0002919128038241318, + "loss": 0.5961, + "step": 390 + }, + { + "epoch": 0.1975, + "grad_norm": 0.09915795922279358, + "learning_rate": 0.0002914833986921401, + "loss": 0.5086, + "step": 395 + }, + { + "epoch": 0.2, + "grad_norm": 0.10796502232551575, + "learning_rate": 0.0002910432190515628, + "loss": 0.5585, + "step": 400 + }, + { + "epoch": 0.2025, + "grad_norm": 0.10748997330665588, + "learning_rate": 0.00029059229842378373, + "loss": 0.5466, + "step": 405 + }, + { + "epoch": 0.205, + "grad_norm": 0.10696308314800262, + "learning_rate": 0.0002901306711481544, + "loss": 0.5513, + "step": 410 + }, + { + "epoch": 0.2075, + "grad_norm": 0.10418657958507538, + "learning_rate": 0.0002896583723793792, + "loss": 0.5391, + "step": 415 + }, + { + "epoch": 0.21, + "grad_norm": 0.16421550512313843, + "learning_rate": 0.00028917543808483796, + "loss": 0.4699, + "step": 420 + }, + { + "epoch": 0.2125, + "grad_norm": 0.12929962575435638, + "learning_rate": 0.00028868190504184696, + "loss": 0.4984, + "step": 425 + }, + { + "epoch": 0.215, + "grad_norm": 0.10469454526901245, + "learning_rate": 0.00028817781083485816, + "loss": 0.5119, + "step": 430 + }, + { + "epoch": 0.2175, + "grad_norm": 0.0964970663189888, + "learning_rate": 0.00028766319385259713, + "loss": 0.5167, + "step": 435 + }, + { + "epoch": 0.22, + "grad_norm": 0.12395574152469635, + "learning_rate": 0.00028713809328513953, + "loss": 0.5692, + "step": 440 + }, + { + "epoch": 0.2225, + "grad_norm": 0.10189738124608994, + "learning_rate": 0.0002866025491209265, + "loss": 0.4628, + "step": 445 + }, + { + "epoch": 0.225, + "grad_norm": 0.10433454066514969, + "learning_rate": 0.0002860566021437197, + "loss": 0.4869, + "step": 450 + }, + { + "epoch": 0.2275, + "grad_norm": 0.13003456592559814, + "learning_rate": 0.0002855002939294951, + "loss": 0.5291, + "step": 455 + }, + { + "epoch": 0.23, + "grad_norm": 0.11692202836275101, + "learning_rate": 0.000284933666843277, + "loss": 0.5229, + "step": 460 + }, + { + "epoch": 0.2325, + "grad_norm": 0.10757846385240555, + "learning_rate": 0.0002843567640359119, + "loss": 0.435, + "step": 465 + }, + { + "epoch": 0.235, + "grad_norm": 0.10775501281023026, + "learning_rate": 0.00028376962944078206, + "loss": 0.4418, + "step": 470 + }, + { + "epoch": 0.2375, + "grad_norm": 0.11543692648410797, + "learning_rate": 0.00028317230777046015, + "loss": 0.4204, + "step": 475 + }, + { + "epoch": 0.24, + "grad_norm": 0.10946698486804962, + "learning_rate": 0.00028256484451330403, + "loss": 0.49, + "step": 480 + }, + { + "epoch": 0.2425, + "grad_norm": 0.11528221517801285, + "learning_rate": 0.00028194728592999247, + "loss": 0.4752, + "step": 485 + }, + { + "epoch": 0.245, + "grad_norm": 0.10474205762147903, + "learning_rate": 0.0002813196790500027, + "loss": 0.4847, + "step": 490 + }, + { + "epoch": 0.2475, + "grad_norm": 0.10768820345401764, + "learning_rate": 0.00028068207166802837, + "loss": 0.4664, + "step": 495 + }, + { + "epoch": 0.25, + "grad_norm": 0.12158560007810593, + "learning_rate": 0.00028003451234034037, + "loss": 0.4741, + "step": 500 + }, + { + "epoch": 0.2525, + "grad_norm": 0.11635497957468033, + "learning_rate": 0.0002793770503810886, + "loss": 0.4969, + "step": 505 + }, + { + "epoch": 0.255, + "grad_norm": 0.12205849587917328, + "learning_rate": 0.00027870973585854665, + "loss": 0.4798, + "step": 510 + }, + { + "epoch": 0.2575, + "grad_norm": 0.10270871222019196, + "learning_rate": 0.00027803261959129905, + "loss": 0.3888, + "step": 515 + }, + { + "epoch": 0.26, + "grad_norm": 0.11313367635011673, + "learning_rate": 0.0002773457531443712, + "loss": 0.4759, + "step": 520 + }, + { + "epoch": 0.2625, + "grad_norm": 0.12905193865299225, + "learning_rate": 0.00027664918882530225, + "loss": 0.4442, + "step": 525 + }, + { + "epoch": 0.265, + "grad_norm": 0.11690939962863922, + "learning_rate": 0.00027594297968016197, + "loss": 0.5535, + "step": 530 + }, + { + "epoch": 0.2675, + "grad_norm": 0.10021405667066574, + "learning_rate": 0.00027522717948951094, + "loss": 0.4717, + "step": 535 + }, + { + "epoch": 0.27, + "grad_norm": 0.10104178637266159, + "learning_rate": 0.0002745018427643051, + "loss": 0.4906, + "step": 540 + }, + { + "epoch": 0.2725, + "grad_norm": 0.12113891541957855, + "learning_rate": 0.00027376702474174425, + "loss": 0.5674, + "step": 545 + }, + { + "epoch": 0.275, + "grad_norm": 0.11330476403236389, + "learning_rate": 0.0002730227813810658, + "loss": 0.5184, + "step": 550 + }, + { + "epoch": 0.2775, + "grad_norm": 0.1025850847363472, + "learning_rate": 0.0002722691693592831, + "loss": 0.4395, + "step": 555 + }, + { + "epoch": 0.28, + "grad_norm": 0.11591499298810959, + "learning_rate": 0.0002715062460668694, + "loss": 0.5003, + "step": 560 + }, + { + "epoch": 0.2825, + "grad_norm": 0.11281153559684753, + "learning_rate": 0.0002707340696033871, + "loss": 0.4672, + "step": 565 + }, + { + "epoch": 0.285, + "grad_norm": 0.1123538464307785, + "learning_rate": 0.00026995269877306356, + "loss": 0.513, + "step": 570 + }, + { + "epoch": 0.2875, + "grad_norm": 0.10776390135288239, + "learning_rate": 0.0002691621930803127, + "loss": 0.4572, + "step": 575 + }, + { + "epoch": 0.29, + "grad_norm": 0.10008667409420013, + "learning_rate": 0.0002683626127252036, + "loss": 0.4618, + "step": 580 + }, + { + "epoch": 0.2925, + "grad_norm": 0.13961340487003326, + "learning_rate": 0.00026755401859887595, + "loss": 0.4819, + "step": 585 + }, + { + "epoch": 0.295, + "grad_norm": 0.1476685106754303, + "learning_rate": 0.00026673647227890316, + "loss": 0.4964, + "step": 590 + }, + { + "epoch": 0.2975, + "grad_norm": 0.09795507788658142, + "learning_rate": 0.00026591003602460263, + "loss": 0.4796, + "step": 595 + }, + { + "epoch": 0.3, + "grad_norm": 0.10903532058000565, + "learning_rate": 0.00026507477277229496, + "loss": 0.4775, + "step": 600 + }, + { + "epoch": 0.3025, + "grad_norm": 0.10258448123931885, + "learning_rate": 0.0002642307461305105, + "loss": 0.4519, + "step": 605 + }, + { + "epoch": 0.305, + "grad_norm": 0.11204435676336288, + "learning_rate": 0.0002633780203751459, + "loss": 0.4451, + "step": 610 + }, + { + "epoch": 0.3075, + "grad_norm": 0.10147629678249359, + "learning_rate": 0.0002625166604445689, + "loss": 0.4256, + "step": 615 + }, + { + "epoch": 0.31, + "grad_norm": 0.10481107234954834, + "learning_rate": 0.00026164673193467306, + "loss": 0.4381, + "step": 620 + }, + { + "epoch": 0.3125, + "grad_norm": 0.10856641829013824, + "learning_rate": 0.00026076830109388255, + "loss": 0.4958, + "step": 625 + }, + { + "epoch": 0.315, + "grad_norm": 0.09918677806854248, + "learning_rate": 0.0002598814348181068, + "loss": 0.4335, + "step": 630 + }, + { + "epoch": 0.3175, + "grad_norm": 0.10417389869689941, + "learning_rate": 0.00025898620064564637, + "loss": 0.4603, + "step": 635 + }, + { + "epoch": 0.32, + "grad_norm": 0.0903329998254776, + "learning_rate": 0.00025808266675204954, + "loss": 0.3932, + "step": 640 + }, + { + "epoch": 0.3225, + "grad_norm": 0.11511855572462082, + "learning_rate": 0.0002571709019449205, + "loss": 0.4169, + "step": 645 + }, + { + "epoch": 0.325, + "grad_norm": 0.11355557292699814, + "learning_rate": 0.0002562509756586793, + "loss": 0.4455, + "step": 650 + }, + { + "epoch": 0.3275, + "grad_norm": 0.1271187961101532, + "learning_rate": 0.00025532295794927437, + "loss": 0.4902, + "step": 655 + }, + { + "epoch": 0.33, + "grad_norm": 0.11936645954847336, + "learning_rate": 0.0002543869194888471, + "loss": 0.4843, + "step": 660 + }, + { + "epoch": 0.3325, + "grad_norm": 0.11935465037822723, + "learning_rate": 0.00025344293156035044, + "loss": 0.4402, + "step": 665 + }, + { + "epoch": 0.335, + "grad_norm": 0.13073407113552094, + "learning_rate": 0.00025249106605211986, + "loss": 0.467, + "step": 670 + }, + { + "epoch": 0.3375, + "grad_norm": 0.10340435802936554, + "learning_rate": 0.0002515313954523991, + "loss": 0.4827, + "step": 675 + }, + { + "epoch": 0.34, + "grad_norm": 0.11634550243616104, + "learning_rate": 0.00025056399284381983, + "loss": 0.466, + "step": 680 + }, + { + "epoch": 0.3425, + "grad_norm": 0.10582319647073746, + "learning_rate": 0.0002495889318978362, + "loss": 0.4751, + "step": 685 + }, + { + "epoch": 0.345, + "grad_norm": 0.16781780123710632, + "learning_rate": 0.00024860628686911436, + "loss": 0.4717, + "step": 690 + }, + { + "epoch": 0.3475, + "grad_norm": 0.11522196233272552, + "learning_rate": 0.0002476161325898776, + "loss": 0.4687, + "step": 695 + }, + { + "epoch": 0.35, + "grad_norm": 0.11830449104309082, + "learning_rate": 0.000246618544464208, + "loss": 0.436, + "step": 700 + }, + { + "epoch": 0.3525, + "grad_norm": 0.17485427856445312, + "learning_rate": 0.0002456135984623034, + "loss": 0.4284, + "step": 705 + }, + { + "epoch": 0.355, + "grad_norm": 0.12288108468055725, + "learning_rate": 0.00024460137111469296, + "loss": 0.4261, + "step": 710 + }, + { + "epoch": 0.3575, + "grad_norm": 0.11587081104516983, + "learning_rate": 0.0002435819395064079, + "loss": 0.4493, + "step": 715 + }, + { + "epoch": 0.36, + "grad_norm": 0.10690271109342575, + "learning_rate": 0.0002425553812711123, + "loss": 0.4648, + "step": 720 + }, + { + "epoch": 0.3625, + "grad_norm": 0.10404397547245026, + "learning_rate": 0.00024152177458519014, + "loss": 0.4634, + "step": 725 + }, + { + "epoch": 0.365, + "grad_norm": 0.11986954510211945, + "learning_rate": 0.00024048119816179236, + "loss": 0.4525, + "step": 730 + }, + { + "epoch": 0.3675, + "grad_norm": 0.10243026167154312, + "learning_rate": 0.00023943373124484234, + "loss": 0.4572, + "step": 735 + }, + { + "epoch": 0.37, + "grad_norm": 0.10386748611927032, + "learning_rate": 0.00023837945360300129, + "loss": 0.3884, + "step": 740 + }, + { + "epoch": 0.3725, + "grad_norm": 0.11165735125541687, + "learning_rate": 0.0002373184455235934, + "loss": 0.4902, + "step": 745 + }, + { + "epoch": 0.375, + "grad_norm": 0.09951601922512054, + "learning_rate": 0.00023625078780649178, + "loss": 0.4541, + "step": 750 + }, + { + "epoch": 0.3775, + "grad_norm": 0.10347504913806915, + "learning_rate": 0.00023517656175796518, + "loss": 0.3871, + "step": 755 + }, + { + "epoch": 0.38, + "grad_norm": 0.10478132963180542, + "learning_rate": 0.00023409584918448627, + "loss": 0.4329, + "step": 760 + }, + { + "epoch": 0.3825, + "grad_norm": 0.1198212131857872, + "learning_rate": 0.00023300873238650159, + "loss": 0.425, + "step": 765 + }, + { + "epoch": 0.385, + "grad_norm": 0.1103711724281311, + "learning_rate": 0.00023191529415216434, + "loss": 0.4274, + "step": 770 + }, + { + "epoch": 0.3875, + "grad_norm": 0.09940385073423386, + "learning_rate": 0.00023081561775102944, + "loss": 0.4368, + "step": 775 + }, + { + "epoch": 0.39, + "grad_norm": 0.11599268019199371, + "learning_rate": 0.00022970978692771242, + "loss": 0.4386, + "step": 780 + }, + { + "epoch": 0.3925, + "grad_norm": 0.10101296752691269, + "learning_rate": 0.00022859788589551188, + "loss": 0.4696, + "step": 785 + }, + { + "epoch": 0.395, + "grad_norm": 0.10112808644771576, + "learning_rate": 0.00022747999932999624, + "loss": 0.4066, + "step": 790 + }, + { + "epoch": 0.3975, + "grad_norm": 0.09595459699630737, + "learning_rate": 0.00022635621236255567, + "loss": 0.4837, + "step": 795 + }, + { + "epoch": 0.4, + "grad_norm": 0.10761380940675735, + "learning_rate": 0.00022522661057391857, + "loss": 0.5446, + "step": 800 + }, + { + "epoch": 0.4025, + "grad_norm": 0.11919954419136047, + "learning_rate": 0.00022409127998763463, + "loss": 0.5027, + "step": 805 + }, + { + "epoch": 0.405, + "grad_norm": 0.10851597785949707, + "learning_rate": 0.00022295030706352356, + "loss": 0.4481, + "step": 810 + }, + { + "epoch": 0.4075, + "grad_norm": 0.10030311346054077, + "learning_rate": 0.00022180377869109104, + "loss": 0.4709, + "step": 815 + }, + { + "epoch": 0.41, + "grad_norm": 0.111280657351017, + "learning_rate": 0.00022065178218291147, + "loss": 0.4423, + "step": 820 + }, + { + "epoch": 0.4125, + "grad_norm": 0.11253602802753448, + "learning_rate": 0.00021949440526797926, + "loss": 0.4136, + "step": 825 + }, + { + "epoch": 0.415, + "grad_norm": 0.10805424302816391, + "learning_rate": 0.00021833173608502732, + "loss": 0.4656, + "step": 830 + }, + { + "epoch": 0.4175, + "grad_norm": 0.10983198881149292, + "learning_rate": 0.00021716386317581542, + "loss": 0.3687, + "step": 835 + }, + { + "epoch": 0.42, + "grad_norm": 0.10653118044137955, + "learning_rate": 0.00021599087547838727, + "loss": 0.4654, + "step": 840 + }, + { + "epoch": 0.4225, + "grad_norm": 0.10856354981660843, + "learning_rate": 0.00021481286232029735, + "loss": 0.4298, + "step": 845 + }, + { + "epoch": 0.425, + "grad_norm": 0.11233706772327423, + "learning_rate": 0.0002136299134118085, + "loss": 0.4484, + "step": 850 + }, + { + "epoch": 0.4275, + "grad_norm": 0.1085442528128624, + "learning_rate": 0.00021244211883906017, + "loss": 0.4776, + "step": 855 + }, + { + "epoch": 0.43, + "grad_norm": 0.12297824025154114, + "learning_rate": 0.0002112495690572077, + "loss": 0.4029, + "step": 860 + }, + { + "epoch": 0.4325, + "grad_norm": 0.10838114470243454, + "learning_rate": 0.00021005235488353428, + "loss": 0.4848, + "step": 865 + }, + { + "epoch": 0.435, + "grad_norm": 0.10273341834545135, + "learning_rate": 0.0002088505674905342, + "loss": 0.3989, + "step": 870 + }, + { + "epoch": 0.4375, + "grad_norm": 0.11189126968383789, + "learning_rate": 0.0002076442983989705, + "loss": 0.438, + "step": 875 + }, + { + "epoch": 0.44, + "grad_norm": 0.11592905968427658, + "learning_rate": 0.0002064336394709048, + "loss": 0.4786, + "step": 880 + }, + { + "epoch": 0.4425, + "grad_norm": 0.11230389773845673, + "learning_rate": 0.0002052186829027017, + "loss": 0.3999, + "step": 885 + }, + { + "epoch": 0.445, + "grad_norm": 0.12455113977193832, + "learning_rate": 0.00020399952121800767, + "loss": 0.4856, + "step": 890 + }, + { + "epoch": 0.4475, + "grad_norm": 0.1001812294125557, + "learning_rate": 0.00020277624726070526, + "loss": 0.4689, + "step": 895 + }, + { + "epoch": 0.45, + "grad_norm": 0.11319112777709961, + "learning_rate": 0.00020154895418784242, + "loss": 0.3998, + "step": 900 + }, + { + "epoch": 0.4525, + "grad_norm": 0.11322236061096191, + "learning_rate": 0.00020031773546253824, + "loss": 0.4321, + "step": 905 + }, + { + "epoch": 0.455, + "grad_norm": 0.12924689054489136, + "learning_rate": 0.00019908268484686558, + "loss": 0.4208, + "step": 910 + }, + { + "epoch": 0.4575, + "grad_norm": 0.11435618251562119, + "learning_rate": 0.00019784389639471048, + "loss": 0.4682, + "step": 915 + }, + { + "epoch": 0.46, + "grad_norm": 0.10801081359386444, + "learning_rate": 0.00019660146444460975, + "loss": 0.428, + "step": 920 + }, + { + "epoch": 0.4625, + "grad_norm": 0.10906939953565598, + "learning_rate": 0.0001953554836125667, + "loss": 0.4455, + "step": 925 + }, + { + "epoch": 0.465, + "grad_norm": 0.10790123790502548, + "learning_rate": 0.00019410604878484556, + "loss": 0.4544, + "step": 930 + }, + { + "epoch": 0.4675, + "grad_norm": 0.10536376386880875, + "learning_rate": 0.000192853255110746, + "loss": 0.376, + "step": 935 + }, + { + "epoch": 0.47, + "grad_norm": 0.11744682490825653, + "learning_rate": 0.00019159719799535668, + "loss": 0.3887, + "step": 940 + }, + { + "epoch": 0.4725, + "grad_norm": 0.12954068183898926, + "learning_rate": 0.00019033797309228983, + "loss": 0.4075, + "step": 945 + }, + { + "epoch": 0.475, + "grad_norm": 0.1401606798171997, + "learning_rate": 0.00018907567629639725, + "loss": 0.4454, + "step": 950 + }, + { + "epoch": 0.4775, + "grad_norm": 0.12059322744607925, + "learning_rate": 0.00018781040373646706, + "loss": 0.4339, + "step": 955 + }, + { + "epoch": 0.48, + "grad_norm": 0.11798987537622452, + "learning_rate": 0.00018654225176790336, + "loss": 0.4405, + "step": 960 + }, + { + "epoch": 0.4825, + "grad_norm": 0.11344211548566818, + "learning_rate": 0.00018527131696538846, + "loss": 0.4124, + "step": 965 + }, + { + "epoch": 0.485, + "grad_norm": 0.10373330116271973, + "learning_rate": 0.00018399769611552824, + "loss": 0.4329, + "step": 970 + }, + { + "epoch": 0.4875, + "grad_norm": 0.12053704261779785, + "learning_rate": 0.0001827214862094814, + "loss": 0.4944, + "step": 975 + }, + { + "epoch": 0.49, + "grad_norm": 0.141033336520195, + "learning_rate": 0.00018144278443557328, + "loss": 0.4569, + "step": 980 + }, + { + "epoch": 0.4925, + "grad_norm": 0.10922867804765701, + "learning_rate": 0.0001801616881718947, + "loss": 0.3879, + "step": 985 + }, + { + "epoch": 0.495, + "grad_norm": 0.09843657910823822, + "learning_rate": 0.00017887829497888612, + "loss": 0.4106, + "step": 990 + }, + { + "epoch": 0.4975, + "grad_norm": 0.12131062150001526, + "learning_rate": 0.000177592702591908, + "loss": 0.4023, + "step": 995 + }, + { + "epoch": 0.5, + "grad_norm": 0.11343283206224442, + "learning_rate": 0.00017630500891379806, + "loss": 0.4824, + "step": 1000 + }, + { + "epoch": 0.5025, + "grad_norm": 0.11050508171319962, + "learning_rate": 0.00017501531200741534, + "loss": 0.4098, + "step": 1005 + }, + { + "epoch": 0.505, + "grad_norm": 0.11737144738435745, + "learning_rate": 0.00017372371008817256, + "loss": 0.3943, + "step": 1010 + }, + { + "epoch": 0.5075, + "grad_norm": 0.11473528295755386, + "learning_rate": 0.00017243030151655643, + "loss": 0.3796, + "step": 1015 + }, + { + "epoch": 0.51, + "grad_norm": 0.13086555898189545, + "learning_rate": 0.00017113518479063738, + "loss": 0.4367, + "step": 1020 + }, + { + "epoch": 0.5125, + "grad_norm": 0.11752833425998688, + "learning_rate": 0.00016983845853856837, + "loss": 0.4097, + "step": 1025 + }, + { + "epoch": 0.515, + "grad_norm": 0.11596900969743729, + "learning_rate": 0.0001685402215110739, + "loss": 0.3812, + "step": 1030 + }, + { + "epoch": 0.5175, + "grad_norm": 0.11850260943174362, + "learning_rate": 0.00016724057257392998, + "loss": 0.4354, + "step": 1035 + }, + { + "epoch": 0.52, + "grad_norm": 0.12466365844011307, + "learning_rate": 0.00016593961070043498, + "loss": 0.4317, + "step": 1040 + }, + { + "epoch": 0.5225, + "grad_norm": 0.11178991943597794, + "learning_rate": 0.0001646374349638724, + "loss": 0.3936, + "step": 1045 + }, + { + "epoch": 0.525, + "grad_norm": 0.11252165585756302, + "learning_rate": 0.00016333414452996623, + "loss": 0.386, + "step": 1050 + }, + { + "epoch": 0.5275, + "grad_norm": 0.12886975705623627, + "learning_rate": 0.0001620298386493288, + "loss": 0.3965, + "step": 1055 + }, + { + "epoch": 0.53, + "grad_norm": 0.11716549098491669, + "learning_rate": 0.00016072461664990288, + "loss": 0.3924, + "step": 1060 + }, + { + "epoch": 0.5325, + "grad_norm": 0.11604485660791397, + "learning_rate": 0.000159418577929397, + "loss": 0.3624, + "step": 1065 + }, + { + "epoch": 0.535, + "grad_norm": 0.11538460850715637, + "learning_rate": 0.00015811182194771633, + "loss": 0.4338, + "step": 1070 + }, + { + "epoch": 0.5375, + "grad_norm": 0.11618762463331223, + "learning_rate": 0.00015680444821938804, + "loss": 0.4058, + "step": 1075 + }, + { + "epoch": 0.54, + "grad_norm": 0.11750835925340652, + "learning_rate": 0.00015549655630598343, + "loss": 0.4422, + "step": 1080 + }, + { + "epoch": 0.5425, + "grad_norm": 0.12725204229354858, + "learning_rate": 0.00015418824580853535, + "loss": 0.4422, + "step": 1085 + }, + { + "epoch": 0.545, + "grad_norm": 0.11274927109479904, + "learning_rate": 0.00015287961635995347, + "loss": 0.4229, + "step": 1090 + }, + { + "epoch": 0.5475, + "grad_norm": 0.11833129078149796, + "learning_rate": 0.00015157076761743686, + "loss": 0.4442, + "step": 1095 + }, + { + "epoch": 0.55, + "grad_norm": 0.11384794861078262, + "learning_rate": 0.00015026179925488475, + "loss": 0.4528, + "step": 1100 + }, + { + "epoch": 0.5525, + "grad_norm": 0.11864661425352097, + "learning_rate": 0.00014895281095530575, + "loss": 0.3988, + "step": 1105 + }, + { + "epoch": 0.555, + "grad_norm": 0.11673832684755325, + "learning_rate": 0.00014764390240322691, + "loss": 0.3544, + "step": 1110 + }, + { + "epoch": 0.5575, + "grad_norm": 0.1174502745270729, + "learning_rate": 0.00014633517327710202, + "loss": 0.4034, + "step": 1115 + }, + { + "epoch": 0.56, + "grad_norm": 0.12685547769069672, + "learning_rate": 0.00014502672324172107, + "loss": 0.3595, + "step": 1120 + }, + { + "epoch": 0.5625, + "grad_norm": 0.12368053942918777, + "learning_rate": 0.00014371865194062007, + "loss": 0.3395, + "step": 1125 + }, + { + "epoch": 0.565, + "grad_norm": 0.1077839657664299, + "learning_rate": 0.000142411058988493, + "loss": 0.4199, + "step": 1130 + }, + { + "epoch": 0.5675, + "grad_norm": 0.11699855327606201, + "learning_rate": 0.00014110404396360576, + "loss": 0.3443, + "step": 1135 + }, + { + "epoch": 0.57, + "grad_norm": 0.13238464295864105, + "learning_rate": 0.0001397977064002128, + "loss": 0.3499, + "step": 1140 + }, + { + "epoch": 0.5725, + "grad_norm": 0.11482933163642883, + "learning_rate": 0.0001384921457809772, + "loss": 0.3619, + "step": 1145 + }, + { + "epoch": 0.575, + "grad_norm": 0.13390353322029114, + "learning_rate": 0.00013718746152939487, + "loss": 0.3684, + "step": 1150 + }, + { + "epoch": 0.5775, + "grad_norm": 0.11464900523424149, + "learning_rate": 0.00013588375300222283, + "loss": 0.3313, + "step": 1155 + }, + { + "epoch": 0.58, + "grad_norm": 0.10367871820926666, + "learning_rate": 0.00013458111948191296, + "loss": 0.3323, + "step": 1160 + }, + { + "epoch": 0.5825, + "grad_norm": 0.12259294092655182, + "learning_rate": 0.0001332796601690512, + "loss": 0.3986, + "step": 1165 + }, + { + "epoch": 0.585, + "grad_norm": 0.10923358052968979, + "learning_rate": 0.00013197947417480292, + "loss": 0.3808, + "step": 1170 + }, + { + "epoch": 0.5875, + "grad_norm": 0.12479504942893982, + "learning_rate": 0.0001306806605133656, + "loss": 0.4429, + "step": 1175 + }, + { + "epoch": 0.59, + "grad_norm": 0.11521733552217484, + "learning_rate": 0.000129383318094428, + "loss": 0.4778, + "step": 1180 + }, + { + "epoch": 0.5925, + "grad_norm": 0.14112086594104767, + "learning_rate": 0.00012808754571563827, + "loss": 0.4634, + "step": 1185 + }, + { + "epoch": 0.595, + "grad_norm": 0.12947902083396912, + "learning_rate": 0.00012679344205507981, + "loss": 0.4439, + "step": 1190 + }, + { + "epoch": 0.5975, + "grad_norm": 0.13288578391075134, + "learning_rate": 0.0001255011056637567, + "loss": 0.4402, + "step": 1195 + }, + { + "epoch": 0.6, + "grad_norm": 0.1216069906949997, + "learning_rate": 0.00012421063495808853, + "loss": 0.4203, + "step": 1200 + }, + { + "epoch": 0.6025, + "grad_norm": 0.11649637669324875, + "learning_rate": 0.000122922128212416, + "loss": 0.4512, + "step": 1205 + }, + { + "epoch": 0.605, + "grad_norm": 0.1201406940817833, + "learning_rate": 0.00012163568355151628, + "loss": 0.3725, + "step": 1210 + }, + { + "epoch": 0.6075, + "grad_norm": 0.12117727100849152, + "learning_rate": 0.00012035139894313107, + "loss": 0.4352, + "step": 1215 + }, + { + "epoch": 0.61, + "grad_norm": 0.11709322035312653, + "learning_rate": 0.00011906937219050556, + "loss": 0.4189, + "step": 1220 + }, + { + "epoch": 0.6125, + "grad_norm": 0.11865726858377457, + "learning_rate": 0.0001177897009249405, + "loss": 0.3796, + "step": 1225 + }, + { + "epoch": 0.615, + "grad_norm": 0.10807759314775467, + "learning_rate": 0.0001165124825983573, + "loss": 0.4465, + "step": 1230 + }, + { + "epoch": 0.6175, + "grad_norm": 0.13788209855556488, + "learning_rate": 0.00011523781447587641, + "loss": 0.4994, + "step": 1235 + }, + { + "epoch": 0.62, + "grad_norm": 0.12921364605426788, + "learning_rate": 0.00011396579362841044, + "loss": 0.4251, + "step": 1240 + }, + { + "epoch": 0.6225, + "grad_norm": 0.12162365019321442, + "learning_rate": 0.0001126965169252718, + "loss": 0.3864, + "step": 1245 + }, + { + "epoch": 0.625, + "grad_norm": 0.12897826731204987, + "learning_rate": 0.00011143008102679559, + "loss": 0.3753, + "step": 1250 + }, + { + "epoch": 0.6275, + "grad_norm": 0.116109699010849, + "learning_rate": 0.00011016658237697866, + "loss": 0.3296, + "step": 1255 + }, + { + "epoch": 0.63, + "grad_norm": 0.12935414910316467, + "learning_rate": 0.00010890611719613512, + "loss": 0.3797, + "step": 1260 + }, + { + "epoch": 0.6325, + "grad_norm": 0.13730891048908234, + "learning_rate": 0.0001076487814735685, + "loss": 0.3711, + "step": 1265 + }, + { + "epoch": 0.635, + "grad_norm": 0.13870631158351898, + "learning_rate": 0.00010639467096026211, + "loss": 0.4328, + "step": 1270 + }, + { + "epoch": 0.6375, + "grad_norm": 0.11644043773412704, + "learning_rate": 0.00010514388116158701, + "loss": 0.3283, + "step": 1275 + }, + { + "epoch": 0.64, + "grad_norm": 0.12221091985702515, + "learning_rate": 0.00010389650733002894, + "loss": 0.3898, + "step": 1280 + }, + { + "epoch": 0.6425, + "grad_norm": 0.12048634141683578, + "learning_rate": 0.00010265264445793464, + "loss": 0.3256, + "step": 1285 + }, + { + "epoch": 0.645, + "grad_norm": 0.1250566840171814, + "learning_rate": 0.00010141238727027761, + "loss": 0.408, + "step": 1290 + }, + { + "epoch": 0.6475, + "grad_norm": 0.13518592715263367, + "learning_rate": 0.00010017583021744454, + "loss": 0.3763, + "step": 1295 + }, + { + "epoch": 0.65, + "grad_norm": 0.13047736883163452, + "learning_rate": 9.89430674680425e-05, + "loss": 0.3989, + "step": 1300 + }, + { + "epoch": 0.6525, + "grad_norm": 0.11474955826997757, + "learning_rate": 9.771419290172773e-05, + "loss": 0.3374, + "step": 1305 + }, + { + "epoch": 0.655, + "grad_norm": 0.11670063436031342, + "learning_rate": 9.648930010205619e-05, + "loss": 0.3343, + "step": 1310 + }, + { + "epoch": 0.6575, + "grad_norm": 0.15385080873966217, + "learning_rate": 9.526848234935704e-05, + "loss": 0.3432, + "step": 1315 + }, + { + "epoch": 0.66, + "grad_norm": 0.13441519439220428, + "learning_rate": 9.405183261362863e-05, + "loss": 0.3116, + "step": 1320 + }, + { + "epoch": 0.6625, + "grad_norm": 0.14772167801856995, + "learning_rate": 9.283944354745888e-05, + "loss": 0.3613, + "step": 1325 + }, + { + "epoch": 0.665, + "grad_norm": 0.12146154791116714, + "learning_rate": 9.163140747896907e-05, + "loss": 0.3411, + "step": 1330 + }, + { + "epoch": 0.6675, + "grad_norm": 0.1333102583885193, + "learning_rate": 9.042781640478291e-05, + "loss": 0.396, + "step": 1335 + }, + { + "epoch": 0.67, + "grad_norm": 0.12051521986722946, + "learning_rate": 8.922876198302062e-05, + "loss": 0.3837, + "step": 1340 + }, + { + "epoch": 0.6725, + "grad_norm": 0.12071400880813599, + "learning_rate": 8.803433552631874e-05, + "loss": 0.354, + "step": 1345 + }, + { + "epoch": 0.675, + "grad_norm": 0.11258620023727417, + "learning_rate": 8.684462799487635e-05, + "loss": 0.3197, + "step": 1350 + }, + { + "epoch": 0.6775, + "grad_norm": 0.11908067762851715, + "learning_rate": 8.565972998952814e-05, + "loss": 0.377, + "step": 1355 + }, + { + "epoch": 0.68, + "grad_norm": 0.1252991259098053, + "learning_rate": 8.447973174484469e-05, + "loss": 0.3438, + "step": 1360 + }, + { + "epoch": 0.6825, + "grad_norm": 0.12832245230674744, + "learning_rate": 8.330472312226091e-05, + "loss": 0.346, + "step": 1365 + }, + { + "epoch": 0.685, + "grad_norm": 0.1396942287683487, + "learning_rate": 8.213479360323258e-05, + "loss": 0.3886, + "step": 1370 + }, + { + "epoch": 0.6875, + "grad_norm": 0.12938210368156433, + "learning_rate": 8.097003228242225e-05, + "loss": 0.3699, + "step": 1375 + }, + { + "epoch": 0.69, + "grad_norm": 0.12459377944469452, + "learning_rate": 7.9810527860914e-05, + "loss": 0.3892, + "step": 1380 + }, + { + "epoch": 0.6925, + "grad_norm": 0.1360333263874054, + "learning_rate": 7.86563686394587e-05, + "loss": 0.3423, + "step": 1385 + }, + { + "epoch": 0.695, + "grad_norm": 0.1357765644788742, + "learning_rate": 7.750764251174963e-05, + "loss": 0.408, + "step": 1390 + }, + { + "epoch": 0.6975, + "grad_norm": 0.14453718066215515, + "learning_rate": 7.636443695772887e-05, + "loss": 0.3398, + "step": 1395 + }, + { + "epoch": 0.7, + "grad_norm": 0.11541519314050674, + "learning_rate": 7.522683903692547e-05, + "loss": 0.4203, + "step": 1400 + }, + { + "epoch": 0.7025, + "grad_norm": 0.13344840705394745, + "learning_rate": 7.409493538182545e-05, + "loss": 0.3694, + "step": 1405 + }, + { + "epoch": 0.705, + "grad_norm": 0.13069866597652435, + "learning_rate": 7.296881219127452e-05, + "loss": 0.3889, + "step": 1410 + }, + { + "epoch": 0.7075, + "grad_norm": 0.12457838654518127, + "learning_rate": 7.184855522391359e-05, + "loss": 0.3342, + "step": 1415 + }, + { + "epoch": 0.71, + "grad_norm": 0.11990659683942795, + "learning_rate": 7.073424979164794e-05, + "loss": 0.3855, + "step": 1420 + }, + { + "epoch": 0.7125, + "grad_norm": 0.1389523446559906, + "learning_rate": 6.962598075315046e-05, + "loss": 0.3943, + "step": 1425 + }, + { + "epoch": 0.715, + "grad_norm": 0.14108599722385406, + "learning_rate": 6.852383250739938e-05, + "loss": 0.388, + "step": 1430 + }, + { + "epoch": 0.7175, + "grad_norm": 0.1342005580663681, + "learning_rate": 6.742788898725065e-05, + "loss": 0.3602, + "step": 1435 + }, + { + "epoch": 0.72, + "grad_norm": 0.13516324758529663, + "learning_rate": 6.633823365304648e-05, + "loss": 0.3935, + "step": 1440 + }, + { + "epoch": 0.7225, + "grad_norm": 0.1302197426557541, + "learning_rate": 6.52549494862593e-05, + "loss": 0.3618, + "step": 1445 + }, + { + "epoch": 0.725, + "grad_norm": 0.12428996711969376, + "learning_rate": 6.417811898317259e-05, + "loss": 0.3338, + "step": 1450 + }, + { + "epoch": 0.7275, + "grad_norm": 0.11249776184558868, + "learning_rate": 6.31078241485982e-05, + "loss": 0.3819, + "step": 1455 + }, + { + "epoch": 0.73, + "grad_norm": 0.1359994113445282, + "learning_rate": 6.204414648963159e-05, + "loss": 0.3356, + "step": 1460 + }, + { + "epoch": 0.7325, + "grad_norm": 0.1118568629026413, + "learning_rate": 6.098716700944479e-05, + "loss": 0.3223, + "step": 1465 + }, + { + "epoch": 0.735, + "grad_norm": 0.12038140743970871, + "learning_rate": 5.993696620111741e-05, + "loss": 0.3481, + "step": 1470 + }, + { + "epoch": 0.7375, + "grad_norm": 0.12787550687789917, + "learning_rate": 5.889362404150703e-05, + "loss": 0.3766, + "step": 1475 + }, + { + "epoch": 0.74, + "grad_norm": 0.12134893983602524, + "learning_rate": 5.7857219985158506e-05, + "loss": 0.2916, + "step": 1480 + }, + { + "epoch": 0.7425, + "grad_norm": 0.1274223029613495, + "learning_rate": 5.682783295825345e-05, + "loss": 0.3095, + "step": 1485 + }, + { + "epoch": 0.745, + "grad_norm": 0.11817299574613571, + "learning_rate": 5.580554135259932e-05, + "loss": 0.3422, + "step": 1490 + }, + { + "epoch": 0.7475, + "grad_norm": 0.1348387748003006, + "learning_rate": 5.479042301965987e-05, + "loss": 0.4044, + "step": 1495 + }, + { + "epoch": 0.75, + "grad_norm": 0.14032681286334991, + "learning_rate": 5.378255526462631e-05, + "loss": 0.337, + "step": 1500 + }, + { + "epoch": 0.7525, + "grad_norm": 0.1196574866771698, + "learning_rate": 5.2782014840530366e-05, + "loss": 0.3638, + "step": 1505 + }, + { + "epoch": 0.755, + "grad_norm": 0.1307535171508789, + "learning_rate": 5.178887794239904e-05, + "loss": 0.3514, + "step": 1510 + }, + { + "epoch": 0.7575, + "grad_norm": 0.12303224951028824, + "learning_rate": 5.080322020145224e-05, + "loss": 0.3825, + "step": 1515 + }, + { + "epoch": 0.76, + "grad_norm": 0.11517804116010666, + "learning_rate": 4.9825116679343025e-05, + "loss": 0.3474, + "step": 1520 + }, + { + "epoch": 0.7625, + "grad_norm": 0.1276445835828781, + "learning_rate": 4.885464186244154e-05, + "loss": 0.3084, + "step": 1525 + }, + { + "epoch": 0.765, + "grad_norm": 0.12166495621204376, + "learning_rate": 4.789186965616232e-05, + "loss": 0.2949, + "step": 1530 + }, + { + "epoch": 0.7675, + "grad_norm": 0.13007108867168427, + "learning_rate": 4.6936873379336564e-05, + "loss": 0.3336, + "step": 1535 + }, + { + "epoch": 0.77, + "grad_norm": 0.12368687242269516, + "learning_rate": 4.598972575862803e-05, + "loss": 0.3443, + "step": 1540 + }, + { + "epoch": 0.7725, + "grad_norm": 0.11817432940006256, + "learning_rate": 4.5050498922995166e-05, + "loss": 0.3198, + "step": 1545 + }, + { + "epoch": 0.775, + "grad_norm": 0.13239014148712158, + "learning_rate": 4.4119264398197843e-05, + "loss": 0.3145, + "step": 1550 + }, + { + "epoch": 0.7775, + "grad_norm": 0.12305855751037598, + "learning_rate": 4.319609310135054e-05, + "loss": 0.3276, + "step": 1555 + }, + { + "epoch": 0.78, + "grad_norm": 0.13063360750675201, + "learning_rate": 4.228105533552169e-05, + "loss": 0.4115, + "step": 1560 + }, + { + "epoch": 0.7825, + "grad_norm": 0.12751415371894836, + "learning_rate": 4.137422078437991e-05, + "loss": 0.4113, + "step": 1565 + }, + { + "epoch": 0.785, + "grad_norm": 0.1429520696401596, + "learning_rate": 4.0475658506887136e-05, + "loss": 0.3634, + "step": 1570 + }, + { + "epoch": 0.7875, + "grad_norm": 0.13072626292705536, + "learning_rate": 3.9585436932039846e-05, + "loss": 0.3914, + "step": 1575 + }, + { + "epoch": 0.79, + "grad_norm": 0.13076546788215637, + "learning_rate": 3.870362385365755e-05, + "loss": 0.3153, + "step": 1580 + }, + { + "epoch": 0.7925, + "grad_norm": 0.11764945089817047, + "learning_rate": 3.7830286425220234e-05, + "loss": 0.331, + "step": 1585 + }, + { + "epoch": 0.795, + "grad_norm": 0.12469421327114105, + "learning_rate": 3.696549115475434e-05, + "loss": 0.3667, + "step": 1590 + }, + { + "epoch": 0.7975, + "grad_norm": 0.13257570564746857, + "learning_rate": 3.6109303899767875e-05, + "loss": 0.3775, + "step": 1595 + }, + { + "epoch": 0.8, + "grad_norm": 0.1399105191230774, + "learning_rate": 3.5261789862235235e-05, + "loss": 0.3786, + "step": 1600 + }, + { + "epoch": 0.8025, + "grad_norm": 0.1299823671579361, + "learning_rate": 3.442301358363163e-05, + "loss": 0.3984, + "step": 1605 + }, + { + "epoch": 0.805, + "grad_norm": 0.12068431079387665, + "learning_rate": 3.359303894001809e-05, + "loss": 0.3416, + "step": 1610 + }, + { + "epoch": 0.8075, + "grad_norm": 0.12825050950050354, + "learning_rate": 3.277192913717717e-05, + "loss": 0.3973, + "step": 1615 + }, + { + "epoch": 0.81, + "grad_norm": 0.12794139981269836, + "learning_rate": 3.195974670579941e-05, + "loss": 0.3942, + "step": 1620 + }, + { + "epoch": 0.8125, + "grad_norm": 0.1178906112909317, + "learning_rate": 3.115655349672141e-05, + "loss": 0.3549, + "step": 1625 + }, + { + "epoch": 0.815, + "grad_norm": 0.11859016120433807, + "learning_rate": 3.036241067621575e-05, + "loss": 0.3113, + "step": 1630 + }, + { + "epoch": 0.8175, + "grad_norm": 0.12508928775787354, + "learning_rate": 2.9577378721332843e-05, + "loss": 0.3802, + "step": 1635 + }, + { + "epoch": 0.82, + "grad_norm": 0.1293668895959854, + "learning_rate": 2.8801517415295455e-05, + "loss": 0.3098, + "step": 1640 + }, + { + "epoch": 0.8225, + "grad_norm": 0.12039236724376678, + "learning_rate": 2.8034885842945865e-05, + "loss": 0.2876, + "step": 1645 + }, + { + "epoch": 0.825, + "grad_norm": 0.14805036783218384, + "learning_rate": 2.7277542386246454e-05, + "loss": 0.3618, + "step": 1650 + }, + { + "epoch": 0.8275, + "grad_norm": 0.12638579308986664, + "learning_rate": 2.6529544719833706e-05, + "loss": 0.3328, + "step": 1655 + }, + { + "epoch": 0.83, + "grad_norm": 0.12427478283643723, + "learning_rate": 2.5790949806625838e-05, + "loss": 0.3394, + "step": 1660 + }, + { + "epoch": 0.8325, + "grad_norm": 0.1283419132232666, + "learning_rate": 2.5061813893485085e-05, + "loss": 0.3392, + "step": 1665 + }, + { + "epoch": 0.835, + "grad_norm": 0.12487384676933289, + "learning_rate": 2.434219250693419e-05, + "loss": 0.3592, + "step": 1670 + }, + { + "epoch": 0.8375, + "grad_norm": 0.14032793045043945, + "learning_rate": 2.363214044892788e-05, + "loss": 0.4099, + "step": 1675 + }, + { + "epoch": 0.84, + "grad_norm": 0.10917101800441742, + "learning_rate": 2.293171179267946e-05, + "loss": 0.3204, + "step": 1680 + }, + { + "epoch": 0.8425, + "grad_norm": 0.1253073364496231, + "learning_rate": 2.2240959878542848e-05, + "loss": 0.3378, + "step": 1685 + }, + { + "epoch": 0.845, + "grad_norm": 0.14096981287002563, + "learning_rate": 2.155993730995077e-05, + "loss": 0.378, + "step": 1690 + }, + { + "epoch": 0.8475, + "grad_norm": 0.12039178609848022, + "learning_rate": 2.0888695949408468e-05, + "loss": 0.3197, + "step": 1695 + }, + { + "epoch": 0.85, + "grad_norm": 0.12723132967948914, + "learning_rate": 2.0227286914544353e-05, + "loss": 0.3241, + "step": 1700 + }, + { + "epoch": 0.8525, + "grad_norm": 0.1309029906988144, + "learning_rate": 1.9575760574217147e-05, + "loss": 0.3743, + "step": 1705 + }, + { + "epoch": 0.855, + "grad_norm": 0.1324499100446701, + "learning_rate": 1.893416654468022e-05, + "loss": 0.345, + "step": 1710 + }, + { + "epoch": 0.8575, + "grad_norm": 0.11905783414840698, + "learning_rate": 1.8302553685802917e-05, + "loss": 0.3514, + "step": 1715 + }, + { + "epoch": 0.86, + "grad_norm": 0.12570443749427795, + "learning_rate": 1.768097009734985e-05, + "loss": 0.3791, + "step": 1720 + }, + { + "epoch": 0.8625, + "grad_norm": 0.13414913415908813, + "learning_rate": 1.7069463115317788e-05, + "loss": 0.3575, + "step": 1725 + }, + { + "epoch": 0.865, + "grad_norm": 0.1283785104751587, + "learning_rate": 1.6468079308331023e-05, + "loss": 0.3496, + "step": 1730 + }, + { + "epoch": 0.8675, + "grad_norm": 0.11180217564105988, + "learning_rate": 1.587686447409478e-05, + "loss": 0.3245, + "step": 1735 + }, + { + "epoch": 0.87, + "grad_norm": 0.13804157078266144, + "learning_rate": 1.5295863635907667e-05, + "loss": 0.367, + "step": 1740 + }, + { + "epoch": 0.8725, + "grad_norm": 0.12629055976867676, + "learning_rate": 1.4725121039232945e-05, + "loss": 0.293, + "step": 1745 + }, + { + "epoch": 0.875, + "grad_norm": 0.12774884700775146, + "learning_rate": 1.4164680148329088e-05, + "loss": 0.3798, + "step": 1750 + }, + { + "epoch": 0.8775, + "grad_norm": 0.11681339889764786, + "learning_rate": 1.3614583642939718e-05, + "loss": 0.3474, + "step": 1755 + }, + { + "epoch": 0.88, + "grad_norm": 0.14510560035705566, + "learning_rate": 1.3074873415043591e-05, + "loss": 0.3999, + "step": 1760 + }, + { + "epoch": 0.8825, + "grad_norm": 0.1168401762843132, + "learning_rate": 1.2545590565664054e-05, + "loss": 0.3398, + "step": 1765 + }, + { + "epoch": 0.885, + "grad_norm": 0.1411600410938263, + "learning_rate": 1.2026775401739348e-05, + "loss": 0.3346, + "step": 1770 + }, + { + "epoch": 0.8875, + "grad_norm": 0.12797729671001434, + "learning_rate": 1.1518467433052863e-05, + "loss": 0.3742, + "step": 1775 + }, + { + "epoch": 0.89, + "grad_norm": 0.12946921586990356, + "learning_rate": 1.1020705369224414e-05, + "loss": 0.3436, + "step": 1780 + }, + { + "epoch": 0.8925, + "grad_norm": 0.13285613059997559, + "learning_rate": 1.0533527116762296e-05, + "loss": 0.3186, + "step": 1785 + }, + { + "epoch": 0.895, + "grad_norm": 0.15213604271411896, + "learning_rate": 1.005696977617666e-05, + "loss": 0.3629, + "step": 1790 + }, + { + "epoch": 0.8975, + "grad_norm": 0.12391404062509537, + "learning_rate": 9.591069639154008e-06, + "loss": 0.3421, + "step": 1795 + }, + { + "epoch": 0.9, + "grad_norm": 0.11592845618724823, + "learning_rate": 9.135862185793636e-06, + "loss": 0.3107, + "step": 1800 + }, + { + "epoch": 0.9025, + "grad_norm": 0.12540902197360992, + "learning_rate": 8.691382081905496e-06, + "loss": 0.3605, + "step": 1805 + }, + { + "epoch": 0.905, + "grad_norm": 0.14459215104579926, + "learning_rate": 8.257663176370389e-06, + "loss": 0.3884, + "step": 1810 + }, + { + "epoch": 0.9075, + "grad_norm": 0.14139464497566223, + "learning_rate": 7.834738498562165e-06, + "loss": 0.3728, + "step": 1815 + }, + { + "epoch": 0.91, + "grad_norm": 0.12125397473573685, + "learning_rate": 7.422640255832446e-06, + "loss": 0.3237, + "step": 1820 + }, + { + "epoch": 0.9125, + "grad_norm": 0.13039612770080566, + "learning_rate": 7.021399831057961e-06, + "loss": 0.3055, + "step": 1825 + }, + { + "epoch": 0.915, + "grad_norm": 0.1337701678276062, + "learning_rate": 6.631047780250481e-06, + "loss": 0.368, + "step": 1830 + }, + { + "epoch": 0.9175, + "grad_norm": 0.13020606338977814, + "learning_rate": 6.251613830230013e-06, + "loss": 0.3262, + "step": 1835 + }, + { + "epoch": 0.92, + "grad_norm": 0.12915077805519104, + "learning_rate": 5.883126876360872e-06, + "loss": 0.3428, + "step": 1840 + }, + { + "epoch": 0.9225, + "grad_norm": 0.12774400413036346, + "learning_rate": 5.525614980351284e-06, + "loss": 0.3735, + "step": 1845 + }, + { + "epoch": 0.925, + "grad_norm": 0.12587039172649384, + "learning_rate": 5.1791053681162545e-06, + "loss": 0.3402, + "step": 1850 + }, + { + "epoch": 0.9275, + "grad_norm": 0.12152459472417831, + "learning_rate": 4.843624427704329e-06, + "loss": 0.2968, + "step": 1855 + }, + { + "epoch": 0.93, + "grad_norm": 0.11444247514009476, + "learning_rate": 4.519197707287986e-06, + "loss": 0.3448, + "step": 1860 + }, + { + "epoch": 0.9325, + "grad_norm": 0.12532518804073334, + "learning_rate": 4.2058499132180734e-06, + "loss": 0.3613, + "step": 1865 + }, + { + "epoch": 0.935, + "grad_norm": 0.14186476171016693, + "learning_rate": 3.903604908142266e-06, + "loss": 0.2887, + "step": 1870 + }, + { + "epoch": 0.9375, + "grad_norm": 0.13014192879199982, + "learning_rate": 3.6124857091878845e-06, + "loss": 0.2679, + "step": 1875 + }, + { + "epoch": 0.94, + "grad_norm": 0.1259031891822815, + "learning_rate": 3.3325144862090648e-06, + "loss": 0.2993, + "step": 1880 + }, + { + "epoch": 0.9425, + "grad_norm": 0.12168288230895996, + "learning_rate": 3.0637125600983916e-06, + "loss": 0.3317, + "step": 1885 + }, + { + "epoch": 0.945, + "grad_norm": 0.12291324138641357, + "learning_rate": 2.8061004011632302e-06, + "loss": 0.3311, + "step": 1890 + }, + { + "epoch": 0.9475, + "grad_norm": 0.13629783689975739, + "learning_rate": 2.5596976275668757e-06, + "loss": 0.3456, + "step": 1895 + }, + { + "epoch": 0.95, + "grad_norm": 0.17415851354599, + "learning_rate": 2.324523003834511e-06, + "loss": 0.3589, + "step": 1900 + }, + { + "epoch": 0.9525, + "grad_norm": 0.1330641210079193, + "learning_rate": 2.100594439424269e-06, + "loss": 0.3826, + "step": 1905 + }, + { + "epoch": 0.955, + "grad_norm": 0.14203837513923645, + "learning_rate": 1.8879289873632907e-06, + "loss": 0.3807, + "step": 1910 + }, + { + "epoch": 0.9575, + "grad_norm": 0.1222100704908371, + "learning_rate": 1.686542842949129e-06, + "loss": 0.3084, + "step": 1915 + }, + { + "epoch": 0.96, + "grad_norm": 0.1441483348608017, + "learning_rate": 1.4964513425163694e-06, + "loss": 0.3871, + "step": 1920 + }, + { + "epoch": 0.9625, + "grad_norm": 0.1402144581079483, + "learning_rate": 1.3176689622687474e-06, + "loss": 0.3192, + "step": 1925 + }, + { + "epoch": 0.965, + "grad_norm": 0.13284745812416077, + "learning_rate": 1.1502093171766979e-06, + "loss": 0.359, + "step": 1930 + }, + { + "epoch": 0.9675, + "grad_norm": 0.1253402829170227, + "learning_rate": 9.94085159940533e-07, + "loss": 0.3214, + "step": 1935 + }, + { + "epoch": 0.97, + "grad_norm": 0.13589312136173248, + "learning_rate": 8.493083800193034e-07, + "loss": 0.3524, + "step": 1940 + }, + { + "epoch": 0.9725, + "grad_norm": 0.13623379170894623, + "learning_rate": 7.158900027253223e-07, + "loss": 0.3711, + "step": 1945 + }, + { + "epoch": 0.975, + "grad_norm": 0.12516111135482788, + "learning_rate": 5.9384018838457e-07, + "loss": 0.3487, + "step": 1950 + }, + { + "epoch": 0.9775, + "grad_norm": 0.1211727038025856, + "learning_rate": 4.831682315629304e-07, + "loss": 0.3079, + "step": 1955 + }, + { + "epoch": 0.98, + "grad_norm": 0.1348896622657776, + "learning_rate": 3.8388256035840615e-07, + "loss": 0.322, + "step": 1960 + }, + { + "epoch": 0.9825, + "grad_norm": 0.12953124940395355, + "learning_rate": 2.959907357592661e-07, + "loss": 0.3054, + "step": 1965 + }, + { + "epoch": 0.985, + "grad_norm": 0.12745600938796997, + "learning_rate": 2.1949945106823909e-07, + "loss": 0.3208, + "step": 1970 + }, + { + "epoch": 0.9875, + "grad_norm": 0.13108642399311066, + "learning_rate": 1.544145313928047e-07, + "loss": 0.3641, + "step": 1975 + }, + { + "epoch": 0.99, + "grad_norm": 0.12415596097707748, + "learning_rate": 1.0074093320156517e-07, + "loss": 0.3141, + "step": 1980 + }, + { + "epoch": 0.9925, + "grad_norm": 0.12116590887308121, + "learning_rate": 5.8482743946847153e-08, + "loss": 0.3085, + "step": 1985 + }, + { + "epoch": 0.995, + "grad_norm": 0.12617753446102142, + "learning_rate": 2.764318175336733e-08, + "loss": 0.316, + "step": 1990 + }, + { + "epoch": 0.9975, + "grad_norm": 0.13097520172595978, + "learning_rate": 8.224595173178527e-09, + "loss": 0.2772, + "step": 1995 + }, + { + "epoch": 1.0, + "grad_norm": 0.1454041749238968, + "learning_rate": 2.284630068460913e-10, + "loss": 0.3226, + "step": 2000 + } + ], + "logging_steps": 5, + "max_steps": 2000, + "num_input_tokens_seen": 0, + "num_train_epochs": 9223372036854775807, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 2.629578157719552e+18, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +} diff --git a/codellama-hugcoder/checkpoint-2000/training_args.bin b/codellama-hugcoder/checkpoint-2000/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..56c791e5156768b7f6e7b84cd13f2e54e47fad93 --- /dev/null +++ b/codellama-hugcoder/checkpoint-2000/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ce19e7480e96c4d26efe137d7fe1582e71cf088cb5b49be23c8ccd4b8298bb4b +size 5304 diff --git a/codellama-hugcoder/checkpoint-500/README.md b/codellama-hugcoder/checkpoint-500/README.md new file mode 100644 index 0000000000000000000000000000000000000000..a8daf76b93ee8508456247003fa155d657d2e354 --- /dev/null +++ b/codellama-hugcoder/checkpoint-500/README.md @@ -0,0 +1,202 @@ +--- +base_model: codellama/CodeLlama-7b-Instruct-hf +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.15.2.dev0 \ No newline at end of file diff --git a/codellama-hugcoder/checkpoint-500/adapter_config.json b/codellama-hugcoder/checkpoint-500/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..5103ef08b354afd3803f5061efdf7a079a493cb1 --- /dev/null +++ b/codellama-hugcoder/checkpoint-500/adapter_config.json @@ -0,0 +1,39 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "codellama/CodeLlama-7b-Instruct-hf", + "bias": "none", + "corda_config": null, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 64, + "lora_bias": false, + "lora_dropout": 0.1, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 32, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "down_proj", + "up_proj", + "k_proj", + "q_proj", + "v_proj", + "gate_proj", + "o_proj" + ], + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/codellama-hugcoder/checkpoint-500/adapter_model.safetensors b/codellama-hugcoder/checkpoint-500/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..efa8f0d96c743dc9c45dbe44b6751e8b825ccea1 --- /dev/null +++ b/codellama-hugcoder/checkpoint-500/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ba0a03baab18f0cdae4dfc77bf7b41f7d1435807efac74517b5672e9ef8bedf1 +size 319876032 diff --git a/codellama-hugcoder/checkpoint-500/optimizer.pt b/codellama-hugcoder/checkpoint-500/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..7893a45a12038c1e0135346bdb8820585f26d596 --- /dev/null +++ b/codellama-hugcoder/checkpoint-500/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2dad4d0839af192a8e721c020748fcd5843aa02d4b867cd03a6da416f3b15a8e +size 640009682 diff --git a/codellama-hugcoder/checkpoint-500/rng_state.pth b/codellama-hugcoder/checkpoint-500/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..35d7a6ea2da3f55cb152eac5b23f5fd9af183676 --- /dev/null +++ b/codellama-hugcoder/checkpoint-500/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8b3fe293b4ac5ae1cf2f114644c15f2a8317440ebc1144a8065f3fe94c0e32b8 +size 14244 diff --git a/codellama-hugcoder/checkpoint-500/scheduler.pt b/codellama-hugcoder/checkpoint-500/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..573d26ce871909d4cca478cf0893de681cf14192 --- /dev/null +++ b/codellama-hugcoder/checkpoint-500/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:12f207d7fee0843ba3ccc634c56e770b9b0bfb3e3b7ef4379b8fc405b4c45a03 +size 1064 diff --git a/codellama-hugcoder/checkpoint-500/trainer_state.json b/codellama-hugcoder/checkpoint-500/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..7d81dd11b86352eda67139874fba12a09a20421d --- /dev/null +++ b/codellama-hugcoder/checkpoint-500/trainer_state.json @@ -0,0 +1,734 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.25, + "eval_steps": 100.0, + "global_step": 500, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0025, + "grad_norm": 0.09379793703556061, + "learning_rate": 5.999999999999999e-06, + "loss": 0.6799, + "step": 5 + }, + { + "epoch": 0.005, + "grad_norm": 0.1399833709001541, + "learning_rate": 1.3499999999999998e-05, + "loss": 0.6954, + "step": 10 + }, + { + "epoch": 0.0075, + "grad_norm": 0.08632303029298782, + "learning_rate": 2.1e-05, + "loss": 0.6921, + "step": 15 + }, + { + "epoch": 0.01, + "grad_norm": 0.10006701201200485, + "learning_rate": 2.8499999999999998e-05, + "loss": 0.69, + "step": 20 + }, + { + "epoch": 0.0125, + "grad_norm": 0.07633858919143677, + "learning_rate": 3.5999999999999994e-05, + "loss": 0.6722, + "step": 25 + }, + { + "epoch": 0.015, + "grad_norm": 0.09399061650037766, + "learning_rate": 4.3499999999999993e-05, + "loss": 0.6453, + "step": 30 + }, + { + "epoch": 0.0175, + "grad_norm": 0.0843738541007042, + "learning_rate": 5.1e-05, + "loss": 0.6276, + "step": 35 + }, + { + "epoch": 0.02, + "grad_norm": 0.08583351224660873, + "learning_rate": 5.85e-05, + "loss": 0.58, + "step": 40 + }, + { + "epoch": 0.0225, + "grad_norm": 0.09571370482444763, + "learning_rate": 6.599999999999999e-05, + "loss": 0.6355, + "step": 45 + }, + { + "epoch": 0.025, + "grad_norm": 0.1083935871720314, + "learning_rate": 7.35e-05, + "loss": 0.589, + "step": 50 + }, + { + "epoch": 0.0275, + "grad_norm": 0.10387319326400757, + "learning_rate": 8.1e-05, + "loss": 0.6061, + "step": 55 + }, + { + "epoch": 0.03, + "grad_norm": 0.11083361506462097, + "learning_rate": 8.849999999999998e-05, + "loss": 0.572, + "step": 60 + }, + { + "epoch": 0.0325, + "grad_norm": 0.12665686011314392, + "learning_rate": 9.599999999999999e-05, + "loss": 0.5442, + "step": 65 + }, + { + "epoch": 0.035, + "grad_norm": 0.1308053582906723, + "learning_rate": 0.00010349999999999998, + "loss": 0.6524, + "step": 70 + }, + { + "epoch": 0.0375, + "grad_norm": 0.13535510003566742, + "learning_rate": 0.00011099999999999999, + "loss": 0.6404, + "step": 75 + }, + { + "epoch": 0.04, + "grad_norm": 0.12833671271800995, + "learning_rate": 0.0001185, + "loss": 0.5717, + "step": 80 + }, + { + "epoch": 0.0425, + "grad_norm": 0.11962099373340607, + "learning_rate": 0.00012599999999999997, + "loss": 0.6098, + "step": 85 + }, + { + "epoch": 0.045, + "grad_norm": 0.13898271322250366, + "learning_rate": 0.0001335, + "loss": 0.6099, + "step": 90 + }, + { + "epoch": 0.0475, + "grad_norm": 0.14486610889434814, + "learning_rate": 0.00014099999999999998, + "loss": 0.5744, + "step": 95 + }, + { + "epoch": 0.05, + "grad_norm": 0.1432138830423355, + "learning_rate": 0.00014849999999999998, + "loss": 0.5659, + "step": 100 + }, + { + "epoch": 0.0525, + "grad_norm": 0.13487878441810608, + "learning_rate": 0.000156, + "loss": 0.5622, + "step": 105 + }, + { + "epoch": 0.055, + "grad_norm": 0.12495309859514236, + "learning_rate": 0.0001635, + "loss": 0.5951, + "step": 110 + }, + { + "epoch": 0.0575, + "grad_norm": 0.13011734187602997, + "learning_rate": 0.00017099999999999998, + "loss": 0.6249, + "step": 115 + }, + { + "epoch": 0.06, + "grad_norm": 0.13987745344638824, + "learning_rate": 0.00017849999999999997, + "loss": 0.559, + "step": 120 + }, + { + "epoch": 0.0625, + "grad_norm": 0.13373605906963348, + "learning_rate": 0.000186, + "loss": 0.5475, + "step": 125 + }, + { + "epoch": 0.065, + "grad_norm": 0.12433867901563644, + "learning_rate": 0.0001935, + "loss": 0.5274, + "step": 130 + }, + { + "epoch": 0.0675, + "grad_norm": 0.11097615957260132, + "learning_rate": 0.000201, + "loss": 0.678, + "step": 135 + }, + { + "epoch": 0.07, + "grad_norm": 0.1155027225613594, + "learning_rate": 0.00020849999999999997, + "loss": 0.5611, + "step": 140 + }, + { + "epoch": 0.0725, + "grad_norm": 0.11431068181991577, + "learning_rate": 0.00021599999999999996, + "loss": 0.6054, + "step": 145 + }, + { + "epoch": 0.075, + "grad_norm": 0.09796140342950821, + "learning_rate": 0.00022349999999999998, + "loss": 0.5472, + "step": 150 + }, + { + "epoch": 0.0775, + "grad_norm": 0.09489257633686066, + "learning_rate": 0.00023099999999999998, + "loss": 0.4636, + "step": 155 + }, + { + "epoch": 0.08, + "grad_norm": 0.10787788033485413, + "learning_rate": 0.0002385, + "loss": 0.6164, + "step": 160 + }, + { + "epoch": 0.0825, + "grad_norm": 0.10261733084917068, + "learning_rate": 0.00024599999999999996, + "loss": 0.5408, + "step": 165 + }, + { + "epoch": 0.085, + "grad_norm": 0.11870352178812027, + "learning_rate": 0.0002535, + "loss": 0.5268, + "step": 170 + }, + { + "epoch": 0.0875, + "grad_norm": 0.11910569667816162, + "learning_rate": 0.000261, + "loss": 0.5461, + "step": 175 + }, + { + "epoch": 0.09, + "grad_norm": 0.10083702206611633, + "learning_rate": 0.00026849999999999997, + "loss": 0.4794, + "step": 180 + }, + { + "epoch": 0.0925, + "grad_norm": 0.10453511029481888, + "learning_rate": 0.000276, + "loss": 0.5539, + "step": 185 + }, + { + "epoch": 0.095, + "grad_norm": 0.101403146982193, + "learning_rate": 0.00028349999999999995, + "loss": 0.5346, + "step": 190 + }, + { + "epoch": 0.0975, + "grad_norm": 0.10724789649248123, + "learning_rate": 0.00029099999999999997, + "loss": 0.6026, + "step": 195 + }, + { + "epoch": 0.1, + "grad_norm": 0.1140277311205864, + "learning_rate": 0.0002985, + "loss": 0.5193, + "step": 200 + }, + { + "epoch": 0.1025, + "grad_norm": 0.09706108272075653, + "learning_rate": 0.0002999963446058092, + "loss": 0.54, + "step": 205 + }, + { + "epoch": 0.105, + "grad_norm": 0.10003062337636948, + "learning_rate": 0.0002999814948722491, + "loss": 0.5365, + "step": 210 + }, + { + "epoch": 0.1075, + "grad_norm": 0.1078687533736229, + "learning_rate": 0.00029995522346717746, + "loss": 0.5889, + "step": 215 + }, + { + "epoch": 0.11, + "grad_norm": 0.10538115352392197, + "learning_rate": 0.0002999175323912636, + "loss": 0.5611, + "step": 220 + }, + { + "epoch": 0.1125, + "grad_norm": 0.1020808294415474, + "learning_rate": 0.00029986842451482874, + "loss": 0.6103, + "step": 225 + }, + { + "epoch": 0.115, + "grad_norm": 0.09635835886001587, + "learning_rate": 0.0002998079035776279, + "loss": 0.5229, + "step": 230 + }, + { + "epoch": 0.1175, + "grad_norm": 0.10287190228700638, + "learning_rate": 0.0002997359741885648, + "loss": 0.5312, + "step": 235 + }, + { + "epoch": 0.12, + "grad_norm": 0.09160075336694717, + "learning_rate": 0.0002996526418253408, + "loss": 0.5673, + "step": 240 + }, + { + "epoch": 0.1225, + "grad_norm": 0.08691006153821945, + "learning_rate": 0.000299557912834038, + "loss": 0.5326, + "step": 245 + }, + { + "epoch": 0.125, + "grad_norm": 0.10096988826990128, + "learning_rate": 0.00029945179442863594, + "loss": 0.6004, + "step": 250 + }, + { + "epoch": 0.1275, + "grad_norm": 0.09594204276800156, + "learning_rate": 0.000299334294690462, + "loss": 0.5516, + "step": 255 + }, + { + "epoch": 0.13, + "grad_norm": 0.10281919687986374, + "learning_rate": 0.00029920542256757607, + "loss": 0.5515, + "step": 260 + }, + { + "epoch": 0.1325, + "grad_norm": 0.08547840267419815, + "learning_rate": 0.00029906518787408944, + "loss": 0.5243, + "step": 265 + }, + { + "epoch": 0.135, + "grad_norm": 0.10161560773849487, + "learning_rate": 0.0002989136012894168, + "loss": 0.5096, + "step": 270 + }, + { + "epoch": 0.1375, + "grad_norm": 0.09101904183626175, + "learning_rate": 0.0002987506743574635, + "loss": 0.553, + "step": 275 + }, + { + "epoch": 0.14, + "grad_norm": 0.09769442677497864, + "learning_rate": 0.0002985764194857463, + "loss": 0.4953, + "step": 280 + }, + { + "epoch": 0.1425, + "grad_norm": 0.10991579294204712, + "learning_rate": 0.00029839084994444826, + "loss": 0.5152, + "step": 285 + }, + { + "epoch": 0.145, + "grad_norm": 0.09450916200876236, + "learning_rate": 0.00029819397986540836, + "loss": 0.5397, + "step": 290 + }, + { + "epoch": 0.1475, + "grad_norm": 0.10876069217920303, + "learning_rate": 0.0002979858242410454, + "loss": 0.4858, + "step": 295 + }, + { + "epoch": 0.15, + "grad_norm": 0.097995825111866, + "learning_rate": 0.00029776639892321606, + "loss": 0.5566, + "step": 300 + }, + { + "epoch": 0.1525, + "grad_norm": 0.1145048514008522, + "learning_rate": 0.0002975357206220079, + "loss": 0.4531, + "step": 305 + }, + { + "epoch": 0.155, + "grad_norm": 0.10271880775690079, + "learning_rate": 0.00029729380690446654, + "loss": 0.5199, + "step": 310 + }, + { + "epoch": 0.1575, + "grad_norm": 0.11095371842384338, + "learning_rate": 0.0002970406761932583, + "loss": 0.5416, + "step": 315 + }, + { + "epoch": 0.16, + "grad_norm": 0.09949438273906708, + "learning_rate": 0.00029677634776526673, + "loss": 0.4841, + "step": 320 + }, + { + "epoch": 0.1625, + "grad_norm": 0.1163724958896637, + "learning_rate": 0.00029650084175012517, + "loss": 0.4913, + "step": 325 + }, + { + "epoch": 0.165, + "grad_norm": 0.10726840049028397, + "learning_rate": 0.00029621417912868323, + "loss": 0.5203, + "step": 330 + }, + { + "epoch": 0.1675, + "grad_norm": 0.09609931707382202, + "learning_rate": 0.00029591638173140947, + "loss": 0.5607, + "step": 335 + }, + { + "epoch": 0.17, + "grad_norm": 0.10824442654848099, + "learning_rate": 0.0002956074722367286, + "loss": 0.6004, + "step": 340 + }, + { + "epoch": 0.1725, + "grad_norm": 0.10465679317712784, + "learning_rate": 0.00029528747416929463, + "loss": 0.5216, + "step": 345 + }, + { + "epoch": 0.175, + "grad_norm": 0.10518354922533035, + "learning_rate": 0.0002949564118981994, + "loss": 0.499, + "step": 350 + }, + { + "epoch": 0.1775, + "grad_norm": 0.0955279991030693, + "learning_rate": 0.0002946143106351165, + "loss": 0.5607, + "step": 355 + }, + { + "epoch": 0.18, + "grad_norm": 0.11159654706716537, + "learning_rate": 0.0002942611964323817, + "loss": 0.5204, + "step": 360 + }, + { + "epoch": 0.1825, + "grad_norm": 0.09571187198162079, + "learning_rate": 0.0002938970961810086, + "loss": 0.6113, + "step": 365 + }, + { + "epoch": 0.185, + "grad_norm": 0.11854679882526398, + "learning_rate": 0.0002935220376086411, + "loss": 0.5639, + "step": 370 + }, + { + "epoch": 0.1875, + "grad_norm": 0.1050512045621872, + "learning_rate": 0.0002931360492774415, + "loss": 0.548, + "step": 375 + }, + { + "epoch": 0.19, + "grad_norm": 0.1053968220949173, + "learning_rate": 0.0002927391605819157, + "loss": 0.5507, + "step": 380 + }, + { + "epoch": 0.1925, + "grad_norm": 0.10567320138216019, + "learning_rate": 0.00029233140174667445, + "loss": 0.5312, + "step": 385 + }, + { + "epoch": 0.195, + "grad_norm": 0.11914283782243729, + "learning_rate": 0.0002919128038241318, + "loss": 0.5961, + "step": 390 + }, + { + "epoch": 0.1975, + "grad_norm": 0.09915795922279358, + "learning_rate": 0.0002914833986921401, + "loss": 0.5086, + "step": 395 + }, + { + "epoch": 0.2, + "grad_norm": 0.10796502232551575, + "learning_rate": 0.0002910432190515628, + "loss": 0.5585, + "step": 400 + }, + { + "epoch": 0.2025, + "grad_norm": 0.10748997330665588, + "learning_rate": 0.00029059229842378373, + "loss": 0.5466, + "step": 405 + }, + { + "epoch": 0.205, + "grad_norm": 0.10696308314800262, + "learning_rate": 0.0002901306711481544, + "loss": 0.5513, + "step": 410 + }, + { + "epoch": 0.2075, + "grad_norm": 0.10418657958507538, + "learning_rate": 0.0002896583723793792, + "loss": 0.5391, + "step": 415 + }, + { + "epoch": 0.21, + "grad_norm": 0.16421550512313843, + "learning_rate": 0.00028917543808483796, + "loss": 0.4699, + "step": 420 + }, + { + "epoch": 0.2125, + "grad_norm": 0.12929962575435638, + "learning_rate": 0.00028868190504184696, + "loss": 0.4984, + "step": 425 + }, + { + "epoch": 0.215, + "grad_norm": 0.10469454526901245, + "learning_rate": 0.00028817781083485816, + "loss": 0.5119, + "step": 430 + }, + { + "epoch": 0.2175, + "grad_norm": 0.0964970663189888, + "learning_rate": 0.00028766319385259713, + "loss": 0.5167, + "step": 435 + }, + { + "epoch": 0.22, + "grad_norm": 0.12395574152469635, + "learning_rate": 0.00028713809328513953, + "loss": 0.5692, + "step": 440 + }, + { + "epoch": 0.2225, + "grad_norm": 0.10189738124608994, + "learning_rate": 0.0002866025491209265, + "loss": 0.4628, + "step": 445 + }, + { + "epoch": 0.225, + "grad_norm": 0.10433454066514969, + "learning_rate": 0.0002860566021437197, + "loss": 0.4869, + "step": 450 + }, + { + "epoch": 0.2275, + "grad_norm": 0.13003456592559814, + "learning_rate": 0.0002855002939294951, + "loss": 0.5291, + "step": 455 + }, + { + "epoch": 0.23, + "grad_norm": 0.11692202836275101, + "learning_rate": 0.000284933666843277, + "loss": 0.5229, + "step": 460 + }, + { + "epoch": 0.2325, + "grad_norm": 0.10757846385240555, + "learning_rate": 0.0002843567640359119, + "loss": 0.435, + "step": 465 + }, + { + "epoch": 0.235, + "grad_norm": 0.10775501281023026, + "learning_rate": 0.00028376962944078206, + "loss": 0.4418, + "step": 470 + }, + { + "epoch": 0.2375, + "grad_norm": 0.11543692648410797, + "learning_rate": 0.00028317230777046015, + "loss": 0.4204, + "step": 475 + }, + { + "epoch": 0.24, + "grad_norm": 0.10946698486804962, + "learning_rate": 0.00028256484451330403, + "loss": 0.49, + "step": 480 + }, + { + "epoch": 0.2425, + "grad_norm": 0.11528221517801285, + "learning_rate": 0.00028194728592999247, + "loss": 0.4752, + "step": 485 + }, + { + "epoch": 0.245, + "grad_norm": 0.10474205762147903, + "learning_rate": 0.0002813196790500027, + "loss": 0.4847, + "step": 490 + }, + { + "epoch": 0.2475, + "grad_norm": 0.10768820345401764, + "learning_rate": 0.00028068207166802837, + "loss": 0.4664, + "step": 495 + }, + { + "epoch": 0.25, + "grad_norm": 0.12158560007810593, + "learning_rate": 0.00028003451234034037, + "loss": 0.4741, + "step": 500 + } + ], + "logging_steps": 5, + "max_steps": 2000, + "num_input_tokens_seen": 0, + "num_train_epochs": 9223372036854775807, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 6.57394539429888e+17, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +} diff --git a/codellama-hugcoder/checkpoint-500/training_args.bin b/codellama-hugcoder/checkpoint-500/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..56c791e5156768b7f6e7b84cd13f2e54e47fad93 --- /dev/null +++ b/codellama-hugcoder/checkpoint-500/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ce19e7480e96c4d26efe137d7fe1582e71cf088cb5b49be23c8ccd4b8298bb4b +size 5304 diff --git a/codellama-hugcoder/training_args.bin b/codellama-hugcoder/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..56c791e5156768b7f6e7b84cd13f2e54e47fad93 --- /dev/null +++ b/codellama-hugcoder/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ce19e7480e96c4d26efe137d7fe1582e71cf088cb5b49be23c8ccd4b8298bb4b +size 5304 diff --git a/configs/deepspeed_config.yaml b/configs/deepspeed_config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..7aad2cf962436ad604aa39e18b05c62879001373 --- /dev/null +++ b/configs/deepspeed_config.yaml @@ -0,0 +1,22 @@ +compute_environment: LOCAL_MACHINE +debug: false +deepspeed_config: + deepspeed_multinode_launcher: standard + offload_optimizer_device: none + offload_param_device: none + zero3_init_flag: true + zero3_save_16bit_model: true + zero_stage: 3 +distributed_type: DEEPSPEED +downcast_bf16: 'no' +machine_rank: 0 +main_training_function: main +mixed_precision: bf16 +num_machines: 1 +num_processes: 8 +rdzv_backend: static +same_network: true +tpu_env: [] +tpu_use_cluster: false +tpu_use_sudo: false +use_cpu: false \ No newline at end of file diff --git a/configs/fsdp_config.yaml b/configs/fsdp_config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..1089362fb2b8527f36e08ebb439644a89dbed6c0 --- /dev/null +++ b/configs/fsdp_config.yaml @@ -0,0 +1,25 @@ +compute_environment: LOCAL_MACHINE +debug: false +distributed_type: FSDP +downcast_bf16: 'no' +fsdp_config: + fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP + fsdp_backward_prefetch_policy: BACKWARD_PRE + fsdp_cpu_ram_efficient_loading: true + fsdp_forward_prefetch: false + fsdp_offload_params: false + fsdp_sharding_strategy: 1 + fsdp_state_dict_type: SHARDED_STATE_DICT + fsdp_sync_module_states: true + fsdp_use_orig_params: true +machine_rank: 0 +main_training_function: main +mixed_precision: bf16 +num_machines: 1 +num_processes: 8 +rdzv_backend: static +same_network: true +tpu_env: [] +tpu_use_cluster: false +tpu_use_sudo: false +use_cpu: false \ No newline at end of file diff --git a/fim.py b/fim.py new file mode 100644 index 0000000000000000000000000000000000000000..ef1d57bc2cf9994a80ffa0239492bad0ba311854 --- /dev/null +++ b/fim.py @@ -0,0 +1,141 @@ +# coding=utf-8 +# Copyright 2024 Sourab Mangrulkar. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import functools +import numpy as np + + +# this is expensive so we cache it +@functools.lru_cache(maxsize=None) +def get_fim_token_ids(tokenizer): + if "codellama" in tokenizer.name_or_path: + return ( + tokenizer.bos_token_id, + tokenizer.suffix_id, + tokenizer.prefix_id, + tokenizer.middle_id, + 0, + ) + elif "deepseek-coder" in tokenizer.name_or_path: + return ( + tokenizer.bos_token_id, + tokenizer.encode("<|fim▁hole|>", add_special_tokens=False)[0], + tokenizer.encode("<|fim▁begin|>", add_special_tokens=False)[0], + tokenizer.encode("<|fim▁end|>", add_special_tokens=False)[0], + tokenizer.encode("", add_special_tokens=False)[0], + ) + elif "stable-code" in tokenizer.name_or_path: + return ( + tokenizer.bos_token_id, + tokenizer.encode("")[0], + tokenizer.encode("")[0], + tokenizer.encode("")[0], + tokenizer.encode("")[0], + ) + else: + bos_token_id = None + try: + FIM_PREFIX, FIM_MIDDLE, FIM_SUFFIX, FIM_PAD = tokenizer.special_tokens_map[ + "additional_special_tokens" + ][1:5] + suffix_tok_id, prefix_tok_id, middle_tok_id, pad_tok_id = ( + tokenizer.vocab[tok] + for tok in [FIM_SUFFIX, FIM_PREFIX, FIM_MIDDLE, FIM_PAD] + ) + except KeyError: + suffix_tok_id, prefix_tok_id, middle_tok_id, pad_tok_id = ( + None, + None, + None, + None, + ) + return bos_token_id, suffix_tok_id, prefix_tok_id, middle_tok_id, pad_tok_id + + +def _bos_token_processing(prefix_token_list, bos_token): + if bos_token is not None: + # add the BOS token to the beginning of the list + prefix_token_list.insert(0, bos_token) + + return prefix_token_list + + +## Adapted from https://github.com/bigcode-project/Megatron-LM/blob/6c4bf908df8fd86b4977f54bf5b8bd4b521003d1/megatron/data/gpt_dataset.py +def permute( + sample, + np_rng, + suffix_tok_id, + prefix_tok_id, + middle_tok_id, + pad_tok_id, + fim_rate=0.5, + fim_spm_rate=0.5, + truncate_or_pad=False, + bos_token_id=None, +): + """ + Take in a sample (list of tokens) and perform a FIM transformation on it with a probability of fim_rate, using two FIM modes: + PSM and SPM (with a probability of fim_spm_rate). + """ + + if np_rng.binomial(1, fim_rate): + boundaries = list(np_rng.randint(low=0, high=len(sample) + 1, size=2)) + boundaries.sort() + + prefix = np.array(sample[: boundaries[0]], dtype=np.int64) + middle = np.array(sample[boundaries[0] : boundaries[1]], dtype=np.int64) + suffix = np.array(sample[boundaries[1] :], dtype=np.int64) + + if truncate_or_pad: + new_length = suffix.shape[0] + prefix.shape[0] + middle.shape[0] + 3 + diff = new_length - len(sample) + if diff > 0: + if suffix.shape[0] <= diff: + return sample, np_rng + suffix = suffix[: suffix.shape[0] - diff] + elif diff < 0: + suffix = np.concatenate([suffix, np.full((-1 * diff), pad_tok_id)]) + + if np_rng.binomial(1, fim_spm_rate): + prefix_special_tokens = _bos_token_processing( + [prefix_tok_id, suffix_tok_id], bos_token_id + ) + # SPM (variant 2 from FIM paper) + new_sample = np.concatenate( + [ + prefix_special_tokens, + suffix, + [middle_tok_id], + prefix, + middle, + ] + ) + else: + prefix_special_tokens = _bos_token_processing([prefix_tok_id], bos_token_id) + # PSM + new_sample = np.concatenate( + [ + prefix_special_tokens, + prefix, + [suffix_tok_id], + suffix, + [middle_tok_id], + middle, + ] + ) + else: + # don't do FIM preproc + new_sample = sample + return list(new_sample), np_rng diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..6b7b96512c409868cf9c62d05ed6254a9dc6bd5f --- /dev/null +++ b/requirements.txt @@ -0,0 +1,14 @@ +git+https://github.com/huggingface/transformers +git+https://github.com/huggingface/accelerate +git+https://github.com/huggingface/peft +trl +huggingface-hub +bitsandbytes +evaluate +datasets +einops +wandb +tiktoken +deepspeed +tqdm +safetensors \ No newline at end of file diff --git a/run_deepspeed.sh b/run_deepspeed.sh new file mode 100644 index 0000000000000000000000000000000000000000..4118c1ea6f811eaa05cc0c7ef15b34b0b8becf23 --- /dev/null +++ b/run_deepspeed.sh @@ -0,0 +1,33 @@ +accelerate launch --config_file "configs/deepspeed_config.yaml" train.py \ +--model_name_or_path "codellama/CodeLlama-7b-Instruct-hf" \ +--dataset_name "smangrul/hug_stack" \ +--splits "train" \ +--max_seq_len 2048 \ +--max_steps 2000 \ +--save_steps 500 \ +--eval_steps 100 \ +--logging_steps 5 \ +--log_level "info" \ +--logging_strategy "steps" \ +--evaluation_strategy "steps" \ +--save_strategy "steps" \ +--push_to_hub \ +--hub_private_repo True \ +--hub_strategy "every_save" \ +--bf16 True \ +--learning_rate 2e-5 \ +--lr_scheduler_type "cosine" \ +--weight_decay 0.1 \ +--warmup_ratio 0.1 \ +--max_grad_norm 1.0 \ +--output_dir "codellama-hugcoder-df" \ +--per_device_train_batch_size 16 \ +--per_device_eval_batch_size 16 \ +--gradient_accumulation_steps 4 \ +--gradient_checkpointing True \ +--use_reentrant False \ +--dataset_text_field "text" \ +--test_size 0.1 \ +--fim_rate 0.5 \ +--fim_spm_rate 0.5 \ +--use_flash_attn True \ No newline at end of file diff --git a/run_fsdp.sh b/run_fsdp.sh new file mode 100644 index 0000000000000000000000000000000000000000..b8f816f6d05a629d27aa022c2424b8a12b27752f --- /dev/null +++ b/run_fsdp.sh @@ -0,0 +1,33 @@ +accelerate launch --config_file "configs/fsdp_config.yaml" train.py \ +--model_path "codellama/CodeLlama-7b-Instruct-hf" \ +--dataset_name "smangrul/hug_stack" \ +--splits "train" \ +--max_seq_len 2048 \ +--max_steps 1000 \ +--save_steps 500 \ +--eval_steps 100 \ +--logging_steps 25 \ +--log_level "info" \ +--logging_strategy "steps" \ +--evaluation_strategy "steps" \ +--save_strategy "steps" \ +--push_to_hub \ +--hub_private_repo True \ +--hub_strategy "every_save" \ +--bf16 True \ +--learning_rate 1e-4 \ +--lr_scheduler_type "cosine" \ +--weight_decay 0.1 \ +--warmup_ratio 0.1 \ +--max_grad_norm 1.0 \ +--output_dir "codellama-hugcoder-fsdp" \ +--per_device_train_batch_size 16 \ +--per_device_eval_batch_size 16 \ +--gradient_accumulation_steps 4 \ +--gradient_checkpointing True \ +--use_reentrant True \ +--dataset_text_field "text" \ +--test_size 0.1 \ +--fim_rate 0.5 \ +--fim_spm_rate 0.5 \ +--use_flash_attn True \ No newline at end of file diff --git a/run_peft.sh b/run_peft.sh new file mode 100644 index 0000000000000000000000000000000000000000..540c0e1e72cd3ca63699ac7eaebf293d475951bd --- /dev/null +++ b/run_peft.sh @@ -0,0 +1,40 @@ +CUDA_VISIBLE_DEVICES=0 WANDB_PROJECT=personal-code-copilot python3 train.py \ +--model_name_or_path "codellama/CodeLlama-7b-Instruct-hf" \ +--dataset_name "smangrul/hug_stack" \ +--splits "train" \ +--max_seq_len 2048 \ +--max_steps 2000 \ +--save_steps 500 \ +--eval_steps 100 \ +--logging_steps 5 \ +--log_level "info" \ +--logging_strategy "steps" \ +--save_strategy "steps" \ +--push_to_hub \ +--hub_private_repo True \ +--hub_strategy "every_save" \ +--bf16 True \ +--learning_rate 3e-4 \ +--lr_scheduler_type "cosine" \ +--weight_decay 0.1 \ +--warmup_ratio 0.1 \ +--max_grad_norm 1.0 \ +--output_dir "codellama-hugcoder" \ +--per_device_train_batch_size 4 \ +--per_device_eval_batch_size 4 \ +--gradient_accumulation_steps 4 \ +--gradient_checkpointing True \ +--use_reentrant True \ +--dataset_text_field "text" \ +--test_size 0.1 \ +--fim_rate 0.5 \ +--fim_spm_rate 0.5 \ +--use_peft_lora True \ +--lora_r 32 \ +--lora_alpha 64 \ +--lora_dropout 0.1 \ +--lora_target_modules "all-linear" \ +--use_4bit_quantization True \ +--use_nested_quant True \ +--bnb_4bit_compute_dtype "bfloat16" \ +--use_flash_attn True \ No newline at end of file diff --git a/run_unsloth_peft.sh b/run_unsloth_peft.sh new file mode 100644 index 0000000000000000000000000000000000000000..8d8a6b489e29c01cf5f7cdfc65cb97ce2da7de08 --- /dev/null +++ b/run_unsloth_peft.sh @@ -0,0 +1,43 @@ +CUDA_VISIBLE_DEVICES=0 WANDB_PROJECT=personal-code-copilot python train.py \ +--seed 11 \ +--model_name_or_path "codellama/CodeLlama-7b-Instruct-hf" \ +--dataset_name "smangrul/hug_stack" \ +--splits "train" \ +--max_seq_len 2048 \ +--max_steps 2000 \ +--save_steps 500 \ +--eval_steps 100 \ +--logging_steps 5 \ +--log_level "info" \ +--logging_strategy "steps" \ +--evaluation_strategy "steps" \ +--save_strategy "steps" \ +--push_to_hub \ +--hub_private_repo True \ +--hub_strategy "every_save" \ +--bf16 True \ +--learning_rate 2e-4 \ +--lr_scheduler_type "cosine" \ +--weight_decay 0.1 \ +--warmup_ratio 0.1 \ +--max_grad_norm 1.0 \ +--output_dir "codellama-hugcoder" \ +--per_device_train_batch_size 16 \ +--per_device_eval_batch_size 16 \ +--gradient_accumulation_steps 4 \ +--gradient_checkpointing True \ +--use_reentrant True \ +--dataset_text_field "text" \ +--test_size 0.1 \ +--fim_rate 0.5 \ +--fim_spm_rate 0.0 \ +--use_peft_lora True \ +--lora_r 16 \ +--lora_alpha 16 \ +--lora_dropout 0.1 \ +--lora_target_modules "q_proj,k_proj,v_proj,o_proj,down_proj,up_proj,gate_proj" \ +--use_4bit_quantization True \ +--use_nested_quant True \ +--bnb_4bit_compute_dtype "bfloat16" \ +--use_flash_attn True \ +--use_unsloth True \ No newline at end of file diff --git a/train.py b/train.py new file mode 100644 index 0000000000000000000000000000000000000000..f7d648ac1bcaddd7ff39da29193f1e7f2ccd9f35 --- /dev/null +++ b/train.py @@ -0,0 +1,495 @@ +# coding=utf-8 +# Copyright 2024 Sourab Mangrulkar. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +Continued pre-training/fine-tuning of code LLMs for code autocompletion. +""" + +import gc +import os +import random +import sys +from typing import Optional +from dataclasses import dataclass, field + +import numpy as np +import torch +from datasets import load_dataset +from torch.utils.data import IterableDataset +from tqdm import tqdm +from transformers import ( + AutoModelForCausalLM, + AutoTokenizer, + Trainer, + TrainingArguments, + HfArgumentParser, + set_seed, + BitsAndBytesConfig, +) + +from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training, replace_lora_weights_loftq +import fim + + +# Define and parse arguments. +@dataclass +class ModelArguments: + """ + Arguments pertaining to which model/config/tokenizer we are going to fine-tune from. + """ + + model_name_or_path: str = field( + metadata={ + "help": "Path to pretrained model or model identifier from huggingface.co/models" + } + ) + lora_alpha: Optional[int] = field(default=16) + lora_dropout: Optional[float] = field(default=0.1) + lora_r: Optional[int] = field(default=64) + lora_target_modules: Optional[str] = field( + default="q_proj,k_proj,v_proj,o_proj,down_proj,up_proj,gate_proj", + metadata={ + "help": "comma separated list of target modules to apply LoRA layers to" + }, + ) + use_nested_quant: Optional[bool] = field( + default=False, + metadata={"help": "Activate nested quantization for 4bit base models"}, + ) + bnb_4bit_compute_dtype: Optional[str] = field( + default="float16", + metadata={"help": "Compute dtype for 4bit base models"}, + ) + bnb_4bit_quant_type: Optional[str] = field( + default="nf4", + metadata={"help": "Quantization type fp4 or nf4"}, + ) + use_flash_attn: Optional[bool] = field( + default=False, + metadata={"help": "Enables Flash attention for training."}, + ) + use_peft_lora: Optional[bool] = field( + default=False, + metadata={"help": "Enables PEFT LoRA for training."}, + ) + use_8bit_qunatization: Optional[bool] = field( + default=False, + metadata={"help": "Enables loading model in 8bit."}, + ) + use_4bit_quantization: Optional[bool] = field( + default=False, + metadata={"help": "Enables loading model in 4bit."}, + ) + use_reentrant: Optional[bool] = field( + default=False, + metadata={"help": "Gradient Checkpointing param. Refer the related docs"}, + ) + use_unsloth: Optional[bool] = field( + default=False, + metadata={"help": "Enables UnSloth for training."}, + ) + use_loftq: Optional[bool] = field( + default=False, + metadata={"help": "Enables LoftQ init for the LoRA adapters when using QLoRA."}, + ) + use_loftq_callback: Optional[bool] = field( + default=False, + metadata={"help": "Enables LoftQ callback comparing logits of base model to the ones from LoftQ init. Provides better init."}, + ) + + +@dataclass +class DataTrainingArguments: + dataset_name: Optional[str] = field( + default="smangrul/hug_stack", + metadata={"help": "The preference dataset to use."}, + ) + dataset_text_field: str = field( + default="text", metadata={"help": "Dataset field to use as input text."} + ) + max_seq_length: Optional[int] = field(default=4096) + test_size: Optional[float] = field(default=0.1) + fim_rate: Optional[float] = field(default=0.5) + fim_spm_rate: Optional[float] = field(default=0.5) + splits: Optional[str] = field( + default="train", + metadata={"help": "Comma separate list of the splits to use from the dataset."}, + ) + + +def chars_token_ratio(dataset, tokenizer, data_column, nb_examples=400): + """ + Estimate the average number of characters per token in the dataset. + """ + total_characters, total_tokens = 0, 0 + for _, example in tqdm(zip(range(nb_examples), iter(dataset)), total=nb_examples): + total_characters += len(example[data_column]) + total_tokens += len(tokenizer(example[data_column]).tokens()) + + return total_characters / total_tokens + + +class ConstantLengthDataset(IterableDataset): + """ + Iterable dataset that returns constant length chunks of tokens from stream of text files. + Args: + tokenizer (Tokenizer): The processor used for proccessing the data. + dataset (dataset.Dataset): Dataset with text files. + infinite (bool): If True the iterator is reset after dataset reaches end else stops. + seq_length (int): Length of token sequences to return. + num_of_sequences (int): Number of token sequences to keep in buffer. + chars_per_token (int): Number of characters per token used to estimate number of tokens in text buffer. + fim_rate (float): Rate (0.0 to 1.0) that sample will be permuted with FIM. + fim_spm_rate (float): Rate (0.0 to 1.0) of FIM permuations that will use SPM. + seed (int): Seed for random number generator. + """ + + def __init__( + self, + tokenizer, + dataset, + infinite=False, + seq_length=1024, + num_of_sequences=1024, + chars_per_token=3.6, + content_field="content", + fim_rate=0.5, + fim_spm_rate=0.5, + seed=0, + shuffle=False, + ): + self.tokenizer = tokenizer + self.concat_token_id = tokenizer.eos_token_id + self.dataset = dataset + self.seq_length = seq_length + self.infinite = infinite + self.current_size = 0 + self.max_buffer_size = seq_length * chars_per_token * num_of_sequences + self.content_field = content_field + self.fim_rate = fim_rate + self.fim_spm_rate = fim_spm_rate + self.seed = seed + self.shuffle = shuffle + + ( + self.bos_token_id, + self.suffix_tok_id, + self.prefix_tok_id, + self.middle_tok_id, + self.pad_tok_id, + ) = fim.get_fim_token_ids(self.tokenizer) + if not self.suffix_tok_id and self.fim_rate > 0: + print("FIM is not supported by tokenizer, disabling FIM") + self.fim_rate = 0 + + def __iter__(self): + iterator = iter(self.dataset) + more_examples = True + np_rng = np.random.RandomState(seed=self.seed) + while more_examples: + buffer, buffer_len = [], 0 + while True: + if buffer_len >= self.max_buffer_size: + break + try: + buffer.append(next(iterator)[self.content_field]) + buffer_len += len(buffer[-1]) + except StopIteration: + if self.infinite: + iterator = iter(self.dataset) + else: + more_examples = False + break + tokenized_inputs = self.tokenizer( + buffer, truncation=False, add_special_tokens=False + )["input_ids"] + all_token_ids = [] + + for tokenized_input in tokenized_inputs: + # optionally do FIM permutations + if self.fim_rate > 0: + tokenized_input, np_rng = fim.permute( + tokenized_input, + np_rng, + self.suffix_tok_id, + self.prefix_tok_id, + self.middle_tok_id, + self.pad_tok_id, + fim_rate=self.fim_rate, + fim_spm_rate=self.fim_spm_rate, + truncate_or_pad=False, + bos_token_id=self.bos_token_id, + ) + + all_token_ids.extend(tokenized_input + [self.concat_token_id]) + examples = [] + for i in range(0, len(all_token_ids), self.seq_length): + input_ids = all_token_ids[i : i + self.seq_length] + if len(input_ids) == self.seq_length: + examples.append(input_ids) + if self.shuffle: + random.shuffle(examples) + for example in examples: + self.current_size += 1 + yield { + "input_ids": torch.LongTensor(example), + "labels": torch.LongTensor(example), + } + + +def create_datasets(tokenizer, args, seed): + dataset = load_dataset(args.dataset_name, split=args.splits) + dataset = dataset.train_test_split( + test_size=args.test_size, seed=seed, shuffle=True + ) + train_data = dataset["train"] + valid_data = dataset["test"] + print( + f"Size of the train set: {len(train_data)}. Size of the validation set: {len(valid_data)}" + ) + chars_per_token = chars_token_ratio(train_data, tokenizer, args.dataset_text_field) + print(f"The character to token ratio of the dataset is: {chars_per_token:.2f}") + train_dataset = ConstantLengthDataset( + tokenizer, + train_data, + infinite=True, + seq_length=args.max_seq_length, + chars_per_token=chars_per_token, + content_field=args.dataset_text_field, + fim_rate=args.fim_rate, + fim_spm_rate=args.fim_spm_rate, + seed=seed, + shuffle=True, + ) + valid_dataset = ConstantLengthDataset( + tokenizer, + valid_data, + infinite=False, + seq_length=args.max_seq_length, + chars_per_token=chars_per_token, + content_field=args.dataset_text_field, + fim_rate=args.fim_rate, + fim_spm_rate=args.fim_spm_rate, + seed=seed, + ) + print(f"A sample of valid dataset: {next(iter(valid_dataset))}") + return train_dataset, valid_dataset + +def get_mae(x, y): + return (x - y).abs().mean() + + +def get_mse(x, y): + return torch.pow(x - y, 2).mean() + + +def error_report(x, y): + mae = get_mae(x, y) + mse = get_mse(x, y) + print( + f"Mean absolute error: {mae:>8.5f}\n" + f"Mean squared error: {mse:>8.5f}" + ) + + +def loftq_init(model, tokenizer, train_dataset, max_seq_length, args): + if args.use_loftq_callback: + compute_dtype = getattr(torch, args.bnb_4bit_compute_dtype) + base_model = AutoModelForCausalLM.from_pretrained(args.model_name_or_path, torch_dtype=compute_dtype) + base_model.resize_token_embeddings(len(tokenizer), pad_to_multiple_of=8) + random_input_ids = torch.randint(0, len(train_dataset), size=(1,)).numpy().tolist() + random_inputs = [train_dataset[i]['content'] for i in random_input_ids] + random_inputs = tokenizer(random_inputs, return_tensors="pt", padding=True, truncation="max_length", max_length=max_seq_length) + logits_base = base_model(**random_inputs).logits + del base_model + gc.collect() + + def loftq_callback(model, module_name): + """Callable to replace weights with LoFTQ if the mse is lower than the current best one.""" + global current_mse + logits = model(**random_inputs).logits + mse = get_mse(logits_base, logits) + if mse < current_mse: + current_mse = mse + print(f"MSE improved for module {module_name}") + return True + print(f"MSE did not improve for module {module_name}") + return False + + replace_lora_weights_loftq(model, callback=loftq_callback) + logits_loftq_callback = model(**random_inputs).logits + error_report(logits_base, logits_loftq_callback) + else: + replace_lora_weights_loftq(model) + + +def create_and_prepare_model(args, data_args, training_args): + device_map = None + bnb_config = None + + load_in_8bit = args.use_8bit_qunatization + load_in_4bit = args.use_4bit_quantization + + if args.use_unsloth: + from unsloth import FastLanguageModel + + if args.use_4bit_quantization: + compute_dtype = getattr(torch, args.bnb_4bit_compute_dtype) + + bnb_config = BitsAndBytesConfig( + load_in_4bit=args.use_4bit_quantization, + bnb_4bit_quant_type=args.bnb_4bit_quant_type, + bnb_4bit_compute_dtype=compute_dtype, + bnb_4bit_use_double_quant=args.use_nested_quant, + ) + + if compute_dtype == torch.float16 and args.use_4bit_quantization: + major, _ = torch.cuda.get_device_capability() + if major >= 8: + print("=" * 80) + print( + "Your GPU supports bfloat16, you can accelerate training with the argument --bf16" + ) + print("=" * 80) + + if args.use_4bit_quantization or args.use_8bit_qunatization: + device_map = ( + int(os.environ.get("LOCAL_RANK", -1)) + if torch.distributed.is_available() and torch.distributed.is_initialized() + else "auto" + ) # {"": 0} + + if args.use_unsloth: + # Load model + model, _ = FastLanguageModel.from_pretrained( + model_name=args.model_name_or_path, + max_seq_length=data_args.max_seq_length, + dtype=None, + load_in_4bit=load_in_4bit, + ) + else: + model = AutoModelForCausalLM.from_pretrained( + args.model_name_or_path, + load_in_8bit=load_in_8bit, + quantization_config=bnb_config, + device_map=device_map, + trust_remote_code=True, + attn_implementation="flash_attention_2" if args.use_flash_attn else "eager", + ) + + if ( + (args.use_4bit_quantization or args.use_8bit_qunatization) + and args.use_peft_lora + and not args.use_unsloth + ): + model = prepare_model_for_kbit_training( + model, + use_gradient_checkpointing=training_args.gradient_checkpointing, + gradient_checkpointing_kwargs={"use_reentrant": model_args.use_reentrant}, + ) + + if args.use_peft_lora and not args.use_unsloth: + peft_config = LoraConfig( + lora_alpha=args.lora_alpha, + lora_dropout=args.lora_dropout, + r=args.lora_r, + bias="none", + task_type="CAUSAL_LM", + target_modules=args.lora_target_modules.split(",") + if args.lora_target_modules != "all-linear" + else args.lora_target_modules, + ) + model = get_peft_model(model, peft_config) + elif args.use_peft_lora and args.use_unsloth: + # Do model patching and add fast LoRA weights + model = FastLanguageModel.get_peft_model( + model, + lora_alpha=args.lora_alpha, + lora_dropout=args.lora_dropout, + r=args.lora_r, + target_modules=args.lora_target_modules.split(",") + if args.lora_target_modules != "all-linear" + else args.lora_target_modules, + use_gradient_checkpointing=training_args.gradient_checkpointing, + random_state=training_args.seed, + max_seq_length=data_args.max_seq_length, + ) + return model + + +def main(model_args, data_args, training_args): + # Set seed for reproducibility + set_seed(training_args.seed) + + # load the tokenizer + tokenizer = AutoTokenizer.from_pretrained(model_args.model_name_or_path) + + # load the datasets + train_dataset, eval_dataset = create_datasets( + tokenizer, data_args, training_args.seed + ) + train_dataset.start_iteration = 0 + + model = create_and_prepare_model(model_args, data_args, training_args) + # gradient ckpt + model.config.use_cache = not training_args.gradient_checkpointing + training_args.gradient_checkpointing = ( + training_args.gradient_checkpointing and not model_args.use_unsloth + ) + if training_args.gradient_checkpointing: + training_args.gradient_checkpointing_kwargs = { + "use_reentrant": model_args.use_reentrant + } + + # trainer + trainer = Trainer( + model=model, + args=training_args, + train_dataset=train_dataset, + eval_dataset=eval_dataset, + ) + trainer.accelerator.print(f"{trainer.model}") + if model_args.use_peft_lora: + trainer.model.print_trainable_parameters() + + # LoftQ initialization when using QLoRA + if model_args.use_4bit_quantization and model_args.use_loftq: + loftq_init(trainer.model, tokenizer, train_dataset, data_args.max_seq_length ,model_args) + + # train + checkpoint = None + if training_args.resume_from_checkpoint is not None: + checkpoint = training_args.resume_from_checkpoint + trainer.train(resume_from_checkpoint=checkpoint) + + # saving final model + if trainer.is_fsdp_enabled: + trainer.accelerator.state.fsdp_plugin.set_state_dict_type("FULL_STATE_DICT") + trainer.save_model() + + +if __name__ == "__main__": + parser = HfArgumentParser( + (ModelArguments, DataTrainingArguments, TrainingArguments) + ) + if len(sys.argv) == 2 and sys.argv[1].endswith(".json"): + # If we pass only one argument to the script and it's the path to a json file, + # let's parse it to get our arguments. + model_args, data_args, training_args = parser.parse_json_file( + json_file=os.path.abspath(sys.argv[1]) + ) + else: + model_args, data_args, training_args = parser.parse_args_into_dataclasses() + main(model_args, data_args, training_args) diff --git a/wandb/debug-internal.log b/wandb/debug-internal.log new file mode 100644 index 0000000000000000000000000000000000000000..ee5bf05e14a33a7993620274bc5fffbfa49ca1f9 --- /dev/null +++ b/wandb/debug-internal.log @@ -0,0 +1,15 @@ +{"time":"2025-05-16T07:37:47.253769219Z","level":"INFO","msg":"stream: starting","core version":"0.19.11","symlink path":"/local_storage/persistent_data/jupyterlab/lab/workspaces/training_script/LLM-Workshop/personal_copilot/training/wandb/offline-run-20250516_073747-jc2tz43q/logs/debug-core.log"} +{"time":"2025-05-16T07:37:47.470171926Z","level":"WARN","msg":"GraphQL client is nil, skipping feature loading"} +{"time":"2025-05-16T07:37:47.471043556Z","level":"INFO","msg":"created new stream","id":"jc2tz43q"} +{"time":"2025-05-16T07:37:47.471071988Z","level":"INFO","msg":"stream: started","id":"jc2tz43q"} +{"time":"2025-05-16T07:37:47.47110475Z","level":"INFO","msg":"writer: Do: started","stream_id":"jc2tz43q"} +{"time":"2025-05-16T07:37:47.471115831Z","level":"INFO","msg":"handler: started","stream_id":"jc2tz43q"} +{"time":"2025-05-16T07:37:47.471180815Z","level":"INFO","msg":"sender: started","stream_id":"jc2tz43q"} +{"time":"2025-05-16T07:37:47.477686926Z","level":"INFO","msg":"Starting system monitor"} +{"time":"2025-05-17T22:27:35.403843016Z","level":"INFO","msg":"stream: closing","id":"jc2tz43q"} +{"time":"2025-05-17T22:27:35.404804572Z","level":"INFO","msg":"Stopping system monitor"} +{"time":"2025-05-17T22:27:35.404850555Z","level":"INFO","msg":"Stopped system monitor"} +{"time":"2025-05-17T22:27:35.40493109Z","level":"INFO","msg":"handler: closed","stream_id":"jc2tz43q"} +{"time":"2025-05-17T22:27:35.404943001Z","level":"INFO","msg":"writer: Close: closed","stream_id":"jc2tz43q"} +{"time":"2025-05-17T22:27:35.404963782Z","level":"INFO","msg":"sender: closed","stream_id":"jc2tz43q"} +{"time":"2025-05-17T22:27:35.405060219Z","level":"INFO","msg":"stream: closed","id":"jc2tz43q"} diff --git a/wandb/debug.log b/wandb/debug.log new file mode 100644 index 0000000000000000000000000000000000000000..067508921187c34e3dc0bd7885357818572fbf91 --- /dev/null +++ b/wandb/debug.log @@ -0,0 +1,26 @@ +2025-05-16 07:37:47,232 INFO MainThread:29365 [wandb_setup.py:_flush():70] Current SDK version is 0.19.11 +2025-05-16 07:37:47,232 INFO MainThread:29365 [wandb_setup.py:_flush():70] Configure stats pid to 29365 +2025-05-16 07:37:47,232 INFO MainThread:29365 [wandb_setup.py:_flush():70] Loading settings from /root/.config/wandb/settings +2025-05-16 07:37:47,232 INFO MainThread:29365 [wandb_setup.py:_flush():70] Loading settings from /local_storage/persistent_data/jupyterlab/lab/workspaces/training_script/LLM-Workshop/personal_copilot/training/wandb/settings +2025-05-16 07:37:47,232 INFO MainThread:29365 [wandb_setup.py:_flush():70] Loading settings from environment variables +2025-05-16 07:37:47,232 INFO MainThread:29365 [wandb_init.py:setup_run_log_directory():724] Logging user logs to /local_storage/persistent_data/jupyterlab/lab/workspaces/training_script/LLM-Workshop/personal_copilot/training/wandb/offline-run-20250516_073747-jc2tz43q/logs/debug.log +2025-05-16 07:37:47,232 INFO MainThread:29365 [wandb_init.py:setup_run_log_directory():725] Logging internal logs to /local_storage/persistent_data/jupyterlab/lab/workspaces/training_script/LLM-Workshop/personal_copilot/training/wandb/offline-run-20250516_073747-jc2tz43q/logs/debug-internal.log +2025-05-16 07:37:47,232 INFO MainThread:29365 [wandb_init.py:init():852] calling init triggers +2025-05-16 07:37:47,232 INFO MainThread:29365 [wandb_init.py:init():857] wandb.init called with sweep_config: {} +config: {'_wandb': {}} +2025-05-16 07:37:47,232 INFO MainThread:29365 [wandb_init.py:init():893] starting backend +2025-05-16 07:37:47,232 INFO MainThread:29365 [wandb_init.py:init():897] sending inform_init request +2025-05-16 07:37:47,236 INFO MainThread:29365 [backend.py:_multiprocessing_setup():101] multiprocessing start_methods=fork,spawn,forkserver, using: spawn +2025-05-16 07:37:47,236 INFO MainThread:29365 [wandb_init.py:init():907] backend started and connected +2025-05-16 07:37:47,237 INFO MainThread:29365 [wandb_init.py:init():1005] updated telemetry +2025-05-16 07:37:47,244 INFO MainThread:29365 [wandb_init.py:init():1029] communicating run to backend with 90.0 second timeout +2025-05-16 07:37:47,473 INFO MainThread:29365 [wandb_init.py:init():1104] starting run threads in backend +2025-05-16 07:37:47,854 INFO MainThread:29365 [wandb_run.py:_console_start():2573] atexit reg +2025-05-16 07:37:47,854 INFO MainThread:29365 [wandb_run.py:_redirect():2421] redirect: wrap_raw +2025-05-16 07:37:47,854 INFO MainThread:29365 [wandb_run.py:_redirect():2490] Wrapping output streams. +2025-05-16 07:37:47,854 INFO MainThread:29365 [wandb_run.py:_redirect():2513] Redirects installed. +2025-05-16 07:37:47,857 INFO MainThread:29365 [wandb_init.py:init():1150] run started, returning control to user process +2025-05-16 07:37:47,859 INFO MainThread:29365 [wandb_run.py:_config_callback():1436] config_cb None None {'peft_config': {'default': {'task_type': 'CAUSAL_LM', 'peft_type': , 'auto_mapping': None, 'base_model_name_or_path': 'codellama/CodeLlama-7b-Instruct-hf', 'revision': None, 'inference_mode': False, 'r': 32, 'target_modules': {'down_proj', 'up_proj', 'k_proj', 'q_proj', 'v_proj', 'gate_proj', 'o_proj'}, 'exclude_modules': None, 'lora_alpha': 64, 'lora_dropout': 0.1, 'fan_in_fan_out': False, 'bias': 'none', 'use_rslora': False, 'modules_to_save': None, 'init_lora_weights': True, 'layers_to_transform': None, 'layers_pattern': None, 'rank_pattern': {}, 'alpha_pattern': {}, 'megatron_config': None, 'megatron_core': 'megatron.core', 'trainable_token_indices': None, 'loftq_config': {}, 'eva_config': None, 'corda_config': None, 'use_dora': False, 'layer_replication': None, 'runtime_config': {'ephemeral_gpu_offload': False}, 'lora_bias': False}}, 'vocab_size': 32016, 'max_position_embeddings': 16384, 'hidden_size': 4096, 'intermediate_size': 11008, 'num_hidden_layers': 32, 'num_attention_heads': 32, 'num_key_value_heads': 32, 'hidden_act': 'silu', 'initializer_range': 0.02, 'rms_norm_eps': 1e-05, 'pretraining_tp': 1, 'use_cache': False, 'rope_theta': 1000000, 'rope_scaling': None, 'attention_bias': False, 'attention_dropout': 0.0, 'mlp_bias': False, 'head_dim': 128, 'return_dict': True, 'output_hidden_states': False, 'output_attentions': False, 'torchscript': False, 'torch_dtype': 'float16', 'use_bfloat16': False, 'tf_legacy_loss': False, 'pruned_heads': {}, 'tie_word_embeddings': False, 'chunk_size_feed_forward': 0, 'is_encoder_decoder': False, 'is_decoder': False, 'cross_attention_hidden_size': None, 'add_cross_attention': False, 'tie_encoder_decoder': False, 'max_length': 20, 'min_length': 0, 'do_sample': False, 'early_stopping': False, 'num_beams': 1, 'num_beam_groups': 1, 'diversity_penalty': 0.0, 'temperature': 1.0, 'top_k': 50, 'top_p': 1.0, 'typical_p': 1.0, 'repetition_penalty': 1.0, 'length_penalty': 1.0, 'no_repeat_ngram_size': 0, 'encoder_no_repeat_ngram_size': 0, 'bad_words_ids': None, 'num_return_sequences': 1, 'output_scores': False, 'return_dict_in_generate': False, 'forced_bos_token_id': None, 'forced_eos_token_id': None, 'remove_invalid_values': False, 'exponential_decay_length_penalty': None, 'suppress_tokens': None, 'begin_suppress_tokens': None, 'architectures': ['LlamaForCausalLM'], 'finetuning_task': None, 'id2label': {0: 'LABEL_0', 1: 'LABEL_1'}, 'label2id': {'LABEL_0': 0, 'LABEL_1': 1}, 'tokenizer_class': None, 'prefix': None, 'bos_token_id': 1, 'pad_token_id': None, 'eos_token_id': 2, 'sep_token_id': None, 'decoder_start_token_id': None, 'task_specific_params': None, 'problem_type': None, '_name_or_path': 'codellama/CodeLlama-7b-Instruct-hf', 'transformers_version': '4.52.0.dev0', 'model_type': 'llama', 'quantization_config': {'quant_method': 'BITS_AND_BYTES', '_load_in_8bit': False, '_load_in_4bit': True, 'llm_int8_threshold': 6.0, 'llm_int8_skip_modules': None, 'llm_int8_enable_fp32_cpu_offload': False, 'llm_int8_has_fp16_weight': False, 'bnb_4bit_quant_type': 'nf4', 'bnb_4bit_use_double_quant': True, 'bnb_4bit_compute_dtype': 'bfloat16', 'bnb_4bit_quant_storage': 'uint8', 'load_in_4bit': True, 'load_in_8bit': False}, 'output_dir': 'codellama-hugcoder', 'overwrite_output_dir': False, 'do_train': False, 'do_eval': False, 'do_predict': False, 'eval_strategy': 'no', 'prediction_loss_only': False, 'per_device_train_batch_size': 4, 'per_device_eval_batch_size': 4, 'per_gpu_train_batch_size': None, 'per_gpu_eval_batch_size': None, 'gradient_accumulation_steps': 4, 'eval_accumulation_steps': None, 'eval_delay': 0, 'torch_empty_cache_steps': None, 'learning_rate': 0.0003, 'weight_decay': 0.1, 'adam_beta1': 0.9, 'adam_beta2': 0.999, 'adam_epsilon': 1e-08, 'max_grad_norm': 1.0, 'num_train_epochs': 3.0, 'max_steps': 2000, 'lr_scheduler_type': 'cosine', 'lr_scheduler_kwargs': {}, 'warmup_ratio': 0.1, 'warmup_steps': 0, 'log_level': 'info', 'log_level_replica': 'warning', 'log_on_each_node': True, 'logging_dir': 'codellama-hugcoder/runs/May16_07-36-11_project-finecode-65846d7984-mzlgr', 'logging_strategy': 'steps', 'logging_first_step': False, 'logging_steps': 5, 'logging_nan_inf_filter': True, 'save_strategy': 'steps', 'save_steps': 500, 'save_total_limit': None, 'save_safetensors': True, 'save_on_each_node': False, 'save_only_model': False, 'restore_callback_states_from_checkpoint': False, 'no_cuda': False, 'use_cpu': False, 'use_mps_device': False, 'seed': 42, 'data_seed': None, 'jit_mode_eval': False, 'use_ipex': False, 'bf16': True, 'fp16': False, 'fp16_opt_level': 'O1', 'half_precision_backend': 'auto', 'bf16_full_eval': False, 'fp16_full_eval': False, 'tf32': None, 'local_rank': 0, 'ddp_backend': None, 'tpu_num_cores': None, 'tpu_metrics_debug': False, 'debug': [], 'dataloader_drop_last': False, 'eval_steps': 100.0, 'dataloader_num_workers': 0, 'dataloader_prefetch_factor': None, 'past_index': -1, 'run_name': 'codellama-hugcoder', 'disable_tqdm': False, 'remove_unused_columns': True, 'label_names': None, 'load_best_model_at_end': False, 'metric_for_best_model': None, 'greater_is_better': None, 'ignore_data_skip': False, 'fsdp': [], 'fsdp_min_num_params': 0, 'fsdp_config': {'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, 'fsdp_transformer_layer_cls_to_wrap': None, 'accelerator_config': {'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None}, 'deepspeed': None, 'label_smoothing_factor': 0.0, 'optim': 'adamw_torch', 'optim_args': None, 'adafactor': False, 'group_by_length': False, 'length_column_name': 'length', 'report_to': ['wandb'], 'ddp_find_unused_parameters': None, 'ddp_bucket_cap_mb': None, 'ddp_broadcast_buffers': None, 'dataloader_pin_memory': True, 'dataloader_persistent_workers': False, 'skip_memory_metrics': True, 'use_legacy_prediction_loop': False, 'push_to_hub': True, 'resume_from_checkpoint': None, 'hub_model_id': None, 'hub_strategy': 'every_save', 'hub_token': '', 'hub_private_repo': True, 'hub_always_push': False, 'gradient_checkpointing': True, 'gradient_checkpointing_kwargs': {'use_reentrant': True}, 'include_inputs_for_metrics': False, 'include_for_metrics': [], 'eval_do_concat_batches': True, 'fp16_backend': 'auto', 'push_to_hub_model_id': None, 'push_to_hub_organization': None, 'push_to_hub_token': '', 'mp_parameters': '', 'auto_find_batch_size': False, 'full_determinism': False, 'torchdynamo': None, 'ray_scope': 'last', 'ddp_timeout': 1800, 'torch_compile': False, 'torch_compile_backend': None, 'torch_compile_mode': None, 'include_tokens_per_second': False, 'include_num_input_tokens_seen': False, 'neftune_noise_alpha': None, 'optim_target_modules': None, 'batch_eval_metrics': False, 'eval_on_start': False, 'use_liger_kernel': False, 'eval_use_gather_object': False, 'average_tokens_across_devices': False} +2025-05-16 07:37:47,866 INFO MainThread:29365 [wandb_config.py:__setitem__():154] [no run ID] config set model/num_parameters = 6818500608 - > +2025-05-16 07:37:47,866 INFO MainThread:29365 [wandb_run.py:_config_callback():1436] config_cb model/num_parameters 6818500608 None +2025-05-17 22:27:35,403 INFO MsgRouterThr:29365 [mailbox.py:close():129] [no run ID] Closing mailbox, abandoning 0 handles. diff --git a/wandb/offline-run-20250516_073234-ywsmjz3f/files/output.log b/wandb/offline-run-20250516_073234-ywsmjz3f/files/output.log new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/wandb/offline-run-20250516_073234-ywsmjz3f/files/requirements.txt b/wandb/offline-run-20250516_073234-ywsmjz3f/files/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..30272a20e03fdb784d0ddd410c9451a2fc8f06a3 --- /dev/null +++ b/wandb/offline-run-20250516_073234-ywsmjz3f/files/requirements.txt @@ -0,0 +1,359 @@ +huggingface-hub==0.31.2 +tokenizers==0.21.1 +bitsandbytes==0.45.5 +py-cpuinfo==9.0.0 +nvidia-ml-py==12.575.51 +hjson==3.1.0 +smmap==5.0.2 +setproctitle==1.3.6 +sentry-sdk==2.28.0 +ninja==1.11.1.4 +msgpack==1.1.0 +einops==0.8.1 +docker-pycreds==0.4.0 +gitdb==4.0.12 +GitPython==3.1.44 +wandb==0.19.11 +transformers==4.52.0.dev0 +deepspeed==0.16.7 +accelerate==1.8.0.dev0 +peft==0.15.2.dev0 +trl==0.17.0 +flash_attn==2.7.4.post1 +APScheduler==3.10.4 +Authlib==1.3.1 +Deprecated==1.2.18 +Flask-Cors==4.0.1 +Mako==1.3.8 +Markdown==3.6 +PyJWT==2.8.0 +PyMySQL==1.1.1 +PyPika==0.48.9 +RTFDE==0.1.2 +SQLAlchemy==2.0.31 +XlsxWriter==3.2.2 +aiohttp==3.9.5 +alembic==1.13.2 +annotated-types==0.7.0 +anthropic==0.45.2 +asgiref==3.8.1 +async-timeout==4.0.3 +av==12.3.0 +backoff==2.2.1 +bcrypt==4.1.3 +beautifulsoup4==4.12.3 +bidict==0.23.1 +black==24.8.0 +blinker==1.9.0 +boto3==1.34.153 +botocore==1.34.162 +build==1.2.2.post1 +cachetools==5.5.1 +chardet==5.2.0 +chroma-hnswlib==0.7.5 +chromadb==0.5.4 +click==8.1.8 +colorclass==2.2.2 +coloredlogs==15.0.1 +compressed-rtf==1.0.6 +cryptography==44.0.0 +ctranslate2==4.5.0 +dataclasses-json==0.6.7 +deepdiff==8.1.1 +distro==1.9.0 +dnspython==2.7.0 +docker==7.1.0 +docx2txt==0.8 +duckduckgo_search==6.2.13 +durationpy==0.9 +easygui==0.98.3 +ebcdic==1.1.1 +ecdsa==0.19.0 +email_validator==2.2.0 +emoji==2.14.1 +extract-msg==0.52.0 +fake-useragent==1.5.1 +fastapi==0.111.0 +fastapi-cli==0.0.7 +faster-whisper==1.0.2 +filetype==1.2.0 +Flask==3.0.3 +flatbuffers==25.1.24 +fonttools==4.55.8 +fpdf2==2.7.9 +google-ai-generativelanguage==0.6.6 +google-api-core==2.24.1 +google-api-python-client==2.160.0 +google-auth==2.38.0 +google-auth-httplib2==0.2.0 +google-generativeai==0.7.2 +googleapis-common-protos==1.66.0 +greenlet==3.1.1 +grpcio==1.70.0 +grpcio-status==1.62.3 +httplib2==0.22.0 +httptools==0.6.4 +humanfriendly==10.0 +importlib_metadata==8.4.0 +importlib_resources==6.5.2 +iniconfig==2.0.0 +itsdangerous==2.2.0 +jiter==0.8.2 +jmespath==1.0.1 +joblib==1.4.2 +jsonpatch==1.33 +jsonpath-python==1.0.6 +kubernetes==32.0.0 +langchain==0.2.11 +langchain-chroma==0.1.2 +langchain-community==0.2.10 +langchain-core==0.2.43 +langchain-text-splitters==0.2.4 +langdetect==1.0.9 +langfuse==2.39.2 +langsmith==0.1.147 +lark==1.1.9 +lxml==5.3.0 +markdown-it-py==3.0.0 +marshmallow==3.26.0 +mdurl==0.1.2 +mmh3==5.1.0 +monotonic==1.6 +msoffcrypto-tool==5.4.2 +mypy-extensions==1.0.0 +nltk==3.9.1 +numpy==1.26.4 +oauthlib==3.2.2 +olefile==0.47 +oletools==0.60.2 +onnxruntime==1.20.1 +openai==1.61.0 +opencv-python==4.11.0.86 +opencv-python-headless==4.10.0.84 +opentelemetry-api==1.27.0 +opentelemetry-exporter-otlp-proto-common==1.27.0 +opentelemetry-exporter-otlp-proto-grpc==1.27.0 +opentelemetry-instrumentation==0.48b0 +opentelemetry-instrumentation-asgi==0.48b0 +opentelemetry-instrumentation-fastapi==0.48b0 +opentelemetry-proto==1.27.0 +opentelemetry-sdk==1.27.0 +opentelemetry-semantic-conventions==0.48b0 +opentelemetry-util-http==0.48b0 +orderly-set==5.2.3 +orjson==3.10.15 +packaging==23.2 +pandas==2.2.2 +passlib==1.7.4 +pathspec==0.12.1 +pcodedmp==1.2.6 +peewee==3.17.6 +peewee-migrate==1.12.2 +pillow==11.1.0 +pluggy==1.5.0 +posthog==3.11.0 +primp==0.11.0 +proto-plus==1.26.0 +protobuf==4.25.6 +psycopg2-binary==2.9.9 +pyasn1==0.6.1 +pyasn1_modules==0.4.1 +pyclipper==1.3.0.post6 +pydantic==2.8.2 +pydantic_core==2.20.1 +pydub==0.25.1 +pymongo==4.11 +pypandoc==1.13 +pyparsing==3.2.1 +pypdf==4.3.1 +pyproject_hooks==1.2.0 +pytest==8.2.2 +pytest-docker==3.1.1 +python-dotenv==1.0.1 +python-engineio==4.11.2 +python-iso639==2025.1.28 +python-jose==3.3.0 +python-magic==0.4.27 +python-multipart==0.0.9 +python-pptx==1.0.0 +python-socketio==5.11.3 +pytube==15.0.0 +pyxlsb==1.0.10 +rank-bm25==0.2.2 +RapidFuzz==3.12.1 +rapidocr-onnxruntime==1.3.24 +red-black-tree-mod==1.20 +redis==5.2.1 +requests-oauthlib==2.0.0 +requests-toolbelt==1.0.0 +rich==13.9.4 +rich-toolkit==0.13.2 +rsa==4.9 +s3transfer==0.10.4 +scikit-learn==1.6.1 +scipy==1.15.1 +sentence-transformers==3.0.1 +shapely==2.0.7 +shellingham==1.5.4 +simple-websocket==1.1.0 +starlette==0.37.2 +tabulate==0.9.0 +tenacity==8.5.0 +threadpoolctl==3.5.0 +tiktoken==0.8.0 +typer==0.15.1 +typing-inspect==0.9.0 +tzlocal==5.2 +ujson==5.10.0 +unstructured==0.15.0 +unstructured-client==0.25.9 +uritemplate==4.1.1 +uvicorn==0.22.0 +uvloop==0.21.0 +validators==0.33.0 +watchfiles==1.0.4 +websockets==14.2 +Werkzeug==3.1.3 +wrapt==1.17.2 +wsproto==1.2.0 +xlrd==2.0.1 +youtube-transcript-api==0.6.2 +zipp==3.21.0 +aiohappyeyeballs==2.4.4 +aiosignal==1.3.2 +datasets==3.2.0 +dill==0.3.8 +et_xmlfile==2.0.0 +evaluate==0.4.3 +filelock==3.17.0 +frozenlist==1.5.0 +fsspec==2024.9.0 +mpmath==1.3.0 +multidict==6.1.0 +multiprocess==0.70.16 +networkx==3.4.2 +nvidia-cublas-cu12==12.4.5.8 +nvidia-cuda-cupti-cu12==12.4.127 +nvidia-cuda-nvrtc-cu12==12.4.127 +nvidia-cuda-runtime-cu12==12.4.127 +nvidia-cudnn-cu12==9.1.0.70 +nvidia-cufft-cu12==11.2.1.3 +nvidia-curand-cu12==10.3.5.147 +nvidia-cusolver-cu12==11.6.1.9 +nvidia-cusparse-cu12==12.3.1.170 +nvidia-cusparselt-cu12==0.6.2 +nvidia-nccl-cu12==2.21.5 +nvidia-nvjitlink-cu12==12.4.127 +nvidia-nvtx-cu12==12.4.127 +openpyxl==3.1.5 +propcache==0.2.1 +pyarrow==19.0.0 +pytz==2025.1 +regex==2024.11.6 +safetensors==0.5.2 +sentencepiece==0.2.0 +sympy==1.13.1 +torch==2.6.0 +tqdm==4.67.1 +triton==3.2.0 +tzdata==2025.1 +xxhash==3.5.0 +yarl==1.18.3 +MarkupSafe==3.0.2 +PyYAML==6.0.2 +Send2Trash==1.8.3 +anyio==4.8.0 +argon2-cffi==23.1.0 +argon2-cffi-bindings==21.2.0 +arrow==1.3.0 +asttokens==3.0.0 +async-lru==2.0.4 +attrs==25.1.0 +babel==2.17.0 +bleach==6.2.0 +certifi==2025.1.31 +cffi==1.17.1 +charset-normalizer==3.4.1 +comm==0.2.2 +debugpy==1.8.12 +decorator==5.1.1 +defusedxml==0.7.1 +exceptiongroup==1.2.2 +executing==2.2.0 +fastjsonschema==2.21.1 +fqdn==1.5.1 +h11==0.14.0 +httpcore==1.0.7 +httpx==0.28.1 +idna==3.10 +ipykernel==6.29.5 +ipython==8.32.0 +ipywidgets==8.1.5 +isoduration==20.11.0 +jedi==0.19.2 +Jinja2==3.1.5 +json5==0.10.0 +jsonpointer==3.0.0 +jsonschema==4.23.0 +jsonschema-specifications==2024.10.1 +jupyter==1.1.1 +jupyter_client==8.6.3 +jupyter-console==6.6.3 +jupyter_core==5.7.2 +jupyter-events==0.11.0 +jupyter-lsp==2.2.5 +jupyter_server==2.15.0 +jupyter_server_terminals==0.5.3 +jupyterlab==4.3.5 +jupyterlab_pygments==0.3.0 +jupyterlab_server==2.27.3 +jupyterlab_widgets==3.0.13 +matplotlib-inline==0.1.7 +mistune==3.1.1 +nbclient==0.10.2 +nbconvert==7.16.6 +nbformat==5.10.4 +nest-asyncio==1.6.0 +notebook==7.3.2 +notebook_shim==0.2.4 +overrides==7.7.0 +pandocfilters==1.5.1 +parso==0.8.4 +pexpect==4.9.0 +platformdirs==4.3.6 +prometheus_client==0.21.1 +prompt_toolkit==3.0.50 +psutil==6.1.1 +ptyprocess==0.7.0 +pure_eval==0.2.3 +pycparser==2.22 +Pygments==2.19.1 +python-dateutil==2.9.0.post0 +python-json-logger==3.2.1 +pyzmq==26.2.1 +referencing==0.36.2 +requests==2.32.3 +rfc3339-validator==0.1.4 +rfc3986-validator==0.1.1 +rpds-py==0.22.3 +six==1.17.0 +sniffio==1.3.1 +soupsieve==2.6 +stack-data==0.6.3 +terminado==0.18.1 +tinycss2==1.4.0 +tomli==2.2.1 +tornado==6.4.2 +traitlets==5.14.3 +types-python-dateutil==2.9.0.20241206 +typing_extensions==4.12.2 +uri-template==1.3.0 +urllib3==2.3.0 +wcwidth==0.2.13 +webcolors==24.11.1 +webencodings==0.5.1 +websocket-client==1.8.0 +widgetsnbextension==4.0.13 +pip==22.0.2 +setuptools==59.6.0 +wheel==0.37.1 diff --git a/wandb/offline-run-20250516_073234-ywsmjz3f/files/wandb-metadata.json b/wandb/offline-run-20250516_073234-ywsmjz3f/files/wandb-metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..7371377be31edea723d676c7a39aefcd7fc28bcc --- /dev/null +++ b/wandb/offline-run-20250516_073234-ywsmjz3f/files/wandb-metadata.json @@ -0,0 +1,162 @@ +{ + "os": "Linux-5.10.236-228.935.amzn2.x86_64-x86_64-with-glibc2.35", + "python": "CPython 3.10.12", + "startedAt": "2025-05-16T07:32:34.057084Z", + "args": [ + "--model_name_or_path", + "codellama/CodeLlama-7b-Instruct-hf", + "--dataset_name", + "smangrul/hug_stack", + "--splits", + "train", + "--max_seq_len", + "2048", + "--max_steps", + "2000", + "--save_steps", + "500", + "--eval_steps", + "100", + "--logging_steps", + "5", + "--log_level", + "info", + "--logging_strategy", + "steps", + "--save_strategy", + "steps", + "--push_to_hub", + "--hub_private_repo", + "True", + "--hub_strategy", + "every_save", + "--bf16", + "True", + "--learning_rate", + "3e-4", + "--lr_scheduler_type", + "cosine", + "--weight_decay", + "0.1", + "--warmup_ratio", + "0.1", + "--max_grad_norm", + "1.0", + "--output_dir", + "codellama-hugcoder", + "--per_device_train_batch_size", + "16", + "--per_device_eval_batch_size", + "16", + "--gradient_accumulation_steps", + "4", + "--gradient_checkpointing", + "True", + "--use_reentrant", + "True", + "--dataset_text_field", + "text", + "--test_size", + "0.1", + "--fim_rate", + "0.5", + "--fim_spm_rate", + "0.5", + "--use_peft_lora", + "True", + "--lora_r", + "32", + "--lora_alpha", + "64", + "--lora_dropout", + "0.1", + "--lora_target_modules", + "all-linear", + "--use_4bit_quantization", + "True", + "--use_nested_quant", + "True", + "--bnb_4bit_compute_dtype", + "bfloat16", + "--use_flash_attn", + "True" + ], + "program": "/local_storage/persistent_data/jupyterlab/lab/workspaces/training_script/LLM-Workshop/personal_copilot/training/train.py", + "codePath": "personal_copilot/training/train.py", + "git": { + "remote": "https://github.com/pacman100/LLM-Workshop.git", + "commit": "0ba41561ce6ea16d3993069c03ec1dca3ab6769d" + }, + "root": "/local_storage/persistent_data/jupyterlab/lab/workspaces/training_script/LLM-Workshop/personal_copilot/training", + "host": "project-finecode-65846d7984-mzlgr", + "executable": "/usr/bin/python3", + "codePathLocal": "train.py", + "cpu_count": 96, + "cpu_count_logical": 192, + "gpu": "NVIDIA L4", + "gpu_count": 8, + "disk": { + "/": { + "total": "161048670208", + "used": "79656607744" + } + }, + "memory": { + "total": "781916942336" + }, + "cpu": { + "count": 96, + "countLogical": 192 + }, + "gpu_nvidia": [ + { + "name": "NVIDIA L4", + "memoryTotal": "24152899584", + "cudaCores": 7424, + "architecture": "Ada" + }, + { + "name": "NVIDIA L4", + "memoryTotal": "24152899584", + "cudaCores": 7424, + "architecture": "Ada" + }, + { + "name": "NVIDIA L4", + "memoryTotal": "24152899584", + "cudaCores": 7424, + "architecture": "Ada" + }, + { + "name": "NVIDIA L4", + "memoryTotal": "24152899584", + "cudaCores": 7424, + "architecture": "Ada" + }, + { + "name": "NVIDIA L4", + "memoryTotal": "24152899584", + "cudaCores": 7424, + "architecture": "Ada" + }, + { + "name": "NVIDIA L4", + "memoryTotal": "24152899584", + "cudaCores": 7424, + "architecture": "Ada" + }, + { + "name": "NVIDIA L4", + "memoryTotal": "24152899584", + "cudaCores": 7424, + "architecture": "Ada" + }, + { + "name": "NVIDIA L4", + "memoryTotal": "24152899584", + "cudaCores": 7424, + "architecture": "Ada" + } + ], + "cudaVersion": "12.4" +} \ No newline at end of file diff --git a/wandb/offline-run-20250516_073234-ywsmjz3f/logs/debug-core.log b/wandb/offline-run-20250516_073234-ywsmjz3f/logs/debug-core.log new file mode 100644 index 0000000000000000000000000000000000000000..1e49a796bb0a79c02b80688d665ca8732b41c4d3 --- /dev/null +++ b/wandb/offline-run-20250516_073234-ywsmjz3f/logs/debug-core.log @@ -0,0 +1,13 @@ +{"time":"2025-05-16T07:30:58.699801538Z","level":"INFO","msg":"main: starting server","port-filename":"/tmp/tmptbc9b5yr/port-24729.txt","pid":24729,"log-level":0,"disable-analytics":false,"shutdown-on-parent-exit":false,"enable-dcgm-profiling":false} +{"time":"2025-05-16T07:30:58.726124342Z","level":"INFO","msg":"Will exit if parent process dies.","ppid":24729} +{"time":"2025-05-16T07:30:58.732312551Z","level":"INFO","msg":"server is running","addr":{"IP":"127.0.0.1","Port":44465,"Zone":""}} +{"time":"2025-05-16T07:30:58.74125048Z","level":"INFO","msg":"connection: ManageConnectionData: new connection created","id":"127.0.0.1:46530"} +{"time":"2025-05-16T07:32:34.066161637Z","level":"INFO","msg":"handleInformInit: received","streamId":"ywsmjz3f","id":"127.0.0.1:46530"} +{"time":"2025-05-16T07:32:34.365776057Z","level":"INFO","msg":"handleInformInit: stream started","streamId":"ywsmjz3f","id":"127.0.0.1:46530"} +{"time":"2025-05-16T07:32:45.38484093Z","level":"INFO","msg":"handleInformTeardown: server teardown initiated","id":"127.0.0.1:46530"} +{"time":"2025-05-16T07:32:45.384915055Z","level":"INFO","msg":"connection: closing","id":"127.0.0.1:46530"} +{"time":"2025-05-16T07:32:45.38497659Z","level":"INFO","msg":"connection: closed successfully","id":"127.0.0.1:46530"} +{"time":"2025-05-16T07:32:45.384963659Z","level":"INFO","msg":"server is shutting down"} +{"time":"2025-05-16T07:32:45.386267549Z","level":"INFO","msg":"handleInformTeardown: server shutdown complete","id":"127.0.0.1:46530"} +{"time":"2025-05-16T07:32:45.386310042Z","level":"INFO","msg":"connection: ManageConnectionData: connection closed","id":"127.0.0.1:46530"} +{"time":"2025-05-16T07:32:45.386340054Z","level":"INFO","msg":"server is closed"} diff --git a/wandb/offline-run-20250516_073234-ywsmjz3f/logs/debug-internal.log b/wandb/offline-run-20250516_073234-ywsmjz3f/logs/debug-internal.log new file mode 100644 index 0000000000000000000000000000000000000000..0aec8ce735dabb0e7ca0e293430496b387b6fdb3 --- /dev/null +++ b/wandb/offline-run-20250516_073234-ywsmjz3f/logs/debug-internal.log @@ -0,0 +1,15 @@ +{"time":"2025-05-16T07:32:34.149387625Z","level":"INFO","msg":"stream: starting","core version":"0.19.11","symlink path":"/local_storage/persistent_data/jupyterlab/lab/workspaces/training_script/LLM-Workshop/personal_copilot/training/wandb/offline-run-20250516_073234-ywsmjz3f/logs/debug-core.log"} +{"time":"2025-05-16T07:32:34.364789908Z","level":"WARN","msg":"GraphQL client is nil, skipping feature loading"} +{"time":"2025-05-16T07:32:34.365750715Z","level":"INFO","msg":"created new stream","id":"ywsmjz3f"} +{"time":"2025-05-16T07:32:34.365769606Z","level":"INFO","msg":"stream: started","id":"ywsmjz3f"} +{"time":"2025-05-16T07:32:34.365807029Z","level":"INFO","msg":"sender: started","stream_id":"ywsmjz3f"} +{"time":"2025-05-16T07:32:34.365799878Z","level":"INFO","msg":"writer: Do: started","stream_id":"ywsmjz3f"} +{"time":"2025-05-16T07:32:34.366790667Z","level":"INFO","msg":"handler: started","stream_id":"ywsmjz3f"} +{"time":"2025-05-16T07:32:34.371019219Z","level":"INFO","msg":"Starting system monitor"} +{"time":"2025-05-16T07:32:45.384954188Z","level":"INFO","msg":"stream: closing","id":"ywsmjz3f"} +{"time":"2025-05-16T07:32:45.38498709Z","level":"INFO","msg":"Stopping system monitor"} +{"time":"2025-05-16T07:32:45.385024963Z","level":"INFO","msg":"Stopped system monitor"} +{"time":"2025-05-16T07:32:45.385961187Z","level":"INFO","msg":"handler: closed","stream_id":"ywsmjz3f"} +{"time":"2025-05-16T07:32:45.385974788Z","level":"INFO","msg":"writer: Close: closed","stream_id":"ywsmjz3f"} +{"time":"2025-05-16T07:32:45.386016981Z","level":"INFO","msg":"sender: closed","stream_id":"ywsmjz3f"} +{"time":"2025-05-16T07:32:45.386092937Z","level":"INFO","msg":"stream: closed","id":"ywsmjz3f"} diff --git a/wandb/offline-run-20250516_073234-ywsmjz3f/logs/debug.log b/wandb/offline-run-20250516_073234-ywsmjz3f/logs/debug.log new file mode 100644 index 0000000000000000000000000000000000000000..2b76c6311f0e613e8f0ba2e409c9cd74990344ce --- /dev/null +++ b/wandb/offline-run-20250516_073234-ywsmjz3f/logs/debug.log @@ -0,0 +1,26 @@ +2025-05-16 07:32:34,052 INFO MainThread:24729 [wandb_setup.py:_flush():70] Current SDK version is 0.19.11 +2025-05-16 07:32:34,052 INFO MainThread:24729 [wandb_setup.py:_flush():70] Configure stats pid to 24729 +2025-05-16 07:32:34,052 INFO MainThread:24729 [wandb_setup.py:_flush():70] Loading settings from /root/.config/wandb/settings +2025-05-16 07:32:34,052 INFO MainThread:24729 [wandb_setup.py:_flush():70] Loading settings from /local_storage/persistent_data/jupyterlab/lab/workspaces/training_script/LLM-Workshop/personal_copilot/training/wandb/settings +2025-05-16 07:32:34,052 INFO MainThread:24729 [wandb_setup.py:_flush():70] Loading settings from environment variables +2025-05-16 07:32:34,052 INFO MainThread:24729 [wandb_init.py:setup_run_log_directory():724] Logging user logs to /local_storage/persistent_data/jupyterlab/lab/workspaces/training_script/LLM-Workshop/personal_copilot/training/wandb/offline-run-20250516_073234-ywsmjz3f/logs/debug.log +2025-05-16 07:32:34,052 INFO MainThread:24729 [wandb_init.py:setup_run_log_directory():725] Logging internal logs to /local_storage/persistent_data/jupyterlab/lab/workspaces/training_script/LLM-Workshop/personal_copilot/training/wandb/offline-run-20250516_073234-ywsmjz3f/logs/debug-internal.log +2025-05-16 07:32:34,052 INFO MainThread:24729 [wandb_init.py:init():852] calling init triggers +2025-05-16 07:32:34,052 INFO MainThread:24729 [wandb_init.py:init():857] wandb.init called with sweep_config: {} +config: {'_wandb': {}} +2025-05-16 07:32:34,052 INFO MainThread:24729 [wandb_init.py:init():893] starting backend +2025-05-16 07:32:34,052 INFO MainThread:24729 [wandb_init.py:init():897] sending inform_init request +2025-05-16 07:32:34,056 INFO MainThread:24729 [backend.py:_multiprocessing_setup():101] multiprocessing start_methods=fork,spawn,forkserver, using: spawn +2025-05-16 07:32:34,056 INFO MainThread:24729 [wandb_init.py:init():907] backend started and connected +2025-05-16 07:32:34,057 INFO MainThread:24729 [wandb_init.py:init():1005] updated telemetry +2025-05-16 07:32:34,089 INFO MainThread:24729 [wandb_init.py:init():1029] communicating run to backend with 90.0 second timeout +2025-05-16 07:32:34,368 INFO MainThread:24729 [wandb_init.py:init():1104] starting run threads in backend +2025-05-16 07:32:34,755 INFO MainThread:24729 [wandb_run.py:_console_start():2573] atexit reg +2025-05-16 07:32:34,755 INFO MainThread:24729 [wandb_run.py:_redirect():2421] redirect: wrap_raw +2025-05-16 07:32:34,756 INFO MainThread:24729 [wandb_run.py:_redirect():2490] Wrapping output streams. +2025-05-16 07:32:34,756 INFO MainThread:24729 [wandb_run.py:_redirect():2513] Redirects installed. +2025-05-16 07:32:34,758 INFO MainThread:24729 [wandb_init.py:init():1150] run started, returning control to user process +2025-05-16 07:32:34,760 INFO MainThread:24729 [wandb_run.py:_config_callback():1436] config_cb None None {'peft_config': {'default': {'task_type': 'CAUSAL_LM', 'peft_type': , 'auto_mapping': None, 'base_model_name_or_path': 'codellama/CodeLlama-7b-Instruct-hf', 'revision': None, 'inference_mode': False, 'r': 32, 'target_modules': {'gate_proj', 'k_proj', 'o_proj', 'q_proj', 'up_proj', 'v_proj', 'down_proj'}, 'exclude_modules': None, 'lora_alpha': 64, 'lora_dropout': 0.1, 'fan_in_fan_out': False, 'bias': 'none', 'use_rslora': False, 'modules_to_save': None, 'init_lora_weights': True, 'layers_to_transform': None, 'layers_pattern': None, 'rank_pattern': {}, 'alpha_pattern': {}, 'megatron_config': None, 'megatron_core': 'megatron.core', 'trainable_token_indices': None, 'loftq_config': {}, 'eva_config': None, 'corda_config': None, 'use_dora': False, 'layer_replication': None, 'runtime_config': {'ephemeral_gpu_offload': False}, 'lora_bias': False}}, 'vocab_size': 32016, 'max_position_embeddings': 16384, 'hidden_size': 4096, 'intermediate_size': 11008, 'num_hidden_layers': 32, 'num_attention_heads': 32, 'num_key_value_heads': 32, 'hidden_act': 'silu', 'initializer_range': 0.02, 'rms_norm_eps': 1e-05, 'pretraining_tp': 1, 'use_cache': False, 'rope_theta': 1000000, 'rope_scaling': None, 'attention_bias': False, 'attention_dropout': 0.0, 'mlp_bias': False, 'head_dim': 128, 'return_dict': True, 'output_hidden_states': False, 'output_attentions': False, 'torchscript': False, 'torch_dtype': 'float16', 'use_bfloat16': False, 'tf_legacy_loss': False, 'pruned_heads': {}, 'tie_word_embeddings': False, 'chunk_size_feed_forward': 0, 'is_encoder_decoder': False, 'is_decoder': False, 'cross_attention_hidden_size': None, 'add_cross_attention': False, 'tie_encoder_decoder': False, 'max_length': 20, 'min_length': 0, 'do_sample': False, 'early_stopping': False, 'num_beams': 1, 'num_beam_groups': 1, 'diversity_penalty': 0.0, 'temperature': 1.0, 'top_k': 50, 'top_p': 1.0, 'typical_p': 1.0, 'repetition_penalty': 1.0, 'length_penalty': 1.0, 'no_repeat_ngram_size': 0, 'encoder_no_repeat_ngram_size': 0, 'bad_words_ids': None, 'num_return_sequences': 1, 'output_scores': False, 'return_dict_in_generate': False, 'forced_bos_token_id': None, 'forced_eos_token_id': None, 'remove_invalid_values': False, 'exponential_decay_length_penalty': None, 'suppress_tokens': None, 'begin_suppress_tokens': None, 'architectures': ['LlamaForCausalLM'], 'finetuning_task': None, 'id2label': {0: 'LABEL_0', 1: 'LABEL_1'}, 'label2id': {'LABEL_0': 0, 'LABEL_1': 1}, 'tokenizer_class': None, 'prefix': None, 'bos_token_id': 1, 'pad_token_id': None, 'eos_token_id': 2, 'sep_token_id': None, 'decoder_start_token_id': None, 'task_specific_params': None, 'problem_type': None, '_name_or_path': 'codellama/CodeLlama-7b-Instruct-hf', 'transformers_version': '4.52.0.dev0', 'model_type': 'llama', 'quantization_config': {'quant_method': 'BITS_AND_BYTES', '_load_in_8bit': False, '_load_in_4bit': True, 'llm_int8_threshold': 6.0, 'llm_int8_skip_modules': None, 'llm_int8_enable_fp32_cpu_offload': False, 'llm_int8_has_fp16_weight': False, 'bnb_4bit_quant_type': 'nf4', 'bnb_4bit_use_double_quant': True, 'bnb_4bit_compute_dtype': 'bfloat16', 'bnb_4bit_quant_storage': 'uint8', 'load_in_4bit': True, 'load_in_8bit': False}, 'output_dir': 'codellama-hugcoder', 'overwrite_output_dir': False, 'do_train': False, 'do_eval': False, 'do_predict': False, 'eval_strategy': 'no', 'prediction_loss_only': False, 'per_device_train_batch_size': 16, 'per_device_eval_batch_size': 16, 'per_gpu_train_batch_size': None, 'per_gpu_eval_batch_size': None, 'gradient_accumulation_steps': 4, 'eval_accumulation_steps': None, 'eval_delay': 0, 'torch_empty_cache_steps': None, 'learning_rate': 0.0003, 'weight_decay': 0.1, 'adam_beta1': 0.9, 'adam_beta2': 0.999, 'adam_epsilon': 1e-08, 'max_grad_norm': 1.0, 'num_train_epochs': 3.0, 'max_steps': 2000, 'lr_scheduler_type': 'cosine', 'lr_scheduler_kwargs': {}, 'warmup_ratio': 0.1, 'warmup_steps': 0, 'log_level': 'info', 'log_level_replica': 'warning', 'log_on_each_node': True, 'logging_dir': 'codellama-hugcoder/runs/May16_07-30-12_project-finecode-65846d7984-mzlgr', 'logging_strategy': 'steps', 'logging_first_step': False, 'logging_steps': 5, 'logging_nan_inf_filter': True, 'save_strategy': 'steps', 'save_steps': 500, 'save_total_limit': None, 'save_safetensors': True, 'save_on_each_node': False, 'save_only_model': False, 'restore_callback_states_from_checkpoint': False, 'no_cuda': False, 'use_cpu': False, 'use_mps_device': False, 'seed': 42, 'data_seed': None, 'jit_mode_eval': False, 'use_ipex': False, 'bf16': True, 'fp16': False, 'fp16_opt_level': 'O1', 'half_precision_backend': 'auto', 'bf16_full_eval': False, 'fp16_full_eval': False, 'tf32': None, 'local_rank': 0, 'ddp_backend': None, 'tpu_num_cores': None, 'tpu_metrics_debug': False, 'debug': [], 'dataloader_drop_last': False, 'eval_steps': 100.0, 'dataloader_num_workers': 0, 'dataloader_prefetch_factor': None, 'past_index': -1, 'run_name': 'codellama-hugcoder', 'disable_tqdm': False, 'remove_unused_columns': True, 'label_names': None, 'load_best_model_at_end': False, 'metric_for_best_model': None, 'greater_is_better': None, 'ignore_data_skip': False, 'fsdp': [], 'fsdp_min_num_params': 0, 'fsdp_config': {'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, 'fsdp_transformer_layer_cls_to_wrap': None, 'accelerator_config': {'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None}, 'deepspeed': None, 'label_smoothing_factor': 0.0, 'optim': 'adamw_torch', 'optim_args': None, 'adafactor': False, 'group_by_length': False, 'length_column_name': 'length', 'report_to': ['wandb'], 'ddp_find_unused_parameters': None, 'ddp_bucket_cap_mb': None, 'ddp_broadcast_buffers': None, 'dataloader_pin_memory': True, 'dataloader_persistent_workers': False, 'skip_memory_metrics': True, 'use_legacy_prediction_loop': False, 'push_to_hub': True, 'resume_from_checkpoint': None, 'hub_model_id': None, 'hub_strategy': 'every_save', 'hub_token': '', 'hub_private_repo': True, 'hub_always_push': False, 'gradient_checkpointing': True, 'gradient_checkpointing_kwargs': {'use_reentrant': True}, 'include_inputs_for_metrics': False, 'include_for_metrics': [], 'eval_do_concat_batches': True, 'fp16_backend': 'auto', 'push_to_hub_model_id': None, 'push_to_hub_organization': None, 'push_to_hub_token': '', 'mp_parameters': '', 'auto_find_batch_size': False, 'full_determinism': False, 'torchdynamo': None, 'ray_scope': 'last', 'ddp_timeout': 1800, 'torch_compile': False, 'torch_compile_backend': None, 'torch_compile_mode': None, 'include_tokens_per_second': False, 'include_num_input_tokens_seen': False, 'neftune_noise_alpha': None, 'optim_target_modules': None, 'batch_eval_metrics': False, 'eval_on_start': False, 'use_liger_kernel': False, 'eval_use_gather_object': False, 'average_tokens_across_devices': False} +2025-05-16 07:32:34,766 INFO MainThread:24729 [wandb_config.py:__setitem__():154] [no run ID] config set model/num_parameters = 6818500608 - > +2025-05-16 07:32:34,766 INFO MainThread:24729 [wandb_run.py:_config_callback():1436] config_cb model/num_parameters 6818500608 None +2025-05-16 07:32:45,384 INFO MsgRouterThr:24729 [mailbox.py:close():129] [no run ID] Closing mailbox, abandoning 0 handles. diff --git a/wandb/offline-run-20250516_073234-ywsmjz3f/run-ywsmjz3f.wandb b/wandb/offline-run-20250516_073234-ywsmjz3f/run-ywsmjz3f.wandb new file mode 100644 index 0000000000000000000000000000000000000000..b09c8d72a495b81524c1015c362d5dd85d4a09a3 Binary files /dev/null and b/wandb/offline-run-20250516_073234-ywsmjz3f/run-ywsmjz3f.wandb differ diff --git a/wandb/offline-run-20250516_073527-tbn7d6q6/files/output.log b/wandb/offline-run-20250516_073527-tbn7d6q6/files/output.log new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/wandb/offline-run-20250516_073527-tbn7d6q6/files/requirements.txt b/wandb/offline-run-20250516_073527-tbn7d6q6/files/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..30272a20e03fdb784d0ddd410c9451a2fc8f06a3 --- /dev/null +++ b/wandb/offline-run-20250516_073527-tbn7d6q6/files/requirements.txt @@ -0,0 +1,359 @@ +huggingface-hub==0.31.2 +tokenizers==0.21.1 +bitsandbytes==0.45.5 +py-cpuinfo==9.0.0 +nvidia-ml-py==12.575.51 +hjson==3.1.0 +smmap==5.0.2 +setproctitle==1.3.6 +sentry-sdk==2.28.0 +ninja==1.11.1.4 +msgpack==1.1.0 +einops==0.8.1 +docker-pycreds==0.4.0 +gitdb==4.0.12 +GitPython==3.1.44 +wandb==0.19.11 +transformers==4.52.0.dev0 +deepspeed==0.16.7 +accelerate==1.8.0.dev0 +peft==0.15.2.dev0 +trl==0.17.0 +flash_attn==2.7.4.post1 +APScheduler==3.10.4 +Authlib==1.3.1 +Deprecated==1.2.18 +Flask-Cors==4.0.1 +Mako==1.3.8 +Markdown==3.6 +PyJWT==2.8.0 +PyMySQL==1.1.1 +PyPika==0.48.9 +RTFDE==0.1.2 +SQLAlchemy==2.0.31 +XlsxWriter==3.2.2 +aiohttp==3.9.5 +alembic==1.13.2 +annotated-types==0.7.0 +anthropic==0.45.2 +asgiref==3.8.1 +async-timeout==4.0.3 +av==12.3.0 +backoff==2.2.1 +bcrypt==4.1.3 +beautifulsoup4==4.12.3 +bidict==0.23.1 +black==24.8.0 +blinker==1.9.0 +boto3==1.34.153 +botocore==1.34.162 +build==1.2.2.post1 +cachetools==5.5.1 +chardet==5.2.0 +chroma-hnswlib==0.7.5 +chromadb==0.5.4 +click==8.1.8 +colorclass==2.2.2 +coloredlogs==15.0.1 +compressed-rtf==1.0.6 +cryptography==44.0.0 +ctranslate2==4.5.0 +dataclasses-json==0.6.7 +deepdiff==8.1.1 +distro==1.9.0 +dnspython==2.7.0 +docker==7.1.0 +docx2txt==0.8 +duckduckgo_search==6.2.13 +durationpy==0.9 +easygui==0.98.3 +ebcdic==1.1.1 +ecdsa==0.19.0 +email_validator==2.2.0 +emoji==2.14.1 +extract-msg==0.52.0 +fake-useragent==1.5.1 +fastapi==0.111.0 +fastapi-cli==0.0.7 +faster-whisper==1.0.2 +filetype==1.2.0 +Flask==3.0.3 +flatbuffers==25.1.24 +fonttools==4.55.8 +fpdf2==2.7.9 +google-ai-generativelanguage==0.6.6 +google-api-core==2.24.1 +google-api-python-client==2.160.0 +google-auth==2.38.0 +google-auth-httplib2==0.2.0 +google-generativeai==0.7.2 +googleapis-common-protos==1.66.0 +greenlet==3.1.1 +grpcio==1.70.0 +grpcio-status==1.62.3 +httplib2==0.22.0 +httptools==0.6.4 +humanfriendly==10.0 +importlib_metadata==8.4.0 +importlib_resources==6.5.2 +iniconfig==2.0.0 +itsdangerous==2.2.0 +jiter==0.8.2 +jmespath==1.0.1 +joblib==1.4.2 +jsonpatch==1.33 +jsonpath-python==1.0.6 +kubernetes==32.0.0 +langchain==0.2.11 +langchain-chroma==0.1.2 +langchain-community==0.2.10 +langchain-core==0.2.43 +langchain-text-splitters==0.2.4 +langdetect==1.0.9 +langfuse==2.39.2 +langsmith==0.1.147 +lark==1.1.9 +lxml==5.3.0 +markdown-it-py==3.0.0 +marshmallow==3.26.0 +mdurl==0.1.2 +mmh3==5.1.0 +monotonic==1.6 +msoffcrypto-tool==5.4.2 +mypy-extensions==1.0.0 +nltk==3.9.1 +numpy==1.26.4 +oauthlib==3.2.2 +olefile==0.47 +oletools==0.60.2 +onnxruntime==1.20.1 +openai==1.61.0 +opencv-python==4.11.0.86 +opencv-python-headless==4.10.0.84 +opentelemetry-api==1.27.0 +opentelemetry-exporter-otlp-proto-common==1.27.0 +opentelemetry-exporter-otlp-proto-grpc==1.27.0 +opentelemetry-instrumentation==0.48b0 +opentelemetry-instrumentation-asgi==0.48b0 +opentelemetry-instrumentation-fastapi==0.48b0 +opentelemetry-proto==1.27.0 +opentelemetry-sdk==1.27.0 +opentelemetry-semantic-conventions==0.48b0 +opentelemetry-util-http==0.48b0 +orderly-set==5.2.3 +orjson==3.10.15 +packaging==23.2 +pandas==2.2.2 +passlib==1.7.4 +pathspec==0.12.1 +pcodedmp==1.2.6 +peewee==3.17.6 +peewee-migrate==1.12.2 +pillow==11.1.0 +pluggy==1.5.0 +posthog==3.11.0 +primp==0.11.0 +proto-plus==1.26.0 +protobuf==4.25.6 +psycopg2-binary==2.9.9 +pyasn1==0.6.1 +pyasn1_modules==0.4.1 +pyclipper==1.3.0.post6 +pydantic==2.8.2 +pydantic_core==2.20.1 +pydub==0.25.1 +pymongo==4.11 +pypandoc==1.13 +pyparsing==3.2.1 +pypdf==4.3.1 +pyproject_hooks==1.2.0 +pytest==8.2.2 +pytest-docker==3.1.1 +python-dotenv==1.0.1 +python-engineio==4.11.2 +python-iso639==2025.1.28 +python-jose==3.3.0 +python-magic==0.4.27 +python-multipart==0.0.9 +python-pptx==1.0.0 +python-socketio==5.11.3 +pytube==15.0.0 +pyxlsb==1.0.10 +rank-bm25==0.2.2 +RapidFuzz==3.12.1 +rapidocr-onnxruntime==1.3.24 +red-black-tree-mod==1.20 +redis==5.2.1 +requests-oauthlib==2.0.0 +requests-toolbelt==1.0.0 +rich==13.9.4 +rich-toolkit==0.13.2 +rsa==4.9 +s3transfer==0.10.4 +scikit-learn==1.6.1 +scipy==1.15.1 +sentence-transformers==3.0.1 +shapely==2.0.7 +shellingham==1.5.4 +simple-websocket==1.1.0 +starlette==0.37.2 +tabulate==0.9.0 +tenacity==8.5.0 +threadpoolctl==3.5.0 +tiktoken==0.8.0 +typer==0.15.1 +typing-inspect==0.9.0 +tzlocal==5.2 +ujson==5.10.0 +unstructured==0.15.0 +unstructured-client==0.25.9 +uritemplate==4.1.1 +uvicorn==0.22.0 +uvloop==0.21.0 +validators==0.33.0 +watchfiles==1.0.4 +websockets==14.2 +Werkzeug==3.1.3 +wrapt==1.17.2 +wsproto==1.2.0 +xlrd==2.0.1 +youtube-transcript-api==0.6.2 +zipp==3.21.0 +aiohappyeyeballs==2.4.4 +aiosignal==1.3.2 +datasets==3.2.0 +dill==0.3.8 +et_xmlfile==2.0.0 +evaluate==0.4.3 +filelock==3.17.0 +frozenlist==1.5.0 +fsspec==2024.9.0 +mpmath==1.3.0 +multidict==6.1.0 +multiprocess==0.70.16 +networkx==3.4.2 +nvidia-cublas-cu12==12.4.5.8 +nvidia-cuda-cupti-cu12==12.4.127 +nvidia-cuda-nvrtc-cu12==12.4.127 +nvidia-cuda-runtime-cu12==12.4.127 +nvidia-cudnn-cu12==9.1.0.70 +nvidia-cufft-cu12==11.2.1.3 +nvidia-curand-cu12==10.3.5.147 +nvidia-cusolver-cu12==11.6.1.9 +nvidia-cusparse-cu12==12.3.1.170 +nvidia-cusparselt-cu12==0.6.2 +nvidia-nccl-cu12==2.21.5 +nvidia-nvjitlink-cu12==12.4.127 +nvidia-nvtx-cu12==12.4.127 +openpyxl==3.1.5 +propcache==0.2.1 +pyarrow==19.0.0 +pytz==2025.1 +regex==2024.11.6 +safetensors==0.5.2 +sentencepiece==0.2.0 +sympy==1.13.1 +torch==2.6.0 +tqdm==4.67.1 +triton==3.2.0 +tzdata==2025.1 +xxhash==3.5.0 +yarl==1.18.3 +MarkupSafe==3.0.2 +PyYAML==6.0.2 +Send2Trash==1.8.3 +anyio==4.8.0 +argon2-cffi==23.1.0 +argon2-cffi-bindings==21.2.0 +arrow==1.3.0 +asttokens==3.0.0 +async-lru==2.0.4 +attrs==25.1.0 +babel==2.17.0 +bleach==6.2.0 +certifi==2025.1.31 +cffi==1.17.1 +charset-normalizer==3.4.1 +comm==0.2.2 +debugpy==1.8.12 +decorator==5.1.1 +defusedxml==0.7.1 +exceptiongroup==1.2.2 +executing==2.2.0 +fastjsonschema==2.21.1 +fqdn==1.5.1 +h11==0.14.0 +httpcore==1.0.7 +httpx==0.28.1 +idna==3.10 +ipykernel==6.29.5 +ipython==8.32.0 +ipywidgets==8.1.5 +isoduration==20.11.0 +jedi==0.19.2 +Jinja2==3.1.5 +json5==0.10.0 +jsonpointer==3.0.0 +jsonschema==4.23.0 +jsonschema-specifications==2024.10.1 +jupyter==1.1.1 +jupyter_client==8.6.3 +jupyter-console==6.6.3 +jupyter_core==5.7.2 +jupyter-events==0.11.0 +jupyter-lsp==2.2.5 +jupyter_server==2.15.0 +jupyter_server_terminals==0.5.3 +jupyterlab==4.3.5 +jupyterlab_pygments==0.3.0 +jupyterlab_server==2.27.3 +jupyterlab_widgets==3.0.13 +matplotlib-inline==0.1.7 +mistune==3.1.1 +nbclient==0.10.2 +nbconvert==7.16.6 +nbformat==5.10.4 +nest-asyncio==1.6.0 +notebook==7.3.2 +notebook_shim==0.2.4 +overrides==7.7.0 +pandocfilters==1.5.1 +parso==0.8.4 +pexpect==4.9.0 +platformdirs==4.3.6 +prometheus_client==0.21.1 +prompt_toolkit==3.0.50 +psutil==6.1.1 +ptyprocess==0.7.0 +pure_eval==0.2.3 +pycparser==2.22 +Pygments==2.19.1 +python-dateutil==2.9.0.post0 +python-json-logger==3.2.1 +pyzmq==26.2.1 +referencing==0.36.2 +requests==2.32.3 +rfc3339-validator==0.1.4 +rfc3986-validator==0.1.1 +rpds-py==0.22.3 +six==1.17.0 +sniffio==1.3.1 +soupsieve==2.6 +stack-data==0.6.3 +terminado==0.18.1 +tinycss2==1.4.0 +tomli==2.2.1 +tornado==6.4.2 +traitlets==5.14.3 +types-python-dateutil==2.9.0.20241206 +typing_extensions==4.12.2 +uri-template==1.3.0 +urllib3==2.3.0 +wcwidth==0.2.13 +webcolors==24.11.1 +webencodings==0.5.1 +websocket-client==1.8.0 +widgetsnbextension==4.0.13 +pip==22.0.2 +setuptools==59.6.0 +wheel==0.37.1 diff --git a/wandb/offline-run-20250516_073527-tbn7d6q6/files/wandb-metadata.json b/wandb/offline-run-20250516_073527-tbn7d6q6/files/wandb-metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..cdbd59cdd313d131b898c80fd5671300bab841ef --- /dev/null +++ b/wandb/offline-run-20250516_073527-tbn7d6q6/files/wandb-metadata.json @@ -0,0 +1,162 @@ +{ + "os": "Linux-5.10.236-228.935.amzn2.x86_64-x86_64-with-glibc2.35", + "python": "CPython 3.10.12", + "startedAt": "2025-05-16T07:35:27.841578Z", + "args": [ + "--model_name_or_path", + "codellama/CodeLlama-7b-Instruct-hf", + "--dataset_name", + "smangrul/hug_stack", + "--splits", + "train", + "--max_seq_len", + "2048", + "--max_steps", + "2000", + "--save_steps", + "500", + "--eval_steps", + "100", + "--logging_steps", + "5", + "--log_level", + "info", + "--logging_strategy", + "steps", + "--save_strategy", + "steps", + "--push_to_hub", + "--hub_private_repo", + "True", + "--hub_strategy", + "every_save", + "--bf16", + "True", + "--learning_rate", + "3e-4", + "--lr_scheduler_type", + "cosine", + "--weight_decay", + "0.1", + "--warmup_ratio", + "0.1", + "--max_grad_norm", + "1.0", + "--output_dir", + "codellama-hugcoder", + "--per_device_train_batch_size", + "8", + "--per_device_eval_batch_size", + "8", + "--gradient_accumulation_steps", + "4", + "--gradient_checkpointing", + "True", + "--use_reentrant", + "True", + "--dataset_text_field", + "text", + "--test_size", + "0.1", + "--fim_rate", + "0.5", + "--fim_spm_rate", + "0.5", + "--use_peft_lora", + "True", + "--lora_r", + "32", + "--lora_alpha", + "64", + "--lora_dropout", + "0.1", + "--lora_target_modules", + "all-linear", + "--use_4bit_quantization", + "True", + "--use_nested_quant", + "True", + "--bnb_4bit_compute_dtype", + "bfloat16", + "--use_flash_attn", + "True" + ], + "program": "/local_storage/persistent_data/jupyterlab/lab/workspaces/training_script/LLM-Workshop/personal_copilot/training/train.py", + "codePath": "personal_copilot/training/train.py", + "git": { + "remote": "https://github.com/pacman100/LLM-Workshop.git", + "commit": "0ba41561ce6ea16d3993069c03ec1dca3ab6769d" + }, + "root": "/local_storage/persistent_data/jupyterlab/lab/workspaces/training_script/LLM-Workshop/personal_copilot/training", + "host": "project-finecode-65846d7984-mzlgr", + "executable": "/usr/bin/python3", + "codePathLocal": "train.py", + "cpu_count": 96, + "cpu_count_logical": 192, + "gpu": "NVIDIA L4", + "gpu_count": 8, + "disk": { + "/": { + "total": "161048670208", + "used": "79644426240" + } + }, + "memory": { + "total": "781916942336" + }, + "cpu": { + "count": 96, + "countLogical": 192 + }, + "gpu_nvidia": [ + { + "name": "NVIDIA L4", + "memoryTotal": "24152899584", + "cudaCores": 7424, + "architecture": "Ada" + }, + { + "name": "NVIDIA L4", + "memoryTotal": "24152899584", + "cudaCores": 7424, + "architecture": "Ada" + }, + { + "name": "NVIDIA L4", + "memoryTotal": "24152899584", + "cudaCores": 7424, + "architecture": "Ada" + }, + { + "name": "NVIDIA L4", + "memoryTotal": "24152899584", + "cudaCores": 7424, + "architecture": "Ada" + }, + { + "name": "NVIDIA L4", + "memoryTotal": "24152899584", + "cudaCores": 7424, + "architecture": "Ada" + }, + { + "name": "NVIDIA L4", + "memoryTotal": "24152899584", + "cudaCores": 7424, + "architecture": "Ada" + }, + { + "name": "NVIDIA L4", + "memoryTotal": "24152899584", + "cudaCores": 7424, + "architecture": "Ada" + }, + { + "name": "NVIDIA L4", + "memoryTotal": "24152899584", + "cudaCores": 7424, + "architecture": "Ada" + } + ], + "cudaVersion": "12.4" +} \ No newline at end of file diff --git a/wandb/offline-run-20250516_073527-tbn7d6q6/logs/debug-core.log b/wandb/offline-run-20250516_073527-tbn7d6q6/logs/debug-core.log new file mode 100644 index 0000000000000000000000000000000000000000..08b937ac63e90b8036187a5ee0c734bc1a24a9e5 --- /dev/null +++ b/wandb/offline-run-20250516_073527-tbn7d6q6/logs/debug-core.log @@ -0,0 +1,13 @@ +{"time":"2025-05-16T07:35:23.964788517Z","level":"INFO","msg":"main: starting server","port-filename":"/tmp/tmpru0m0q6i/port-27688.txt","pid":27688,"log-level":0,"disable-analytics":false,"shutdown-on-parent-exit":false,"enable-dcgm-profiling":false} +{"time":"2025-05-16T07:35:23.968897511Z","level":"INFO","msg":"Will exit if parent process dies.","ppid":27688} +{"time":"2025-05-16T07:35:23.969782233Z","level":"INFO","msg":"server is running","addr":{"IP":"127.0.0.1","Port":39579,"Zone":""}} +{"time":"2025-05-16T07:35:24.077965245Z","level":"INFO","msg":"connection: ManageConnectionData: new connection created","id":"127.0.0.1:48002"} +{"time":"2025-05-16T07:35:27.860328522Z","level":"INFO","msg":"handleInformInit: received","streamId":"tbn7d6q6","id":"127.0.0.1:48002"} +{"time":"2025-05-16T07:35:28.313655136Z","level":"INFO","msg":"handleInformInit: stream started","streamId":"tbn7d6q6","id":"127.0.0.1:48002"} +{"time":"2025-05-16T07:35:40.331700042Z","level":"INFO","msg":"handleInformTeardown: server teardown initiated","id":"127.0.0.1:48002"} +{"time":"2025-05-16T07:35:40.331802539Z","level":"INFO","msg":"server is shutting down"} +{"time":"2025-05-16T07:35:40.331789688Z","level":"INFO","msg":"connection: closing","id":"127.0.0.1:48002"} +{"time":"2025-05-16T07:35:40.33196098Z","level":"INFO","msg":"connection: closed successfully","id":"127.0.0.1:48002"} +{"time":"2025-05-16T07:35:40.332470885Z","level":"INFO","msg":"handleInformTeardown: server shutdown complete","id":"127.0.0.1:48002"} +{"time":"2025-05-16T07:35:40.332500567Z","level":"INFO","msg":"connection: ManageConnectionData: connection closed","id":"127.0.0.1:48002"} +{"time":"2025-05-16T07:35:40.332516958Z","level":"INFO","msg":"server is closed"} diff --git a/wandb/offline-run-20250516_073527-tbn7d6q6/logs/debug-internal.log b/wandb/offline-run-20250516_073527-tbn7d6q6/logs/debug-internal.log new file mode 100644 index 0000000000000000000000000000000000000000..fca616e55a6dab66f5a227eadab2723bafa7f53a --- /dev/null +++ b/wandb/offline-run-20250516_073527-tbn7d6q6/logs/debug-internal.log @@ -0,0 +1,15 @@ +{"time":"2025-05-16T07:35:27.893672998Z","level":"INFO","msg":"stream: starting","core version":"0.19.11","symlink path":"/local_storage/persistent_data/jupyterlab/lab/workspaces/training_script/LLM-Workshop/personal_copilot/training/wandb/offline-run-20250516_073527-tbn7d6q6/logs/debug-core.log"} +{"time":"2025-05-16T07:35:28.312975339Z","level":"WARN","msg":"GraphQL client is nil, skipping feature loading"} +{"time":"2025-05-16T07:35:28.313625704Z","level":"INFO","msg":"created new stream","id":"tbn7d6q6"} +{"time":"2025-05-16T07:35:28.313648245Z","level":"INFO","msg":"stream: started","id":"tbn7d6q6"} +{"time":"2025-05-16T07:35:28.313670627Z","level":"INFO","msg":"writer: Do: started","stream_id":"tbn7d6q6"} +{"time":"2025-05-16T07:35:28.31370909Z","level":"INFO","msg":"handler: started","stream_id":"tbn7d6q6"} +{"time":"2025-05-16T07:35:28.313708829Z","level":"INFO","msg":"sender: started","stream_id":"tbn7d6q6"} +{"time":"2025-05-16T07:35:28.318320488Z","level":"INFO","msg":"Starting system monitor"} +{"time":"2025-05-16T07:35:40.331838151Z","level":"INFO","msg":"stream: closing","id":"tbn7d6q6"} +{"time":"2025-05-16T07:35:40.331893235Z","level":"INFO","msg":"Stopping system monitor"} +{"time":"2025-05-16T07:35:40.331940419Z","level":"INFO","msg":"Stopped system monitor"} +{"time":"2025-05-16T07:35:40.332018254Z","level":"INFO","msg":"handler: closed","stream_id":"tbn7d6q6"} +{"time":"2025-05-16T07:35:40.332025334Z","level":"INFO","msg":"writer: Close: closed","stream_id":"tbn7d6q6"} +{"time":"2025-05-16T07:35:40.332069427Z","level":"INFO","msg":"sender: closed","stream_id":"tbn7d6q6"} +{"time":"2025-05-16T07:35:40.332288503Z","level":"INFO","msg":"stream: closed","id":"tbn7d6q6"} diff --git a/wandb/offline-run-20250516_073527-tbn7d6q6/logs/debug.log b/wandb/offline-run-20250516_073527-tbn7d6q6/logs/debug.log new file mode 100644 index 0000000000000000000000000000000000000000..403ba9e322b3620de5896b4a25ab7f9464c0115b --- /dev/null +++ b/wandb/offline-run-20250516_073527-tbn7d6q6/logs/debug.log @@ -0,0 +1,26 @@ +2025-05-16 07:35:27,828 INFO MainThread:27688 [wandb_setup.py:_flush():70] Current SDK version is 0.19.11 +2025-05-16 07:35:27,828 INFO MainThread:27688 [wandb_setup.py:_flush():70] Configure stats pid to 27688 +2025-05-16 07:35:27,828 INFO MainThread:27688 [wandb_setup.py:_flush():70] Loading settings from /root/.config/wandb/settings +2025-05-16 07:35:27,828 INFO MainThread:27688 [wandb_setup.py:_flush():70] Loading settings from /local_storage/persistent_data/jupyterlab/lab/workspaces/training_script/LLM-Workshop/personal_copilot/training/wandb/settings +2025-05-16 07:35:27,828 INFO MainThread:27688 [wandb_setup.py:_flush():70] Loading settings from environment variables +2025-05-16 07:35:27,828 INFO MainThread:27688 [wandb_init.py:setup_run_log_directory():724] Logging user logs to /local_storage/persistent_data/jupyterlab/lab/workspaces/training_script/LLM-Workshop/personal_copilot/training/wandb/offline-run-20250516_073527-tbn7d6q6/logs/debug.log +2025-05-16 07:35:27,828 INFO MainThread:27688 [wandb_init.py:setup_run_log_directory():725] Logging internal logs to /local_storage/persistent_data/jupyterlab/lab/workspaces/training_script/LLM-Workshop/personal_copilot/training/wandb/offline-run-20250516_073527-tbn7d6q6/logs/debug-internal.log +2025-05-16 07:35:27,828 INFO MainThread:27688 [wandb_init.py:init():852] calling init triggers +2025-05-16 07:35:27,828 INFO MainThread:27688 [wandb_init.py:init():857] wandb.init called with sweep_config: {} +config: {'_wandb': {}} +2025-05-16 07:35:27,828 INFO MainThread:27688 [wandb_init.py:init():893] starting backend +2025-05-16 07:35:27,828 INFO MainThread:27688 [wandb_init.py:init():897] sending inform_init request +2025-05-16 07:35:27,841 INFO MainThread:27688 [backend.py:_multiprocessing_setup():101] multiprocessing start_methods=fork,spawn,forkserver, using: spawn +2025-05-16 07:35:27,841 INFO MainThread:27688 [wandb_init.py:init():907] backend started and connected +2025-05-16 07:35:27,842 INFO MainThread:27688 [wandb_init.py:init():1005] updated telemetry +2025-05-16 07:35:27,851 INFO MainThread:27688 [wandb_init.py:init():1029] communicating run to backend with 90.0 second timeout +2025-05-16 07:35:28,315 INFO MainThread:27688 [wandb_init.py:init():1104] starting run threads in backend +2025-05-16 07:35:28,703 INFO MainThread:27688 [wandb_run.py:_console_start():2573] atexit reg +2025-05-16 07:35:28,703 INFO MainThread:27688 [wandb_run.py:_redirect():2421] redirect: wrap_raw +2025-05-16 07:35:28,703 INFO MainThread:27688 [wandb_run.py:_redirect():2490] Wrapping output streams. +2025-05-16 07:35:28,703 INFO MainThread:27688 [wandb_run.py:_redirect():2513] Redirects installed. +2025-05-16 07:35:28,705 INFO MainThread:27688 [wandb_init.py:init():1150] run started, returning control to user process +2025-05-16 07:35:28,707 INFO MainThread:27688 [wandb_run.py:_config_callback():1436] config_cb None None {'peft_config': {'default': {'task_type': 'CAUSAL_LM', 'peft_type': , 'auto_mapping': None, 'base_model_name_or_path': 'codellama/CodeLlama-7b-Instruct-hf', 'revision': None, 'inference_mode': False, 'r': 32, 'target_modules': {'q_proj', 'o_proj', 'v_proj', 'up_proj', 'down_proj', 'gate_proj', 'k_proj'}, 'exclude_modules': None, 'lora_alpha': 64, 'lora_dropout': 0.1, 'fan_in_fan_out': False, 'bias': 'none', 'use_rslora': False, 'modules_to_save': None, 'init_lora_weights': True, 'layers_to_transform': None, 'layers_pattern': None, 'rank_pattern': {}, 'alpha_pattern': {}, 'megatron_config': None, 'megatron_core': 'megatron.core', 'trainable_token_indices': None, 'loftq_config': {}, 'eva_config': None, 'corda_config': None, 'use_dora': False, 'layer_replication': None, 'runtime_config': {'ephemeral_gpu_offload': False}, 'lora_bias': False}}, 'vocab_size': 32016, 'max_position_embeddings': 16384, 'hidden_size': 4096, 'intermediate_size': 11008, 'num_hidden_layers': 32, 'num_attention_heads': 32, 'num_key_value_heads': 32, 'hidden_act': 'silu', 'initializer_range': 0.02, 'rms_norm_eps': 1e-05, 'pretraining_tp': 1, 'use_cache': False, 'rope_theta': 1000000, 'rope_scaling': None, 'attention_bias': False, 'attention_dropout': 0.0, 'mlp_bias': False, 'head_dim': 128, 'return_dict': True, 'output_hidden_states': False, 'output_attentions': False, 'torchscript': False, 'torch_dtype': 'float16', 'use_bfloat16': False, 'tf_legacy_loss': False, 'pruned_heads': {}, 'tie_word_embeddings': False, 'chunk_size_feed_forward': 0, 'is_encoder_decoder': False, 'is_decoder': False, 'cross_attention_hidden_size': None, 'add_cross_attention': False, 'tie_encoder_decoder': False, 'max_length': 20, 'min_length': 0, 'do_sample': False, 'early_stopping': False, 'num_beams': 1, 'num_beam_groups': 1, 'diversity_penalty': 0.0, 'temperature': 1.0, 'top_k': 50, 'top_p': 1.0, 'typical_p': 1.0, 'repetition_penalty': 1.0, 'length_penalty': 1.0, 'no_repeat_ngram_size': 0, 'encoder_no_repeat_ngram_size': 0, 'bad_words_ids': None, 'num_return_sequences': 1, 'output_scores': False, 'return_dict_in_generate': False, 'forced_bos_token_id': None, 'forced_eos_token_id': None, 'remove_invalid_values': False, 'exponential_decay_length_penalty': None, 'suppress_tokens': None, 'begin_suppress_tokens': None, 'architectures': ['LlamaForCausalLM'], 'finetuning_task': None, 'id2label': {0: 'LABEL_0', 1: 'LABEL_1'}, 'label2id': {'LABEL_0': 0, 'LABEL_1': 1}, 'tokenizer_class': None, 'prefix': None, 'bos_token_id': 1, 'pad_token_id': None, 'eos_token_id': 2, 'sep_token_id': None, 'decoder_start_token_id': None, 'task_specific_params': None, 'problem_type': None, '_name_or_path': 'codellama/CodeLlama-7b-Instruct-hf', 'transformers_version': '4.52.0.dev0', 'model_type': 'llama', 'quantization_config': {'quant_method': 'BITS_AND_BYTES', '_load_in_8bit': False, '_load_in_4bit': True, 'llm_int8_threshold': 6.0, 'llm_int8_skip_modules': None, 'llm_int8_enable_fp32_cpu_offload': False, 'llm_int8_has_fp16_weight': False, 'bnb_4bit_quant_type': 'nf4', 'bnb_4bit_use_double_quant': True, 'bnb_4bit_compute_dtype': 'bfloat16', 'bnb_4bit_quant_storage': 'uint8', 'load_in_4bit': True, 'load_in_8bit': False}, 'output_dir': 'codellama-hugcoder', 'overwrite_output_dir': False, 'do_train': False, 'do_eval': False, 'do_predict': False, 'eval_strategy': 'no', 'prediction_loss_only': False, 'per_device_train_batch_size': 8, 'per_device_eval_batch_size': 8, 'per_gpu_train_batch_size': None, 'per_gpu_eval_batch_size': None, 'gradient_accumulation_steps': 4, 'eval_accumulation_steps': None, 'eval_delay': 0, 'torch_empty_cache_steps': None, 'learning_rate': 0.0003, 'weight_decay': 0.1, 'adam_beta1': 0.9, 'adam_beta2': 0.999, 'adam_epsilon': 1e-08, 'max_grad_norm': 1.0, 'num_train_epochs': 3.0, 'max_steps': 2000, 'lr_scheduler_type': 'cosine', 'lr_scheduler_kwargs': {}, 'warmup_ratio': 0.1, 'warmup_steps': 0, 'log_level': 'info', 'log_level_replica': 'warning', 'log_on_each_node': True, 'logging_dir': 'codellama-hugcoder/runs/May16_07-34-08_project-finecode-65846d7984-mzlgr', 'logging_strategy': 'steps', 'logging_first_step': False, 'logging_steps': 5, 'logging_nan_inf_filter': True, 'save_strategy': 'steps', 'save_steps': 500, 'save_total_limit': None, 'save_safetensors': True, 'save_on_each_node': False, 'save_only_model': False, 'restore_callback_states_from_checkpoint': False, 'no_cuda': False, 'use_cpu': False, 'use_mps_device': False, 'seed': 42, 'data_seed': None, 'jit_mode_eval': False, 'use_ipex': False, 'bf16': True, 'fp16': False, 'fp16_opt_level': 'O1', 'half_precision_backend': 'auto', 'bf16_full_eval': False, 'fp16_full_eval': False, 'tf32': None, 'local_rank': 0, 'ddp_backend': None, 'tpu_num_cores': None, 'tpu_metrics_debug': False, 'debug': [], 'dataloader_drop_last': False, 'eval_steps': 100.0, 'dataloader_num_workers': 0, 'dataloader_prefetch_factor': None, 'past_index': -1, 'run_name': 'codellama-hugcoder', 'disable_tqdm': False, 'remove_unused_columns': True, 'label_names': None, 'load_best_model_at_end': False, 'metric_for_best_model': None, 'greater_is_better': None, 'ignore_data_skip': False, 'fsdp': [], 'fsdp_min_num_params': 0, 'fsdp_config': {'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, 'fsdp_transformer_layer_cls_to_wrap': None, 'accelerator_config': {'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None}, 'deepspeed': None, 'label_smoothing_factor': 0.0, 'optim': 'adamw_torch', 'optim_args': None, 'adafactor': False, 'group_by_length': False, 'length_column_name': 'length', 'report_to': ['wandb'], 'ddp_find_unused_parameters': None, 'ddp_bucket_cap_mb': None, 'ddp_broadcast_buffers': None, 'dataloader_pin_memory': True, 'dataloader_persistent_workers': False, 'skip_memory_metrics': True, 'use_legacy_prediction_loop': False, 'push_to_hub': True, 'resume_from_checkpoint': None, 'hub_model_id': None, 'hub_strategy': 'every_save', 'hub_token': '', 'hub_private_repo': True, 'hub_always_push': False, 'gradient_checkpointing': True, 'gradient_checkpointing_kwargs': {'use_reentrant': True}, 'include_inputs_for_metrics': False, 'include_for_metrics': [], 'eval_do_concat_batches': True, 'fp16_backend': 'auto', 'push_to_hub_model_id': None, 'push_to_hub_organization': None, 'push_to_hub_token': '', 'mp_parameters': '', 'auto_find_batch_size': False, 'full_determinism': False, 'torchdynamo': None, 'ray_scope': 'last', 'ddp_timeout': 1800, 'torch_compile': False, 'torch_compile_backend': None, 'torch_compile_mode': None, 'include_tokens_per_second': False, 'include_num_input_tokens_seen': False, 'neftune_noise_alpha': None, 'optim_target_modules': None, 'batch_eval_metrics': False, 'eval_on_start': False, 'use_liger_kernel': False, 'eval_use_gather_object': False, 'average_tokens_across_devices': False} +2025-05-16 07:35:28,713 INFO MainThread:27688 [wandb_config.py:__setitem__():154] [no run ID] config set model/num_parameters = 6818500608 - > +2025-05-16 07:35:28,713 INFO MainThread:27688 [wandb_run.py:_config_callback():1436] config_cb model/num_parameters 6818500608 None +2025-05-16 07:35:40,331 INFO MsgRouterThr:27688 [mailbox.py:close():129] [no run ID] Closing mailbox, abandoning 0 handles. diff --git a/wandb/offline-run-20250516_073527-tbn7d6q6/run-tbn7d6q6.wandb b/wandb/offline-run-20250516_073527-tbn7d6q6/run-tbn7d6q6.wandb new file mode 100644 index 0000000000000000000000000000000000000000..7ebfb43e2a2f1440ad5f94590d197d68174a58fc Binary files /dev/null and b/wandb/offline-run-20250516_073527-tbn7d6q6/run-tbn7d6q6.wandb differ diff --git a/wandb/offline-run-20250516_073747-jc2tz43q/files/.ipynb_checkpoints/output-checkpoint.log b/wandb/offline-run-20250516_073747-jc2tz43q/files/.ipynb_checkpoints/output-checkpoint.log new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/wandb/offline-run-20250516_073747-jc2tz43q/files/.ipynb_checkpoints/requirements-checkpoint.txt b/wandb/offline-run-20250516_073747-jc2tz43q/files/.ipynb_checkpoints/requirements-checkpoint.txt new file mode 100644 index 0000000000000000000000000000000000000000..30272a20e03fdb784d0ddd410c9451a2fc8f06a3 --- /dev/null +++ b/wandb/offline-run-20250516_073747-jc2tz43q/files/.ipynb_checkpoints/requirements-checkpoint.txt @@ -0,0 +1,359 @@ +huggingface-hub==0.31.2 +tokenizers==0.21.1 +bitsandbytes==0.45.5 +py-cpuinfo==9.0.0 +nvidia-ml-py==12.575.51 +hjson==3.1.0 +smmap==5.0.2 +setproctitle==1.3.6 +sentry-sdk==2.28.0 +ninja==1.11.1.4 +msgpack==1.1.0 +einops==0.8.1 +docker-pycreds==0.4.0 +gitdb==4.0.12 +GitPython==3.1.44 +wandb==0.19.11 +transformers==4.52.0.dev0 +deepspeed==0.16.7 +accelerate==1.8.0.dev0 +peft==0.15.2.dev0 +trl==0.17.0 +flash_attn==2.7.4.post1 +APScheduler==3.10.4 +Authlib==1.3.1 +Deprecated==1.2.18 +Flask-Cors==4.0.1 +Mako==1.3.8 +Markdown==3.6 +PyJWT==2.8.0 +PyMySQL==1.1.1 +PyPika==0.48.9 +RTFDE==0.1.2 +SQLAlchemy==2.0.31 +XlsxWriter==3.2.2 +aiohttp==3.9.5 +alembic==1.13.2 +annotated-types==0.7.0 +anthropic==0.45.2 +asgiref==3.8.1 +async-timeout==4.0.3 +av==12.3.0 +backoff==2.2.1 +bcrypt==4.1.3 +beautifulsoup4==4.12.3 +bidict==0.23.1 +black==24.8.0 +blinker==1.9.0 +boto3==1.34.153 +botocore==1.34.162 +build==1.2.2.post1 +cachetools==5.5.1 +chardet==5.2.0 +chroma-hnswlib==0.7.5 +chromadb==0.5.4 +click==8.1.8 +colorclass==2.2.2 +coloredlogs==15.0.1 +compressed-rtf==1.0.6 +cryptography==44.0.0 +ctranslate2==4.5.0 +dataclasses-json==0.6.7 +deepdiff==8.1.1 +distro==1.9.0 +dnspython==2.7.0 +docker==7.1.0 +docx2txt==0.8 +duckduckgo_search==6.2.13 +durationpy==0.9 +easygui==0.98.3 +ebcdic==1.1.1 +ecdsa==0.19.0 +email_validator==2.2.0 +emoji==2.14.1 +extract-msg==0.52.0 +fake-useragent==1.5.1 +fastapi==0.111.0 +fastapi-cli==0.0.7 +faster-whisper==1.0.2 +filetype==1.2.0 +Flask==3.0.3 +flatbuffers==25.1.24 +fonttools==4.55.8 +fpdf2==2.7.9 +google-ai-generativelanguage==0.6.6 +google-api-core==2.24.1 +google-api-python-client==2.160.0 +google-auth==2.38.0 +google-auth-httplib2==0.2.0 +google-generativeai==0.7.2 +googleapis-common-protos==1.66.0 +greenlet==3.1.1 +grpcio==1.70.0 +grpcio-status==1.62.3 +httplib2==0.22.0 +httptools==0.6.4 +humanfriendly==10.0 +importlib_metadata==8.4.0 +importlib_resources==6.5.2 +iniconfig==2.0.0 +itsdangerous==2.2.0 +jiter==0.8.2 +jmespath==1.0.1 +joblib==1.4.2 +jsonpatch==1.33 +jsonpath-python==1.0.6 +kubernetes==32.0.0 +langchain==0.2.11 +langchain-chroma==0.1.2 +langchain-community==0.2.10 +langchain-core==0.2.43 +langchain-text-splitters==0.2.4 +langdetect==1.0.9 +langfuse==2.39.2 +langsmith==0.1.147 +lark==1.1.9 +lxml==5.3.0 +markdown-it-py==3.0.0 +marshmallow==3.26.0 +mdurl==0.1.2 +mmh3==5.1.0 +monotonic==1.6 +msoffcrypto-tool==5.4.2 +mypy-extensions==1.0.0 +nltk==3.9.1 +numpy==1.26.4 +oauthlib==3.2.2 +olefile==0.47 +oletools==0.60.2 +onnxruntime==1.20.1 +openai==1.61.0 +opencv-python==4.11.0.86 +opencv-python-headless==4.10.0.84 +opentelemetry-api==1.27.0 +opentelemetry-exporter-otlp-proto-common==1.27.0 +opentelemetry-exporter-otlp-proto-grpc==1.27.0 +opentelemetry-instrumentation==0.48b0 +opentelemetry-instrumentation-asgi==0.48b0 +opentelemetry-instrumentation-fastapi==0.48b0 +opentelemetry-proto==1.27.0 +opentelemetry-sdk==1.27.0 +opentelemetry-semantic-conventions==0.48b0 +opentelemetry-util-http==0.48b0 +orderly-set==5.2.3 +orjson==3.10.15 +packaging==23.2 +pandas==2.2.2 +passlib==1.7.4 +pathspec==0.12.1 +pcodedmp==1.2.6 +peewee==3.17.6 +peewee-migrate==1.12.2 +pillow==11.1.0 +pluggy==1.5.0 +posthog==3.11.0 +primp==0.11.0 +proto-plus==1.26.0 +protobuf==4.25.6 +psycopg2-binary==2.9.9 +pyasn1==0.6.1 +pyasn1_modules==0.4.1 +pyclipper==1.3.0.post6 +pydantic==2.8.2 +pydantic_core==2.20.1 +pydub==0.25.1 +pymongo==4.11 +pypandoc==1.13 +pyparsing==3.2.1 +pypdf==4.3.1 +pyproject_hooks==1.2.0 +pytest==8.2.2 +pytest-docker==3.1.1 +python-dotenv==1.0.1 +python-engineio==4.11.2 +python-iso639==2025.1.28 +python-jose==3.3.0 +python-magic==0.4.27 +python-multipart==0.0.9 +python-pptx==1.0.0 +python-socketio==5.11.3 +pytube==15.0.0 +pyxlsb==1.0.10 +rank-bm25==0.2.2 +RapidFuzz==3.12.1 +rapidocr-onnxruntime==1.3.24 +red-black-tree-mod==1.20 +redis==5.2.1 +requests-oauthlib==2.0.0 +requests-toolbelt==1.0.0 +rich==13.9.4 +rich-toolkit==0.13.2 +rsa==4.9 +s3transfer==0.10.4 +scikit-learn==1.6.1 +scipy==1.15.1 +sentence-transformers==3.0.1 +shapely==2.0.7 +shellingham==1.5.4 +simple-websocket==1.1.0 +starlette==0.37.2 +tabulate==0.9.0 +tenacity==8.5.0 +threadpoolctl==3.5.0 +tiktoken==0.8.0 +typer==0.15.1 +typing-inspect==0.9.0 +tzlocal==5.2 +ujson==5.10.0 +unstructured==0.15.0 +unstructured-client==0.25.9 +uritemplate==4.1.1 +uvicorn==0.22.0 +uvloop==0.21.0 +validators==0.33.0 +watchfiles==1.0.4 +websockets==14.2 +Werkzeug==3.1.3 +wrapt==1.17.2 +wsproto==1.2.0 +xlrd==2.0.1 +youtube-transcript-api==0.6.2 +zipp==3.21.0 +aiohappyeyeballs==2.4.4 +aiosignal==1.3.2 +datasets==3.2.0 +dill==0.3.8 +et_xmlfile==2.0.0 +evaluate==0.4.3 +filelock==3.17.0 +frozenlist==1.5.0 +fsspec==2024.9.0 +mpmath==1.3.0 +multidict==6.1.0 +multiprocess==0.70.16 +networkx==3.4.2 +nvidia-cublas-cu12==12.4.5.8 +nvidia-cuda-cupti-cu12==12.4.127 +nvidia-cuda-nvrtc-cu12==12.4.127 +nvidia-cuda-runtime-cu12==12.4.127 +nvidia-cudnn-cu12==9.1.0.70 +nvidia-cufft-cu12==11.2.1.3 +nvidia-curand-cu12==10.3.5.147 +nvidia-cusolver-cu12==11.6.1.9 +nvidia-cusparse-cu12==12.3.1.170 +nvidia-cusparselt-cu12==0.6.2 +nvidia-nccl-cu12==2.21.5 +nvidia-nvjitlink-cu12==12.4.127 +nvidia-nvtx-cu12==12.4.127 +openpyxl==3.1.5 +propcache==0.2.1 +pyarrow==19.0.0 +pytz==2025.1 +regex==2024.11.6 +safetensors==0.5.2 +sentencepiece==0.2.0 +sympy==1.13.1 +torch==2.6.0 +tqdm==4.67.1 +triton==3.2.0 +tzdata==2025.1 +xxhash==3.5.0 +yarl==1.18.3 +MarkupSafe==3.0.2 +PyYAML==6.0.2 +Send2Trash==1.8.3 +anyio==4.8.0 +argon2-cffi==23.1.0 +argon2-cffi-bindings==21.2.0 +arrow==1.3.0 +asttokens==3.0.0 +async-lru==2.0.4 +attrs==25.1.0 +babel==2.17.0 +bleach==6.2.0 +certifi==2025.1.31 +cffi==1.17.1 +charset-normalizer==3.4.1 +comm==0.2.2 +debugpy==1.8.12 +decorator==5.1.1 +defusedxml==0.7.1 +exceptiongroup==1.2.2 +executing==2.2.0 +fastjsonschema==2.21.1 +fqdn==1.5.1 +h11==0.14.0 +httpcore==1.0.7 +httpx==0.28.1 +idna==3.10 +ipykernel==6.29.5 +ipython==8.32.0 +ipywidgets==8.1.5 +isoduration==20.11.0 +jedi==0.19.2 +Jinja2==3.1.5 +json5==0.10.0 +jsonpointer==3.0.0 +jsonschema==4.23.0 +jsonschema-specifications==2024.10.1 +jupyter==1.1.1 +jupyter_client==8.6.3 +jupyter-console==6.6.3 +jupyter_core==5.7.2 +jupyter-events==0.11.0 +jupyter-lsp==2.2.5 +jupyter_server==2.15.0 +jupyter_server_terminals==0.5.3 +jupyterlab==4.3.5 +jupyterlab_pygments==0.3.0 +jupyterlab_server==2.27.3 +jupyterlab_widgets==3.0.13 +matplotlib-inline==0.1.7 +mistune==3.1.1 +nbclient==0.10.2 +nbconvert==7.16.6 +nbformat==5.10.4 +nest-asyncio==1.6.0 +notebook==7.3.2 +notebook_shim==0.2.4 +overrides==7.7.0 +pandocfilters==1.5.1 +parso==0.8.4 +pexpect==4.9.0 +platformdirs==4.3.6 +prometheus_client==0.21.1 +prompt_toolkit==3.0.50 +psutil==6.1.1 +ptyprocess==0.7.0 +pure_eval==0.2.3 +pycparser==2.22 +Pygments==2.19.1 +python-dateutil==2.9.0.post0 +python-json-logger==3.2.1 +pyzmq==26.2.1 +referencing==0.36.2 +requests==2.32.3 +rfc3339-validator==0.1.4 +rfc3986-validator==0.1.1 +rpds-py==0.22.3 +six==1.17.0 +sniffio==1.3.1 +soupsieve==2.6 +stack-data==0.6.3 +terminado==0.18.1 +tinycss2==1.4.0 +tomli==2.2.1 +tornado==6.4.2 +traitlets==5.14.3 +types-python-dateutil==2.9.0.20241206 +typing_extensions==4.12.2 +uri-template==1.3.0 +urllib3==2.3.0 +wcwidth==0.2.13 +webcolors==24.11.1 +webencodings==0.5.1 +websocket-client==1.8.0 +widgetsnbextension==4.0.13 +pip==22.0.2 +setuptools==59.6.0 +wheel==0.37.1 diff --git a/wandb/offline-run-20250516_073747-jc2tz43q/files/output.log b/wandb/offline-run-20250516_073747-jc2tz43q/files/output.log new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/wandb/offline-run-20250516_073747-jc2tz43q/files/requirements.txt b/wandb/offline-run-20250516_073747-jc2tz43q/files/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..30272a20e03fdb784d0ddd410c9451a2fc8f06a3 --- /dev/null +++ b/wandb/offline-run-20250516_073747-jc2tz43q/files/requirements.txt @@ -0,0 +1,359 @@ +huggingface-hub==0.31.2 +tokenizers==0.21.1 +bitsandbytes==0.45.5 +py-cpuinfo==9.0.0 +nvidia-ml-py==12.575.51 +hjson==3.1.0 +smmap==5.0.2 +setproctitle==1.3.6 +sentry-sdk==2.28.0 +ninja==1.11.1.4 +msgpack==1.1.0 +einops==0.8.1 +docker-pycreds==0.4.0 +gitdb==4.0.12 +GitPython==3.1.44 +wandb==0.19.11 +transformers==4.52.0.dev0 +deepspeed==0.16.7 +accelerate==1.8.0.dev0 +peft==0.15.2.dev0 +trl==0.17.0 +flash_attn==2.7.4.post1 +APScheduler==3.10.4 +Authlib==1.3.1 +Deprecated==1.2.18 +Flask-Cors==4.0.1 +Mako==1.3.8 +Markdown==3.6 +PyJWT==2.8.0 +PyMySQL==1.1.1 +PyPika==0.48.9 +RTFDE==0.1.2 +SQLAlchemy==2.0.31 +XlsxWriter==3.2.2 +aiohttp==3.9.5 +alembic==1.13.2 +annotated-types==0.7.0 +anthropic==0.45.2 +asgiref==3.8.1 +async-timeout==4.0.3 +av==12.3.0 +backoff==2.2.1 +bcrypt==4.1.3 +beautifulsoup4==4.12.3 +bidict==0.23.1 +black==24.8.0 +blinker==1.9.0 +boto3==1.34.153 +botocore==1.34.162 +build==1.2.2.post1 +cachetools==5.5.1 +chardet==5.2.0 +chroma-hnswlib==0.7.5 +chromadb==0.5.4 +click==8.1.8 +colorclass==2.2.2 +coloredlogs==15.0.1 +compressed-rtf==1.0.6 +cryptography==44.0.0 +ctranslate2==4.5.0 +dataclasses-json==0.6.7 +deepdiff==8.1.1 +distro==1.9.0 +dnspython==2.7.0 +docker==7.1.0 +docx2txt==0.8 +duckduckgo_search==6.2.13 +durationpy==0.9 +easygui==0.98.3 +ebcdic==1.1.1 +ecdsa==0.19.0 +email_validator==2.2.0 +emoji==2.14.1 +extract-msg==0.52.0 +fake-useragent==1.5.1 +fastapi==0.111.0 +fastapi-cli==0.0.7 +faster-whisper==1.0.2 +filetype==1.2.0 +Flask==3.0.3 +flatbuffers==25.1.24 +fonttools==4.55.8 +fpdf2==2.7.9 +google-ai-generativelanguage==0.6.6 +google-api-core==2.24.1 +google-api-python-client==2.160.0 +google-auth==2.38.0 +google-auth-httplib2==0.2.0 +google-generativeai==0.7.2 +googleapis-common-protos==1.66.0 +greenlet==3.1.1 +grpcio==1.70.0 +grpcio-status==1.62.3 +httplib2==0.22.0 +httptools==0.6.4 +humanfriendly==10.0 +importlib_metadata==8.4.0 +importlib_resources==6.5.2 +iniconfig==2.0.0 +itsdangerous==2.2.0 +jiter==0.8.2 +jmespath==1.0.1 +joblib==1.4.2 +jsonpatch==1.33 +jsonpath-python==1.0.6 +kubernetes==32.0.0 +langchain==0.2.11 +langchain-chroma==0.1.2 +langchain-community==0.2.10 +langchain-core==0.2.43 +langchain-text-splitters==0.2.4 +langdetect==1.0.9 +langfuse==2.39.2 +langsmith==0.1.147 +lark==1.1.9 +lxml==5.3.0 +markdown-it-py==3.0.0 +marshmallow==3.26.0 +mdurl==0.1.2 +mmh3==5.1.0 +monotonic==1.6 +msoffcrypto-tool==5.4.2 +mypy-extensions==1.0.0 +nltk==3.9.1 +numpy==1.26.4 +oauthlib==3.2.2 +olefile==0.47 +oletools==0.60.2 +onnxruntime==1.20.1 +openai==1.61.0 +opencv-python==4.11.0.86 +opencv-python-headless==4.10.0.84 +opentelemetry-api==1.27.0 +opentelemetry-exporter-otlp-proto-common==1.27.0 +opentelemetry-exporter-otlp-proto-grpc==1.27.0 +opentelemetry-instrumentation==0.48b0 +opentelemetry-instrumentation-asgi==0.48b0 +opentelemetry-instrumentation-fastapi==0.48b0 +opentelemetry-proto==1.27.0 +opentelemetry-sdk==1.27.0 +opentelemetry-semantic-conventions==0.48b0 +opentelemetry-util-http==0.48b0 +orderly-set==5.2.3 +orjson==3.10.15 +packaging==23.2 +pandas==2.2.2 +passlib==1.7.4 +pathspec==0.12.1 +pcodedmp==1.2.6 +peewee==3.17.6 +peewee-migrate==1.12.2 +pillow==11.1.0 +pluggy==1.5.0 +posthog==3.11.0 +primp==0.11.0 +proto-plus==1.26.0 +protobuf==4.25.6 +psycopg2-binary==2.9.9 +pyasn1==0.6.1 +pyasn1_modules==0.4.1 +pyclipper==1.3.0.post6 +pydantic==2.8.2 +pydantic_core==2.20.1 +pydub==0.25.1 +pymongo==4.11 +pypandoc==1.13 +pyparsing==3.2.1 +pypdf==4.3.1 +pyproject_hooks==1.2.0 +pytest==8.2.2 +pytest-docker==3.1.1 +python-dotenv==1.0.1 +python-engineio==4.11.2 +python-iso639==2025.1.28 +python-jose==3.3.0 +python-magic==0.4.27 +python-multipart==0.0.9 +python-pptx==1.0.0 +python-socketio==5.11.3 +pytube==15.0.0 +pyxlsb==1.0.10 +rank-bm25==0.2.2 +RapidFuzz==3.12.1 +rapidocr-onnxruntime==1.3.24 +red-black-tree-mod==1.20 +redis==5.2.1 +requests-oauthlib==2.0.0 +requests-toolbelt==1.0.0 +rich==13.9.4 +rich-toolkit==0.13.2 +rsa==4.9 +s3transfer==0.10.4 +scikit-learn==1.6.1 +scipy==1.15.1 +sentence-transformers==3.0.1 +shapely==2.0.7 +shellingham==1.5.4 +simple-websocket==1.1.0 +starlette==0.37.2 +tabulate==0.9.0 +tenacity==8.5.0 +threadpoolctl==3.5.0 +tiktoken==0.8.0 +typer==0.15.1 +typing-inspect==0.9.0 +tzlocal==5.2 +ujson==5.10.0 +unstructured==0.15.0 +unstructured-client==0.25.9 +uritemplate==4.1.1 +uvicorn==0.22.0 +uvloop==0.21.0 +validators==0.33.0 +watchfiles==1.0.4 +websockets==14.2 +Werkzeug==3.1.3 +wrapt==1.17.2 +wsproto==1.2.0 +xlrd==2.0.1 +youtube-transcript-api==0.6.2 +zipp==3.21.0 +aiohappyeyeballs==2.4.4 +aiosignal==1.3.2 +datasets==3.2.0 +dill==0.3.8 +et_xmlfile==2.0.0 +evaluate==0.4.3 +filelock==3.17.0 +frozenlist==1.5.0 +fsspec==2024.9.0 +mpmath==1.3.0 +multidict==6.1.0 +multiprocess==0.70.16 +networkx==3.4.2 +nvidia-cublas-cu12==12.4.5.8 +nvidia-cuda-cupti-cu12==12.4.127 +nvidia-cuda-nvrtc-cu12==12.4.127 +nvidia-cuda-runtime-cu12==12.4.127 +nvidia-cudnn-cu12==9.1.0.70 +nvidia-cufft-cu12==11.2.1.3 +nvidia-curand-cu12==10.3.5.147 +nvidia-cusolver-cu12==11.6.1.9 +nvidia-cusparse-cu12==12.3.1.170 +nvidia-cusparselt-cu12==0.6.2 +nvidia-nccl-cu12==2.21.5 +nvidia-nvjitlink-cu12==12.4.127 +nvidia-nvtx-cu12==12.4.127 +openpyxl==3.1.5 +propcache==0.2.1 +pyarrow==19.0.0 +pytz==2025.1 +regex==2024.11.6 +safetensors==0.5.2 +sentencepiece==0.2.0 +sympy==1.13.1 +torch==2.6.0 +tqdm==4.67.1 +triton==3.2.0 +tzdata==2025.1 +xxhash==3.5.0 +yarl==1.18.3 +MarkupSafe==3.0.2 +PyYAML==6.0.2 +Send2Trash==1.8.3 +anyio==4.8.0 +argon2-cffi==23.1.0 +argon2-cffi-bindings==21.2.0 +arrow==1.3.0 +asttokens==3.0.0 +async-lru==2.0.4 +attrs==25.1.0 +babel==2.17.0 +bleach==6.2.0 +certifi==2025.1.31 +cffi==1.17.1 +charset-normalizer==3.4.1 +comm==0.2.2 +debugpy==1.8.12 +decorator==5.1.1 +defusedxml==0.7.1 +exceptiongroup==1.2.2 +executing==2.2.0 +fastjsonschema==2.21.1 +fqdn==1.5.1 +h11==0.14.0 +httpcore==1.0.7 +httpx==0.28.1 +idna==3.10 +ipykernel==6.29.5 +ipython==8.32.0 +ipywidgets==8.1.5 +isoduration==20.11.0 +jedi==0.19.2 +Jinja2==3.1.5 +json5==0.10.0 +jsonpointer==3.0.0 +jsonschema==4.23.0 +jsonschema-specifications==2024.10.1 +jupyter==1.1.1 +jupyter_client==8.6.3 +jupyter-console==6.6.3 +jupyter_core==5.7.2 +jupyter-events==0.11.0 +jupyter-lsp==2.2.5 +jupyter_server==2.15.0 +jupyter_server_terminals==0.5.3 +jupyterlab==4.3.5 +jupyterlab_pygments==0.3.0 +jupyterlab_server==2.27.3 +jupyterlab_widgets==3.0.13 +matplotlib-inline==0.1.7 +mistune==3.1.1 +nbclient==0.10.2 +nbconvert==7.16.6 +nbformat==5.10.4 +nest-asyncio==1.6.0 +notebook==7.3.2 +notebook_shim==0.2.4 +overrides==7.7.0 +pandocfilters==1.5.1 +parso==0.8.4 +pexpect==4.9.0 +platformdirs==4.3.6 +prometheus_client==0.21.1 +prompt_toolkit==3.0.50 +psutil==6.1.1 +ptyprocess==0.7.0 +pure_eval==0.2.3 +pycparser==2.22 +Pygments==2.19.1 +python-dateutil==2.9.0.post0 +python-json-logger==3.2.1 +pyzmq==26.2.1 +referencing==0.36.2 +requests==2.32.3 +rfc3339-validator==0.1.4 +rfc3986-validator==0.1.1 +rpds-py==0.22.3 +six==1.17.0 +sniffio==1.3.1 +soupsieve==2.6 +stack-data==0.6.3 +terminado==0.18.1 +tinycss2==1.4.0 +tomli==2.2.1 +tornado==6.4.2 +traitlets==5.14.3 +types-python-dateutil==2.9.0.20241206 +typing_extensions==4.12.2 +uri-template==1.3.0 +urllib3==2.3.0 +wcwidth==0.2.13 +webcolors==24.11.1 +webencodings==0.5.1 +websocket-client==1.8.0 +widgetsnbextension==4.0.13 +pip==22.0.2 +setuptools==59.6.0 +wheel==0.37.1 diff --git a/wandb/offline-run-20250516_073747-jc2tz43q/files/wandb-metadata.json b/wandb/offline-run-20250516_073747-jc2tz43q/files/wandb-metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..7097317b98a945cbc73910836d4d7816918309c1 --- /dev/null +++ b/wandb/offline-run-20250516_073747-jc2tz43q/files/wandb-metadata.json @@ -0,0 +1,162 @@ +{ + "os": "Linux-5.10.236-228.935.amzn2.x86_64-x86_64-with-glibc2.35", + "python": "CPython 3.10.12", + "startedAt": "2025-05-16T07:37:47.236910Z", + "args": [ + "--model_name_or_path", + "codellama/CodeLlama-7b-Instruct-hf", + "--dataset_name", + "smangrul/hug_stack", + "--splits", + "train", + "--max_seq_len", + "2048", + "--max_steps", + "2000", + "--save_steps", + "500", + "--eval_steps", + "100", + "--logging_steps", + "5", + "--log_level", + "info", + "--logging_strategy", + "steps", + "--save_strategy", + "steps", + "--push_to_hub", + "--hub_private_repo", + "True", + "--hub_strategy", + "every_save", + "--bf16", + "True", + "--learning_rate", + "3e-4", + "--lr_scheduler_type", + "cosine", + "--weight_decay", + "0.1", + "--warmup_ratio", + "0.1", + "--max_grad_norm", + "1.0", + "--output_dir", + "codellama-hugcoder", + "--per_device_train_batch_size", + "4", + "--per_device_eval_batch_size", + "4", + "--gradient_accumulation_steps", + "4", + "--gradient_checkpointing", + "True", + "--use_reentrant", + "True", + "--dataset_text_field", + "text", + "--test_size", + "0.1", + "--fim_rate", + "0.5", + "--fim_spm_rate", + "0.5", + "--use_peft_lora", + "True", + "--lora_r", + "32", + "--lora_alpha", + "64", + "--lora_dropout", + "0.1", + "--lora_target_modules", + "all-linear", + "--use_4bit_quantization", + "True", + "--use_nested_quant", + "True", + "--bnb_4bit_compute_dtype", + "bfloat16", + "--use_flash_attn", + "True" + ], + "program": "/local_storage/persistent_data/jupyterlab/lab/workspaces/training_script/LLM-Workshop/personal_copilot/training/train.py", + "codePath": "personal_copilot/training/train.py", + "git": { + "remote": "https://github.com/pacman100/LLM-Workshop.git", + "commit": "0ba41561ce6ea16d3993069c03ec1dca3ab6769d" + }, + "root": "/local_storage/persistent_data/jupyterlab/lab/workspaces/training_script/LLM-Workshop/personal_copilot/training", + "host": "project-finecode-65846d7984-mzlgr", + "executable": "/usr/bin/python3", + "codePathLocal": "train.py", + "cpu_count": 96, + "cpu_count_logical": 192, + "gpu": "NVIDIA L4", + "gpu_count": 8, + "disk": { + "/": { + "total": "161048670208", + "used": "79644385280" + } + }, + "memory": { + "total": "781916942336" + }, + "cpu": { + "count": 96, + "countLogical": 192 + }, + "gpu_nvidia": [ + { + "name": "NVIDIA L4", + "memoryTotal": "24152899584", + "cudaCores": 7424, + "architecture": "Ada" + }, + { + "name": "NVIDIA L4", + "memoryTotal": "24152899584", + "cudaCores": 7424, + "architecture": "Ada" + }, + { + "name": "NVIDIA L4", + "memoryTotal": "24152899584", + "cudaCores": 7424, + "architecture": "Ada" + }, + { + "name": "NVIDIA L4", + "memoryTotal": "24152899584", + "cudaCores": 7424, + "architecture": "Ada" + }, + { + "name": "NVIDIA L4", + "memoryTotal": "24152899584", + "cudaCores": 7424, + "architecture": "Ada" + }, + { + "name": "NVIDIA L4", + "memoryTotal": "24152899584", + "cudaCores": 7424, + "architecture": "Ada" + }, + { + "name": "NVIDIA L4", + "memoryTotal": "24152899584", + "cudaCores": 7424, + "architecture": "Ada" + }, + { + "name": "NVIDIA L4", + "memoryTotal": "24152899584", + "cudaCores": 7424, + "architecture": "Ada" + } + ], + "cudaVersion": "12.4" +} \ No newline at end of file diff --git a/wandb/offline-run-20250516_073747-jc2tz43q/logs/.ipynb_checkpoints/debug-checkpoint.log b/wandb/offline-run-20250516_073747-jc2tz43q/logs/.ipynb_checkpoints/debug-checkpoint.log new file mode 100644 index 0000000000000000000000000000000000000000..d90128eade86bb07359f810edbe2e6d6c84de48d --- /dev/null +++ b/wandb/offline-run-20250516_073747-jc2tz43q/logs/.ipynb_checkpoints/debug-checkpoint.log @@ -0,0 +1,25 @@ +2025-05-16 07:37:47,232 INFO MainThread:29365 [wandb_setup.py:_flush():70] Current SDK version is 0.19.11 +2025-05-16 07:37:47,232 INFO MainThread:29365 [wandb_setup.py:_flush():70] Configure stats pid to 29365 +2025-05-16 07:37:47,232 INFO MainThread:29365 [wandb_setup.py:_flush():70] Loading settings from /root/.config/wandb/settings +2025-05-16 07:37:47,232 INFO MainThread:29365 [wandb_setup.py:_flush():70] Loading settings from /local_storage/persistent_data/jupyterlab/lab/workspaces/training_script/LLM-Workshop/personal_copilot/training/wandb/settings +2025-05-16 07:37:47,232 INFO MainThread:29365 [wandb_setup.py:_flush():70] Loading settings from environment variables +2025-05-16 07:37:47,232 INFO MainThread:29365 [wandb_init.py:setup_run_log_directory():724] Logging user logs to /local_storage/persistent_data/jupyterlab/lab/workspaces/training_script/LLM-Workshop/personal_copilot/training/wandb/offline-run-20250516_073747-jc2tz43q/logs/debug.log +2025-05-16 07:37:47,232 INFO MainThread:29365 [wandb_init.py:setup_run_log_directory():725] Logging internal logs to /local_storage/persistent_data/jupyterlab/lab/workspaces/training_script/LLM-Workshop/personal_copilot/training/wandb/offline-run-20250516_073747-jc2tz43q/logs/debug-internal.log +2025-05-16 07:37:47,232 INFO MainThread:29365 [wandb_init.py:init():852] calling init triggers +2025-05-16 07:37:47,232 INFO MainThread:29365 [wandb_init.py:init():857] wandb.init called with sweep_config: {} +config: {'_wandb': {}} +2025-05-16 07:37:47,232 INFO MainThread:29365 [wandb_init.py:init():893] starting backend +2025-05-16 07:37:47,232 INFO MainThread:29365 [wandb_init.py:init():897] sending inform_init request +2025-05-16 07:37:47,236 INFO MainThread:29365 [backend.py:_multiprocessing_setup():101] multiprocessing start_methods=fork,spawn,forkserver, using: spawn +2025-05-16 07:37:47,236 INFO MainThread:29365 [wandb_init.py:init():907] backend started and connected +2025-05-16 07:37:47,237 INFO MainThread:29365 [wandb_init.py:init():1005] updated telemetry +2025-05-16 07:37:47,244 INFO MainThread:29365 [wandb_init.py:init():1029] communicating run to backend with 90.0 second timeout +2025-05-16 07:37:47,473 INFO MainThread:29365 [wandb_init.py:init():1104] starting run threads in backend +2025-05-16 07:37:47,854 INFO MainThread:29365 [wandb_run.py:_console_start():2573] atexit reg +2025-05-16 07:37:47,854 INFO MainThread:29365 [wandb_run.py:_redirect():2421] redirect: wrap_raw +2025-05-16 07:37:47,854 INFO MainThread:29365 [wandb_run.py:_redirect():2490] Wrapping output streams. +2025-05-16 07:37:47,854 INFO MainThread:29365 [wandb_run.py:_redirect():2513] Redirects installed. +2025-05-16 07:37:47,857 INFO MainThread:29365 [wandb_init.py:init():1150] run started, returning control to user process +2025-05-16 07:37:47,859 INFO MainThread:29365 [wandb_run.py:_config_callback():1436] config_cb None None {'peft_config': {'default': {'task_type': 'CAUSAL_LM', 'peft_type': , 'auto_mapping': None, 'base_model_name_or_path': 'codellama/CodeLlama-7b-Instruct-hf', 'revision': None, 'inference_mode': False, 'r': 32, 'target_modules': {'down_proj', 'up_proj', 'k_proj', 'q_proj', 'v_proj', 'gate_proj', 'o_proj'}, 'exclude_modules': None, 'lora_alpha': 64, 'lora_dropout': 0.1, 'fan_in_fan_out': False, 'bias': 'none', 'use_rslora': False, 'modules_to_save': None, 'init_lora_weights': True, 'layers_to_transform': None, 'layers_pattern': None, 'rank_pattern': {}, 'alpha_pattern': {}, 'megatron_config': None, 'megatron_core': 'megatron.core', 'trainable_token_indices': None, 'loftq_config': {}, 'eva_config': None, 'corda_config': None, 'use_dora': False, 'layer_replication': None, 'runtime_config': {'ephemeral_gpu_offload': False}, 'lora_bias': False}}, 'vocab_size': 32016, 'max_position_embeddings': 16384, 'hidden_size': 4096, 'intermediate_size': 11008, 'num_hidden_layers': 32, 'num_attention_heads': 32, 'num_key_value_heads': 32, 'hidden_act': 'silu', 'initializer_range': 0.02, 'rms_norm_eps': 1e-05, 'pretraining_tp': 1, 'use_cache': False, 'rope_theta': 1000000, 'rope_scaling': None, 'attention_bias': False, 'attention_dropout': 0.0, 'mlp_bias': False, 'head_dim': 128, 'return_dict': True, 'output_hidden_states': False, 'output_attentions': False, 'torchscript': False, 'torch_dtype': 'float16', 'use_bfloat16': False, 'tf_legacy_loss': False, 'pruned_heads': {}, 'tie_word_embeddings': False, 'chunk_size_feed_forward': 0, 'is_encoder_decoder': False, 'is_decoder': False, 'cross_attention_hidden_size': None, 'add_cross_attention': False, 'tie_encoder_decoder': False, 'max_length': 20, 'min_length': 0, 'do_sample': False, 'early_stopping': False, 'num_beams': 1, 'num_beam_groups': 1, 'diversity_penalty': 0.0, 'temperature': 1.0, 'top_k': 50, 'top_p': 1.0, 'typical_p': 1.0, 'repetition_penalty': 1.0, 'length_penalty': 1.0, 'no_repeat_ngram_size': 0, 'encoder_no_repeat_ngram_size': 0, 'bad_words_ids': None, 'num_return_sequences': 1, 'output_scores': False, 'return_dict_in_generate': False, 'forced_bos_token_id': None, 'forced_eos_token_id': None, 'remove_invalid_values': False, 'exponential_decay_length_penalty': None, 'suppress_tokens': None, 'begin_suppress_tokens': None, 'architectures': ['LlamaForCausalLM'], 'finetuning_task': None, 'id2label': {0: 'LABEL_0', 1: 'LABEL_1'}, 'label2id': {'LABEL_0': 0, 'LABEL_1': 1}, 'tokenizer_class': None, 'prefix': None, 'bos_token_id': 1, 'pad_token_id': None, 'eos_token_id': 2, 'sep_token_id': None, 'decoder_start_token_id': None, 'task_specific_params': None, 'problem_type': None, '_name_or_path': 'codellama/CodeLlama-7b-Instruct-hf', 'transformers_version': '4.52.0.dev0', 'model_type': 'llama', 'quantization_config': {'quant_method': 'BITS_AND_BYTES', '_load_in_8bit': False, '_load_in_4bit': True, 'llm_int8_threshold': 6.0, 'llm_int8_skip_modules': None, 'llm_int8_enable_fp32_cpu_offload': False, 'llm_int8_has_fp16_weight': False, 'bnb_4bit_quant_type': 'nf4', 'bnb_4bit_use_double_quant': True, 'bnb_4bit_compute_dtype': 'bfloat16', 'bnb_4bit_quant_storage': 'uint8', 'load_in_4bit': True, 'load_in_8bit': False}, 'output_dir': 'codellama-hugcoder', 'overwrite_output_dir': False, 'do_train': False, 'do_eval': False, 'do_predict': False, 'eval_strategy': 'no', 'prediction_loss_only': False, 'per_device_train_batch_size': 4, 'per_device_eval_batch_size': 4, 'per_gpu_train_batch_size': None, 'per_gpu_eval_batch_size': None, 'gradient_accumulation_steps': 4, 'eval_accumulation_steps': None, 'eval_delay': 0, 'torch_empty_cache_steps': None, 'learning_rate': 0.0003, 'weight_decay': 0.1, 'adam_beta1': 0.9, 'adam_beta2': 0.999, 'adam_epsilon': 1e-08, 'max_grad_norm': 1.0, 'num_train_epochs': 3.0, 'max_steps': 2000, 'lr_scheduler_type': 'cosine', 'lr_scheduler_kwargs': {}, 'warmup_ratio': 0.1, 'warmup_steps': 0, 'log_level': 'info', 'log_level_replica': 'warning', 'log_on_each_node': True, 'logging_dir': 'codellama-hugcoder/runs/May16_07-36-11_project-finecode-65846d7984-mzlgr', 'logging_strategy': 'steps', 'logging_first_step': False, 'logging_steps': 5, 'logging_nan_inf_filter': True, 'save_strategy': 'steps', 'save_steps': 500, 'save_total_limit': None, 'save_safetensors': True, 'save_on_each_node': False, 'save_only_model': False, 'restore_callback_states_from_checkpoint': False, 'no_cuda': False, 'use_cpu': False, 'use_mps_device': False, 'seed': 42, 'data_seed': None, 'jit_mode_eval': False, 'use_ipex': False, 'bf16': True, 'fp16': False, 'fp16_opt_level': 'O1', 'half_precision_backend': 'auto', 'bf16_full_eval': False, 'fp16_full_eval': False, 'tf32': None, 'local_rank': 0, 'ddp_backend': None, 'tpu_num_cores': None, 'tpu_metrics_debug': False, 'debug': [], 'dataloader_drop_last': False, 'eval_steps': 100.0, 'dataloader_num_workers': 0, 'dataloader_prefetch_factor': None, 'past_index': -1, 'run_name': 'codellama-hugcoder', 'disable_tqdm': False, 'remove_unused_columns': True, 'label_names': None, 'load_best_model_at_end': False, 'metric_for_best_model': None, 'greater_is_better': None, 'ignore_data_skip': False, 'fsdp': [], 'fsdp_min_num_params': 0, 'fsdp_config': {'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, 'fsdp_transformer_layer_cls_to_wrap': None, 'accelerator_config': {'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None}, 'deepspeed': None, 'label_smoothing_factor': 0.0, 'optim': 'adamw_torch', 'optim_args': None, 'adafactor': False, 'group_by_length': False, 'length_column_name': 'length', 'report_to': ['wandb'], 'ddp_find_unused_parameters': None, 'ddp_bucket_cap_mb': None, 'ddp_broadcast_buffers': None, 'dataloader_pin_memory': True, 'dataloader_persistent_workers': False, 'skip_memory_metrics': True, 'use_legacy_prediction_loop': False, 'push_to_hub': True, 'resume_from_checkpoint': None, 'hub_model_id': None, 'hub_strategy': 'every_save', 'hub_token': '', 'hub_private_repo': True, 'hub_always_push': False, 'gradient_checkpointing': True, 'gradient_checkpointing_kwargs': {'use_reentrant': True}, 'include_inputs_for_metrics': False, 'include_for_metrics': [], 'eval_do_concat_batches': True, 'fp16_backend': 'auto', 'push_to_hub_model_id': None, 'push_to_hub_organization': None, 'push_to_hub_token': '', 'mp_parameters': '', 'auto_find_batch_size': False, 'full_determinism': False, 'torchdynamo': None, 'ray_scope': 'last', 'ddp_timeout': 1800, 'torch_compile': False, 'torch_compile_backend': None, 'torch_compile_mode': None, 'include_tokens_per_second': False, 'include_num_input_tokens_seen': False, 'neftune_noise_alpha': None, 'optim_target_modules': None, 'batch_eval_metrics': False, 'eval_on_start': False, 'use_liger_kernel': False, 'eval_use_gather_object': False, 'average_tokens_across_devices': False} +2025-05-16 07:37:47,866 INFO MainThread:29365 [wandb_config.py:__setitem__():154] [no run ID] config set model/num_parameters = 6818500608 - > +2025-05-16 07:37:47,866 INFO MainThread:29365 [wandb_run.py:_config_callback():1436] config_cb model/num_parameters 6818500608 None diff --git a/wandb/offline-run-20250516_073747-jc2tz43q/logs/.ipynb_checkpoints/debug-core-checkpoint.log b/wandb/offline-run-20250516_073747-jc2tz43q/logs/.ipynb_checkpoints/debug-core-checkpoint.log new file mode 100644 index 0000000000000000000000000000000000000000..d8651a3eaf5c4b99f83713269d923c485096f122 --- /dev/null +++ b/wandb/offline-run-20250516_073747-jc2tz43q/logs/.ipynb_checkpoints/debug-core-checkpoint.log @@ -0,0 +1,6 @@ +{"time":"2025-05-16T07:37:38.620987457Z","level":"INFO","msg":"main: starting server","port-filename":"/tmp/tmp_3cqd41s/port-29365.txt","pid":29365,"log-level":0,"disable-analytics":false,"shutdown-on-parent-exit":false,"enable-dcgm-profiling":false} +{"time":"2025-05-16T07:37:38.627281182Z","level":"INFO","msg":"Will exit if parent process dies.","ppid":29365} +{"time":"2025-05-16T07:37:38.628474204Z","level":"INFO","msg":"server is running","addr":{"IP":"127.0.0.1","Port":43351,"Zone":""}} +{"time":"2025-05-16T07:37:38.725900233Z","level":"INFO","msg":"connection: ManageConnectionData: new connection created","id":"127.0.0.1:57302"} +{"time":"2025-05-16T07:37:47.242158486Z","level":"INFO","msg":"handleInformInit: received","streamId":"jc2tz43q","id":"127.0.0.1:57302"} +{"time":"2025-05-16T07:37:47.471081329Z","level":"INFO","msg":"handleInformInit: stream started","streamId":"jc2tz43q","id":"127.0.0.1:57302"} diff --git a/wandb/offline-run-20250516_073747-jc2tz43q/logs/.ipynb_checkpoints/debug-internal-checkpoint.log b/wandb/offline-run-20250516_073747-jc2tz43q/logs/.ipynb_checkpoints/debug-internal-checkpoint.log new file mode 100644 index 0000000000000000000000000000000000000000..933f96ad797a96ef23bba6e0326388ce0cd85c7f --- /dev/null +++ b/wandb/offline-run-20250516_073747-jc2tz43q/logs/.ipynb_checkpoints/debug-internal-checkpoint.log @@ -0,0 +1,8 @@ +{"time":"2025-05-16T07:37:47.253769219Z","level":"INFO","msg":"stream: starting","core version":"0.19.11","symlink path":"/local_storage/persistent_data/jupyterlab/lab/workspaces/training_script/LLM-Workshop/personal_copilot/training/wandb/offline-run-20250516_073747-jc2tz43q/logs/debug-core.log"} +{"time":"2025-05-16T07:37:47.470171926Z","level":"WARN","msg":"GraphQL client is nil, skipping feature loading"} +{"time":"2025-05-16T07:37:47.471043556Z","level":"INFO","msg":"created new stream","id":"jc2tz43q"} +{"time":"2025-05-16T07:37:47.471071988Z","level":"INFO","msg":"stream: started","id":"jc2tz43q"} +{"time":"2025-05-16T07:37:47.47110475Z","level":"INFO","msg":"writer: Do: started","stream_id":"jc2tz43q"} +{"time":"2025-05-16T07:37:47.471115831Z","level":"INFO","msg":"handler: started","stream_id":"jc2tz43q"} +{"time":"2025-05-16T07:37:47.471180815Z","level":"INFO","msg":"sender: started","stream_id":"jc2tz43q"} +{"time":"2025-05-16T07:37:47.477686926Z","level":"INFO","msg":"Starting system monitor"} diff --git a/wandb/offline-run-20250516_073747-jc2tz43q/logs/debug-core.log b/wandb/offline-run-20250516_073747-jc2tz43q/logs/debug-core.log new file mode 100644 index 0000000000000000000000000000000000000000..a9fa499c890860ea95258f04260540e5177981f6 --- /dev/null +++ b/wandb/offline-run-20250516_073747-jc2tz43q/logs/debug-core.log @@ -0,0 +1,13 @@ +{"time":"2025-05-16T07:37:38.620987457Z","level":"INFO","msg":"main: starting server","port-filename":"/tmp/tmp_3cqd41s/port-29365.txt","pid":29365,"log-level":0,"disable-analytics":false,"shutdown-on-parent-exit":false,"enable-dcgm-profiling":false} +{"time":"2025-05-16T07:37:38.627281182Z","level":"INFO","msg":"Will exit if parent process dies.","ppid":29365} +{"time":"2025-05-16T07:37:38.628474204Z","level":"INFO","msg":"server is running","addr":{"IP":"127.0.0.1","Port":43351,"Zone":""}} +{"time":"2025-05-16T07:37:38.725900233Z","level":"INFO","msg":"connection: ManageConnectionData: new connection created","id":"127.0.0.1:57302"} +{"time":"2025-05-16T07:37:47.242158486Z","level":"INFO","msg":"handleInformInit: received","streamId":"jc2tz43q","id":"127.0.0.1:57302"} +{"time":"2025-05-16T07:37:47.471081329Z","level":"INFO","msg":"handleInformInit: stream started","streamId":"jc2tz43q","id":"127.0.0.1:57302"} +{"time":"2025-05-17T22:27:35.403652933Z","level":"INFO","msg":"handleInformTeardown: server teardown initiated","id":"127.0.0.1:57302"} +{"time":"2025-05-17T22:27:35.403810773Z","level":"INFO","msg":"server is shutting down"} +{"time":"2025-05-17T22:27:35.403793492Z","level":"INFO","msg":"connection: closing","id":"127.0.0.1:57302"} +{"time":"2025-05-17T22:27:35.404006127Z","level":"INFO","msg":"connection: closed successfully","id":"127.0.0.1:57302"} +{"time":"2025-05-17T22:27:35.405135844Z","level":"INFO","msg":"handleInformTeardown: server shutdown complete","id":"127.0.0.1:57302"} +{"time":"2025-05-17T22:27:35.405151025Z","level":"INFO","msg":"connection: ManageConnectionData: connection closed","id":"127.0.0.1:57302"} +{"time":"2025-05-17T22:27:35.405161186Z","level":"INFO","msg":"server is closed"} diff --git a/wandb/offline-run-20250516_073747-jc2tz43q/logs/debug-internal.log b/wandb/offline-run-20250516_073747-jc2tz43q/logs/debug-internal.log new file mode 100644 index 0000000000000000000000000000000000000000..ee5bf05e14a33a7993620274bc5fffbfa49ca1f9 --- /dev/null +++ b/wandb/offline-run-20250516_073747-jc2tz43q/logs/debug-internal.log @@ -0,0 +1,15 @@ +{"time":"2025-05-16T07:37:47.253769219Z","level":"INFO","msg":"stream: starting","core version":"0.19.11","symlink path":"/local_storage/persistent_data/jupyterlab/lab/workspaces/training_script/LLM-Workshop/personal_copilot/training/wandb/offline-run-20250516_073747-jc2tz43q/logs/debug-core.log"} +{"time":"2025-05-16T07:37:47.470171926Z","level":"WARN","msg":"GraphQL client is nil, skipping feature loading"} +{"time":"2025-05-16T07:37:47.471043556Z","level":"INFO","msg":"created new stream","id":"jc2tz43q"} +{"time":"2025-05-16T07:37:47.471071988Z","level":"INFO","msg":"stream: started","id":"jc2tz43q"} +{"time":"2025-05-16T07:37:47.47110475Z","level":"INFO","msg":"writer: Do: started","stream_id":"jc2tz43q"} +{"time":"2025-05-16T07:37:47.471115831Z","level":"INFO","msg":"handler: started","stream_id":"jc2tz43q"} +{"time":"2025-05-16T07:37:47.471180815Z","level":"INFO","msg":"sender: started","stream_id":"jc2tz43q"} +{"time":"2025-05-16T07:37:47.477686926Z","level":"INFO","msg":"Starting system monitor"} +{"time":"2025-05-17T22:27:35.403843016Z","level":"INFO","msg":"stream: closing","id":"jc2tz43q"} +{"time":"2025-05-17T22:27:35.404804572Z","level":"INFO","msg":"Stopping system monitor"} +{"time":"2025-05-17T22:27:35.404850555Z","level":"INFO","msg":"Stopped system monitor"} +{"time":"2025-05-17T22:27:35.40493109Z","level":"INFO","msg":"handler: closed","stream_id":"jc2tz43q"} +{"time":"2025-05-17T22:27:35.404943001Z","level":"INFO","msg":"writer: Close: closed","stream_id":"jc2tz43q"} +{"time":"2025-05-17T22:27:35.404963782Z","level":"INFO","msg":"sender: closed","stream_id":"jc2tz43q"} +{"time":"2025-05-17T22:27:35.405060219Z","level":"INFO","msg":"stream: closed","id":"jc2tz43q"} diff --git a/wandb/offline-run-20250516_073747-jc2tz43q/logs/debug.log b/wandb/offline-run-20250516_073747-jc2tz43q/logs/debug.log new file mode 100644 index 0000000000000000000000000000000000000000..067508921187c34e3dc0bd7885357818572fbf91 --- /dev/null +++ b/wandb/offline-run-20250516_073747-jc2tz43q/logs/debug.log @@ -0,0 +1,26 @@ +2025-05-16 07:37:47,232 INFO MainThread:29365 [wandb_setup.py:_flush():70] Current SDK version is 0.19.11 +2025-05-16 07:37:47,232 INFO MainThread:29365 [wandb_setup.py:_flush():70] Configure stats pid to 29365 +2025-05-16 07:37:47,232 INFO MainThread:29365 [wandb_setup.py:_flush():70] Loading settings from /root/.config/wandb/settings +2025-05-16 07:37:47,232 INFO MainThread:29365 [wandb_setup.py:_flush():70] Loading settings from /local_storage/persistent_data/jupyterlab/lab/workspaces/training_script/LLM-Workshop/personal_copilot/training/wandb/settings +2025-05-16 07:37:47,232 INFO MainThread:29365 [wandb_setup.py:_flush():70] Loading settings from environment variables +2025-05-16 07:37:47,232 INFO MainThread:29365 [wandb_init.py:setup_run_log_directory():724] Logging user logs to /local_storage/persistent_data/jupyterlab/lab/workspaces/training_script/LLM-Workshop/personal_copilot/training/wandb/offline-run-20250516_073747-jc2tz43q/logs/debug.log +2025-05-16 07:37:47,232 INFO MainThread:29365 [wandb_init.py:setup_run_log_directory():725] Logging internal logs to /local_storage/persistent_data/jupyterlab/lab/workspaces/training_script/LLM-Workshop/personal_copilot/training/wandb/offline-run-20250516_073747-jc2tz43q/logs/debug-internal.log +2025-05-16 07:37:47,232 INFO MainThread:29365 [wandb_init.py:init():852] calling init triggers +2025-05-16 07:37:47,232 INFO MainThread:29365 [wandb_init.py:init():857] wandb.init called with sweep_config: {} +config: {'_wandb': {}} +2025-05-16 07:37:47,232 INFO MainThread:29365 [wandb_init.py:init():893] starting backend +2025-05-16 07:37:47,232 INFO MainThread:29365 [wandb_init.py:init():897] sending inform_init request +2025-05-16 07:37:47,236 INFO MainThread:29365 [backend.py:_multiprocessing_setup():101] multiprocessing start_methods=fork,spawn,forkserver, using: spawn +2025-05-16 07:37:47,236 INFO MainThread:29365 [wandb_init.py:init():907] backend started and connected +2025-05-16 07:37:47,237 INFO MainThread:29365 [wandb_init.py:init():1005] updated telemetry +2025-05-16 07:37:47,244 INFO MainThread:29365 [wandb_init.py:init():1029] communicating run to backend with 90.0 second timeout +2025-05-16 07:37:47,473 INFO MainThread:29365 [wandb_init.py:init():1104] starting run threads in backend +2025-05-16 07:37:47,854 INFO MainThread:29365 [wandb_run.py:_console_start():2573] atexit reg +2025-05-16 07:37:47,854 INFO MainThread:29365 [wandb_run.py:_redirect():2421] redirect: wrap_raw +2025-05-16 07:37:47,854 INFO MainThread:29365 [wandb_run.py:_redirect():2490] Wrapping output streams. +2025-05-16 07:37:47,854 INFO MainThread:29365 [wandb_run.py:_redirect():2513] Redirects installed. +2025-05-16 07:37:47,857 INFO MainThread:29365 [wandb_init.py:init():1150] run started, returning control to user process +2025-05-16 07:37:47,859 INFO MainThread:29365 [wandb_run.py:_config_callback():1436] config_cb None None {'peft_config': {'default': {'task_type': 'CAUSAL_LM', 'peft_type': , 'auto_mapping': None, 'base_model_name_or_path': 'codellama/CodeLlama-7b-Instruct-hf', 'revision': None, 'inference_mode': False, 'r': 32, 'target_modules': {'down_proj', 'up_proj', 'k_proj', 'q_proj', 'v_proj', 'gate_proj', 'o_proj'}, 'exclude_modules': None, 'lora_alpha': 64, 'lora_dropout': 0.1, 'fan_in_fan_out': False, 'bias': 'none', 'use_rslora': False, 'modules_to_save': None, 'init_lora_weights': True, 'layers_to_transform': None, 'layers_pattern': None, 'rank_pattern': {}, 'alpha_pattern': {}, 'megatron_config': None, 'megatron_core': 'megatron.core', 'trainable_token_indices': None, 'loftq_config': {}, 'eva_config': None, 'corda_config': None, 'use_dora': False, 'layer_replication': None, 'runtime_config': {'ephemeral_gpu_offload': False}, 'lora_bias': False}}, 'vocab_size': 32016, 'max_position_embeddings': 16384, 'hidden_size': 4096, 'intermediate_size': 11008, 'num_hidden_layers': 32, 'num_attention_heads': 32, 'num_key_value_heads': 32, 'hidden_act': 'silu', 'initializer_range': 0.02, 'rms_norm_eps': 1e-05, 'pretraining_tp': 1, 'use_cache': False, 'rope_theta': 1000000, 'rope_scaling': None, 'attention_bias': False, 'attention_dropout': 0.0, 'mlp_bias': False, 'head_dim': 128, 'return_dict': True, 'output_hidden_states': False, 'output_attentions': False, 'torchscript': False, 'torch_dtype': 'float16', 'use_bfloat16': False, 'tf_legacy_loss': False, 'pruned_heads': {}, 'tie_word_embeddings': False, 'chunk_size_feed_forward': 0, 'is_encoder_decoder': False, 'is_decoder': False, 'cross_attention_hidden_size': None, 'add_cross_attention': False, 'tie_encoder_decoder': False, 'max_length': 20, 'min_length': 0, 'do_sample': False, 'early_stopping': False, 'num_beams': 1, 'num_beam_groups': 1, 'diversity_penalty': 0.0, 'temperature': 1.0, 'top_k': 50, 'top_p': 1.0, 'typical_p': 1.0, 'repetition_penalty': 1.0, 'length_penalty': 1.0, 'no_repeat_ngram_size': 0, 'encoder_no_repeat_ngram_size': 0, 'bad_words_ids': None, 'num_return_sequences': 1, 'output_scores': False, 'return_dict_in_generate': False, 'forced_bos_token_id': None, 'forced_eos_token_id': None, 'remove_invalid_values': False, 'exponential_decay_length_penalty': None, 'suppress_tokens': None, 'begin_suppress_tokens': None, 'architectures': ['LlamaForCausalLM'], 'finetuning_task': None, 'id2label': {0: 'LABEL_0', 1: 'LABEL_1'}, 'label2id': {'LABEL_0': 0, 'LABEL_1': 1}, 'tokenizer_class': None, 'prefix': None, 'bos_token_id': 1, 'pad_token_id': None, 'eos_token_id': 2, 'sep_token_id': None, 'decoder_start_token_id': None, 'task_specific_params': None, 'problem_type': None, '_name_or_path': 'codellama/CodeLlama-7b-Instruct-hf', 'transformers_version': '4.52.0.dev0', 'model_type': 'llama', 'quantization_config': {'quant_method': 'BITS_AND_BYTES', '_load_in_8bit': False, '_load_in_4bit': True, 'llm_int8_threshold': 6.0, 'llm_int8_skip_modules': None, 'llm_int8_enable_fp32_cpu_offload': False, 'llm_int8_has_fp16_weight': False, 'bnb_4bit_quant_type': 'nf4', 'bnb_4bit_use_double_quant': True, 'bnb_4bit_compute_dtype': 'bfloat16', 'bnb_4bit_quant_storage': 'uint8', 'load_in_4bit': True, 'load_in_8bit': False}, 'output_dir': 'codellama-hugcoder', 'overwrite_output_dir': False, 'do_train': False, 'do_eval': False, 'do_predict': False, 'eval_strategy': 'no', 'prediction_loss_only': False, 'per_device_train_batch_size': 4, 'per_device_eval_batch_size': 4, 'per_gpu_train_batch_size': None, 'per_gpu_eval_batch_size': None, 'gradient_accumulation_steps': 4, 'eval_accumulation_steps': None, 'eval_delay': 0, 'torch_empty_cache_steps': None, 'learning_rate': 0.0003, 'weight_decay': 0.1, 'adam_beta1': 0.9, 'adam_beta2': 0.999, 'adam_epsilon': 1e-08, 'max_grad_norm': 1.0, 'num_train_epochs': 3.0, 'max_steps': 2000, 'lr_scheduler_type': 'cosine', 'lr_scheduler_kwargs': {}, 'warmup_ratio': 0.1, 'warmup_steps': 0, 'log_level': 'info', 'log_level_replica': 'warning', 'log_on_each_node': True, 'logging_dir': 'codellama-hugcoder/runs/May16_07-36-11_project-finecode-65846d7984-mzlgr', 'logging_strategy': 'steps', 'logging_first_step': False, 'logging_steps': 5, 'logging_nan_inf_filter': True, 'save_strategy': 'steps', 'save_steps': 500, 'save_total_limit': None, 'save_safetensors': True, 'save_on_each_node': False, 'save_only_model': False, 'restore_callback_states_from_checkpoint': False, 'no_cuda': False, 'use_cpu': False, 'use_mps_device': False, 'seed': 42, 'data_seed': None, 'jit_mode_eval': False, 'use_ipex': False, 'bf16': True, 'fp16': False, 'fp16_opt_level': 'O1', 'half_precision_backend': 'auto', 'bf16_full_eval': False, 'fp16_full_eval': False, 'tf32': None, 'local_rank': 0, 'ddp_backend': None, 'tpu_num_cores': None, 'tpu_metrics_debug': False, 'debug': [], 'dataloader_drop_last': False, 'eval_steps': 100.0, 'dataloader_num_workers': 0, 'dataloader_prefetch_factor': None, 'past_index': -1, 'run_name': 'codellama-hugcoder', 'disable_tqdm': False, 'remove_unused_columns': True, 'label_names': None, 'load_best_model_at_end': False, 'metric_for_best_model': None, 'greater_is_better': None, 'ignore_data_skip': False, 'fsdp': [], 'fsdp_min_num_params': 0, 'fsdp_config': {'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, 'fsdp_transformer_layer_cls_to_wrap': None, 'accelerator_config': {'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None}, 'deepspeed': None, 'label_smoothing_factor': 0.0, 'optim': 'adamw_torch', 'optim_args': None, 'adafactor': False, 'group_by_length': False, 'length_column_name': 'length', 'report_to': ['wandb'], 'ddp_find_unused_parameters': None, 'ddp_bucket_cap_mb': None, 'ddp_broadcast_buffers': None, 'dataloader_pin_memory': True, 'dataloader_persistent_workers': False, 'skip_memory_metrics': True, 'use_legacy_prediction_loop': False, 'push_to_hub': True, 'resume_from_checkpoint': None, 'hub_model_id': None, 'hub_strategy': 'every_save', 'hub_token': '', 'hub_private_repo': True, 'hub_always_push': False, 'gradient_checkpointing': True, 'gradient_checkpointing_kwargs': {'use_reentrant': True}, 'include_inputs_for_metrics': False, 'include_for_metrics': [], 'eval_do_concat_batches': True, 'fp16_backend': 'auto', 'push_to_hub_model_id': None, 'push_to_hub_organization': None, 'push_to_hub_token': '', 'mp_parameters': '', 'auto_find_batch_size': False, 'full_determinism': False, 'torchdynamo': None, 'ray_scope': 'last', 'ddp_timeout': 1800, 'torch_compile': False, 'torch_compile_backend': None, 'torch_compile_mode': None, 'include_tokens_per_second': False, 'include_num_input_tokens_seen': False, 'neftune_noise_alpha': None, 'optim_target_modules': None, 'batch_eval_metrics': False, 'eval_on_start': False, 'use_liger_kernel': False, 'eval_use_gather_object': False, 'average_tokens_across_devices': False} +2025-05-16 07:37:47,866 INFO MainThread:29365 [wandb_config.py:__setitem__():154] [no run ID] config set model/num_parameters = 6818500608 - > +2025-05-16 07:37:47,866 INFO MainThread:29365 [wandb_run.py:_config_callback():1436] config_cb model/num_parameters 6818500608 None +2025-05-17 22:27:35,403 INFO MsgRouterThr:29365 [mailbox.py:close():129] [no run ID] Closing mailbox, abandoning 0 handles. diff --git a/wandb/offline-run-20250516_073747-jc2tz43q/run-jc2tz43q.wandb b/wandb/offline-run-20250516_073747-jc2tz43q/run-jc2tz43q.wandb new file mode 100644 index 0000000000000000000000000000000000000000..be710865ae2cf80d025f72197b5b5bd35f92a90b --- /dev/null +++ b/wandb/offline-run-20250516_073747-jc2tz43q/run-jc2tz43q.wandb @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:17a73eb777948b78d05faf1e2809a681696b5c56ce0a7a7ada14ae168f5f3438 +size 33685295