|
|
--- |
|
|
base_model: google/gemma-2b |
|
|
pipeline_tag: text-generation |
|
|
library_name: peft |
|
|
--- |
|
|
|
|
|
# Model Card for Model ID |
|
|
|
|
|
Fine-tuned on python |
|
|
|
|
|
|
|
|
## Model Details |
|
|
|
|
|
### Model Description |
|
|
|
|
|
|
|
|
### Model Sources [optional] |
|
|
Gemma-2b trained on python-oasst dataset |
|
|
|
|
|
## Uses |
|
|
|
|
|
|
|
|
## Training Details |
|
|
|
|
|
### Training Data |
|
|
|
|
|
{ |
|
|
"_timestamp": 1711018613.433522, |
|
|
"train/grad_norm": 0.1240904619429259, |
|
|
"train/global_step": 232, |
|
|
"eval/steps_per_second": 2.894, |
|
|
"_step": 232, |
|
|
"_runtime": 2545.4226660728455, |
|
|
"eval/loss": 1.189491629600525, |
|
|
"eval/runtime": 1805.8574, |
|
|
"train/learning_rate": 0.000014800637958532697, |
|
|
"eval/samples_per_second": 23.152, |
|
|
"_wandb.runtime": 2547, |
|
|
"train/loss": 1.0436, |
|
|
"train/epoch": 0.01 |
|
|
} |
|
|
|
|
|
### Results |
|
|
|
|
|
|
|
|
#### Summary |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
## Technical Specifications [optional] |
|
|
{ |
|
|
"bf16": { |
|
|
"desc": null, |
|
|
"value": true |
|
|
}, |
|
|
"fp16": { |
|
|
"desc": null, |
|
|
"value": false |
|
|
}, |
|
|
"fsdp": { |
|
|
"desc": null, |
|
|
"value": [] |
|
|
}, |
|
|
"seed": { |
|
|
"desc": null, |
|
|
"value": 42 |
|
|
}, |
|
|
"tf32": { |
|
|
"desc": null, |
|
|
"value": false |
|
|
}, |
|
|
"debug": { |
|
|
"desc": null, |
|
|
"value": [] |
|
|
}, |
|
|
"optim": { |
|
|
"desc": null, |
|
|
"value": "adamw_bnb_8bit" |
|
|
}, |
|
|
"qlora": { |
|
|
"desc": null, |
|
|
"value": true |
|
|
}, |
|
|
"top_k": { |
|
|
"desc": null, |
|
|
"value": 50 |
|
|
}, |
|
|
"top_p": { |
|
|
"desc": null, |
|
|
"value": 1 |
|
|
}, |
|
|
"_wandb": { |
|
|
"desc": null, |
|
|
"value": { |
|
|
"m": [ |
|
|
{ |
|
|
"1": "train/global_step", |
|
|
"6": [ |
|
|
3 |
|
|
] |
|
|
}, |
|
|
{ |
|
|
"1": "train/loss", |
|
|
"5": 1, |
|
|
"6": [ |
|
|
1 |
|
|
] |
|
|
}, |
|
|
{ |
|
|
"1": "train/grad_norm", |
|
|
"5": 1, |
|
|
"6": [ |
|
|
1 |
|
|
] |
|
|
}, |
|
|
{ |
|
|
"1": "train/learning_rate", |
|
|
"5": 1, |
|
|
"6": [ |
|
|
1 |
|
|
] |
|
|
}, |
|
|
{ |
|
|
"1": "train/epoch", |
|
|
"5": 1, |
|
|
"6": [ |
|
|
1 |
|
|
] |
|
|
}, |
|
|
{ |
|
|
"1": "eval/loss", |
|
|
"5": 1, |
|
|
"6": [ |
|
|
1 |
|
|
] |
|
|
}, |
|
|
{ |
|
|
"1": "eval/runtime", |
|
|
"5": 1, |
|
|
"6": [ |
|
|
1 |
|
|
] |
|
|
}, |
|
|
{ |
|
|
"1": "eval/samples_per_second", |
|
|
"5": 1, |
|
|
"6": [ |
|
|
1 |
|
|
] |
|
|
}, |
|
|
{ |
|
|
"1": "eval/steps_per_second", |
|
|
"5": 1, |
|
|
"6": [ |
|
|
1 |
|
|
] |
|
|
} |
|
|
], |
|
|
"t": { |
|
|
"1": [ |
|
|
1, |
|
|
5, |
|
|
11, |
|
|
49, |
|
|
51, |
|
|
53, |
|
|
55, |
|
|
71, |
|
|
84, |
|
|
98, |
|
|
99, |
|
|
100, |
|
|
105 |
|
|
], |
|
|
"2": [ |
|
|
1, |
|
|
5, |
|
|
11, |
|
|
49, |
|
|
51, |
|
|
53, |
|
|
55, |
|
|
71, |
|
|
84, |
|
|
98, |
|
|
99, |
|
|
100, |
|
|
105 |
|
|
], |
|
|
"3": [ |
|
|
3, |
|
|
7, |
|
|
23 |
|
|
], |
|
|
"4": "3.10.13", |
|
|
"5": "0.16.4", |
|
|
"6": "4.39.0.dev0", |
|
|
"8": [ |
|
|
5 |
|
|
], |
|
|
"9": { |
|
|
"1": "transformers_trainer" |
|
|
}, |
|
|
"13": "linux-x86_64" |
|
|
}, |
|
|
"framework": "huggingface", |
|
|
"start_time": 1711016068, |
|
|
"cli_version": "0.16.4", |
|
|
"is_jupyter_run": false, |
|
|
"python_version": "3.10.13", |
|
|
"is_kaggle_kernel": false, |
|
|
"huggingface_version": "4.39.0.dev0" |
|
|
} |
|
|
}, |
|
|
"prefix": { |
|
|
"desc": null, |
|
|
"value": null |
|
|
}, |
|
|
"do_eval": { |
|
|
"desc": null, |
|
|
"value": true |
|
|
}, |
|
|
"no_cuda": { |
|
|
"desc": null, |
|
|
"value": false |
|
|
}, |
|
|
"use_cpu": { |
|
|
"desc": null, |
|
|
"value": false |
|
|
}, |
|
|
"do_train": { |
|
|
"desc": null, |
|
|
"value": false |
|
|
}, |
|
|
"head_dim": { |
|
|
"desc": null, |
|
|
"value": 256 |
|
|
}, |
|
|
"id2label": { |
|
|
"desc": null, |
|
|
"value": { |
|
|
"0": "LABEL_0", |
|
|
"1": "LABEL_1" |
|
|
} |
|
|
}, |
|
|
"label2id": { |
|
|
"desc": null, |
|
|
"value": { |
|
|
"LABEL_0": 0, |
|
|
"LABEL_1": 1 |
|
|
} |
|
|
}, |
|
|
"run_name": { |
|
|
"desc": null, |
|
|
"value": "./out" |
|
|
}, |
|
|
"use_ipex": { |
|
|
"desc": null, |
|
|
"value": false |
|
|
}, |
|
|
"adafactor": { |
|
|
"desc": null, |
|
|
"value": false |
|
|
}, |
|
|
"data_seed": { |
|
|
"desc": null, |
|
|
"value": null |
|
|
}, |
|
|
"deepspeed": { |
|
|
"desc": null, |
|
|
"value": "deepspeed_configs/zero1.json" |
|
|
}, |
|
|
"do_sample": { |
|
|
"desc": null, |
|
|
"value": false |
|
|
}, |
|
|
"hub_token": { |
|
|
"desc": null, |
|
|
}, |
|
|
"log_level": { |
|
|
"desc": null, |
|
|
"value": "passive" |
|
|
}, |
|
|
"max_steps": { |
|
|
"desc": null, |
|
|
"value": -1 |
|
|
}, |
|
|
"num_beams": { |
|
|
"desc": null, |
|
|
"value": 1 |
|
|
}, |
|
|
"ray_scope": { |
|
|
"desc": null, |
|
|
"value": "last" |
|
|
}, |
|
|
"report_to": { |
|
|
"desc": null, |
|
|
"value": [ |
|
|
"wandb" |
|
|
] |
|
|
}, |
|
|
"typical_p": { |
|
|
"desc": null, |
|
|
"value": 1 |
|
|
}, |
|
|
"use_cache": { |
|
|
"desc": null, |
|
|
"value": false |
|
|
}, |
|
|
"adam_beta1": { |
|
|
"desc": null, |
|
|
"value": 0.9 |
|
|
}, |
|
|
"adam_beta2": { |
|
|
"desc": null, |
|
|
"value": 0.999 |
|
|
}, |
|
|
"do_predict": { |
|
|
"desc": null, |
|
|
"value": false |
|
|
}, |
|
|
"eval_delay": { |
|
|
"desc": null, |
|
|
"value": 0 |
|
|
}, |
|
|
"eval_steps": { |
|
|
"desc": null, |
|
|
"value": 0.03125 |
|
|
}, |
|
|
"hidden_act": { |
|
|
"desc": null, |
|
|
"value": "gelu" |
|
|
}, |
|
|
"is_decoder": { |
|
|
"desc": null, |
|
|
"value": false |
|
|
}, |
|
|
"local_rank": { |
|
|
"desc": null, |
|
|
"value": 0 |
|
|
}, |
|
|
"max_length": { |
|
|
"desc": null, |
|
|
"value": 20 |
|
|
}, |
|
|
"min_length": { |
|
|
"desc": null, |
|
|
"value": 0 |
|
|
}, |
|
|
"model_type": { |
|
|
"desc": null, |
|
|
"value": "gemma" |
|
|
}, |
|
|
"optim_args": { |
|
|
"desc": null, |
|
|
"value": null |
|
|
}, |
|
|
"orpo_alpha": { |
|
|
"desc": null, |
|
|
"value": null |
|
|
}, |
|
|
"output_dir": { |
|
|
"desc": null, |
|
|
"value": "./out" |
|
|
}, |
|
|
"past_index": { |
|
|
"desc": null, |
|
|
"value": -1 |
|
|
}, |
|
|
"rope_theta": { |
|
|
"desc": null, |
|
|
"value": 10000 |
|
|
}, |
|
|
"save_steps": { |
|
|
"desc": null, |
|
|
"value": 0.125 |
|
|
}, |
|
|
"vocab_size": { |
|
|
"desc": null, |
|
|
"value": 256000 |
|
|
}, |
|
|
"bench_split": { |
|
|
"desc": null, |
|
|
"value": "eval" |
|
|
}, |
|
|
"ddp_backend": { |
|
|
"desc": null, |
|
|
"value": null |
|
|
}, |
|
|
"ddp_timeout": { |
|
|
"desc": null, |
|
|
"value": 1800 |
|
|
}, |
|
|
"fsdp_config": { |
|
|
"desc": null, |
|
|
"value": { |
|
|
"xla": false, |
|
|
"xla_fsdp_v2": false, |
|
|
"min_num_params": 0, |
|
|
"xla_fsdp_grad_ckpt": false |
|
|
} |
|
|
}, |
|
|
"hidden_size": { |
|
|
"desc": null, |
|
|
"value": 2048 |
|
|
}, |
|
|
"label_names": { |
|
|
"desc": null, |
|
|
"value": null |
|
|
}, |
|
|
"logging_dir": { |
|
|
"desc": null, |
|
|
"value": "./out/runs/Mar21_10-14-24_8205afe3ecd2" |
|
|
}, |
|
|
"pretraining": { |
|
|
"desc": null, |
|
|
"value": false |
|
|
}, |
|
|
"push_to_hub": { |
|
|
"desc": null, |
|
|
"value": false |
|
|
}, |
|
|
"return_dict": { |
|
|
"desc": null, |
|
|
"value": true |
|
|
}, |
|
|
"temperature": { |
|
|
"desc": null, |
|
|
"value": 1 |
|
|
}, |
|
|
"torch_dtype": { |
|
|
"desc": null, |
|
|
"value": "bfloat16" |
|
|
}, |
|
|
"torchdynamo": { |
|
|
"desc": null, |
|
|
"value": null |
|
|
}, |
|
|
"torchscript": { |
|
|
"desc": null, |
|
|
"value": false |
|
|
}, |
|
|
"adam_epsilon": { |
|
|
"desc": null, |
|
|
"value": 1e-8 |
|
|
}, |
|
|
"bos_token_id": { |
|
|
"desc": null, |
|
|
"value": 2 |
|
|
}, |
|
|
"disable_tqdm": { |
|
|
"desc": null, |
|
|
"value": false |
|
|
}, |
|
|
"eos_token_id": { |
|
|
"desc": null, |
|
|
"value": 1 |
|
|
}, |
|
|
"fp16_backend": { |
|
|
"desc": null, |
|
|
"value": "auto" |
|
|
}, |
|
|
"hub_model_id": { |
|
|
"desc": null, |
|
|
"value": null |
|
|
}, |
|
|
"hub_strategy": { |
|
|
"desc": null, |
|
|
"value": "every_save" |
|
|
}, |
|
|
"pad_token_id": { |
|
|
"desc": null, |
|
|
"value": 0 |
|
|
}, |
|
|
"problem_type": { |
|
|
"desc": null, |
|
|
"value": null |
|
|
}, |
|
|
"pruned_heads": { |
|
|
"desc": null, |
|
|
"value": {} |
|
|
}, |
|
|
"relora_steps": { |
|
|
"desc": null, |
|
|
"value": null |
|
|
}, |
|
|
"rms_norm_eps": { |
|
|
"desc": null, |
|
|
"value": 0.000001 |
|
|
}, |
|
|
"rope_scaling": { |
|
|
"desc": null, |
|
|
"value": null |
|
|
}, |
|
|
"sep_token_id": { |
|
|
"desc": null, |
|
|
"value": null |
|
|
}, |
|
|
"use_bfloat16": { |
|
|
"desc": null, |
|
|
"value": false |
|
|
}, |
|
|
"warmup_ratio": { |
|
|
"desc": null, |
|
|
"value": 0 |
|
|
}, |
|
|
"warmup_steps": { |
|
|
"desc": null, |
|
|
"value": 3135 |
|
|
}, |
|
|
"weight_decay": { |
|
|
"desc": null, |
|
|
"value": 0 |
|
|
}, |
|
|
"_name_or_path": { |
|
|
"desc": null, |
|
|
"value": "dvdmrs09/gemma2b-train" |
|
|
}, |
|
|
"architectures": { |
|
|
"desc": null, |
|
|
"value": [ |
|
|
"GemmaForCausalLM" |
|
|
] |
|
|
}, |
|
|
"bad_words_ids": { |
|
|
"desc": null, |
|
|
"value": null |
|
|
}, |
|
|
"bench_dataset": { |
|
|
"desc": null, |
|
|
"value": "pharaouk/dharma-1/dharma_1_mini.json" |
|
|
}, |
|
|
"do_bench_eval": { |
|
|
"desc": null, |
|
|
"value": false |
|
|
}, |
|
|
"jit_mode_eval": { |
|
|
"desc": null, |
|
|
"value": false |
|
|
}, |
|
|
"learning_rate": { |
|
|
"desc": null, |
|
|
"value": 0.0002 |
|
|
}, |
|
|
"logging_steps": { |
|
|
"desc": null, |
|
|
"value": 1 |
|
|
}, |
|
|
"max_grad_norm": { |
|
|
"desc": null, |
|
|
"value": 1 |
|
|
}, |
|
|
"mp_parameters": { |
|
|
"desc": null, |
|
|
"value": "" |
|
|
}, |
|
|
"output_scores": { |
|
|
"desc": null, |
|
|
"value": false |
|
|
}, |
|
|
"save_strategy": { |
|
|
"desc": null, |
|
|
"value": "steps" |
|
|
}, |
|
|
"split_batches": { |
|
|
"desc": null, |
|
|
"value": null |
|
|
}, |
|
|
"torch_compile": { |
|
|
"desc": null, |
|
|
"value": false |
|
|
}, |
|
|
"tpu_num_cores": { |
|
|
"desc": null, |
|
|
"value": null |
|
|
}, |
|
|
"attention_bias": { |
|
|
"desc": null, |
|
|
"value": false |
|
|
}, |
|
|
"bf16_full_eval": { |
|
|
"desc": null, |
|
|
"value": false |
|
|
}, |
|
|
"early_stopping": { |
|
|
"desc": null, |
|
|
"value": false |
|
|
}, |
|
|
"fp16_full_eval": { |
|
|
"desc": null, |
|
|
"value": false |
|
|
}, |
|
|
"fp16_opt_level": { |
|
|
"desc": null, |
|
|
"value": "O1" |
|
|
}, |
|
|
"length_penalty": { |
|
|
"desc": null, |
|
|
"value": 1 |
|
|
}, |
|
|
"max_seq_length": { |
|
|
"desc": null, |
|
|
"value": 4096 |
|
|
}, |
|
|
"sample_packing": { |
|
|
"desc": null, |
|
|
"value": false |
|
|
}, |
|
|
"tf_legacy_loss": { |
|
|
"desc": null, |
|
|
"value": false |
|
|
}, |
|
|
"use_mps_device": { |
|
|
"desc": null, |
|
|
"value": false |
|
|
}, |
|
|
"finetuning_task": { |
|
|
"desc": null, |
|
|
"value": null |
|
|
}, |
|
|
"group_by_length": { |
|
|
"desc": null, |
|
|
"value": false |
|
|
}, |
|
|
"hub_always_push": { |
|
|
"desc": null, |
|
|
"value": false |
|
|
}, |
|
|
"num_beam_groups": { |
|
|
"desc": null, |
|
|
"value": 1 |
|
|
}, |
|
|
"save_only_model": { |
|
|
"desc": null, |
|
|
"value": false |
|
|
}, |
|
|
"suppress_tokens": { |
|
|
"desc": null, |
|
|
"value": null |
|
|
}, |
|
|
"tokenizer_class": { |
|
|
"desc": null, |
|
|
"value": null |
|
|
}, |
|
|
"dispatch_batches": { |
|
|
"desc": null, |
|
|
"value": null |
|
|
}, |
|
|
"full_determinism": { |
|
|
"desc": null, |
|
|
"value": false |
|
|
}, |
|
|
"hub_private_repo": { |
|
|
"desc": null, |
|
|
"value": false |
|
|
}, |
|
|
"ignore_data_skip": { |
|
|
"desc": null, |
|
|
"value": false |
|
|
}, |
|
|
"log_on_each_node": { |
|
|
"desc": null, |
|
|
"value": true |
|
|
}, |
|
|
"logging_strategy": { |
|
|
"desc": null, |
|
|
"value": "steps" |
|
|
}, |
|
|
"num_train_epochs": { |
|
|
"desc": null, |
|
|
"value": 8 |
|
|
}, |
|
|
"save_safetensors": { |
|
|
"desc": null, |
|
|
"value": true |
|
|
}, |
|
|
"save_total_limit": { |
|
|
"desc": null, |
|
|
"value": 4 |
|
|
}, |
|
|
"attention_dropout": { |
|
|
"desc": null, |
|
|
"value": 0 |
|
|
}, |
|
|
"ddp_bucket_cap_mb": { |
|
|
"desc": null, |
|
|
"value": null |
|
|
}, |
|
|
"diversity_penalty": { |
|
|
"desc": null, |
|
|
"value": 0 |
|
|
}, |
|
|
"do_causal_lm_eval": { |
|
|
"desc": null, |
|
|
"value": false |
|
|
}, |
|
|
"greater_is_better": { |
|
|
"desc": null, |
|
|
"value": false |
|
|
}, |
|
|
"initializer_range": { |
|
|
"desc": null, |
|
|
"value": 0.02 |
|
|
}, |
|
|
"intermediate_size": { |
|
|
"desc": null, |
|
|
"value": 16384 |
|
|
}, |
|
|
"log_level_replica": { |
|
|
"desc": null, |
|
|
"value": "warning" |
|
|
}, |
|
|
"loraplus_lr_ratio": { |
|
|
"desc": null, |
|
|
"value": null |
|
|
}, |
|
|
"lr_scheduler_type": { |
|
|
"desc": null, |
|
|
"value": "cosine" |
|
|
}, |
|
|
"max_bench_samples": { |
|
|
"desc": null, |
|
|
"value": null |
|
|
}, |
|
|
"num_hidden_layers": { |
|
|
"desc": null, |
|
|
"value": 18 |
|
|
}, |
|
|
"output_attentions": { |
|
|
"desc": null, |
|
|
"value": false |
|
|
}, |
|
|
"push_to_hub_token": { |
|
|
"desc": null, |
|
|
"value": "<PUSH_TO_HUB_TOKEN>" |
|
|
}, |
|
|
"save_on_each_node": { |
|
|
"desc": null, |
|
|
"value": false |
|
|
}, |
|
|
"tpu_metrics_debug": { |
|
|
"desc": null, |
|
|
"value": false |
|
|
}, |
|
|
"accelerator_config": { |
|
|
"desc": null, |
|
|
"value": { |
|
|
"even_batches": true, |
|
|
"split_batches": false, |
|
|
"dispatch_batches": null, |
|
|
"use_seedable_sampler": true |
|
|
} |
|
|
}, |
|
|
"is_encoder_decoder": { |
|
|
"desc": null, |
|
|
"value": false |
|
|
}, |
|
|
"length_column_name": { |
|
|
"desc": null, |
|
|
"value": "length" |
|
|
}, |
|
|
"logging_first_step": { |
|
|
"desc": null, |
|
|
"value": false |
|
|
}, |
|
|
"relora_prune_ratio": { |
|
|
"desc": null, |
|
|
"value": 0.9 |
|
|
}, |
|
|
"repetition_penalty": { |
|
|
"desc": null, |
|
|
"value": 1 |
|
|
}, |
|
|
"torch_compile_mode": { |
|
|
"desc": null, |
|
|
"value": null |
|
|
}, |
|
|
"add_cross_attention": { |
|
|
"desc": null, |
|
|
"value": false |
|
|
}, |
|
|
"cosine_min_lr_ratio": { |
|
|
"desc": null, |
|
|
"value": null |
|
|
}, |
|
|
"eval_sample_packing": { |
|
|
"desc": null, |
|
|
"value": false |
|
|
}, |
|
|
"evaluation_strategy": { |
|
|
"desc": null, |
|
|
"value": "steps" |
|
|
}, |
|
|
"forced_bos_token_id": { |
|
|
"desc": null, |
|
|
"value": null |
|
|
}, |
|
|
"forced_eos_token_id": { |
|
|
"desc": null, |
|
|
"value": null |
|
|
}, |
|
|
"fsdp_min_num_params": { |
|
|
"desc": null, |
|
|
"value": 0 |
|
|
}, |
|
|
"lr_quadratic_warmup": { |
|
|
"desc": null, |
|
|
"value": false |
|
|
}, |
|
|
"lr_scheduler_kwargs": { |
|
|
"desc": null, |
|
|
"value": {} |
|
|
}, |
|
|
"neftune_noise_alpha": { |
|
|
"desc": null, |
|
|
"value": null |
|
|
}, |
|
|
"num_attention_heads": { |
|
|
"desc": null, |
|
|
"value": 8 |
|
|
}, |
|
|
"num_key_value_heads": { |
|
|
"desc": null, |
|
|
"value": 1 |
|
|
}, |
|
|
"quantization_config": { |
|
|
"desc": null, |
|
|
"value": { |
|
|
"load_in_4bit": true, |
|
|
"load_in_8bit": false, |
|
|
"quant_method": "QuantizationMethod.BITS_AND_BYTES", |
|
|
"_load_in_4bit": true, |
|
|
"_load_in_8bit": false, |
|
|
"llm_int8_threshold": 6, |
|
|
"bnb_4bit_quant_type": "nf4", |
|
|
"llm_int8_skip_modules": null, |
|
|
"bnb_4bit_compute_dtype": "bfloat16", |
|
|
"bnb_4bit_quant_storage": "uint8", |
|
|
"llm_int8_has_fp16_weight": false, |
|
|
"bnb_4bit_use_double_quant": true, |
|
|
"llm_int8_enable_fp32_cpu_offload": false |
|
|
} |
|
|
}, |
|
|
"relora_anneal_steps": { |
|
|
"desc": null, |
|
|
"value": null |
|
|
}, |
|
|
"relora_warmup_steps": { |
|
|
"desc": null, |
|
|
"value": null |
|
|
}, |
|
|
"skip_memory_metrics": { |
|
|
"desc": null, |
|
|
"value": true |
|
|
}, |
|
|
"tie_encoder_decoder": { |
|
|
"desc": null, |
|
|
"value": false |
|
|
}, |
|
|
"tie_word_embeddings": { |
|
|
"desc": null, |
|
|
"value": true |
|
|
}, |
|
|
"auto_find_batch_size": { |
|
|
"desc": null, |
|
|
"value": false |
|
|
}, |
|
|
"bench_source_max_len": { |
|
|
"desc": null, |
|
|
"value": 2048 |
|
|
}, |
|
|
"dataloader_drop_last": { |
|
|
"desc": null, |
|
|
"value": false |
|
|
}, |
|
|
"no_repeat_ngram_size": { |
|
|
"desc": null, |
|
|
"value": 0 |
|
|
}, |
|
|
"num_return_sequences": { |
|
|
"desc": null, |
|
|
"value": 1 |
|
|
}, |
|
|
"optim_target_modules": { |
|
|
"desc": null, |
|
|
"value": null |
|
|
}, |
|
|
"output_hidden_states": { |
|
|
"desc": null, |
|
|
"value": false |
|
|
}, |
|
|
"overwrite_output_dir": { |
|
|
"desc": null, |
|
|
"value": false |
|
|
}, |
|
|
"prediction_loss_only": { |
|
|
"desc": null, |
|
|
"value": false |
|
|
}, |
|
|
"push_to_hub_model_id": { |
|
|
"desc": null, |
|
|
"value": null |
|
|
}, |
|
|
"task_specific_params": { |
|
|
"desc": null, |
|
|
"value": null |
|
|
}, |
|
|
"transformers_version": { |
|
|
"desc": null, |
|
|
"value": "4.39.0.dev0" |
|
|
}, |
|
|
"begin_suppress_tokens": { |
|
|
"desc": null, |
|
|
"value": null |
|
|
}, |
|
|
"dataloader_pin_memory": { |
|
|
"desc": null, |
|
|
"value": true |
|
|
}, |
|
|
"ddp_broadcast_buffers": { |
|
|
"desc": null, |
|
|
"value": null |
|
|
}, |
|
|
"loraplus_lr_embedding": { |
|
|
"desc": null, |
|
|
"value": null |
|
|
}, |
|
|
"metric_for_best_model": { |
|
|
"desc": null, |
|
|
"value": "loss" |
|
|
}, |
|
|
"remove_invalid_values": { |
|
|
"desc": null, |
|
|
"value": false |
|
|
}, |
|
|
"remove_unused_columns": { |
|
|
"desc": null, |
|
|
"value": true |
|
|
}, |
|
|
"torch_compile_backend": { |
|
|
"desc": null, |
|
|
"value": null |
|
|
}, |
|
|
"dataloader_num_workers": { |
|
|
"desc": null, |
|
|
"value": 0 |
|
|
}, |
|
|
"decoder_start_token_id": { |
|
|
"desc": null, |
|
|
"value": null |
|
|
}, |
|
|
"gradient_checkpointing": { |
|
|
"desc": null, |
|
|
"value": true |
|
|
}, |
|
|
"half_precision_backend": { |
|
|
"desc": null, |
|
|
"value": "auto" |
|
|
}, |
|
|
"label_smoothing_factor": { |
|
|
"desc": null, |
|
|
"value": 0 |
|
|
}, |
|
|
"load_best_model_at_end": { |
|
|
"desc": null, |
|
|
"value": true |
|
|
}, |
|
|
"logging_nan_inf_filter": { |
|
|
"desc": null, |
|
|
"value": true |
|
|
}, |
|
|
"multipack_real_batches": { |
|
|
"desc": null, |
|
|
"value": false |
|
|
}, |
|
|
"resume_from_checkpoint": { |
|
|
"desc": null, |
|
|
"value": null |
|
|
}, |
|
|
"chunk_size_feed_forward": { |
|
|
"desc": null, |
|
|
"value": 0 |
|
|
}, |
|
|
"eval_accumulation_steps": { |
|
|
"desc": null, |
|
|
"value": 3 |
|
|
}, |
|
|
"max_position_embeddings": { |
|
|
"desc": null, |
|
|
"value": 8192 |
|
|
}, |
|
|
"per_gpu_eval_batch_size": { |
|
|
"desc": null, |
|
|
"value": null |
|
|
}, |
|
|
"return_dict_in_generate": { |
|
|
"desc": null, |
|
|
"value": false |
|
|
}, |
|
|
"cosine_constant_lr_ratio": { |
|
|
"desc": null, |
|
|
"value": null |
|
|
}, |
|
|
"per_gpu_train_batch_size": { |
|
|
"desc": null, |
|
|
"value": null |
|
|
}, |
|
|
"push_to_hub_organization": { |
|
|
"desc": null, |
|
|
"value": null |
|
|
}, |
|
|
"include_tokens_per_second": { |
|
|
"desc": null, |
|
|
"value": false |
|
|
}, |
|
|
"sample_packing_efficiency": { |
|
|
"desc": null, |
|
|
"value": 1 |
|
|
}, |
|
|
"dataloader_prefetch_factor": { |
|
|
"desc": null, |
|
|
"value": null |
|
|
}, |
|
|
"ddp_find_unused_parameters": { |
|
|
"desc": null, |
|
|
"value": false |
|
|
}, |
|
|
"include_inputs_for_metrics": { |
|
|
"desc": null, |
|
|
"value": false |
|
|
}, |
|
|
"per_device_eval_batch_size": { |
|
|
"desc": null, |
|
|
"value": 2 |
|
|
}, |
|
|
"use_legacy_prediction_loop": { |
|
|
"desc": null, |
|
|
"value": false |
|
|
}, |
|
|
"cross_attention_hidden_size": { |
|
|
"desc": null, |
|
|
"value": null |
|
|
}, |
|
|
"gradient_accumulation_steps": { |
|
|
"desc": null, |
|
|
"value": 3 |
|
|
}, |
|
|
"per_device_train_batch_size": { |
|
|
"desc": null, |
|
|
"value": 2 |
|
|
}, |
|
|
"encoder_no_repeat_ngram_size": { |
|
|
"desc": null, |
|
|
"value": 0 |
|
|
}, |
|
|
"dataloader_persistent_workers": { |
|
|
"desc": null, |
|
|
"value": false |
|
|
}, |
|
|
"gradient_checkpointing_kwargs": { |
|
|
"desc": null, |
|
|
"value": { |
|
|
"use_reentrant": true |
|
|
} |
|
|
}, |
|
|
"include_num_input_tokens_seen": { |
|
|
"desc": null, |
|
|
"value": false |
|
|
}, |
|
|
"exponential_decay_length_penalty": { |
|
|
"desc": null, |
|
|
"value": null |
|
|
}, |
|
|
"sample_packing_seq_len_multiplier": { |
|
|
"desc": null, |
|
|
"value": 2 |
|
|
}, |
|
|
"fsdp_transformer_layer_cls_to_wrap": { |
|
|
"desc": null, |
|
|
"value": null |
|
|
} |
|
|
} |
|
|
### Model Architecture and Objective |
|
|
|
|
|
## Citation [optional] |
|
|
|
|
|
|
|
|
## Glossary [optional] |
|
|
### Framework versions |
|
|
|
|
|
- PEFT 0.9.0 |