cpt core 4
Browse files- scripts/cpt_core_model_4.py +17 -17
scripts/cpt_core_model_4.py
CHANGED
|
@@ -32,28 +32,25 @@ model, tokenizer = FastLanguageModel.from_pretrained(
|
|
| 32 |
|
| 33 |
model = FastLanguageModel.get_peft_model(
|
| 34 |
model,
|
| 35 |
-
r
|
| 36 |
-
|
| 37 |
-
target_modules = [
|
| 38 |
"q_proj", "k_proj", "v_proj", "o_proj",
|
| 39 |
"gate_proj",
|
| 40 |
"up_proj", "down_proj",
|
| 41 |
"embed_tokens", "lm_head",
|
| 42 |
],
|
| 43 |
-
lora_alpha
|
| 44 |
-
#
|
| 45 |
-
|
| 46 |
-
bias = "none", # Supports any, but = "none" is optimized
|
| 47 |
# [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
|
| 48 |
-
# use_gradient_checkpointing
|
| 49 |
-
use_gradient_checkpointing
|
| 50 |
-
random_state
|
| 51 |
-
use_rslora
|
| 52 |
-
loftq_config
|
| 53 |
)
|
| 54 |
# print(f'{model=}')
|
| 55 |
|
| 56 |
-
|
| 57 |
#
|
| 58 |
# dataset
|
| 59 |
#
|
|
@@ -89,21 +86,20 @@ trainer = UnslothTrainer(
|
|
| 89 |
model=model,
|
| 90 |
tokenizer=tokenizer,
|
| 91 |
train_dataset=train_dataset,
|
| 92 |
-
# dataset_text_field='text',
|
| 93 |
max_seq_length=max_seq_length,
|
| 94 |
dataset_num_proc=32,
|
| 95 |
max_steps=len(litgpt_streaming_dataset),
|
| 96 |
packing=False, # Can make training 5x faster for short sequences.
|
| 97 |
|
| 98 |
-
args
|
| 99 |
per_device_train_batch_size=1,
|
| 100 |
# gradient_accumulation_steps=8,
|
| 101 |
|
| 102 |
warmup_ratio=0,
|
| 103 |
num_train_epochs=1,
|
| 104 |
|
| 105 |
-
learning_rate
|
| 106 |
-
embedding_learning_rate
|
| 107 |
|
| 108 |
fp16=not is_bfloat16_supported(),
|
| 109 |
bf16=is_bfloat16_supported(),
|
|
@@ -115,6 +111,10 @@ trainer = UnslothTrainer(
|
|
| 115 |
seed=23,
|
| 116 |
output_dir=output_dir,
|
| 117 |
report_to='wandb',
|
|
|
|
|
|
|
|
|
|
|
|
|
| 118 |
),
|
| 119 |
)
|
| 120 |
|
|
|
|
| 32 |
|
| 33 |
model = FastLanguageModel.get_peft_model(
|
| 34 |
model,
|
| 35 |
+
r=256, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
|
| 36 |
+
target_modules=[
|
|
|
|
| 37 |
"q_proj", "k_proj", "v_proj", "o_proj",
|
| 38 |
"gate_proj",
|
| 39 |
"up_proj", "down_proj",
|
| 40 |
"embed_tokens", "lm_head",
|
| 41 |
],
|
| 42 |
+
lora_alpha=32,
|
| 43 |
+
lora_dropout=0, # Supports any, but = 0 is optimized
|
| 44 |
+
bias="none", # Supports any, but = "none" is optimized
|
|
|
|
| 45 |
# [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
|
| 46 |
+
# use_gradient_checkpointing="unsloth", # True or "unsloth" for very long context
|
| 47 |
+
use_gradient_checkpointing=False,
|
| 48 |
+
random_state=23,
|
| 49 |
+
use_rslora=True, # We support rank stabilized LoRA
|
| 50 |
+
loftq_config=None, # And LoftQ
|
| 51 |
)
|
| 52 |
# print(f'{model=}')
|
| 53 |
|
|
|
|
| 54 |
#
|
| 55 |
# dataset
|
| 56 |
#
|
|
|
|
| 86 |
model=model,
|
| 87 |
tokenizer=tokenizer,
|
| 88 |
train_dataset=train_dataset,
|
|
|
|
| 89 |
max_seq_length=max_seq_length,
|
| 90 |
dataset_num_proc=32,
|
| 91 |
max_steps=len(litgpt_streaming_dataset),
|
| 92 |
packing=False, # Can make training 5x faster for short sequences.
|
| 93 |
|
| 94 |
+
args=UnslothTrainingArguments(
|
| 95 |
per_device_train_batch_size=1,
|
| 96 |
# gradient_accumulation_steps=8,
|
| 97 |
|
| 98 |
warmup_ratio=0,
|
| 99 |
num_train_epochs=1,
|
| 100 |
|
| 101 |
+
learning_rate=5e-5,
|
| 102 |
+
embedding_learning_rate=5e-5 / 10.0,
|
| 103 |
|
| 104 |
fp16=not is_bfloat16_supported(),
|
| 105 |
bf16=is_bfloat16_supported(),
|
|
|
|
| 111 |
seed=23,
|
| 112 |
output_dir=output_dir,
|
| 113 |
report_to='wandb',
|
| 114 |
+
|
| 115 |
+
do_eval=True,
|
| 116 |
+
save_steps=100,
|
| 117 |
+
eval_steps=100,
|
| 118 |
),
|
| 119 |
)
|
| 120 |
|