cpt core 4
Browse files- scripts/cpt_core_model_4.py +3 -11
scripts/cpt_core_model_4.py
CHANGED
|
@@ -55,7 +55,7 @@ model = FastLanguageModel.get_peft_model(
|
|
| 55 |
|
| 56 |
|
| 57 |
#
|
| 58 |
-
#
|
| 59 |
#
|
| 60 |
from datasets import Dataset
|
| 61 |
from litdata import TokensLoader, StreamingDataset
|
|
@@ -74,11 +74,10 @@ def unlsoth_generator():
|
|
| 74 |
yield {'input_ids': batch}
|
| 75 |
|
| 76 |
|
| 77 |
-
# train_dataset = Dataset.from_generator(unlsoth_generator, streaming=True)
|
| 78 |
train_dataset = Dataset.from_generator(unlsoth_generator)
|
| 79 |
|
| 80 |
#
|
| 81 |
-
#
|
| 82 |
#
|
| 83 |
from trl import SFTTrainer
|
| 84 |
from transformers import TrainingArguments
|
|
@@ -97,18 +96,12 @@ trainer = UnslothTrainer(
|
|
| 97 |
packing=False, # Can make training 5x faster for short sequences.
|
| 98 |
|
| 99 |
args = UnslothTrainingArguments(
|
| 100 |
-
# per_device_train_batch_size=16,
|
| 101 |
-
# gradient_accumulation_steps=64,
|
| 102 |
-
# per_device_train_batch_size=16,
|
| 103 |
-
# gradient_accumulation_steps=16,
|
| 104 |
per_device_train_batch_size=1,
|
| 105 |
# gradient_accumulation_steps=8,
|
| 106 |
|
| 107 |
warmup_ratio=0,
|
| 108 |
num_train_epochs=1,
|
| 109 |
|
| 110 |
-
# learning_rate=5e-5,
|
| 111 |
-
# embedding_learning_rate=5e-6,
|
| 112 |
learning_rate = 5e-5,
|
| 113 |
embedding_learning_rate = 5e-5 / 10.0,
|
| 114 |
|
|
@@ -116,8 +109,7 @@ trainer = UnslothTrainer(
|
|
| 116 |
bf16=is_bfloat16_supported(),
|
| 117 |
logging_steps=1,
|
| 118 |
# optim='adamw_8bit',
|
| 119 |
-
optim='
|
| 120 |
-
# optim='adamw_torch_fused',
|
| 121 |
weight_decay=0.01,
|
| 122 |
lr_scheduler_type='cosine',
|
| 123 |
seed=23,
|
|
|
|
| 55 |
|
| 56 |
|
| 57 |
#
|
| 58 |
+
# dataset
|
| 59 |
#
|
| 60 |
from datasets import Dataset
|
| 61 |
from litdata import TokensLoader, StreamingDataset
|
|
|
|
| 74 |
yield {'input_ids': batch}
|
| 75 |
|
| 76 |
|
|
|
|
| 77 |
train_dataset = Dataset.from_generator(unlsoth_generator)
|
| 78 |
|
| 79 |
#
|
| 80 |
+
# trainer
|
| 81 |
#
|
| 82 |
from trl import SFTTrainer
|
| 83 |
from transformers import TrainingArguments
|
|
|
|
| 96 |
packing=False, # Can make training 5x faster for short sequences.
|
| 97 |
|
| 98 |
args = UnslothTrainingArguments(
|
|
|
|
|
|
|
|
|
|
|
|
|
| 99 |
per_device_train_batch_size=1,
|
| 100 |
# gradient_accumulation_steps=8,
|
| 101 |
|
| 102 |
warmup_ratio=0,
|
| 103 |
num_train_epochs=1,
|
| 104 |
|
|
|
|
|
|
|
| 105 |
learning_rate = 5e-5,
|
| 106 |
embedding_learning_rate = 5e-5 / 10.0,
|
| 107 |
|
|
|
|
| 109 |
bf16=is_bfloat16_supported(),
|
| 110 |
logging_steps=1,
|
| 111 |
# optim='adamw_8bit',
|
| 112 |
+
optim='adamw_torch_fused',
|
|
|
|
| 113 |
weight_decay=0.01,
|
| 114 |
lr_scheduler_type='cosine',
|
| 115 |
seed=23,
|