| | import sys |
| | sys.path.append("..") |
| |
|
| | import os |
| | from datasets import load_dataset |
| | from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments, DataCollatorForLanguageModeling |
| | from utils_qwen import PERTURBATIONS, BABYLM_SPLITS, BABYLM_DATA_PATH, \ |
| | GENRES, MARKER_TOKEN_IDS, marker_sg_token, marker_pl_token, marker_rev_token, write_file |
| | from peft import get_peft_model, LoraConfig, TaskType |
| | |
| |
|
| | |
| | perturbation = "shuffle_deterministic21" |
| | train_set = "10M" |
| | seed = 0 |
| | ckpt_path = "./checkpoints" |
| | effective_bsz = 512 |
| |
|
| | |
| | run_id = f"babylm_{perturbation}_{train_set}_seed{seed}" |
| | cache_dir = os.path.join(ckpt_path, "babylm_lora", run_id, "artifacts") |
| | run_dir = os.path.join(ckpt_path, "babylm_lora", run_id, "runs") |
| | os.makedirs(cache_dir, exist_ok=True) |
| | os.makedirs(run_dir, exist_ok=True) |
| |
|
| | |
| | |
| |
|
| | |
| | dataset_name = f"babylm_{perturbation}_{train_set}_seed{seed}" |
| | dataset = load_dataset('babylm_dataset.py', name=dataset_name, trust_remote_code=True) |
| | train_dataset = dataset['train'] |
| |
|
| | |
| | model_name = "Qwen/Qwen2.5-0.5B" |
| | tokenizer = PERTURBATIONS[perturbation]['qwen_tokenizer'] |
| | model = AutoModelForCausalLM.from_pretrained(model_name, cache_dir=cache_dir) |
| |
|
| | |
| | lora_config = LoraConfig( |
| | task_type=TaskType.CAUSAL_LM, |
| | r=16, |
| | lora_alpha=16, |
| | lora_dropout=0.1, |
| | ) |
| | model = get_peft_model(model, lora_config) |
| |
|
| | |
| | |
| | |
| | |
| | |
| | def tokenize_function(examples): |
| | return tokenizer(examples['text'], padding="max_length", truncation=True, max_length=1024) |
| |
|
| | tokenized_train = train_dataset.map(tokenize_function, batched=True, remove_columns=["text"]) |
| | |
| | data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False) |
| |
|
| | |
| | training_args = TrainingArguments( |
| | output_dir=run_dir, |
| | |
| | evaluation_strategy="no", |
| | per_device_train_batch_size=1, |
| | logging_dir='./logs', |
| | logging_steps=10, |
| | save_steps=10, |
| | |
| | learning_rate=5e-4, |
| | num_train_epochs=10, |
| | seed=seed, |
| | |
| | gradient_accumulation_steps=1, |
| | fp16=True, |
| | warmup_ratio=0.1, |
| | |
| | ) |
| |
|
| | |
| | trainer = Trainer( |
| | model=model, |
| | args=training_args, |
| | train_dataset=tokenized_train, |
| | tokenizer=tokenizer, |
| | data_collator=data_collator |
| | ) |
| |
|
| | |
| | trainer.train() |
| |
|
| | |
| | |