S-Dreamer commited on
Commit
00ae6eb
·
verified ·
1 Parent(s): 33a206b

Rename trainer.py to src/train.py

Browse files
Files changed (2) hide show
  1. src/train.py +88 -0
  2. trainer.py +0 -58
src/train.py ADDED
@@ -0,0 +1,88 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from typing import Optional
3
+
4
+ import torch
5
+ from datasets import load_dataset
6
+ from transformers import (
7
+ AutoModelForCausalLM,
8
+ AutoTokenizer,
9
+ DataCollatorForLanguageModeling,
10
+ Trainer,
11
+ TrainingArguments,
12
+ )
13
+
14
+ from peft import LoraConfig, TaskType, get_peft_model
15
+
16
+
17
+ def finetune_lora(
18
+ base_model: str,
19
+ dataset_id: str,
20
+ text_column: str,
21
+ output_dir: str,
22
+ max_train_samples: int = 2000,
23
+ max_steps: int = 100,
24
+ learning_rate: float = 2e-4,
25
+ batch_size: int = 2,
26
+ lora_r: int = 8,
27
+ lora_alpha: int = 16,
28
+ lora_dropout: float = 0.05,
29
+ ) -> str:
30
+ ds = load_dataset(dataset_id, split="train")
31
+ if text_column not in ds.column_names:
32
+ return f"ERROR: column '{text_column}' not found. Available: {ds.column_names}"
33
+
34
+ if max_train_samples and max_train_samples > 0:
35
+ ds = ds.select(range(min(len(ds), int(max_train_samples))))
36
+
37
+ tokenizer = AutoTokenizer.from_pretrained(base_model, use_fast=True)
38
+ if tokenizer.pad_token is None:
39
+ tokenizer.pad_token = tokenizer.eos_token
40
+
41
+ def tok(batch):
42
+ return tokenizer(batch[text_column], truncation=True, max_length=256)
43
+
44
+ tokenized = ds.map(tok, batched=True, remove_columns=ds.column_names)
45
+
46
+ model = AutoModelForCausalLM.from_pretrained(base_model)
47
+ model.config.pad_token_id = tokenizer.pad_token_id
48
+
49
+ # LoRA target modules here are GPT-2-ish defaults.
50
+ # If you swap to a non-GPT2 architecture, you may need to change target_modules.
51
+ lora_cfg = LoraConfig(
52
+ task_type=TaskType.CAUSAL_LM,
53
+ r=int(lora_r),
54
+ lora_alpha=int(lora_alpha),
55
+ lora_dropout=float(lora_dropout),
56
+ bias="none",
57
+ target_modules=["c_attn", "c_proj"],
58
+ )
59
+ model = get_peft_model(model, lora_cfg)
60
+
61
+ collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
62
+
63
+ fp16 = torch.cuda.is_available()
64
+ args = TrainingArguments(
65
+ output_dir=output_dir,
66
+ per_device_train_batch_size=int(batch_size),
67
+ learning_rate=float(learning_rate),
68
+ max_steps=int(max_steps),
69
+ logging_steps=10,
70
+ save_steps=0,
71
+ report_to=[],
72
+ fp16=fp16,
73
+ )
74
+
75
+ trainer = Trainer(
76
+ model=model,
77
+ args=args,
78
+ train_dataset=tokenized,
79
+ data_collator=collator,
80
+ )
81
+
82
+ trainer.train()
83
+
84
+ adapter_dir = os.path.join(output_dir, "adapter")
85
+ model.save_pretrained(adapter_dir)
86
+ tokenizer.save_pretrained(adapter_dir)
87
+
88
+ return f"Saved LoRA adapter + tokenizer to {adapter_dir}"
trainer.py DELETED
@@ -1,58 +0,0 @@
1
- from datasets import load_dataset
2
- from transformers import (
3
- AutoTokenizer,
4
- AutoModelForSequenceClassification,
5
- TrainingArguments,
6
- Trainer
7
- )
8
- from peft import LoraConfig, get_peft_model
9
-
10
-
11
- def run_finetune(base_model, dataset_path, epochs=3):
12
-
13
- dataset = load_dataset("json", data_files=dataset_path)
14
-
15
- tokenizer = AutoTokenizer.from_pretrained(base_model)
16
-
17
- def tokenize(example):
18
- return tokenizer(
19
- example["text"],
20
- truncation=True,
21
- padding="max_length"
22
- )
23
-
24
- tokenized = dataset.map(tokenize)
25
-
26
- model = AutoModelForSequenceClassification.from_pretrained(
27
- base_model,
28
- num_labels=2
29
- )
30
-
31
- lora_config = LoraConfig(
32
- r=8,
33
- lora_alpha=32,
34
- target_modules=["query", "value"],
35
- lora_dropout=0.05
36
- )
37
-
38
- model = get_peft_model(model, lora_config)
39
-
40
- args = TrainingArguments(
41
- output_dir="./results",
42
- num_train_epochs=epochs,
43
- per_device_train_batch_size=4,
44
- save_steps=50,
45
- logging_steps=10
46
- )
47
-
48
- trainer = Trainer(
49
- model=model,
50
- args=args,
51
- train_dataset=tokenized["train"]
52
- )
53
-
54
- trainer.train()
55
-
56
- model.save_pretrained("./finetuned")
57
-
58
- return "Training complete. Model saved."