Mike369williams commited on
Commit
2848bde
·
verified ·
1 Parent(s): d9cc9ba

Create training/train.py

Browse files
Files changed (1) hide show
  1. training/train.py +137 -0
training/train.py ADDED
@@ -0,0 +1,137 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # training/train.py
2
+ # Minimal training skeleton using Hugging Face transformers Trainer.
3
+ # Designed to train Sanchari-S (200-350M) from scratch or fine-tune.
4
+ # Run: python training/train.py --config training/config_s.json --tokenizer_dir ../tokenizer
5
+
6
+ import json
7
+ import argparse
8
+ import os
9
+ from pathlib import Path
10
+ from datasets import load_dataset
11
+ from transformers import (
12
+ AutoTokenizer,
13
+ GPT2Config,
14
+ AutoModelForCausalLM,
15
+ DataCollatorForLanguageModeling,
16
+ TrainingArguments,
17
+ Trainer
18
+ )
19
+
20
+ def load_config(path):
21
+ with open(path, "r") as f:
22
+ return json.load(f)
23
+
24
+ def group_texts(examples, block_size):
25
+ # concatenate and chunk
26
+ concatenated = {k: sum(examples[k], []) for k in examples.keys()}
27
+ total_length = len(concatenated["input_ids"])
28
+ total_length = (total_length // block_size) * block_size
29
+ result = {
30
+ k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
31
+ for k, t in concatenated.items()
32
+ }
33
+ return result
34
+
35
+ def main():
36
+ parser = argparse.ArgumentParser()
37
+ parser.add_argument("--config", required=True, help="Path to config json")
38
+ parser.add_argument("--tokenizer_dir", required=True, help="Path to tokenizer folder (containing .model/.vocab)")
39
+ parser.add_argument("--data_file", default="../data/all_texts.txt", help="Single-line text file or newline-separated.")
40
+ parser.add_argument("--output_dir", default="./outputs/sanchari-s", help="Output directory")
41
+ args = parser.parse_args()
42
+
43
+ cfg = load_config(args.config)
44
+
45
+ # Load tokenizer
46
+ tokenizer = AutoTokenizer.from_pretrained(args.tokenizer_dir, use_fast=False)
47
+ # Make sure tokenizer has pad token
48
+ if tokenizer.pad_token is None:
49
+ tokenizer.add_special_tokens({"pad_token": "[PAD]"})
50
+ block_size = cfg.get("block_size", 1024)
51
+
52
+ # Create or load dataset (text)
53
+ if not os.path.exists(args.data_file):
54
+ raise FileNotFoundError(f"Data file not found: {args.data_file}")
55
+ raw_dsets = load_dataset("text", data_files={"train": args.data_file})
56
+ # Tokenize
57
+ def tokenize_fn(examples):
58
+ return tokenizer(examples["text"], return_special_tokens_mask=False)
59
+
60
+ tokenized = raw_dsets.map(
61
+ tokenize_fn,
62
+ batched=True,
63
+ remove_columns=["text"],
64
+ num_proc=1
65
+ )
66
+
67
+ # Convert tokenized sequences to blocks of block_size
68
+ tokenized = tokenized.map(
69
+ lambda examples: {
70
+ "input_ids": sum(examples["input_ids"], [])
71
+ },
72
+ batched=True,
73
+ remove_columns=tokenized["train"].column_names
74
+ )
75
+
76
+ # Group into blocks
77
+ def chunker(examples):
78
+ all_ids = examples["input_ids"]
79
+ chunks = [all_ids[i:i+block_size] for i in range(0, len(all_ids), block_size) if len(all_ids[i:i+block_size])==block_size]
80
+ return {"input_ids": chunks}
81
+
82
+ dataset = tokenized["train"].map(
83
+ chunker,
84
+ batched=True,
85
+ remove_columns=tokenized["train"].column_names,
86
+ )
87
+
88
+ # Build model config and model
89
+ model_cfg = GPT2Config(
90
+ vocab_size=len(tokenizer),
91
+ n_positions=block_size,
92
+ n_ctx=block_size,
93
+ n_embd=cfg["model"]["n_embd"],
94
+ n_layer=cfg["model"]["n_layer"],
95
+ n_head=cfg["model"]["n_head"],
96
+ bos_token_id=tokenizer.bos_token_id if tokenizer.bos_token_id is not None else tokenizer.convert_tokens_to_ids(tokenizer.cls_token) if tokenizer.cls_token else 1,
97
+ eos_token_id=tokenizer.eos_token_id if tokenizer.eos_token_id is not None else 2,
98
+ )
99
+ model = AutoModelForCausalLM.from_config(model_cfg)
100
+ # resize token embeddings if tokenizer added tokens
101
+ model.resize_token_embeddings(len(tokenizer))
102
+
103
+ # Data collator
104
+ data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
105
+
106
+ # Training arguments from config
107
+ train_args = cfg["training"]
108
+ training_args = TrainingArguments(
109
+ output_dir=args.output_dir,
110
+ per_device_train_batch_size=train_args.get("per_device_train_batch_size", 2),
111
+ gradient_accumulation_steps=train_args.get("gradient_accumulation_steps", 8),
112
+ num_train_epochs=train_args.get("num_train_epochs", 1),
113
+ learning_rate=train_args.get("learning_rate", 2e-4),
114
+ weight_decay=train_args.get("weight_decay", 0.01),
115
+ fp16=train_args.get("fp16", True),
116
+ logging_steps=train_args.get("logging_steps", 100),
117
+ save_steps=train_args.get("save_steps", 1000),
118
+ evaluation_strategy="no",
119
+ save_total_limit=3,
120
+ remove_unused_columns=False,
121
+ report_to="none" # disable wandb by default
122
+ )
123
+
124
+ trainer = Trainer(
125
+ model=model,
126
+ args=training_args,
127
+ train_dataset=dataset,
128
+ data_collator=data_collator
129
+ )
130
+
131
+ trainer.train()
132
+ trainer.save_model(args.output_dir)
133
+ tokenizer.save_pretrained(args.output_dir)
134
+ print("Training complete. Model & tokenizer saved to", args.output_dir)
135
+
136
+ if __name__ == "__main__":
137
+ main()