|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
import json |
|
|
import argparse |
|
|
import os |
|
|
from pathlib import Path |
|
|
from datasets import load_dataset |
|
|
from transformers import ( |
|
|
AutoTokenizer, |
|
|
GPT2Config, |
|
|
AutoModelForCausalLM, |
|
|
DataCollatorForLanguageModeling, |
|
|
TrainingArguments, |
|
|
Trainer |
|
|
) |
|
|
|
|
|
def load_config(path): |
|
|
with open(path, "r") as f: |
|
|
return json.load(f) |
|
|
|
|
|
def group_texts(examples, block_size): |
|
|
|
|
|
concatenated = {k: sum(examples[k], []) for k in examples.keys()} |
|
|
total_length = len(concatenated["input_ids"]) |
|
|
total_length = (total_length // block_size) * block_size |
|
|
result = { |
|
|
k: [t[i : i + block_size] for i in range(0, total_length, block_size)] |
|
|
for k, t in concatenated.items() |
|
|
} |
|
|
return result |
|
|
|
|
|
def main(): |
|
|
parser = argparse.ArgumentParser() |
|
|
parser.add_argument("--config", required=True, help="Path to config json") |
|
|
parser.add_argument("--tokenizer_dir", required=True, help="Path to tokenizer folder (containing .model/.vocab)") |
|
|
parser.add_argument("--data_file", default="../data/all_texts.txt", help="Single-line text file or newline-separated.") |
|
|
parser.add_argument("--output_dir", default="./outputs/sanchari-s", help="Output directory") |
|
|
args = parser.parse_args() |
|
|
|
|
|
cfg = load_config(args.config) |
|
|
|
|
|
|
|
|
tokenizer = AutoTokenizer.from_pretrained(args.tokenizer_dir, use_fast=False) |
|
|
|
|
|
if tokenizer.pad_token is None: |
|
|
tokenizer.add_special_tokens({"pad_token": "[PAD]"}) |
|
|
block_size = cfg.get("block_size", 1024) |
|
|
|
|
|
|
|
|
if not os.path.exists(args.data_file): |
|
|
raise FileNotFoundError(f"Data file not found: {args.data_file}") |
|
|
raw_dsets = load_dataset("text", data_files={"train": args.data_file}) |
|
|
|
|
|
def tokenize_fn(examples): |
|
|
return tokenizer(examples["text"], return_special_tokens_mask=False) |
|
|
|
|
|
tokenized = raw_dsets.map( |
|
|
tokenize_fn, |
|
|
batched=True, |
|
|
remove_columns=["text"], |
|
|
num_proc=1 |
|
|
) |
|
|
|
|
|
|
|
|
tokenized = tokenized.map( |
|
|
lambda examples: { |
|
|
"input_ids": sum(examples["input_ids"], []) |
|
|
}, |
|
|
batched=True, |
|
|
remove_columns=tokenized["train"].column_names |
|
|
) |
|
|
|
|
|
|
|
|
def chunker(examples): |
|
|
all_ids = examples["input_ids"] |
|
|
chunks = [all_ids[i:i+block_size] for i in range(0, len(all_ids), block_size) if len(all_ids[i:i+block_size])==block_size] |
|
|
return {"input_ids": chunks} |
|
|
|
|
|
dataset = tokenized["train"].map( |
|
|
chunker, |
|
|
batched=True, |
|
|
remove_columns=tokenized["train"].column_names, |
|
|
) |
|
|
|
|
|
|
|
|
model_cfg = GPT2Config( |
|
|
vocab_size=len(tokenizer), |
|
|
n_positions=block_size, |
|
|
n_ctx=block_size, |
|
|
n_embd=cfg["model"]["n_embd"], |
|
|
n_layer=cfg["model"]["n_layer"], |
|
|
n_head=cfg["model"]["n_head"], |
|
|
bos_token_id=tokenizer.bos_token_id if tokenizer.bos_token_id is not None else tokenizer.convert_tokens_to_ids(tokenizer.cls_token) if tokenizer.cls_token else 1, |
|
|
eos_token_id=tokenizer.eos_token_id if tokenizer.eos_token_id is not None else 2, |
|
|
) |
|
|
model = AutoModelForCausalLM.from_config(model_cfg) |
|
|
|
|
|
model.resize_token_embeddings(len(tokenizer)) |
|
|
|
|
|
|
|
|
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False) |
|
|
|
|
|
|
|
|
train_args = cfg["training"] |
|
|
training_args = TrainingArguments( |
|
|
output_dir=args.output_dir, |
|
|
per_device_train_batch_size=train_args.get("per_device_train_batch_size", 2), |
|
|
gradient_accumulation_steps=train_args.get("gradient_accumulation_steps", 8), |
|
|
num_train_epochs=train_args.get("num_train_epochs", 1), |
|
|
learning_rate=train_args.get("learning_rate", 2e-4), |
|
|
weight_decay=train_args.get("weight_decay", 0.01), |
|
|
fp16=train_args.get("fp16", True), |
|
|
logging_steps=train_args.get("logging_steps", 100), |
|
|
save_steps=train_args.get("save_steps", 1000), |
|
|
evaluation_strategy="no", |
|
|
save_total_limit=3, |
|
|
remove_unused_columns=False, |
|
|
report_to="none" |
|
|
) |
|
|
|
|
|
trainer = Trainer( |
|
|
model=model, |
|
|
args=training_args, |
|
|
train_dataset=dataset, |
|
|
data_collator=data_collator |
|
|
) |
|
|
|
|
|
trainer.train() |
|
|
trainer.save_model(args.output_dir) |
|
|
tokenizer.save_pretrained(args.output_dir) |
|
|
print("Training complete. Model & tokenizer saved to", args.output_dir) |
|
|
|
|
|
if __name__ == "__main__": |
|
|
main() |