myanmar-llm-train / train.py
amkyawdev's picture
Upload train.py with huggingface_hub
a0d6b29 verified
"""
Myanmar LLM Training Script
Fine-tune Qwen2.5-0.5B-Instruct with Myanmar dataset (No license required!)
"""
import json
import os
from datasets import load_dataset
from transformers import (
AutoModelForCausalLM,
AutoTokenizer,
TrainingArguments,
Trainer,
DataCollatorForLanguageModeling,
)
import torch
# Config - Fully open model, no license needed!
MODEL_NAME = "Qwen/Qwen2.5-0.5B-Instruct"
OUTPUT_DIR = "./myanmar-qwen-output"
DATASET_PATH = "amkyawdev/AmkyawDev-Dataset"
def format_conversation(example):
"""Format conversation for Qwen chat template"""
messages = example["messages"]
text = "<|im_start|>system\n"
for msg in messages:
if msg["role"] == "system":
text += msg["content"] + "<|im_end|>\n"
elif msg["role"] == "user":
text += f"<|im_start|>user\n{msg['content']}<|im_end|>\n"
elif msg["role"] == "assistant":
text += f"<|im_start|>assistant\n{msg['content']}<|im_end|>\n"
# Add prompt for assistant to generate
text += "<|im_start|>assistant\n"
return {"text": text}
def preprocess_function(examples, tokenizer, max_length=2048):
"""Tokenize the text"""
texts = examples["text"]
tokenized = tokenizer(
texts,
truncation=True,
max_length=max_length,
padding="max_length",
return_tensors=None,
)
# Labels same as input_ids (causal LM)
tokenized["labels"] = tokenized["input_ids"].copy()
return tokenized
def compute_metrics(eval_pred):
"""Compute perplexity"""
logits, labels = eval_pred
logits = logits[:-1]
labels = labels[1:]
loss = torch.nn.functional.cross_entropy(
torch.tensor(logits),
torch.tensor(labels),
ignore_index=-100
)
return {"perplexity": torch.exp(loss).item()}
def load_data():
"""Load and prepare Myanmar dataset"""
print("๐Ÿ“‚ Loading dataset...")
# Load from JSONL files (train.jsonl, test.jsonl, validation.jsonl)
dataset = load_dataset(DATASET_PATH, data_files={
"train": "train.jsonl",
"validation": "validation.jsonl",
"test": "test.jsonl"
})
print(f" Train: {len(dataset['train'])} samples")
print(f" Validation: {len(dataset['validation'])} samples")
print(f" Test: {len(dataset['test'])} samples")
return dataset
def main():
print("=" * 60)
print("๐Ÿง  Myanmar LLM Training - Qwen2.5 0.5B (No License!)")
print("=" * 60)
# Check GPU
if torch.cuda.is_available():
gpu_name = torch.cuda.get_device_name(0)
vram = torch.cuda.get_device_properties(0).total_memory / 1e9
print(f"โœ… GPU: {gpu_name}")
print(f" VRAM: {vram:.2f} GB")
else:
print("โš ๏ธ No GPU - will use CPU (very slow)")
# Load tokenizer
print(f"\n๐Ÿ“ฅ Loading model: {MODEL_NAME}")
tokenizer = AutoTokenizer.from_pretrained(
MODEL_NAME,
trust_remote_code=True,
padding_side="right",
)
tokenizer.pad_token = tokenizer.eos_token
# Load model (FP16, no quantization needed for 0.5B)
print("๐Ÿ”„ Loading model...")
model = AutoModelForCausalLM.from_pretrained(
MODEL_NAME,
trust_remote_code=True,
torch_dtype=torch.float16,
device_map="auto",
)
# Enable gradient checkpointing
model.gradient_checkpointing_enable()
# Load dataset
dataset = load_data()
# Format and tokenize
print("โœ๏ธ Formatting data...")
for split in dataset:
dataset[split] = dataset[split].map(format_conversation)
print("๐Ÿ”ง Tokenizing...")
for split in dataset:
dataset[split] = dataset[split].map(
lambda x: preprocess_function(x, tokenizer),
batched=True,
remove_columns=dataset[split].column_names,
)
train_dataset = dataset["train"]
eval_dataset = dataset["validation"]
test_dataset = dataset["test"]
print(f"\n๐Ÿ“Š Dataset:")
print(f" Train: {len(train_dataset)} samples")
print(f" Validation: {len(eval_dataset)} samples")
print(f" Test: {len(test_dataset)} samples")
# Training args
training_args = TrainingArguments(
output_dir=OUTPUT_DIR,
num_train_epochs=3,
per_device_train_batch_size=4,
per_device_eval_batch_size=4,
gradient_accumulation_steps=4,
learning_rate=2e-5,
warmup_ratio=0.1,
logging_steps=10,
save_steps=100,
eval_steps=100,
save_total_limit=2,
fp16=True,
remove_unused_columns=False,
optim="adamw_torch",
report_to="none",
load_best_model_at_end=True,
eval_strategy="steps",
save_strategy="steps",
)
# Data collator
data_collator = DataCollatorForLanguageModeling(
tokenizer=tokenizer,
mlm=False,
pad_to_multiple_of=8,
)
# Trainer
trainer = Trainer(
model=model,
args=training_args,
train_dataset=train_dataset,
eval_dataset=eval_dataset,
data_collator=data_collator,
compute_metrics=compute_metrics,
)
# Train
print("\n๐Ÿš€ Starting training...")
trainer.train()
# Evaluate on test set
print("\n๐Ÿ“ Evaluating on test set...")
test_results = trainer.evaluate(test_dataset)
print(f"Test Results: {test_results}")
# Save model
print("\n๐Ÿ’พ Saving model...")
trainer.save_model(OUTPUT_DIR)
tokenizer.save_pretrained(OUTPUT_DIR)
print(f"\nโœ… Training complete!")
print(f" Model: {OUTPUT_DIR}")
print(f"\n๐Ÿ“ค Upload to HuggingFace:")
print(f" cd {OUTPUT_DIR}")
print(f" hf upload amkyawdev/my-myanmar-qwen . --repo-type model")
if __name__ == "__main__":
main()