| """ |
| Myanmar LLM Training Script |
| Fine-tune Qwen2.5-0.5B-Instruct with Myanmar dataset (No license required!) |
| """ |
|
|
| import json |
| import os |
| from datasets import load_dataset |
| from transformers import ( |
| AutoModelForCausalLM, |
| AutoTokenizer, |
| TrainingArguments, |
| Trainer, |
| DataCollatorForLanguageModeling, |
| ) |
| import torch |
|
|
| |
| MODEL_NAME = "Qwen/Qwen2.5-0.5B-Instruct" |
| OUTPUT_DIR = "./myanmar-qwen-output" |
| DATASET_PATH = "amkyawdev/AmkyawDev-Dataset" |
|
|
| def format_conversation(example): |
| """Format conversation for Qwen chat template""" |
| messages = example["messages"] |
| text = "<|im_start|>system\n" |
| for msg in messages: |
| if msg["role"] == "system": |
| text += msg["content"] + "<|im_end|>\n" |
| elif msg["role"] == "user": |
| text += f"<|im_start|>user\n{msg['content']}<|im_end|>\n" |
| elif msg["role"] == "assistant": |
| text += f"<|im_start|>assistant\n{msg['content']}<|im_end|>\n" |
| |
| text += "<|im_start|>assistant\n" |
| return {"text": text} |
|
|
| def preprocess_function(examples, tokenizer, max_length=2048): |
| """Tokenize the text""" |
| texts = examples["text"] |
| |
| tokenized = tokenizer( |
| texts, |
| truncation=True, |
| max_length=max_length, |
| padding="max_length", |
| return_tensors=None, |
| ) |
| |
| |
| tokenized["labels"] = tokenized["input_ids"].copy() |
| return tokenized |
|
|
| def compute_metrics(eval_pred): |
| """Compute perplexity""" |
| logits, labels = eval_pred |
| logits = logits[:-1] |
| labels = labels[1:] |
| |
| loss = torch.nn.functional.cross_entropy( |
| torch.tensor(logits), |
| torch.tensor(labels), |
| ignore_index=-100 |
| ) |
| return {"perplexity": torch.exp(loss).item()} |
|
|
| def load_data(): |
| """Load and prepare Myanmar dataset""" |
| print("๐ Loading dataset...") |
| |
| |
| dataset = load_dataset(DATASET_PATH, data_files={ |
| "train": "train.jsonl", |
| "validation": "validation.jsonl", |
| "test": "test.jsonl" |
| }) |
| |
| print(f" Train: {len(dataset['train'])} samples") |
| print(f" Validation: {len(dataset['validation'])} samples") |
| print(f" Test: {len(dataset['test'])} samples") |
| |
| return dataset |
|
|
| def main(): |
| print("=" * 60) |
| print("๐ง Myanmar LLM Training - Qwen2.5 0.5B (No License!)") |
| print("=" * 60) |
| |
| |
| if torch.cuda.is_available(): |
| gpu_name = torch.cuda.get_device_name(0) |
| vram = torch.cuda.get_device_properties(0).total_memory / 1e9 |
| print(f"โ
GPU: {gpu_name}") |
| print(f" VRAM: {vram:.2f} GB") |
| else: |
| print("โ ๏ธ No GPU - will use CPU (very slow)") |
| |
| |
| print(f"\n๐ฅ Loading model: {MODEL_NAME}") |
| tokenizer = AutoTokenizer.from_pretrained( |
| MODEL_NAME, |
| trust_remote_code=True, |
| padding_side="right", |
| ) |
| |
| tokenizer.pad_token = tokenizer.eos_token |
| |
| |
| print("๐ Loading model...") |
| model = AutoModelForCausalLM.from_pretrained( |
| MODEL_NAME, |
| trust_remote_code=True, |
| torch_dtype=torch.float16, |
| device_map="auto", |
| ) |
| |
| |
| model.gradient_checkpointing_enable() |
| |
| |
| dataset = load_data() |
| |
| |
| print("โ๏ธ Formatting data...") |
| for split in dataset: |
| dataset[split] = dataset[split].map(format_conversation) |
| |
| print("๐ง Tokenizing...") |
| for split in dataset: |
| dataset[split] = dataset[split].map( |
| lambda x: preprocess_function(x, tokenizer), |
| batched=True, |
| remove_columns=dataset[split].column_names, |
| ) |
| |
| train_dataset = dataset["train"] |
| eval_dataset = dataset["validation"] |
| test_dataset = dataset["test"] |
| |
| print(f"\n๐ Dataset:") |
| print(f" Train: {len(train_dataset)} samples") |
| print(f" Validation: {len(eval_dataset)} samples") |
| print(f" Test: {len(test_dataset)} samples") |
| |
| |
| training_args = TrainingArguments( |
| output_dir=OUTPUT_DIR, |
| num_train_epochs=3, |
| per_device_train_batch_size=4, |
| per_device_eval_batch_size=4, |
| gradient_accumulation_steps=4, |
| learning_rate=2e-5, |
| warmup_ratio=0.1, |
| logging_steps=10, |
| save_steps=100, |
| eval_steps=100, |
| save_total_limit=2, |
| fp16=True, |
| remove_unused_columns=False, |
| optim="adamw_torch", |
| report_to="none", |
| load_best_model_at_end=True, |
| eval_strategy="steps", |
| save_strategy="steps", |
| ) |
| |
| |
| data_collator = DataCollatorForLanguageModeling( |
| tokenizer=tokenizer, |
| mlm=False, |
| pad_to_multiple_of=8, |
| ) |
| |
| |
| trainer = Trainer( |
| model=model, |
| args=training_args, |
| train_dataset=train_dataset, |
| eval_dataset=eval_dataset, |
| data_collator=data_collator, |
| compute_metrics=compute_metrics, |
| ) |
| |
| |
| print("\n๐ Starting training...") |
| trainer.train() |
| |
| |
| print("\n๐ Evaluating on test set...") |
| test_results = trainer.evaluate(test_dataset) |
| print(f"Test Results: {test_results}") |
| |
| |
| print("\n๐พ Saving model...") |
| trainer.save_model(OUTPUT_DIR) |
| tokenizer.save_pretrained(OUTPUT_DIR) |
| |
| print(f"\nโ
Training complete!") |
| print(f" Model: {OUTPUT_DIR}") |
| print(f"\n๐ค Upload to HuggingFace:") |
| print(f" cd {OUTPUT_DIR}") |
| print(f" hf upload amkyawdev/my-myanmar-qwen . --repo-type model") |
|
|
| if __name__ == "__main__": |
| main() |