""" Myanmar LLM Training Script Fine-tune Qwen2.5-0.5B-Instruct with Myanmar dataset (No license required!) """ import json import os from datasets import load_dataset from transformers import ( AutoModelForCausalLM, AutoTokenizer, TrainingArguments, Trainer, DataCollatorForLanguageModeling, ) import torch # Config - Fully open model, no license needed! MODEL_NAME = "Qwen/Qwen2.5-0.5B-Instruct" OUTPUT_DIR = "./myanmar-qwen-output" DATASET_PATH = "amkyawdev/AmkyawDev-Dataset" def format_conversation(example): """Format conversation for Qwen chat template""" messages = example["messages"] text = "<|im_start|>system\n" for msg in messages: if msg["role"] == "system": text += msg["content"] + "<|im_end|>\n" elif msg["role"] == "user": text += f"<|im_start|>user\n{msg['content']}<|im_end|>\n" elif msg["role"] == "assistant": text += f"<|im_start|>assistant\n{msg['content']}<|im_end|>\n" # Add prompt for assistant to generate text += "<|im_start|>assistant\n" return {"text": text} def preprocess_function(examples, tokenizer, max_length=2048): """Tokenize the text""" texts = examples["text"] tokenized = tokenizer( texts, truncation=True, max_length=max_length, padding="max_length", return_tensors=None, ) # Labels same as input_ids (causal LM) tokenized["labels"] = tokenized["input_ids"].copy() return tokenized def compute_metrics(eval_pred): """Compute perplexity""" logits, labels = eval_pred logits = logits[:-1] labels = labels[1:] loss = torch.nn.functional.cross_entropy( torch.tensor(logits), torch.tensor(labels), ignore_index=-100 ) return {"perplexity": torch.exp(loss).item()} def load_data(): """Load and prepare Myanmar dataset""" print("šŸ“‚ Loading dataset...") # Load from JSONL files (train.jsonl, test.jsonl, validation.jsonl) dataset = load_dataset(DATASET_PATH, data_files={ "train": "train.jsonl", "validation": "validation.jsonl", "test": "test.jsonl" }) print(f" Train: {len(dataset['train'])} samples") print(f" Validation: {len(dataset['validation'])} samples") print(f" Test: {len(dataset['test'])} samples") return dataset def main(): print("=" * 60) print("🧠 Myanmar LLM Training - Qwen2.5 0.5B (No License!)") print("=" * 60) # Check GPU if torch.cuda.is_available(): gpu_name = torch.cuda.get_device_name(0) vram = torch.cuda.get_device_properties(0).total_memory / 1e9 print(f"āœ… GPU: {gpu_name}") print(f" VRAM: {vram:.2f} GB") else: print("āš ļø No GPU - will use CPU (very slow)") # Load tokenizer print(f"\nšŸ“„ Loading model: {MODEL_NAME}") tokenizer = AutoTokenizer.from_pretrained( MODEL_NAME, trust_remote_code=True, padding_side="right", ) tokenizer.pad_token = tokenizer.eos_token # Load model (FP16, no quantization needed for 0.5B) print("šŸ”„ Loading model...") model = AutoModelForCausalLM.from_pretrained( MODEL_NAME, trust_remote_code=True, torch_dtype=torch.float16, device_map="auto", ) # Enable gradient checkpointing model.gradient_checkpointing_enable() # Load dataset dataset = load_data() # Format and tokenize print("āœļø Formatting data...") for split in dataset: dataset[split] = dataset[split].map(format_conversation) print("šŸ”§ Tokenizing...") for split in dataset: dataset[split] = dataset[split].map( lambda x: preprocess_function(x, tokenizer), batched=True, remove_columns=dataset[split].column_names, ) train_dataset = dataset["train"] eval_dataset = dataset["validation"] test_dataset = dataset["test"] print(f"\nšŸ“Š Dataset:") print(f" Train: {len(train_dataset)} samples") print(f" Validation: {len(eval_dataset)} samples") print(f" Test: {len(test_dataset)} samples") # Training args training_args = TrainingArguments( output_dir=OUTPUT_DIR, num_train_epochs=3, per_device_train_batch_size=4, per_device_eval_batch_size=4, gradient_accumulation_steps=4, learning_rate=2e-5, warmup_ratio=0.1, logging_steps=10, save_steps=100, eval_steps=100, save_total_limit=2, fp16=True, remove_unused_columns=False, optim="adamw_torch", report_to="none", load_best_model_at_end=True, eval_strategy="steps", save_strategy="steps", ) # Data collator data_collator = DataCollatorForLanguageModeling( tokenizer=tokenizer, mlm=False, pad_to_multiple_of=8, ) # Trainer trainer = Trainer( model=model, args=training_args, train_dataset=train_dataset, eval_dataset=eval_dataset, data_collator=data_collator, compute_metrics=compute_metrics, ) # Train print("\nšŸš€ Starting training...") trainer.train() # Evaluate on test set print("\nšŸ“ Evaluating on test set...") test_results = trainer.evaluate(test_dataset) print(f"Test Results: {test_results}") # Save model print("\nšŸ’¾ Saving model...") trainer.save_model(OUTPUT_DIR) tokenizer.save_pretrained(OUTPUT_DIR) print(f"\nāœ… Training complete!") print(f" Model: {OUTPUT_DIR}") print(f"\nšŸ“¤ Upload to HuggingFace:") print(f" cd {OUTPUT_DIR}") print(f" hf upload amkyawdev/my-myanmar-qwen . --repo-type model") if __name__ == "__main__": main()