amkyawdev
/

myanmar-llm-train

Model card Files Files and versions

xet

Community

amkyawdev commited on Apr 5

Commit

a0d6b29

verified ·

1 Parent(s): bf64cbe

Upload train.py with huggingface_hub

Browse files

Files changed (1) hide show

train.py +43 -52

train.py CHANGED Viewed

@@ -1,6 +1,6 @@
 """
 Myanmar LLM Training Script
-Fine-tune Llama-3.1-8B-Instruct with Myanmar dataset
 """
 import json
@@ -12,46 +12,32 @@ from transformers import (
     TrainingArguments,
     Trainer,
     DataCollatorForLanguageModeling,
-    EvalPrediction,
 )
-from transformers import BitsAndBytesConfig
 import torch
-from sklearn.metrics import accuracy_score
-# Config
-MODEL_NAME = "meta-llama/Llama-3.1-8B-Instruct"
-OUTPUT_DIR = "./myanmar-llama-output"
-DATASET_PATH = "amkyawdev/myanmar-llm-data"
-# Quantization config for low VRAM
-bnb_config = BitsAndBytesConfig(
-    load_in_4bit=True,
-    bnb_4bit_quant_type="nf4",
-    bnb_4bit_compute_dtype="float16",
-    bnb_4bit_use_double_quant=True,
-)
 def format_conversation(example):
-    """Format conversation for Llama chat template"""
     messages = example["messages"]
-    text = ""
     for msg in messages:
-        role = msg["role"]
-        content = msg["content"]
-        if role == "system":
-            text += f"<|start_header_id|>system<|end_header_id|>\n\n{content}<|eot_id|>"
-        elif role == "user":
-            text += f"<|start_header_id|>user<|end_header_id|>\n\n{content}<|eot_id|>"
-        elif role == "assistant":
-            text += f"<|start_header_id|>assistant<|end_header_id|>\n\n{content}<|eot_id|>"
-    # Add separator
-    text += "<|start_header_id|>assistant<|end_header_id|>\n\n"
     return {"text": text}
 def preprocess_function(examples, tokenizer, max_length=2048):
     """Tokenize the text"""
-    # Add prompt suffix for assistant response
-    texts = [text + "<|start_header_id|>assistant<|end_header_id|>\n\n" for text in examples["text"]]
     tokenized = tokenizer(
         texts,
@@ -66,13 +52,11 @@ def preprocess_function(examples, tokenizer, max_length=2048):
     return tokenized
 def compute_metrics(eval_pred):
-    """Compute perplexity as evaluation metric"""
     logits, labels = eval_pred
-    # Shift for causal LM
     logits = logits[:-1]
     labels = labels[1:]
-    # Calculate perplexity
     loss = torch.nn.functional.cross_entropy(
         torch.tensor(logits),
         torch.tensor(labels),
@@ -83,18 +67,23 @@ def compute_metrics(eval_pred):
 def load_data():
     """Load and prepare Myanmar dataset"""
     print("📂 Loading dataset...")
-    dataset = load_dataset(DATASET_PATH)
-    # Format data
-    print("✏️ Formatting data...")
-    for split in dataset:
-        dataset[split] = dataset[split].map(format_conversation)
     return dataset
 def main():
     print("=" * 60)
-    print("🧠 Myanmar LLM Training - Llama 3.1 8B")
     print("=" * 60)
     # Check GPU
@@ -114,25 +103,28 @@ def main():
         padding_side="right",
     )
-    # Set pad token
     tokenizer.pad_token = tokenizer.eos_token
-    # Load model with 4-bit quantization
-    print("🔄 Loading model with 4-bit quantization...")
     model = AutoModelForCausalLM.from_pretrained(
         MODEL_NAME,
-        quantization_config=bnb_config,
         trust_remote_code=True,
         device_map="auto",
     )
-    # Disable gradient checkpointing for stability
     model.gradient_checkpointing_enable()
     # Load dataset
     dataset = load_data()
-    # Preprocess
     print("🔧 Tokenizing...")
     for split in dataset:
         dataset[split] = dataset[split].map(
@@ -154,17 +146,16 @@ def main():
     training_args = TrainingArguments(
         output_dir=OUTPUT_DIR,
         num_train_epochs=3,
-        per_device_train_batch_size=2,
-        per_device_eval_batch_size=2,
-        gradient_accumulation_steps=8,
-        learning_rate=1e-5,
         warmup_ratio=0.1,
         logging_steps=10,
         save_steps=100,
         eval_steps=100,
         save_total_limit=2,
-        fp16=False,
-        bf16=True,
         remove_unused_columns=False,
         optim="adamw_torch",
         report_to="none",
@@ -208,7 +199,7 @@ def main():
     print(f"   Model: {OUTPUT_DIR}")
     print(f"\n📤 Upload to HuggingFace:")
     print(f"   cd {OUTPUT_DIR}")
-    print(f"   hf upload amkyawdev/my-myanmar-llama . --repo-type model")
 if __name__ == "__main__":
     main()

 """
 Myanmar LLM Training Script
+Fine-tune Qwen2.5-0.5B-Instruct with Myanmar dataset (No license required!)
 """
 import json
     TrainingArguments,
     Trainer,
     DataCollatorForLanguageModeling,
 )
 import torch
+# Config - Fully open model, no license needed!
+MODEL_NAME = "Qwen/Qwen2.5-0.5B-Instruct"
+OUTPUT_DIR = "./myanmar-qwen-output"
+DATASET_PATH = "amkyawdev/AmkyawDev-Dataset"
 def format_conversation(example):
+    """Format conversation for Qwen chat template"""
     messages = example["messages"]
+    text = "<|im_start|>system\n"
     for msg in messages:
+        if msg["role"] == "system":
+            text += msg["content"] + "<|im_end|>\n"
+        elif msg["role"] == "user":
+            text += f"<|im_start|>user\n{msg['content']}<|im_end|>\n"
+        elif msg["role"] == "assistant":
+            text += f"<|im_start|>assistant\n{msg['content']}<|im_end|>\n"
+    # Add prompt for assistant to generate
+    text += "<|im_start|>assistant\n"
     return {"text": text}
 def preprocess_function(examples, tokenizer, max_length=2048):
     """Tokenize the text"""
+    texts = examples["text"]
     tokenized = tokenizer(
         texts,
     return tokenized
 def compute_metrics(eval_pred):
+    """Compute perplexity"""
     logits, labels = eval_pred
     logits = logits[:-1]
     labels = labels[1:]
     loss = torch.nn.functional.cross_entropy(
         torch.tensor(logits),
         torch.tensor(labels),
 def load_data():
     """Load and prepare Myanmar dataset"""
     print("📂 Loading dataset...")
+    # Load from JSONL files (train.jsonl, test.jsonl, validation.jsonl)
+    dataset = load_dataset(DATASET_PATH, data_files={
+        "train": "train.jsonl",
+        "validation": "validation.jsonl",
+        "test": "test.jsonl"
+    })
+    print(f"   Train: {len(dataset['train'])} samples")
+    print(f"   Validation: {len(dataset['validation'])} samples")
+    print(f"   Test: {len(dataset['test'])} samples")
     return dataset
 def main():
     print("=" * 60)
+    print("🧠 Myanmar LLM Training - Qwen2.5 0.5B (No License!)")
     print("=" * 60)
     # Check GPU
         padding_side="right",
     )
     tokenizer.pad_token = tokenizer.eos_token
+    # Load model (FP16, no quantization needed for 0.5B)
+    print("🔄 Loading model...")
     model = AutoModelForCausalLM.from_pretrained(
         MODEL_NAME,
         trust_remote_code=True,
+        torch_dtype=torch.float16,
         device_map="auto",
     )
+    # Enable gradient checkpointing
     model.gradient_checkpointing_enable()
     # Load dataset
     dataset = load_data()
+    # Format and tokenize
+    print("✏️ Formatting data...")
+    for split in dataset:
+        dataset[split] = dataset[split].map(format_conversation)
     print("🔧 Tokenizing...")
     for split in dataset:
         dataset[split] = dataset[split].map(
     training_args = TrainingArguments(
         output_dir=OUTPUT_DIR,
         num_train_epochs=3,
+        per_device_train_batch_size=4,
+        per_device_eval_batch_size=4,
+        gradient_accumulation_steps=4,
+        learning_rate=2e-5,
         warmup_ratio=0.1,
         logging_steps=10,
         save_steps=100,
         eval_steps=100,
         save_total_limit=2,
+        fp16=True,
         remove_unused_columns=False,
         optim="adamw_torch",
         report_to="none",
     print(f"   Model: {OUTPUT_DIR}")
     print(f"\n📤 Upload to HuggingFace:")
     print(f"   cd {OUTPUT_DIR}")
+    print(f"   hf upload amkyawdev/my-myanmar-qwen . --repo-type model")
 if __name__ == "__main__":
     main()