amkyawdev commited on
Commit
a0d6b29
·
verified ·
1 Parent(s): bf64cbe

Upload train.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. train.py +43 -52
train.py CHANGED
@@ -1,6 +1,6 @@
1
  """
2
  Myanmar LLM Training Script
3
- Fine-tune Llama-3.1-8B-Instruct with Myanmar dataset
4
  """
5
 
6
  import json
@@ -12,46 +12,32 @@ from transformers import (
12
  TrainingArguments,
13
  Trainer,
14
  DataCollatorForLanguageModeling,
15
- EvalPrediction,
16
  )
17
- from transformers import BitsAndBytesConfig
18
  import torch
19
- from sklearn.metrics import accuracy_score
20
 
21
- # Config
22
- MODEL_NAME = "meta-llama/Llama-3.1-8B-Instruct"
23
- OUTPUT_DIR = "./myanmar-llama-output"
24
- DATASET_PATH = "amkyawdev/myanmar-llm-data"
25
-
26
- # Quantization config for low VRAM
27
- bnb_config = BitsAndBytesConfig(
28
- load_in_4bit=True,
29
- bnb_4bit_quant_type="nf4",
30
- bnb_4bit_compute_dtype="float16",
31
- bnb_4bit_use_double_quant=True,
32
- )
33
 
34
  def format_conversation(example):
35
- """Format conversation for Llama chat template"""
36
  messages = example["messages"]
37
- text = ""
38
  for msg in messages:
39
- role = msg["role"]
40
- content = msg["content"]
41
- if role == "system":
42
- text += f"<|start_header_id|>system<|end_header_id|>\n\n{content}<|eot_id|>"
43
- elif role == "user":
44
- text += f"<|start_header_id|>user<|end_header_id|>\n\n{content}<|eot_id|>"
45
- elif role == "assistant":
46
- text += f"<|start_header_id|>assistant<|end_header_id|>\n\n{content}<|eot_id|>"
47
- # Add separator
48
- text += "<|start_header_id|>assistant<|end_header_id|>\n\n"
49
  return {"text": text}
50
 
51
  def preprocess_function(examples, tokenizer, max_length=2048):
52
  """Tokenize the text"""
53
- # Add prompt suffix for assistant response
54
- texts = [text + "<|start_header_id|>assistant<|end_header_id|>\n\n" for text in examples["text"]]
55
 
56
  tokenized = tokenizer(
57
  texts,
@@ -66,13 +52,11 @@ def preprocess_function(examples, tokenizer, max_length=2048):
66
  return tokenized
67
 
68
  def compute_metrics(eval_pred):
69
- """Compute perplexity as evaluation metric"""
70
  logits, labels = eval_pred
71
- # Shift for causal LM
72
  logits = logits[:-1]
73
  labels = labels[1:]
74
 
75
- # Calculate perplexity
76
  loss = torch.nn.functional.cross_entropy(
77
  torch.tensor(logits),
78
  torch.tensor(labels),
@@ -83,18 +67,23 @@ def compute_metrics(eval_pred):
83
  def load_data():
84
  """Load and prepare Myanmar dataset"""
85
  print("📂 Loading dataset...")
86
- dataset = load_dataset(DATASET_PATH)
87
 
88
- # Format data
89
- print("✏️ Formatting data...")
90
- for split in dataset:
91
- dataset[split] = dataset[split].map(format_conversation)
 
 
 
 
 
 
92
 
93
  return dataset
94
 
95
  def main():
96
  print("=" * 60)
97
- print("🧠 Myanmar LLM Training - Llama 3.1 8B")
98
  print("=" * 60)
99
 
100
  # Check GPU
@@ -114,25 +103,28 @@ def main():
114
  padding_side="right",
115
  )
116
 
117
- # Set pad token
118
  tokenizer.pad_token = tokenizer.eos_token
119
 
120
- # Load model with 4-bit quantization
121
- print("🔄 Loading model with 4-bit quantization...")
122
  model = AutoModelForCausalLM.from_pretrained(
123
  MODEL_NAME,
124
- quantization_config=bnb_config,
125
  trust_remote_code=True,
 
126
  device_map="auto",
127
  )
128
 
129
- # Disable gradient checkpointing for stability
130
  model.gradient_checkpointing_enable()
131
 
132
  # Load dataset
133
  dataset = load_data()
134
 
135
- # Preprocess
 
 
 
 
136
  print("🔧 Tokenizing...")
137
  for split in dataset:
138
  dataset[split] = dataset[split].map(
@@ -154,17 +146,16 @@ def main():
154
  training_args = TrainingArguments(
155
  output_dir=OUTPUT_DIR,
156
  num_train_epochs=3,
157
- per_device_train_batch_size=2,
158
- per_device_eval_batch_size=2,
159
- gradient_accumulation_steps=8,
160
- learning_rate=1e-5,
161
  warmup_ratio=0.1,
162
  logging_steps=10,
163
  save_steps=100,
164
  eval_steps=100,
165
  save_total_limit=2,
166
- fp16=False,
167
- bf16=True,
168
  remove_unused_columns=False,
169
  optim="adamw_torch",
170
  report_to="none",
@@ -208,7 +199,7 @@ def main():
208
  print(f" Model: {OUTPUT_DIR}")
209
  print(f"\n📤 Upload to HuggingFace:")
210
  print(f" cd {OUTPUT_DIR}")
211
- print(f" hf upload amkyawdev/my-myanmar-llama . --repo-type model")
212
 
213
  if __name__ == "__main__":
214
  main()
 
1
  """
2
  Myanmar LLM Training Script
3
+ Fine-tune Qwen2.5-0.5B-Instruct with Myanmar dataset (No license required!)
4
  """
5
 
6
  import json
 
12
  TrainingArguments,
13
  Trainer,
14
  DataCollatorForLanguageModeling,
 
15
  )
 
16
  import torch
 
17
 
18
+ # Config - Fully open model, no license needed!
19
+ MODEL_NAME = "Qwen/Qwen2.5-0.5B-Instruct"
20
+ OUTPUT_DIR = "./myanmar-qwen-output"
21
+ DATASET_PATH = "amkyawdev/AmkyawDev-Dataset"
 
 
 
 
 
 
 
 
22
 
23
  def format_conversation(example):
24
+ """Format conversation for Qwen chat template"""
25
  messages = example["messages"]
26
+ text = "<|im_start|>system\n"
27
  for msg in messages:
28
+ if msg["role"] == "system":
29
+ text += msg["content"] + "<|im_end|>\n"
30
+ elif msg["role"] == "user":
31
+ text += f"<|im_start|>user\n{msg['content']}<|im_end|>\n"
32
+ elif msg["role"] == "assistant":
33
+ text += f"<|im_start|>assistant\n{msg['content']}<|im_end|>\n"
34
+ # Add prompt for assistant to generate
35
+ text += "<|im_start|>assistant\n"
 
 
36
  return {"text": text}
37
 
38
  def preprocess_function(examples, tokenizer, max_length=2048):
39
  """Tokenize the text"""
40
+ texts = examples["text"]
 
41
 
42
  tokenized = tokenizer(
43
  texts,
 
52
  return tokenized
53
 
54
  def compute_metrics(eval_pred):
55
+ """Compute perplexity"""
56
  logits, labels = eval_pred
 
57
  logits = logits[:-1]
58
  labels = labels[1:]
59
 
 
60
  loss = torch.nn.functional.cross_entropy(
61
  torch.tensor(logits),
62
  torch.tensor(labels),
 
67
  def load_data():
68
  """Load and prepare Myanmar dataset"""
69
  print("📂 Loading dataset...")
 
70
 
71
+ # Load from JSONL files (train.jsonl, test.jsonl, validation.jsonl)
72
+ dataset = load_dataset(DATASET_PATH, data_files={
73
+ "train": "train.jsonl",
74
+ "validation": "validation.jsonl",
75
+ "test": "test.jsonl"
76
+ })
77
+
78
+ print(f" Train: {len(dataset['train'])} samples")
79
+ print(f" Validation: {len(dataset['validation'])} samples")
80
+ print(f" Test: {len(dataset['test'])} samples")
81
 
82
  return dataset
83
 
84
  def main():
85
  print("=" * 60)
86
+ print("🧠 Myanmar LLM Training - Qwen2.5 0.5B (No License!)")
87
  print("=" * 60)
88
 
89
  # Check GPU
 
103
  padding_side="right",
104
  )
105
 
 
106
  tokenizer.pad_token = tokenizer.eos_token
107
 
108
+ # Load model (FP16, no quantization needed for 0.5B)
109
+ print("🔄 Loading model...")
110
  model = AutoModelForCausalLM.from_pretrained(
111
  MODEL_NAME,
 
112
  trust_remote_code=True,
113
+ torch_dtype=torch.float16,
114
  device_map="auto",
115
  )
116
 
117
+ # Enable gradient checkpointing
118
  model.gradient_checkpointing_enable()
119
 
120
  # Load dataset
121
  dataset = load_data()
122
 
123
+ # Format and tokenize
124
+ print("✏️ Formatting data...")
125
+ for split in dataset:
126
+ dataset[split] = dataset[split].map(format_conversation)
127
+
128
  print("🔧 Tokenizing...")
129
  for split in dataset:
130
  dataset[split] = dataset[split].map(
 
146
  training_args = TrainingArguments(
147
  output_dir=OUTPUT_DIR,
148
  num_train_epochs=3,
149
+ per_device_train_batch_size=4,
150
+ per_device_eval_batch_size=4,
151
+ gradient_accumulation_steps=4,
152
+ learning_rate=2e-5,
153
  warmup_ratio=0.1,
154
  logging_steps=10,
155
  save_steps=100,
156
  eval_steps=100,
157
  save_total_limit=2,
158
+ fp16=True,
 
159
  remove_unused_columns=False,
160
  optim="adamw_torch",
161
  report_to="none",
 
199
  print(f" Model: {OUTPUT_DIR}")
200
  print(f"\n📤 Upload to HuggingFace:")
201
  print(f" cd {OUTPUT_DIR}")
202
+ print(f" hf upload amkyawdev/my-myanmar-qwen . --repo-type model")
203
 
204
  if __name__ == "__main__":
205
  main()