Mindigenous commited on
Commit
7a24ed3
·
1 Parent(s): 84d3b3f

Backup update: add latest checkpoints and train.py changes

Browse files
backup_step8000.tar.gz ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:13c3e5c401a567493b92bf02f6d4040f5b6f578c4c413b33362a0009d7405237
3
+ size 84689731
backup_step8250.tar.gz ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5dff8ef0900eeed1141b8aac59e1c45697ff3c804e4e2792568f4fdf5754e021
3
+ size 84688227
backup_step8500.tar.gz ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d35f24763676911a2f605ff63e56a62f521bde805757d51b2e356a004d479e2e
3
+ size 84695943
backup_step8750.tar.gz ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:007068138f8a165ff5a3fea9ed096a94bdf620d0007b013d8834d69bfc650628
3
+ size 84696682
backup_step9000.tar.gz ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a69b305a69b77ea66f9feeaaaa3bbd7c4a08f7111bbd6cdd3b90e2e59a5b2e7b
3
+ size 84704097
backup_step9250.tar.gz ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f8724eceedfd4f8c4f87a14f1fa8c2019bcbfe9af6165e57aac020bb04c65fd5
3
+ size 84699876
backup_step9500.tar.gz ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a728fcf9931e37ae37a3db4044170a254473aa08f9a10e958ce88987f2575d8c
3
+ size 84705286
backup_step9750.tar.gz ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fe8bbef08bb3ee21de186753bce613d4b050b4011d85378737d464e190db65a7
3
+ size 84703357
train.py CHANGED
@@ -1,6 +1,7 @@
1
  import argparse
2
  from pathlib import Path
3
- from typing import List
 
4
 
5
  import torch
6
  from peft import LoraConfig, TaskType, get_peft_model
@@ -9,20 +10,78 @@ from transformers import (
9
  AutoTokenizer,
10
  Trainer,
11
  TrainingArguments,
 
12
  set_seed,
13
  )
14
 
15
  from config import PATHS, TRAINING_CONFIG
16
- from dataset import LocalJsonlInstructionDataset, format_prompt
17
  from utils import ensure_dirs, setup_logger
18
 
19
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
20
  def _is_valid_hf_model_dir(path: Path) -> bool:
21
  if not path.exists():
22
  return False
23
- has_config = (path / "config.json").exists()
24
- has_weights = (path / "model.safetensors").exists() or (path / "pytorch_model.bin").exists()
25
- return has_config and has_weights
26
 
27
 
28
  def _resolve_model_path(logger) -> Path:
@@ -31,6 +90,7 @@ def _resolve_model_path(logger) -> Path:
31
 
32
  if _is_valid_hf_model_dir(primary):
33
  return primary
 
34
  if _is_valid_hf_model_dir(fallback):
35
  logger.warning(
36
  "Primary model path %s is missing HF files. Falling back to %s",
@@ -38,36 +98,19 @@ def _resolve_model_path(logger) -> Path:
38
  fallback.resolve(),
39
  )
40
  return fallback
41
- raise FileNotFoundError(
42
- "No valid HuggingFace model directory found.\n"
43
- f"Checked: {primary.resolve()} and {fallback.resolve()}.\n"
44
- "Expected files: config.json + model.safetensors (or pytorch_model.bin)."
45
- )
46
 
 
47
 
48
- def _build_model_and_tokenizer(model_path: Path, logger):
49
- try:
50
- tokenizer = AutoTokenizer.from_pretrained(
51
- model_path,
52
- trust_remote_code=True,
53
- local_files_only=True,
54
- use_fast=True,
55
- )
56
- except Exception as fast_exc:
57
- logger.warning("Fast tokenizer load failed: %s. Retrying with slow tokenizer.", fast_exc)
58
- try:
59
- tokenizer = AutoTokenizer.from_pretrained(
60
- model_path,
61
- trust_remote_code=True,
62
- local_files_only=True,
63
- use_fast=False,
64
- )
65
- except Exception as slow_exc:
66
- raise RuntimeError(
67
- "Tokenizer loading failed for both fast and slow modes. "
68
- "Ensure tokenizer files exist in the model folder and install "
69
- "`sentencepiece` (and optionally `tiktoken`) if required."
70
- ) from slow_exc
71
 
72
  if tokenizer.pad_token is None:
73
  tokenizer.pad_token = tokenizer.eos_token
@@ -87,84 +130,75 @@ def _build_model_and_tokenizer(model_path: Path, logger):
87
  task_type=TaskType.CAUSAL_LM,
88
  target_modules="all-linear",
89
  )
 
90
  model = get_peft_model(model, lora_cfg)
91
  return model, tokenizer
92
 
93
 
94
- def _maybe_resume_train(trainer: Trainer, logger, resume_requested: bool) -> None:
95
- if not resume_requested:
96
- trainer.train()
97
- return
98
-
99
- try:
100
- trainer.train(resume_from_checkpoint=True)
101
- except (ValueError, OSError) as exc:
102
- logger.warning(
103
- "Resume requested but no valid checkpoint found (%s). Starting fresh training.",
104
- exc,
105
- )
106
- trainer.train()
107
 
 
 
 
 
108
 
109
- def _generate_predictions(model, tokenizer, prompts: List[str], logger) -> None:
110
- model.eval()
111
- device = model.device
112
- logger.info("Running post-training evaluation prompts.")
113
 
114
- for prompt in prompts:
115
- full_prompt = format_prompt(
116
- instruction=prompt,
117
- input_text="",
118
- output_text="",
119
- )
120
- inputs = tokenizer(full_prompt, return_tensors="pt").to(device)
121
- with torch.no_grad():
122
- outputs = model.generate(
123
- **inputs,
124
- max_new_tokens=TRAINING_CONFIG.eval_max_new_tokens,
125
- do_sample=True,
126
- temperature=0.2,
127
- top_p=0.95,
128
- pad_token_id=tokenizer.pad_token_id,
129
- )
130
- decoded = tokenizer.decode(outputs[0], skip_special_tokens=True)
131
- print("\n" + "=" * 80)
132
- print(f"PROMPT: {prompt}")
133
- print("-" * 80)
134
- print(decoded)
135
-
136
-
137
- def train(resume: bool) -> Path:
138
- ensure_dirs(
139
- [
140
- PATHS.data_dir,
141
- PATHS.output_dir,
142
- PATHS.logs_dir,
143
- PATHS.checkpoint_dir,
144
- PATHS.lora_output_dir,
145
- PATHS.tokenizer_output_dir,
146
- ]
147
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
148
  logger = setup_logger("train", PATHS.logs_dir / "train.log")
149
  set_seed(42)
150
- if not torch.cuda.is_available():
151
- logger.warning(
152
- "CUDA is not available. Training will run on CPU, which is very slow and can limit practical model quality."
153
- )
154
-
155
- if not PATHS.train_jsonl.exists():
156
- raise FileNotFoundError(
157
- f"Training dataset not found: {PATHS.train_jsonl.resolve()}. "
158
- "Run data_fetch.py first."
159
- )
160
 
161
  model_path = _resolve_model_path(logger)
162
- logger.info("Loading model and tokenizer from %s", model_path.resolve())
163
- model, tokenizer = _build_model_and_tokenizer(model_path, logger)
 
164
  model.print_trainable_parameters()
165
 
166
- train_dataset = LocalJsonlInstructionDataset(tokenizer, max_length=TRAINING_CONFIG.max_length)
167
- logger.info("Loaded %d samples from %s", len(train_dataset), PATHS.train_jsonl.resolve())
 
 
168
 
169
  training_args = TrainingArguments(
170
  output_dir=str(PATHS.checkpoint_dir),
@@ -173,56 +207,37 @@ def train(resume: bool) -> Path:
173
  gradient_accumulation_steps=TRAINING_CONFIG.gradient_accumulation_steps,
174
  learning_rate=TRAINING_CONFIG.learning_rate,
175
  fp16=torch.cuda.is_available(),
176
- lr_scheduler_type="cosine",
177
- warmup_ratio=0.03,
178
- weight_decay=0.01,
179
- max_grad_norm=1.0,
180
- gradient_checkpointing=True,
181
- group_by_length=True,
182
- logging_steps=TRAINING_CONFIG.logging_steps,
183
- save_steps=TRAINING_CONFIG.save_steps,
184
- save_total_limit=4,
185
  report_to="none",
186
  remove_unused_columns=False,
187
- dataloader_num_workers=2,
188
- dataloader_pin_memory=torch.cuda.is_available(),
189
  )
190
 
191
  trainer = Trainer(
192
  model=model,
193
  args=training_args,
194
  train_dataset=train_dataset,
 
195
  )
196
 
197
- logger.info("Starting training. Resume mode: %s", resume)
198
- _maybe_resume_train(trainer, logger, resume_requested=resume)
 
199
 
200
- logger.info("Saving LoRA adapters to %s", PATHS.lora_output_dir.resolve())
201
  trainer.model.save_pretrained(str(PATHS.lora_output_dir))
202
  tokenizer.save_pretrained(str(PATHS.tokenizer_output_dir))
203
 
204
- prompts = [
205
- "Write a Python binary search function",
206
- "Fix this Python bug: list index out of range",
207
- "Create a FastAPI endpoint",
208
- ]
209
- _generate_predictions(model, tokenizer, prompts, logger)
210
-
211
- print(f"\nLoRA adapters saved to: {PATHS.lora_output_dir.resolve()}")
212
- print(f"Tokenizer saved to: {PATHS.tokenizer_output_dir.resolve()}")
213
- return PATHS.lora_output_dir
214
-
215
-
216
- def _build_arg_parser() -> argparse.ArgumentParser:
217
- parser = argparse.ArgumentParser(description="LoRA fine-tuning for MINDI Python coding tasks.")
218
- parser.add_argument(
219
- "--no-resume",
220
- action="store_true",
221
- help="Disable automatic resume_from_checkpoint=True behavior.",
222
- )
223
- return parser
224
 
225
 
 
 
 
226
  if __name__ == "__main__":
227
- args = _build_arg_parser().parse_args()
228
- train(resume=not args.no_resume and TRAINING_CONFIG.resume_training)
 
 
 
 
1
  import argparse
2
  from pathlib import Path
3
+ import os
4
+ import subprocess
5
 
6
  import torch
7
  from peft import LoraConfig, TaskType, get_peft_model
 
10
  AutoTokenizer,
11
  Trainer,
12
  TrainingArguments,
13
+ TrainerCallback,
14
  set_seed,
15
  )
16
 
17
  from config import PATHS, TRAINING_CONFIG
18
+ from dataset import LocalJsonlInstructionDataset
19
  from utils import ensure_dirs, setup_logger
20
 
21
 
22
+ # ==============================
23
+ # 🔥 FIXED BACKUP CALLBACK
24
+ # ==============================
25
+ class BackupCallback(TrainerCallback):
26
+ def on_save(self, args, state, control, **kwargs):
27
+ try:
28
+ checkpoint_dir = os.path.join(
29
+ args.output_dir,
30
+ f"checkpoint-{state.global_step}"
31
+ )
32
+
33
+ if not os.path.exists(checkpoint_dir):
34
+ return
35
+
36
+ os.makedirs("backups", exist_ok=True)
37
+
38
+ backup_name = f"backup_step{state.global_step}.tar.gz"
39
+ backup_path = os.path.join("backups", backup_name)
40
+
41
+ print(f"\n[BACKUP] Creating backup for step {state.global_step}...")
42
+
43
+ subprocess.run([
44
+ "tar", "-czf", backup_path, checkpoint_dir
45
+ ], check=True)
46
+
47
+ print(f"[BACKUP] Saved: {backup_path}")
48
+
49
+ # =========================
50
+ # 🔥 FIXED NUMERIC SORT
51
+ # =========================
52
+ backups = [
53
+ f for f in os.listdir("backups")
54
+ if f.endswith(".tar.gz")
55
+ ]
56
+
57
+ backups = sorted(
58
+ backups,
59
+ key=lambda x: int(x.split("step")[1].split(".")[0])
60
+ )
61
+
62
+ # =========================
63
+ # KEEP LAST 5 BACKUPS
64
+ # =========================
65
+ if len(backups) > 5:
66
+ old_backup = backups[0]
67
+ old_path = os.path.join("backups", old_backup)
68
+
69
+ if os.path.isfile(old_path):
70
+ os.remove(old_path)
71
+ print(f"[BACKUP] Removed old backup: {old_backup}")
72
+
73
+ except Exception as e:
74
+ print(f"[BACKUP ERROR] {e}")
75
+ # Never crash training
76
+
77
+
78
+ # ==============================
79
+ # MODEL PATH RESOLUTION
80
+ # ==============================
81
  def _is_valid_hf_model_dir(path: Path) -> bool:
82
  if not path.exists():
83
  return False
84
+ return (path / "config.json").exists()
 
 
85
 
86
 
87
  def _resolve_model_path(logger) -> Path:
 
90
 
91
  if _is_valid_hf_model_dir(primary):
92
  return primary
93
+
94
  if _is_valid_hf_model_dir(fallback):
95
  logger.warning(
96
  "Primary model path %s is missing HF files. Falling back to %s",
 
98
  fallback.resolve(),
99
  )
100
  return fallback
 
 
 
 
 
101
 
102
+ raise FileNotFoundError("No valid model directory found.")
103
 
104
+
105
+ # ==============================
106
+ # BUILD MODEL
107
+ # ==============================
108
+ def _build_model_and_tokenizer(model_path: Path):
109
+ tokenizer = AutoTokenizer.from_pretrained(
110
+ model_path,
111
+ trust_remote_code=True,
112
+ local_files_only=True,
113
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
114
 
115
  if tokenizer.pad_token is None:
116
  tokenizer.pad_token = tokenizer.eos_token
 
130
  task_type=TaskType.CAUSAL_LM,
131
  target_modules="all-linear",
132
  )
133
+
134
  model = get_peft_model(model, lora_cfg)
135
  return model, tokenizer
136
 
137
 
138
+ # ==============================
139
+ # SMART RESUME
140
+ # ==============================
141
+ def get_latest_checkpoint(checkpoint_dir):
142
+ if not os.path.exists(checkpoint_dir):
143
+ return None
 
 
 
 
 
 
 
144
 
145
+ checkpoints = [
146
+ d for d in os.listdir(checkpoint_dir)
147
+ if d.startswith("checkpoint-")
148
+ ]
149
 
150
+ if not checkpoints:
151
+ return None
 
 
152
 
153
+ checkpoints = sorted(
154
+ checkpoints,
155
+ key=lambda x: int(x.split("-")[-1])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
156
  )
157
+
158
+ return os.path.join(checkpoint_dir, checkpoints[-1])
159
+
160
+
161
+ def safe_train(trainer, checkpoint_dir, logger):
162
+ latest_checkpoint = get_latest_checkpoint(checkpoint_dir)
163
+
164
+ if latest_checkpoint:
165
+ logger.info(f"Resuming from checkpoint: {latest_checkpoint}")
166
+ try:
167
+ trainer.train(resume_from_checkpoint=latest_checkpoint)
168
+ return
169
+ except Exception as e:
170
+ logger.warning(f"Resume failed: {e}")
171
+
172
+ logger.warning("No valid checkpoint → starting fresh training")
173
+ trainer.train()
174
+
175
+
176
+ # ==============================
177
+ # MAIN TRAIN FUNCTION
178
+ # ==============================
179
+ def train(resume: bool):
180
+ ensure_dirs([
181
+ PATHS.data_dir,
182
+ PATHS.output_dir,
183
+ PATHS.logs_dir,
184
+ PATHS.checkpoint_dir,
185
+ PATHS.lora_output_dir,
186
+ PATHS.tokenizer_output_dir,
187
+ ])
188
+
189
  logger = setup_logger("train", PATHS.logs_dir / "train.log")
190
  set_seed(42)
 
 
 
 
 
 
 
 
 
 
191
 
192
  model_path = _resolve_model_path(logger)
193
+ logger.info("Loading model from %s", model_path)
194
+
195
+ model, tokenizer = _build_model_and_tokenizer(model_path)
196
  model.print_trainable_parameters()
197
 
198
+ train_dataset = LocalJsonlInstructionDataset(
199
+ tokenizer,
200
+ max_length=TRAINING_CONFIG.max_length
201
+ )
202
 
203
  training_args = TrainingArguments(
204
  output_dir=str(PATHS.checkpoint_dir),
 
207
  gradient_accumulation_steps=TRAINING_CONFIG.gradient_accumulation_steps,
208
  learning_rate=TRAINING_CONFIG.learning_rate,
209
  fp16=torch.cuda.is_available(),
210
+ logging_steps=50,
211
+ save_steps=250,
212
+ save_total_limit=3,
213
+ gradient_checkpointing=False,
 
 
 
 
 
214
  report_to="none",
215
  remove_unused_columns=False,
 
 
216
  )
217
 
218
  trainer = Trainer(
219
  model=model,
220
  args=training_args,
221
  train_dataset=train_dataset,
222
+ callbacks=[BackupCallback()],
223
  )
224
 
225
+ logger.info("Starting training...")
226
+
227
+ safe_train(trainer, str(PATHS.checkpoint_dir), logger)
228
 
 
229
  trainer.model.save_pretrained(str(PATHS.lora_output_dir))
230
  tokenizer.save_pretrained(str(PATHS.tokenizer_output_dir))
231
 
232
+ print("\n✅ Training complete. Model saved.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
233
 
234
 
235
+ # ==============================
236
+ # ENTRY POINT
237
+ # ==============================
238
  if __name__ == "__main__":
239
+ parser = argparse.ArgumentParser()
240
+ parser.add_argument("--no-resume", action="store_true")
241
+ args = parser.parse_args()
242
+
243
+ train(resume=not args.no_resume)