| |
| """Codette LoRA Adapter Training v3 - Remaining 6 Adapters |
| |
| Newton and Davinci already completed and uploaded. |
| This script trains ONLY the remaining 6 adapters to save GPU credits. |
| Robust error handling: upload failures won't kill the job. |
| """ |
|
|
| |
| import subprocess, sys |
| print("Installing dependencies...") |
| subprocess.check_call([ |
| sys.executable, "-m", "pip", "install", "-q", |
| "torch", "transformers", "peft", "trl", "datasets", |
| "bitsandbytes", "accelerate", "huggingface_hub", "sentencepiece", |
| ]) |
| print("Dependencies installed.\n") |
|
|
| import json, os, gc, time, torch, traceback |
| from pathlib import Path |
| from huggingface_hub import hf_hub_download, HfApi |
| from datasets import Dataset |
| from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig |
| from peft import LoraConfig, get_peft_model, TaskType |
|
|
| try: |
| from trl import SFTTrainer, SFTConfig |
| USE_NEW_TRL = True |
| except ImportError: |
| from trl import SFTTrainer |
| from transformers import TrainingArguments |
| USE_NEW_TRL = False |
|
|
| MODEL_NAME = "meta-llama/Llama-3.1-8B-Instruct" |
| DATASET_REPO = "Raiff1982/codette-training-data" |
| OUTPUT_REPO = "Raiff1982/codette-lora-adapters" |
| HF_TOKEN = os.environ.get("HF_TOKEN") |
|
|
| |
| ADAPTERS = [ |
| ("empathy", "empathy_reasoning.jsonl", 3), |
| ("philosophy", "philosophy_reasoning.jsonl", 3), |
| ("quantum", "quantum_reasoning.jsonl", 3), |
| ("consciousness", "consciousness_reasoning.jsonl", 3), |
| ("multi_perspective", "multi_perspective_reasoning.jsonl", 3), |
| ("systems_architecture", "systems_architecture_reasoning.jsonl", 3), |
| ] |
|
|
| print("=" * 60) |
| print("Codette LoRA Training v3 - Remaining 6 Adapters") |
| print("=" * 60) |
| print("SKIPPING: newton (done), davinci (done)") |
| print(f"TRAINING: {', '.join(a[0] for a in ADAPTERS)}") |
| print(f"CUDA available: {torch.cuda.is_available()}") |
| if torch.cuda.is_available(): |
| print(f"GPU: {torch.cuda.get_device_name(0)}") |
| print(f"VRAM: {torch.cuda.get_device_properties(0).total_memory/1024**3:.1f} GB") |
| print(f"HF Token present: {bool(HF_TOKEN)}") |
| print(f"USE_NEW_TRL: {USE_NEW_TRL}") |
|
|
| |
| api = HfApi(token=HF_TOKEN) |
| try: |
| api.repo_info(OUTPUT_REPO, token=HF_TOKEN) |
| print(f"Output repo verified: {OUTPUT_REPO}") |
| except Exception: |
| try: |
| api.create_repo(OUTPUT_REPO, private=True, token=HF_TOKEN) |
| print(f"Created output repo: {OUTPUT_REPO}") |
| except Exception as e: |
| print(f"Output repo status: {e}") |
|
|
| |
| print("\nDownloading datasets...") |
| dataset_dir = Path("/tmp/datasets") |
| dataset_dir.mkdir(exist_ok=True) |
| for name, filename, _ in ADAPTERS: |
| try: |
| hf_hub_download(DATASET_REPO, filename, repo_type="dataset", |
| local_dir=str(dataset_dir), token=HF_TOKEN) |
| print(f" done: {name}") |
| except Exception as e: |
| print(f" FAILED to download {name}: {e}") |
| raise |
|
|
| |
| print("\nLoading tokenizer...") |
| tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, token=HF_TOKEN) |
| if tokenizer.pad_token is None: |
| tokenizer.pad_token = tokenizer.eos_token |
|
|
| |
| print("Loading model with 4-bit QLoRA...") |
| bnb_config = BitsAndBytesConfig( |
| load_in_4bit=True, |
| bnb_4bit_quant_type="nf4", |
| bnb_4bit_compute_dtype=torch.bfloat16, |
| bnb_4bit_use_double_quant=True, |
| ) |
|
|
| model = AutoModelForCausalLM.from_pretrained( |
| MODEL_NAME, |
| quantization_config=bnb_config, |
| device_map="auto", |
| dtype=torch.bfloat16, |
| trust_remote_code=True, |
| use_cache=False, |
| token=HF_TOKEN, |
| ) |
| model.gradient_checkpointing_enable() |
| print(f"Model loaded! GPU: {torch.cuda.memory_allocated()/1024**3:.2f} GB") |
|
|
| |
| results = {} |
| failed_uploads = [] |
| completed = [] |
| total_start = time.time() |
|
|
| for adapter_idx, (adapter_name, dataset_file, epochs) in enumerate(ADAPTERS): |
| print(f"\n{'=' * 60}") |
| print(f"TRAINING [{adapter_idx+1}/{len(ADAPTERS)}]: {adapter_name} ({epochs} epochs)") |
| print(f"{'=' * 60}") |
| start = time.time() |
|
|
| try: |
| |
| dataset_path = dataset_dir / dataset_file |
| examples = [] |
| with open(dataset_path) as f: |
| for line in f: |
| examples.append(json.loads(line)) |
|
|
| def format_example(ex): |
| return {"text": tokenizer.apply_chat_template(ex["messages"], tokenize=False)} |
|
|
| dataset = Dataset.from_list(examples).map(format_example, remove_columns=["messages"]) |
| print(f" Dataset: {len(dataset)} examples") |
|
|
| |
| lora_config = LoraConfig( |
| r=16, lora_alpha=32, lora_dropout=0.05, |
| target_modules=["q_proj", "k_proj", "v_proj", "o_proj"], |
| task_type=TaskType.CAUSAL_LM, bias="none", |
| ) |
| peft_model = get_peft_model(model, lora_config) |
| trainable = sum(p.numel() for p in peft_model.parameters() if p.requires_grad) |
| total_params = sum(p.numel() for p in peft_model.parameters()) |
| print(f" LoRA: {trainable:,}/{total_params:,} trainable") |
|
|
| output_dir = f"/tmp/adapters/{adapter_name}" |
|
|
| |
| if USE_NEW_TRL: |
| training_args = SFTConfig( |
| output_dir=output_dir, |
| num_train_epochs=epochs, |
| per_device_train_batch_size=2, |
| gradient_accumulation_steps=4, |
| learning_rate=2e-4, |
| warmup_ratio=0.03, |
| logging_steps=10, |
| save_steps=500, |
| bf16=True, |
| report_to="none", |
| dataset_text_field="text", |
| max_length=2048, |
| ) |
| trainer = SFTTrainer( |
| model=peft_model, |
| args=training_args, |
| train_dataset=dataset, |
| processing_class=tokenizer, |
| ) |
| else: |
| training_args = TrainingArguments( |
| output_dir=output_dir, |
| num_train_epochs=epochs, |
| per_device_train_batch_size=2, |
| gradient_accumulation_steps=4, |
| learning_rate=2e-4, |
| warmup_ratio=0.03, |
| logging_steps=10, |
| save_steps=500, |
| bf16=True, |
| report_to="none", |
| ) |
| trainer = SFTTrainer( |
| model=peft_model, |
| args=training_args, |
| train_dataset=dataset, |
| tokenizer=tokenizer, |
| dataset_text_field="text", |
| max_seq_length=2048, |
| ) |
|
|
| |
| print(f" Training...") |
| result = trainer.train() |
| elapsed = time.time() - start |
| print(f" DONE! Loss: {result.training_loss:.4f}, Steps: {result.global_step}, Time: {elapsed:.0f}s") |
|
|
| |
| peft_model.save_pretrained(output_dir) |
| tokenizer.save_pretrained(output_dir) |
| print(f" Saved locally to {output_dir}") |
|
|
| |
| try: |
| api.upload_folder( |
| folder_path=output_dir, |
| path_in_repo=adapter_name, |
| repo_id=OUTPUT_REPO, |
| token=HF_TOKEN, |
| ) |
| print(f" Uploaded to {OUTPUT_REPO}/{adapter_name}") |
| except Exception as e: |
| print(f" WARNING: Upload failed for {adapter_name}: {e}") |
| failed_uploads.append(adapter_name) |
|
|
| results[adapter_name] = { |
| "loss": result.training_loss, |
| "steps": result.global_step, |
| "time_seconds": elapsed, |
| } |
| completed.append(adapter_name) |
|
|
| except Exception as e: |
| elapsed = time.time() - start |
| print(f" TRAINING FAILED for {adapter_name}: {e}") |
| print(traceback.format_exc()) |
| results[adapter_name] = { |
| "error": str(e), |
| "time_seconds": elapsed, |
| } |
| finally: |
| |
| try: |
| model = peft_model.unload() |
| except: |
| try: |
| model = peft_model.base_model.model |
| except: |
| pass |
| for obj_name in ['peft_model', 'trainer', 'dataset']: |
| if obj_name in dir(): |
| try: |
| exec(f"del {obj_name}") |
| except: |
| pass |
| gc.collect() |
| torch.cuda.empty_cache() |
| print(f" GPU after cleanup: {torch.cuda.memory_allocated()/1024**3:.2f} GB") |
|
|
| |
| total_elapsed = time.time() - total_start |
| print(f"\n{'=' * 60}") |
| print(f"TRAINING COMPLETE: {len(completed)}/{len(ADAPTERS)} adapters") |
| print(f"Total time: {total_elapsed/60:.1f} minutes") |
| print(f"{'=' * 60}") |
| print(f" Previously completed: newton, davinci") |
| for name, r in results.items(): |
| if "error" in r: |
| print(f" {name}: FAILED - {r['error']}") |
| else: |
| print(f" {name}: loss={r['loss']:.4f}, steps={r['steps']}, time={r['time_seconds']:.0f}s") |
|
|
| |
| if failed_uploads: |
| print(f"\nRetrying {len(failed_uploads)} failed uploads...") |
| for adapter_name in list(failed_uploads): |
| output_dir = f"/tmp/adapters/{adapter_name}" |
| try: |
| api.upload_folder( |
| folder_path=output_dir, |
| path_in_repo=adapter_name, |
| repo_id=OUTPUT_REPO, |
| token=HF_TOKEN, |
| ) |
| print(f" Retry SUCCESS: {adapter_name}") |
| failed_uploads.remove(adapter_name) |
| except Exception as e: |
| print(f" Retry FAILED: {adapter_name}: {e}") |
|
|
| |
| try: |
| |
| existing_results = {} |
| try: |
| existing_path = hf_hub_download( |
| OUTPUT_REPO, "training_results.json", |
| repo_type="model", token=HF_TOKEN |
| ) |
| with open(existing_path) as f: |
| existing_results = json.load(f) |
| print(f"Loaded existing results: {list(existing_results.keys())}") |
| except: |
| pass |
|
|
| |
| existing_results.update(results) |
|
|
| with open("/tmp/training_results.json", "w") as f: |
| json.dump(existing_results, f, indent=2) |
| api.upload_file( |
| path_or_fileobj="/tmp/training_results.json", |
| path_in_repo="training_results.json", |
| repo_id=OUTPUT_REPO, |
| token=HF_TOKEN, |
| ) |
| print("Combined results uploaded.") |
| except Exception as e: |
| print(f"Results upload failed: {e}") |
| print("Results JSON:") |
| print(json.dumps(results, indent=2)) |
|
|
| |
| all_done = ["newton", "davinci"] + completed |
| remaining = [a[0] for a in ADAPTERS if a[0] not in completed] |
| print(f"\n{'=' * 60}") |
| print(f"OVERALL STATUS") |
| print(f"{'=' * 60}") |
| print(f" Completed ({len(all_done)}/8): {', '.join(all_done)}") |
| if remaining: |
| print(f" Remaining ({len(remaining)}/8): {', '.join(remaining)}") |
| if failed_uploads: |
| print(f" Failed uploads: {', '.join(failed_uploads)}") |
| print(f"\nAdapters: https://huggingface.co/{OUTPUT_REPO}") |
|
|