| |
| """Codette LoRA Adapter Training - HuggingFace Jobs (A10G GPU) |
| |
| Trains all 8 LoRA adapters on Llama 3.1 8B Instruct with QLoRA. |
| Robust error handling: upload failures won't kill the job. |
| """ |
|
|
| import json, os, gc, time, torch, traceback |
| from pathlib import Path |
| from huggingface_hub import hf_hub_download, HfApi |
| from datasets import Dataset |
| from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig |
| from peft import LoraConfig, get_peft_model, TaskType |
|
|
| try: |
| from trl import SFTTrainer, SFTConfig |
| USE_NEW_TRL = True |
| except ImportError: |
| from trl import SFTTrainer |
| from transformers import TrainingArguments |
| USE_NEW_TRL = False |
|
|
| MODEL_NAME = "meta-llama/Llama-3.1-8B-Instruct" |
| DATASET_REPO = "Raiff1982/codette-training-data" |
| OUTPUT_REPO = "Raiff1982/codette-lora-adapters" |
| HF_TOKEN = os.environ.get("HF_TOKEN") |
|
|
| ADAPTERS = [ |
| ("newton", "newton_reasoning.jsonl", 3), |
| ("davinci", "davinci_reasoning.jsonl", 3), |
| ("empathy", "empathy_reasoning.jsonl", 3), |
| ("philosophy", "philosophy_reasoning.jsonl", 3), |
| ("quantum", "quantum_reasoning.jsonl", 3), |
| ("consciousness", "consciousness_reasoning.jsonl", 3), |
| ("multi_perspective", "multi_perspective_reasoning.jsonl", 3), |
| ("systems_architecture", "systems_architecture_reasoning.jsonl", 3), |
| ] |
|
|
| print("=" * 60) |
| print("Codette LoRA Training - HuggingFace Jobs (A10G GPU)") |
| print("=" * 60) |
| print(f"CUDA available: {torch.cuda.is_available()}") |
| if torch.cuda.is_available(): |
| print(f"GPU: {torch.cuda.get_device_name(0)}") |
| print(f"VRAM: {torch.cuda.get_device_properties(0).total_memory/1024**3:.1f} GB") |
| print(f"HF Token present: {bool(HF_TOKEN)}") |
| print(f"USE_NEW_TRL: {USE_NEW_TRL}") |
|
|
| |
| api = HfApi(token=HF_TOKEN) |
| try: |
| api.create_repo(OUTPUT_REPO, private=True, token=HF_TOKEN) |
| print(f"Created output repo: {OUTPUT_REPO}") |
| except Exception as e: |
| print(f"Output repo status: {e}") |
|
|
| |
| print("\nDownloading datasets...") |
| dataset_dir = Path("/tmp/datasets") |
| dataset_dir.mkdir(exist_ok=True) |
| for name, filename, _ in ADAPTERS: |
| hf_hub_download(DATASET_REPO, filename, repo_type="dataset", local_dir=str(dataset_dir)) |
| print(f" done: {name}") |
|
|
| |
| print("\nLoading tokenizer...") |
| tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, token=HF_TOKEN) |
| if tokenizer.pad_token is None: |
| tokenizer.pad_token = tokenizer.eos_token |
|
|
| |
| print("Loading model with 4-bit QLoRA...") |
| bnb_config = BitsAndBytesConfig( |
| load_in_4bit=True, |
| bnb_4bit_quant_type="nf4", |
| bnb_4bit_compute_dtype=torch.bfloat16, |
| bnb_4bit_use_double_quant=True, |
| ) |
|
|
| model = AutoModelForCausalLM.from_pretrained( |
| MODEL_NAME, |
| quantization_config=bnb_config, |
| device_map="auto", |
| dtype=torch.bfloat16, |
| trust_remote_code=True, |
| use_cache=False, |
| token=HF_TOKEN, |
| ) |
| model.gradient_checkpointing_enable() |
| print(f"Model loaded! GPU: {torch.cuda.memory_allocated()/1024**3:.2f} GB") |
|
|
| |
| results = {} |
| failed_uploads = [] |
| total_start = time.time() |
|
|
| for adapter_name, dataset_file, epochs in ADAPTERS: |
| print(f"\n{'=' * 60}") |
| print(f"TRAINING: {adapter_name} ({epochs} epochs)") |
| print(f"{'=' * 60}") |
| start = time.time() |
|
|
| |
| dataset_path = dataset_dir / dataset_file |
| examples = [] |
| with open(dataset_path) as f: |
| for line in f: |
| examples.append(json.loads(line)) |
|
|
| def format_example(ex): |
| return {"text": tokenizer.apply_chat_template(ex["messages"], tokenize=False)} |
|
|
| dataset = Dataset.from_list(examples).map(format_example, remove_columns=["messages"]) |
| print(f" Dataset: {len(dataset)} examples") |
|
|
| |
| lora_config = LoraConfig( |
| r=16, lora_alpha=32, lora_dropout=0.05, |
| target_modules=["q_proj", "k_proj", "v_proj", "o_proj"], |
| task_type=TaskType.CAUSAL_LM, bias="none", |
| ) |
| peft_model = get_peft_model(model, lora_config) |
| trainable = sum(p.numel() for p in peft_model.parameters() if p.requires_grad) |
| total_params = sum(p.numel() for p in peft_model.parameters()) |
| print(f" LoRA: {trainable:,}/{total_params:,} trainable") |
|
|
| output_dir = f"/tmp/adapters/{adapter_name}" |
|
|
| |
| if USE_NEW_TRL: |
| training_args = SFTConfig( |
| output_dir=output_dir, |
| num_train_epochs=epochs, |
| per_device_train_batch_size=2, |
| gradient_accumulation_steps=4, |
| learning_rate=2e-4, |
| warmup_ratio=0.03, |
| logging_steps=10, |
| save_steps=500, |
| bf16=True, |
| report_to="none", |
| dataset_text_field="text", |
| max_length=2048, |
| ) |
| trainer = SFTTrainer( |
| model=peft_model, |
| args=training_args, |
| train_dataset=dataset, |
| processing_class=tokenizer, |
| ) |
| else: |
| training_args = TrainingArguments( |
| output_dir=output_dir, |
| num_train_epochs=epochs, |
| per_device_train_batch_size=2, |
| gradient_accumulation_steps=4, |
| learning_rate=2e-4, |
| warmup_ratio=0.03, |
| logging_steps=10, |
| save_steps=500, |
| bf16=True, |
| report_to="none", |
| ) |
| trainer = SFTTrainer( |
| model=peft_model, |
| args=training_args, |
| train_dataset=dataset, |
| tokenizer=tokenizer, |
| dataset_text_field="text", |
| max_seq_length=2048, |
| ) |
|
|
| |
| print(f" Training...") |
| result = trainer.train() |
| elapsed = time.time() - start |
| print(f" DONE! Loss: {result.training_loss:.4f}, Steps: {result.global_step}, Time: {elapsed:.0f}s") |
|
|
| |
| peft_model.save_pretrained(output_dir) |
| tokenizer.save_pretrained(output_dir) |
| print(f" Saved locally to {output_dir}") |
|
|
| |
| try: |
| api.upload_folder( |
| folder_path=output_dir, |
| path_in_repo=adapter_name, |
| repo_id=OUTPUT_REPO, |
| token=HF_TOKEN, |
| ) |
| print(f" Uploaded to {OUTPUT_REPO}/{adapter_name}") |
| except Exception as e: |
| print(f" WARNING: Upload failed for {adapter_name}: {e}") |
| failed_uploads.append(adapter_name) |
|
|
| results[adapter_name] = { |
| "loss": result.training_loss, |
| "steps": result.global_step, |
| "time_seconds": elapsed, |
| } |
|
|
| |
| try: |
| model = peft_model.unload() |
| except: |
| model = peft_model.base_model.model |
| del peft_model, trainer, dataset |
| gc.collect() |
| torch.cuda.empty_cache() |
|
|
| |
| total_elapsed = time.time() - total_start |
| print(f"\n{'=' * 60}") |
| print("ALL 8 ADAPTERS TRAINED!") |
| print(f"Total time: {total_elapsed/60:.1f} minutes") |
| print(f"{'=' * 60}") |
| for name, r in results.items(): |
| print(f" {name}: loss={r['loss']:.4f}, steps={r['steps']}, time={r['time_seconds']:.0f}s") |
|
|
| |
| if failed_uploads: |
| print(f"\nRetrying {len(failed_uploads)} failed uploads...") |
| for adapter_name in failed_uploads: |
| output_dir = f"/tmp/adapters/{adapter_name}" |
| try: |
| api.upload_folder( |
| folder_path=output_dir, |
| path_in_repo=adapter_name, |
| repo_id=OUTPUT_REPO, |
| token=HF_TOKEN, |
| ) |
| print(f" Retry SUCCESS: {adapter_name}") |
| except Exception as e: |
| print(f" Retry FAILED: {adapter_name}: {e}") |
|
|
| |
| try: |
| with open("/tmp/training_results.json", "w") as f: |
| json.dump(results, f, indent=2) |
| api.upload_file( |
| path_or_fileobj="/tmp/training_results.json", |
| path_in_repo="training_results.json", |
| repo_id=OUTPUT_REPO, |
| token=HF_TOKEN, |
| ) |
| print("Results uploaded.") |
| except Exception as e: |
| print(f"Results upload failed: {e}") |
| print("Results JSON:") |
| print(json.dumps(results, indent=2)) |
|
|
| print(f"\nAdapters: https://huggingface.co/{OUTPUT_REPO}") |
|
|