Spaces:
Paused
Paused
AdriBat1
Add Deep-NanoGPT experiment (Phase 1 & 2): resumable training, inference, 72-layer models
671ce97 | #!/usr/bin/env python3 | |
| """ | |
| Remote LLM Training (DistilGPT2 on Wikitext) | |
| ============================================ | |
| Allena un piccolo LLM (DistilGPT2) su un dataset di testo (Wikitext-2) | |
| direttamente sulla GPU remota e salva il modello persistente. | |
| """ | |
| from antigravity_sdk import RemoteGPU | |
| TRAINING_CODE = r''' | |
| import os | |
| import sys | |
| print("π§ Setting up Environment...") | |
| # Pin compatible versions for PyTorch 2.1.2 | |
| os.system(f"{sys.executable} -m pip install transformers==4.37.2 datasets==2.17.0 accelerate==0.27.2 --quiet") | |
| import torch | |
| from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments, TextDataset, DataCollatorForLanguageModeling | |
| from datasets import load_dataset | |
| print("π Starting LLM Training...") | |
| device = torch.device("cuda" if torch.cuda.is_available() else "cpu") | |
| print(f" Using device: {device}") | |
| # 1. Configuration | |
| MODEL_NAME = "distilgpt2" | |
| STORAGE_DIR = "/home/user/app/storage/my_llm" | |
| os.makedirs(STORAGE_DIR, exist_ok=True) | |
| # 2. Load Tokenizer & Model | |
| print(f" Loading {MODEL_NAME}...") | |
| tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME) | |
| tokenizer.pad_token = tokenizer.eos_token # Fix for GPT-2 which has no pad token | |
| model = AutoModelForCausalLM.from_pretrained(MODEL_NAME).to(device) | |
| # 3. Prepare Dataset (Wikitext-2 small subset for speed) | |
| print(" Loading dataset (wikitext-2)...") | |
| # For simplicity/speed in this demo, accessing a small raw text subset or using 'wikitext' library | |
| dataset = load_dataset("wikitext", "wikitext-2-raw-v1", split="train[:1%]") # 1% just for demo speed | |
| print(f" Dataset loaded. Rows: {len(dataset)}") | |
| # Helper to tokenize | |
| def tokenize_function(examples): | |
| return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=128) | |
| print(" Tokenizing...") | |
| tokenized_datasets = dataset.map(tokenize_function, batched=True) | |
| tokenized_datasets = tokenized_datasets.remove_columns(["text"]) | |
| tokenized_datasets.set_format("torch") | |
| data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False) | |
| # 4. Training Arguments | |
| training_args = TrainingArguments( | |
| output_dir="./results", | |
| overwrite_output_dir=True, | |
| num_train_epochs=1, | |
| per_device_train_batch_size=4, | |
| save_steps=500, | |
| save_total_limit=1, | |
| report_to="none", | |
| disable_tqdm=True # Cleaner output logs | |
| ) | |
| # 5. Trainer | |
| trainer = Trainer( | |
| model=model, | |
| args=training_args, | |
| train_dataset=tokenized_datasets, | |
| data_collator=data_collator, | |
| ) | |
| # 6. Train | |
| print(" Starting Fine-Tuning...") | |
| trainer.train() | |
| # 7. Save Persistently | |
| print(f" πΎ Saving model to {STORAGE_DIR}...") | |
| model.save_pretrained(STORAGE_DIR) | |
| tokenizer.save_pretrained(STORAGE_DIR) | |
| # 8. Test Generation | |
| print(" Testing generation...") | |
| input_text = "The future of AI is" | |
| inputs = tokenizer(input_text, return_tensors="pt").to(device) | |
| output = model.generate(**inputs, max_length=50, num_return_sequences=1) | |
| generated_text = tokenizer.decode(output[0], skip_special_tokens=True) | |
| print("-" * 40) | |
| print(f"Input: {input_text}") | |
| print(f"Output: {generated_text}") | |
| print("-" * 40) | |
| print("β LLM Training Complete & Model Saved.") | |
| ''' | |
| def main(): | |
| print("π‘ Connecting to Remote GPU for LLM Training...") | |
| gpu = RemoteGPU() | |
| # Run in standard mode (blocking) | |
| result = gpu.run(TRAINING_CODE) | |
| if "Training Complete" in result.output: | |
| print("\nπ LLM Addestrato e Salvato sul Server!") | |
| else: | |
| print("\nβ οΈ Qualcosa Γ¨ andato storto (controlla i log sopra).") | |
| if __name__ == "__main__": | |
| main() | |