#!/usr/bin/env python3 """ Remote LLM Training (DistilGPT2 on Wikitext) ============================================ Allena un piccolo LLM (DistilGPT2) su un dataset di testo (Wikitext-2) direttamente sulla GPU remota e salva il modello persistente. """ from antigravity_sdk import RemoteGPU TRAINING_CODE = r''' import os import sys print("๐Ÿ”ง Setting up Environment...") # Pin compatible versions for PyTorch 2.1.2 os.system(f"{sys.executable} -m pip install transformers==4.37.2 datasets==2.17.0 accelerate==0.27.2 --quiet") import torch from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments, TextDataset, DataCollatorForLanguageModeling from datasets import load_dataset print("๐Ÿš€ Starting LLM Training...") device = torch.device("cuda" if torch.cuda.is_available() else "cpu") print(f" Using device: {device}") # 1. Configuration MODEL_NAME = "distilgpt2" STORAGE_DIR = "/home/user/app/storage/my_llm" os.makedirs(STORAGE_DIR, exist_ok=True) # 2. Load Tokenizer & Model print(f" Loading {MODEL_NAME}...") tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME) tokenizer.pad_token = tokenizer.eos_token # Fix for GPT-2 which has no pad token model = AutoModelForCausalLM.from_pretrained(MODEL_NAME).to(device) # 3. Prepare Dataset (Wikitext-2 small subset for speed) print(" Loading dataset (wikitext-2)...") # For simplicity/speed in this demo, accessing a small raw text subset or using 'wikitext' library dataset = load_dataset("wikitext", "wikitext-2-raw-v1", split="train[:1%]") # 1% just for demo speed print(f" Dataset loaded. Rows: {len(dataset)}") # Helper to tokenize def tokenize_function(examples): return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=128) print(" Tokenizing...") tokenized_datasets = dataset.map(tokenize_function, batched=True) tokenized_datasets = tokenized_datasets.remove_columns(["text"]) tokenized_datasets.set_format("torch") data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False) # 4. Training Arguments training_args = TrainingArguments( output_dir="./results", overwrite_output_dir=True, num_train_epochs=1, per_device_train_batch_size=4, save_steps=500, save_total_limit=1, report_to="none", disable_tqdm=True # Cleaner output logs ) # 5. Trainer trainer = Trainer( model=model, args=training_args, train_dataset=tokenized_datasets, data_collator=data_collator, ) # 6. Train print(" Starting Fine-Tuning...") trainer.train() # 7. Save Persistently print(f" ๐Ÿ’พ Saving model to {STORAGE_DIR}...") model.save_pretrained(STORAGE_DIR) tokenizer.save_pretrained(STORAGE_DIR) # 8. Test Generation print(" Testing generation...") input_text = "The future of AI is" inputs = tokenizer(input_text, return_tensors="pt").to(device) output = model.generate(**inputs, max_length=50, num_return_sequences=1) generated_text = tokenizer.decode(output[0], skip_special_tokens=True) print("-" * 40) print(f"Input: {input_text}") print(f"Output: {generated_text}") print("-" * 40) print("โœ… LLM Training Complete & Model Saved.") ''' def main(): print("๐Ÿ“ก Connecting to Remote GPU for LLM Training...") gpu = RemoteGPU() # Run in standard mode (blocking) result = gpu.run(TRAINING_CODE) if "Training Complete" in result.output: print("\n๐Ÿ† LLM Addestrato e Salvato sul Server!") else: print("\nโš ๏ธ Qualcosa รจ andato storto (controlla i log sopra).") if __name__ == "__main__": main()