"""Train on Modal serverless GPU. Modal lets you rent GPUs by the second. Cheaper than RunPod for short jobs. Setup: pip install modal modal setup # login modal secret create huggingface HF_TOKEN= Run: modal run train_modal.py --model EleutherAI/pythia-1.4b """ import modal app = modal.App("mel-corpus-training") image = ( modal.Image.debian_slim(python_version="3.11") .pip_install([ "torch>=2.0.0", "transformers>=4.40.0", "peft>=0.10.0", "accelerate>=0.30.0", "datasets>=2.18.0", "bitsandbytes>=0.43.0", "huggingface_hub>=0.22.0", ]) .apt_install("git") ) volume = modal.Volume.from_name("mel-training", create_if_missing=True) @app.function( image=image, gpu="A100-40GB", # change to T4, A10, A100-80GB as needed timeout=60 * 60 * 12, # 12 hour max volumes={"/workspace": volume}, secrets=[modal.Secret.from_name("huggingface")], ) def train( model_id: str = "EleutherAI/pythia-1.4b", bridge_repo: str = "Melofhell00/claude-bridge", output_repo: str = None, epochs: int = 3, ): import os import subprocess from huggingface_hub import hf_hub_download, snapshot_download, HfApi os.chdir("/workspace") # Pull unified corpus from bridge print(f"Downloading corpus from {bridge_repo}...") corpus_path = hf_hub_download( repo_id=bridge_repo, filename="unified_corpus_2026_05_12/unified_corpus.txt", repo_type="dataset", ) print(f"Corpus: {corpus_path}") # Pull training scripts from this repo (uploaded separately) snapshot_download( repo_id="Melofhell00/mel-training-package", repo_type="model", local_dir="/workspace/training_package", ) # Prepare data print("Preparing data...") subprocess.run([ "python", "/workspace/training_package/prepare_data.py", "--corpus", corpus_path, "--output", "/workspace/train.jsonl", "--tokenizer", model_id, ], check=True) # Train print("Training...") output_name = output_repo or f"mel-{model_id.split('/')[-1]}" cmd = [ "python", "/workspace/training_package/train.py", "--model", model_id, "--data", "/workspace/train.jsonl", "--output", f"/workspace/{output_name}", "--epochs", str(epochs), "--use-4bit", "--hf-repo", f"Melofhell00/{output_name}", ] subprocess.run(cmd, check=True) print(f"Done. Pushed to Melofhell00/{output_name}") return f"Melofhell00/{output_name}" @app.local_entrypoint() def main(model: str = "EleutherAI/pythia-1.4b", epochs: int = 3): result = train.remote(model_id=model, epochs=epochs) print(f"\nResult: {result}")