File size: 2,772 Bytes
fde73f3 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 | """Train on Modal serverless GPU.
Modal lets you rent GPUs by the second. Cheaper than RunPod for short jobs.
Setup:
pip install modal
modal setup # login
modal secret create huggingface HF_TOKEN=<your_token>
Run:
modal run train_modal.py --model EleutherAI/pythia-1.4b
"""
import modal
app = modal.App("mel-corpus-training")
image = (
modal.Image.debian_slim(python_version="3.11")
.pip_install([
"torch>=2.0.0", "transformers>=4.40.0", "peft>=0.10.0",
"accelerate>=0.30.0", "datasets>=2.18.0", "bitsandbytes>=0.43.0",
"huggingface_hub>=0.22.0",
])
.apt_install("git")
)
volume = modal.Volume.from_name("mel-training", create_if_missing=True)
@app.function(
image=image,
gpu="A100-40GB", # change to T4, A10, A100-80GB as needed
timeout=60 * 60 * 12, # 12 hour max
volumes={"/workspace": volume},
secrets=[modal.Secret.from_name("huggingface")],
)
def train(
model_id: str = "EleutherAI/pythia-1.4b",
bridge_repo: str = "Melofhell00/claude-bridge",
output_repo: str = None,
epochs: int = 3,
):
import os
import subprocess
from huggingface_hub import hf_hub_download, snapshot_download, HfApi
os.chdir("/workspace")
# Pull unified corpus from bridge
print(f"Downloading corpus from {bridge_repo}...")
corpus_path = hf_hub_download(
repo_id=bridge_repo,
filename="unified_corpus_2026_05_12/unified_corpus.txt",
repo_type="dataset",
)
print(f"Corpus: {corpus_path}")
# Pull training scripts from this repo (uploaded separately)
snapshot_download(
repo_id="Melofhell00/mel-training-package",
repo_type="model",
local_dir="/workspace/training_package",
)
# Prepare data
print("Preparing data...")
subprocess.run([
"python", "/workspace/training_package/prepare_data.py",
"--corpus", corpus_path,
"--output", "/workspace/train.jsonl",
"--tokenizer", model_id,
], check=True)
# Train
print("Training...")
output_name = output_repo or f"mel-{model_id.split('/')[-1]}"
cmd = [
"python", "/workspace/training_package/train.py",
"--model", model_id,
"--data", "/workspace/train.jsonl",
"--output", f"/workspace/{output_name}",
"--epochs", str(epochs),
"--use-4bit",
"--hf-repo", f"Melofhell00/{output_name}",
]
subprocess.run(cmd, check=True)
print(f"Done. Pushed to Melofhell00/{output_name}")
return f"Melofhell00/{output_name}"
@app.local_entrypoint()
def main(model: str = "EleutherAI/pythia-1.4b", epochs: int = 3):
result = train.remote(model_id=model, epochs=epochs)
print(f"\nResult: {result}")
|