mel-training-package / train_modal.py
Melofhell00's picture
Complete training pipeline for unified corpus on uncontaminated base models
fde73f3 verified
"""Train on Modal serverless GPU.
Modal lets you rent GPUs by the second. Cheaper than RunPod for short jobs.
Setup:
pip install modal
modal setup # login
modal secret create huggingface HF_TOKEN=<your_token>
Run:
modal run train_modal.py --model EleutherAI/pythia-1.4b
"""
import modal
app = modal.App("mel-corpus-training")
image = (
modal.Image.debian_slim(python_version="3.11")
.pip_install([
"torch>=2.0.0", "transformers>=4.40.0", "peft>=0.10.0",
"accelerate>=0.30.0", "datasets>=2.18.0", "bitsandbytes>=0.43.0",
"huggingface_hub>=0.22.0",
])
.apt_install("git")
)
volume = modal.Volume.from_name("mel-training", create_if_missing=True)
@app.function(
image=image,
gpu="A100-40GB", # change to T4, A10, A100-80GB as needed
timeout=60 * 60 * 12, # 12 hour max
volumes={"/workspace": volume},
secrets=[modal.Secret.from_name("huggingface")],
)
def train(
model_id: str = "EleutherAI/pythia-1.4b",
bridge_repo: str = "Melofhell00/claude-bridge",
output_repo: str = None,
epochs: int = 3,
):
import os
import subprocess
from huggingface_hub import hf_hub_download, snapshot_download, HfApi
os.chdir("/workspace")
# Pull unified corpus from bridge
print(f"Downloading corpus from {bridge_repo}...")
corpus_path = hf_hub_download(
repo_id=bridge_repo,
filename="unified_corpus_2026_05_12/unified_corpus.txt",
repo_type="dataset",
)
print(f"Corpus: {corpus_path}")
# Pull training scripts from this repo (uploaded separately)
snapshot_download(
repo_id="Melofhell00/mel-training-package",
repo_type="model",
local_dir="/workspace/training_package",
)
# Prepare data
print("Preparing data...")
subprocess.run([
"python", "/workspace/training_package/prepare_data.py",
"--corpus", corpus_path,
"--output", "/workspace/train.jsonl",
"--tokenizer", model_id,
], check=True)
# Train
print("Training...")
output_name = output_repo or f"mel-{model_id.split('/')[-1]}"
cmd = [
"python", "/workspace/training_package/train.py",
"--model", model_id,
"--data", "/workspace/train.jsonl",
"--output", f"/workspace/{output_name}",
"--epochs", str(epochs),
"--use-4bit",
"--hf-repo", f"Melofhell00/{output_name}",
]
subprocess.run(cmd, check=True)
print(f"Done. Pushed to Melofhell00/{output_name}")
return f"Melofhell00/{output_name}"
@app.local_entrypoint()
def main(model: str = "EleutherAI/pythia-1.4b", epochs: int = 3):
result = train.remote(model_id=model, epochs=epochs)
print(f"\nResult: {result}")