car-detection / training /modal_train.py
socks22's picture
first
f3f6f5d
"""Run RF-DETR training on Modal with an L4 GPU."""
import os
from pathlib import Path
import modal
DATASET_LOCAL_PATH = "./car_data"
DATASET_REMOTE_PATH = "/root/car_data"
TRAINING_LOCAL_PATH = "."
TRAINING_REMOTE_PATH = "/root/training"
OUTPUT_REMOTE_PATH = "/root/output"
LOCAL_OUTPUT_DIR = Path("./output")
image = (
modal.Image.debian_slim(python_version="3.10")
.apt_install("libgl1", "libglib2.0-0")
.pip_install(
"rfdetr",
"supervision",
"pydantic",
gpu="L4",
)
.add_local_dir(
DATASET_LOCAL_PATH,
remote_path=DATASET_REMOTE_PATH,
)
.add_local_dir(
TRAINING_LOCAL_PATH,
remote_path=TRAINING_REMOTE_PATH,
)
)
app = modal.App("rf-detr-training", image=image)
output_volume = modal.Volume.from_name("rf-detr-output", create_if_missing=True)
@app.function(
gpu="L4",
timeout=3600,
volumes={OUTPUT_REMOTE_PATH: output_volume},
)
def train(
epochs: int = 50,
batch_size: int = 4,
lr: float = 1e-4,
resolution: int = 640,
model_size: str = "base",
grad_accum_steps: int = 1,
) -> None:
import sys
sys.path.insert(0, "/root")
from training.train import run_training
print("=" * 60)
print("Training RF-DETR model on Modal (L4 GPU)")
print(f" dataset: {DATASET_REMOTE_PATH}")
print(f" model_size: {model_size}")
print(f" epochs: {epochs}")
print(f" batch_size: {batch_size}")
print(f" lr: {lr}")
print(f" resolution: {resolution}")
print("=" * 60)
run_training(
dataset_dir=DATASET_REMOTE_PATH,
epochs=epochs,
batch_size=batch_size,
lr=lr,
resolution=resolution,
output_dir=OUTPUT_REMOTE_PATH,
model_size=model_size,
grad_accum_steps=grad_accum_steps,
)
output_volume.commit()
print()
print("=" * 60)
print("Training complete. Checkpoints saved to volume 'rf-detr-output'.")
print("=" * 60)
@app.function(volumes={OUTPUT_REMOTE_PATH: output_volume})
def list_output_files() -> list[str]:
"""Return all file paths in the output volume."""
paths: list[str] = []
for root, _dirs, files in os.walk(OUTPUT_REMOTE_PATH):
for f in files:
paths.append(os.path.join(root, f))
return paths
@app.function(volumes={OUTPUT_REMOTE_PATH: output_volume})
def read_output_file(remote_path: str) -> bytes:
"""Read a single file from the output volume."""
with open(remote_path, "rb") as f:
return f.read()
def download_outputs() -> None:
"""Download all files from the output volume to local disk."""
print("Listing remote output files...")
remote_files = list_output_files.remote()
if not remote_files:
print("No output files found in volume.")
return
print(f"Downloading {len(remote_files)} file(s) to {LOCAL_OUTPUT_DIR}/")
for remote_path in remote_files:
rel = os.path.relpath(remote_path, OUTPUT_REMOTE_PATH)
local_path = LOCAL_OUTPUT_DIR / rel
local_path.parent.mkdir(parents=True, exist_ok=True)
data = read_output_file.remote(remote_path)
local_path.write_bytes(data)
print(f" {rel} ({len(data)} bytes)")
print("Download complete.")
@app.local_entrypoint()
def main(
epochs: int = 50,
batch_size: int = 4,
lr: float = 1e-4,
resolution: int = 640,
model_size: str = "base",
grad_accum_steps: int = 1,
) -> None:
train.remote(
epochs=epochs,
batch_size=batch_size,
lr=lr,
resolution=resolution,
model_size=model_size,
grad_accum_steps=grad_accum_steps,
)
download_outputs()