"""Run RF-DETR training on Modal with an L4 GPU.""" import os from pathlib import Path import modal DATASET_LOCAL_PATH = "./car_data" DATASET_REMOTE_PATH = "/root/car_data" TRAINING_LOCAL_PATH = "." TRAINING_REMOTE_PATH = "/root/training" OUTPUT_REMOTE_PATH = "/root/output" LOCAL_OUTPUT_DIR = Path("./output") image = ( modal.Image.debian_slim(python_version="3.10") .apt_install("libgl1", "libglib2.0-0") .pip_install( "rfdetr", "supervision", "pydantic", gpu="L4", ) .add_local_dir( DATASET_LOCAL_PATH, remote_path=DATASET_REMOTE_PATH, ) .add_local_dir( TRAINING_LOCAL_PATH, remote_path=TRAINING_REMOTE_PATH, ) ) app = modal.App("rf-detr-training", image=image) output_volume = modal.Volume.from_name("rf-detr-output", create_if_missing=True) @app.function( gpu="L4", timeout=3600, volumes={OUTPUT_REMOTE_PATH: output_volume}, ) def train( epochs: int = 50, batch_size: int = 4, lr: float = 1e-4, resolution: int = 640, model_size: str = "base", grad_accum_steps: int = 1, ) -> None: import sys sys.path.insert(0, "/root") from training.train import run_training print("=" * 60) print("Training RF-DETR model on Modal (L4 GPU)") print(f" dataset: {DATASET_REMOTE_PATH}") print(f" model_size: {model_size}") print(f" epochs: {epochs}") print(f" batch_size: {batch_size}") print(f" lr: {lr}") print(f" resolution: {resolution}") print("=" * 60) run_training( dataset_dir=DATASET_REMOTE_PATH, epochs=epochs, batch_size=batch_size, lr=lr, resolution=resolution, output_dir=OUTPUT_REMOTE_PATH, model_size=model_size, grad_accum_steps=grad_accum_steps, ) output_volume.commit() print() print("=" * 60) print("Training complete. Checkpoints saved to volume 'rf-detr-output'.") print("=" * 60) @app.function(volumes={OUTPUT_REMOTE_PATH: output_volume}) def list_output_files() -> list[str]: """Return all file paths in the output volume.""" paths: list[str] = [] for root, _dirs, files in os.walk(OUTPUT_REMOTE_PATH): for f in files: paths.append(os.path.join(root, f)) return paths @app.function(volumes={OUTPUT_REMOTE_PATH: output_volume}) def read_output_file(remote_path: str) -> bytes: """Read a single file from the output volume.""" with open(remote_path, "rb") as f: return f.read() def download_outputs() -> None: """Download all files from the output volume to local disk.""" print("Listing remote output files...") remote_files = list_output_files.remote() if not remote_files: print("No output files found in volume.") return print(f"Downloading {len(remote_files)} file(s) to {LOCAL_OUTPUT_DIR}/") for remote_path in remote_files: rel = os.path.relpath(remote_path, OUTPUT_REMOTE_PATH) local_path = LOCAL_OUTPUT_DIR / rel local_path.parent.mkdir(parents=True, exist_ok=True) data = read_output_file.remote(remote_path) local_path.write_bytes(data) print(f" {rel} ({len(data)} bytes)") print("Download complete.") @app.local_entrypoint() def main( epochs: int = 50, batch_size: int = 4, lr: float = 1e-4, resolution: int = 640, model_size: str = "base", grad_accum_steps: int = 1, ) -> None: train.remote( epochs=epochs, batch_size=batch_size, lr=lr, resolution=resolution, model_size=model_size, grad_accum_steps=grad_accum_steps, ) download_outputs()