Spaces:
Sleeping
Sleeping
| """Run RF-DETR training on Modal with an L4 GPU.""" | |
| import os | |
| from pathlib import Path | |
| import modal | |
| DATASET_LOCAL_PATH = "./car_data" | |
| DATASET_REMOTE_PATH = "/root/car_data" | |
| TRAINING_LOCAL_PATH = "." | |
| TRAINING_REMOTE_PATH = "/root/training" | |
| OUTPUT_REMOTE_PATH = "/root/output" | |
| LOCAL_OUTPUT_DIR = Path("./output") | |
| image = ( | |
| modal.Image.debian_slim(python_version="3.10") | |
| .apt_install("libgl1", "libglib2.0-0") | |
| .pip_install( | |
| "rfdetr", | |
| "supervision", | |
| "pydantic", | |
| gpu="L4", | |
| ) | |
| .add_local_dir( | |
| DATASET_LOCAL_PATH, | |
| remote_path=DATASET_REMOTE_PATH, | |
| ) | |
| .add_local_dir( | |
| TRAINING_LOCAL_PATH, | |
| remote_path=TRAINING_REMOTE_PATH, | |
| ) | |
| ) | |
| app = modal.App("rf-detr-training", image=image) | |
| output_volume = modal.Volume.from_name("rf-detr-output", create_if_missing=True) | |
| def train( | |
| epochs: int = 50, | |
| batch_size: int = 4, | |
| lr: float = 1e-4, | |
| resolution: int = 640, | |
| model_size: str = "base", | |
| grad_accum_steps: int = 1, | |
| ) -> None: | |
| import sys | |
| sys.path.insert(0, "/root") | |
| from training.train import run_training | |
| print("=" * 60) | |
| print("Training RF-DETR model on Modal (L4 GPU)") | |
| print(f" dataset: {DATASET_REMOTE_PATH}") | |
| print(f" model_size: {model_size}") | |
| print(f" epochs: {epochs}") | |
| print(f" batch_size: {batch_size}") | |
| print(f" lr: {lr}") | |
| print(f" resolution: {resolution}") | |
| print("=" * 60) | |
| run_training( | |
| dataset_dir=DATASET_REMOTE_PATH, | |
| epochs=epochs, | |
| batch_size=batch_size, | |
| lr=lr, | |
| resolution=resolution, | |
| output_dir=OUTPUT_REMOTE_PATH, | |
| model_size=model_size, | |
| grad_accum_steps=grad_accum_steps, | |
| ) | |
| output_volume.commit() | |
| print() | |
| print("=" * 60) | |
| print("Training complete. Checkpoints saved to volume 'rf-detr-output'.") | |
| print("=" * 60) | |
| def list_output_files() -> list[str]: | |
| """Return all file paths in the output volume.""" | |
| paths: list[str] = [] | |
| for root, _dirs, files in os.walk(OUTPUT_REMOTE_PATH): | |
| for f in files: | |
| paths.append(os.path.join(root, f)) | |
| return paths | |
| def read_output_file(remote_path: str) -> bytes: | |
| """Read a single file from the output volume.""" | |
| with open(remote_path, "rb") as f: | |
| return f.read() | |
| def download_outputs() -> None: | |
| """Download all files from the output volume to local disk.""" | |
| print("Listing remote output files...") | |
| remote_files = list_output_files.remote() | |
| if not remote_files: | |
| print("No output files found in volume.") | |
| return | |
| print(f"Downloading {len(remote_files)} file(s) to {LOCAL_OUTPUT_DIR}/") | |
| for remote_path in remote_files: | |
| rel = os.path.relpath(remote_path, OUTPUT_REMOTE_PATH) | |
| local_path = LOCAL_OUTPUT_DIR / rel | |
| local_path.parent.mkdir(parents=True, exist_ok=True) | |
| data = read_output_file.remote(remote_path) | |
| local_path.write_bytes(data) | |
| print(f" {rel} ({len(data)} bytes)") | |
| print("Download complete.") | |
| def main( | |
| epochs: int = 50, | |
| batch_size: int = 4, | |
| lr: float = 1e-4, | |
| resolution: int = 640, | |
| model_size: str = "base", | |
| grad_accum_steps: int = 1, | |
| ) -> None: | |
| train.remote( | |
| epochs=epochs, | |
| batch_size=batch_size, | |
| lr=lr, | |
| resolution=resolution, | |
| model_size=model_size, | |
| grad_accum_steps=grad_accum_steps, | |
| ) | |
| download_outputs() | |