Spaces:

socks22
/

car-detection

Sleeping

App Files Files Community

car-detection / training /modal_train.py

socks22

first

f3f6f5d 26 days ago

raw

history blame contribute delete

3.71 kB

	"""Run RF-DETR training on Modal with an L4 GPU."""

	import os
	from pathlib import Path

	import modal

	DATASET_LOCAL_PATH = "./car_data"
	DATASET_REMOTE_PATH = "/root/car_data"
	TRAINING_LOCAL_PATH = "."
	TRAINING_REMOTE_PATH = "/root/training"
	OUTPUT_REMOTE_PATH = "/root/output"
	LOCAL_OUTPUT_DIR = Path("./output")

	image = (
	modal.Image.debian_slim(python_version="3.10")
	.apt_install("libgl1", "libglib2.0-0")
	.pip_install(
	"rfdetr",
	"supervision",
	"pydantic",
	gpu="L4",
	)
	.add_local_dir(
	DATASET_LOCAL_PATH,
	remote_path=DATASET_REMOTE_PATH,
	)
	.add_local_dir(
	TRAINING_LOCAL_PATH,
	remote_path=TRAINING_REMOTE_PATH,
	)
	)

	app = modal.App("rf-detr-training", image=image)

	output_volume = modal.Volume.from_name("rf-detr-output", create_if_missing=True)


	@app.function(
	gpu="L4",
	timeout=3600,
	volumes={OUTPUT_REMOTE_PATH: output_volume},
	)
	def train(
	epochs: int = 50,
	batch_size: int = 4,
	lr: float = 1e-4,
	resolution: int = 640,
	model_size: str = "base",
	grad_accum_steps: int = 1,
	) -> None:
	import sys

	sys.path.insert(0, "/root")

	from training.train import run_training

	print("=" * 60)
	print("Training RF-DETR model on Modal (L4 GPU)")
	print(f" dataset: {DATASET_REMOTE_PATH}")
	print(f" model_size: {model_size}")
	print(f" epochs: {epochs}")
	print(f" batch_size: {batch_size}")
	print(f" lr: {lr}")
	print(f" resolution: {resolution}")
	print("=" * 60)

	run_training(
	dataset_dir=DATASET_REMOTE_PATH,
	epochs=epochs,
	batch_size=batch_size,
	lr=lr,
	resolution=resolution,
	output_dir=OUTPUT_REMOTE_PATH,
	model_size=model_size,
	grad_accum_steps=grad_accum_steps,
	)

	output_volume.commit()

	print()
	print("=" * 60)
	print("Training complete. Checkpoints saved to volume 'rf-detr-output'.")
	print("=" * 60)


	@app.function(volumes={OUTPUT_REMOTE_PATH: output_volume})
	def list_output_files() -> list[str]:
	"""Return all file paths in the output volume."""
	paths: list[str] = []
	for root, _dirs, files in os.walk(OUTPUT_REMOTE_PATH):
	for f in files:
	paths.append(os.path.join(root, f))
	return paths


	@app.function(volumes={OUTPUT_REMOTE_PATH: output_volume})
	def read_output_file(remote_path: str) -> bytes:
	"""Read a single file from the output volume."""
	with open(remote_path, "rb") as f:
	return f.read()


	def download_outputs() -> None:
	"""Download all files from the output volume to local disk."""
	print("Listing remote output files...")
	remote_files = list_output_files.remote()

	if not remote_files:
	print("No output files found in volume.")
	return

	print(f"Downloading {len(remote_files)} file(s) to {LOCAL_OUTPUT_DIR}/")
	for remote_path in remote_files:
	rel = os.path.relpath(remote_path, OUTPUT_REMOTE_PATH)
	local_path = LOCAL_OUTPUT_DIR / rel
	local_path.parent.mkdir(parents=True, exist_ok=True)

	data = read_output_file.remote(remote_path)
	local_path.write_bytes(data)
	print(f" {rel} ({len(data)} bytes)")

	print("Download complete.")


	@app.local_entrypoint()
	def main(
	epochs: int = 50,
	batch_size: int = 4,
	lr: float = 1e-4,
	resolution: int = 640,
	model_size: str = "base",
	grad_accum_steps: int = 1,
	) -> None:
	train.remote(
	epochs=epochs,
	batch_size=batch_size,
	lr=lr,
	resolution=resolution,
	model_size=model_size,
	grad_accum_steps=grad_accum_steps,
	)
	download_outputs()