Spaces:
Runtime error
Runtime error
File size: 1,815 Bytes
54fa103 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 | """
Vertex AI Job Submission Script for SciMLx.
Launches a Custom Container training job on a GPU-enabled worker.
"""
import argparse
from google.cloud import aiplatform
def submit_job(
project_id: str,
region: str,
image_uri: str,
display_name: str,
machine_type: str = "n1-standard-8",
accelerator_type: str = "NVIDIA_TESLA_T4",
accelerator_count: int = 1,
args: list = None
):
aiplatform.init(project=project_id, location=region)
job = aiplatform.CustomContainerTrainingJob(
display_name=display_name,
container_uri=image_uri,
)
model = job.run(
args=args or [],
machine_type=machine_type,
accelerator_type=accelerator_type,
accelerator_count=accelerator_count,
replica_count=1,
)
return model
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--project", required=True, help="GCP Project ID")
parser.add_argument("--region", default="us-central1", help="GCP Region")
parser.add_argument("--image", required=True, help="Artifact Registry image URI")
parser.add_argument("--name", default="scimlx-training-cuda", help="Job display name")
parser.add_argument("--gpu-type", default="NVIDIA_TESLA_T4", help="e.g. NVIDIA_L4, NVIDIA_TESLA_A100_40GB")
parser.add_argument("--gpu-count", type=int, default=1)
# Capture all remaining args to pass to train.py
parsed, unknown = parser.parse_known_args()
print(f"Submitting job '{parsed.name}' to Vertex AI...")
submit_job(
project_id=parsed.project,
region=parsed.region,
image_uri=parsed.image,
display_name=parsed.name,
accelerator_type=parsed.gpu_type,
accelerator_count=parsed.gpu_count,
args=unknown
)
|