SciMLx_Production / scripts /gcp /submit_vertex.py
Moatasim Farooque
Remove problematic files
54fa103
"""
Vertex AI Job Submission Script for SciMLx.
Launches a Custom Container training job on a GPU-enabled worker.
"""
import argparse
from google.cloud import aiplatform
def submit_job(
project_id: str,
region: str,
image_uri: str,
display_name: str,
machine_type: str = "n1-standard-8",
accelerator_type: str = "NVIDIA_TESLA_T4",
accelerator_count: int = 1,
args: list = None
):
aiplatform.init(project=project_id, location=region)
job = aiplatform.CustomContainerTrainingJob(
display_name=display_name,
container_uri=image_uri,
)
model = job.run(
args=args or [],
machine_type=machine_type,
accelerator_type=accelerator_type,
accelerator_count=accelerator_count,
replica_count=1,
)
return model
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--project", required=True, help="GCP Project ID")
parser.add_argument("--region", default="us-central1", help="GCP Region")
parser.add_argument("--image", required=True, help="Artifact Registry image URI")
parser.add_argument("--name", default="scimlx-training-cuda", help="Job display name")
parser.add_argument("--gpu-type", default="NVIDIA_TESLA_T4", help="e.g. NVIDIA_L4, NVIDIA_TESLA_A100_40GB")
parser.add_argument("--gpu-count", type=int, default=1)
# Capture all remaining args to pass to train.py
parsed, unknown = parser.parse_known_args()
print(f"Submitting job '{parsed.name}' to Vertex AI...")
submit_job(
project_id=parsed.project,
region=parsed.region,
image_uri=parsed.image,
display_name=parsed.name,
accelerator_type=parsed.gpu_type,
accelerator_count=parsed.gpu_count,
args=unknown
)