Spaces:
Runtime error
Runtime error
| """ | |
| Vertex AI Job Submission Script for SciMLx. | |
| Launches a Custom Container training job on a GPU-enabled worker. | |
| """ | |
| import argparse | |
| from google.cloud import aiplatform | |
| def submit_job( | |
| project_id: str, | |
| region: str, | |
| image_uri: str, | |
| display_name: str, | |
| machine_type: str = "n1-standard-8", | |
| accelerator_type: str = "NVIDIA_TESLA_T4", | |
| accelerator_count: int = 1, | |
| args: list = None | |
| ): | |
| aiplatform.init(project=project_id, location=region) | |
| job = aiplatform.CustomContainerTrainingJob( | |
| display_name=display_name, | |
| container_uri=image_uri, | |
| ) | |
| model = job.run( | |
| args=args or [], | |
| machine_type=machine_type, | |
| accelerator_type=accelerator_type, | |
| accelerator_count=accelerator_count, | |
| replica_count=1, | |
| ) | |
| return model | |
| if __name__ == "__main__": | |
| parser = argparse.ArgumentParser() | |
| parser.add_argument("--project", required=True, help="GCP Project ID") | |
| parser.add_argument("--region", default="us-central1", help="GCP Region") | |
| parser.add_argument("--image", required=True, help="Artifact Registry image URI") | |
| parser.add_argument("--name", default="scimlx-training-cuda", help="Job display name") | |
| parser.add_argument("--gpu-type", default="NVIDIA_TESLA_T4", help="e.g. NVIDIA_L4, NVIDIA_TESLA_A100_40GB") | |
| parser.add_argument("--gpu-count", type=int, default=1) | |
| # Capture all remaining args to pass to train.py | |
| parsed, unknown = parser.parse_known_args() | |
| print(f"Submitting job '{parsed.name}' to Vertex AI...") | |
| submit_job( | |
| project_id=parsed.project, | |
| region=parsed.region, | |
| image_uri=parsed.image, | |
| display_name=parsed.name, | |
| accelerator_type=parsed.gpu_type, | |
| accelerator_count=parsed.gpu_count, | |
| args=unknown | |
| ) | |