Spaces:
Running
Running
Create tpu_manager.py
Browse files- services/tpu_manager.py +97 -0
services/tpu_manager.py
ADDED
|
@@ -0,0 +1,97 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# ===== FILE: services/tpu_manager.py (Intelligent Orchestrator) =====
|
| 2 |
+
import os
|
| 3 |
+
import google.cloud.tpu_v2 as tpu
|
| 4 |
+
|
| 5 |
+
class TPUComputeManager:
|
| 6 |
+
def __init__(self, master_framework_instance):
|
| 7 |
+
self.mf = master_framework_instance
|
| 8 |
+
# We will use the official Google Cloud TPU client library for management.
|
| 9 |
+
# self.tpu_client = tpu.TpuClient() # This will be added when we install the library
|
| 10 |
+
|
| 11 |
+
# --- DEFINITIVE RESOURCE POOLS (Based on the TRC Grant) ---
|
| 12 |
+
|
| 13 |
+
# The On-Demand Pool is our single, most reliable resource.
|
| 14 |
+
# It is reserved for the highest priority tasks (like the Oracle Core).
|
| 15 |
+
self.on_demand_pool = {
|
| 16 |
+
"zone": "us-central2-b",
|
| 17 |
+
"accelerator_type": "v4-32", # Assuming we use the 32 chips as a single pod slice
|
| 18 |
+
"name": "aetherius-oracle-core"
|
| 19 |
+
}
|
| 20 |
+
|
| 21 |
+
# The Spot Pools are our powerful, distributed, but ephemeral resources.
|
| 22 |
+
# They are the default for batch jobs like the Harvester Fleet and Architect Guild.
|
| 23 |
+
# The orchestrator will try these in order.
|
| 24 |
+
self.spot_pools = [
|
| 25 |
+
{
|
| 26 |
+
"zone": "us-east1-d",
|
| 27 |
+
"accelerator_type": "v6e-64", # Note: v6e (TPU v5p) might have different naming
|
| 28 |
+
"name_prefix": "aetherius-harvester-useast1"
|
| 29 |
+
},
|
| 30 |
+
{
|
| 31 |
+
"zone": "us-central2-b",
|
| 32 |
+
"accelerator_type": "v4-32",
|
| 33 |
+
"name_prefix": "aetherius-architect-uscentral2"
|
| 34 |
+
},
|
| 35 |
+
{
|
| 36 |
+
"zone": "europe-west4-a",
|
| 37 |
+
"accelerator_type": "v6e-64",
|
| 38 |
+
"name_prefix": "aetherius-harvester-euwest4a"
|
| 39 |
+
},
|
| 40 |
+
{
|
| 41 |
+
"zone": "europe-west4-b",
|
| 42 |
+
"accelerator_type": "v5e-64", # Note: v5e might have different naming
|
| 43 |
+
"name_prefix": "aetherius-harvester-euwest4b"
|
| 44 |
+
},
|
| 45 |
+
{
|
| 46 |
+
"zone": "us-central1-a",
|
| 47 |
+
"accelerator_type": "v5e-64",
|
| 48 |
+
"name_prefix": "aetherius-harvester-uscentral1"
|
| 49 |
+
},
|
| 50 |
+
]
|
| 51 |
+
|
| 52 |
+
print("TPU Compute Manager: Definitive Resource Pools Configured.", flush=True)
|
| 53 |
+
|
| 54 |
+
def launch_training_job(self, script_path: str, is_critical: bool = False):
|
| 55 |
+
"""
|
| 56 |
+
Launches a training job (like the Architect Guild's learning cycle).
|
| 57 |
+
It will prioritize spot instances unless the job is marked critical.
|
| 58 |
+
"""
|
| 59 |
+
self.mf.add_to_short_term_memory(f"Orchestrator: Received request to launch training job: {script_path}")
|
| 60 |
+
|
| 61 |
+
# --- PRIORITY LOGIC ---
|
| 62 |
+
if not is_critical:
|
| 63 |
+
# 1. Attempt to acquire a Spot instance first
|
| 64 |
+
for pool in self.spot_pools:
|
| 65 |
+
try:
|
| 66 |
+
# This is a conceptual representation of the API call.
|
| 67 |
+
# It would use gcloud or the TPU client library to request a queued resource.
|
| 68 |
+
print(f"Orchestrator: Attempting to acquire Spot TPU from pool: {pool['zone']}...")
|
| 69 |
+
# ... code to request and run job on a spot instance ...
|
| 70 |
+
self.mf.add_to_short_term_memory(f"Orchestrator: Job successfully launched on Spot TPU in {pool['zone']}.")
|
| 71 |
+
return "Job launched successfully on a Spot instance."
|
| 72 |
+
except Exception as e:
|
| 73 |
+
# This error would indicate the spot capacity is unavailable.
|
| 74 |
+
print(f"Orchestrator: Could not acquire Spot TPU from {pool['zone']}. Reason: {e}. Trying next pool...")
|
| 75 |
+
continue # Try the next spot pool
|
| 76 |
+
|
| 77 |
+
# 2. If all spot pools fail, or if the job is critical, failover to On-Demand
|
| 78 |
+
print("Orchestrator: All Spot pools unavailable or job is critical. Failing over to On-Demand pool...")
|
| 79 |
+
try:
|
| 80 |
+
# This is our fallback. Use the precious on-demand resource.
|
| 81 |
+
# ... code to request and run job on the on-demand instance ...
|
| 82 |
+
self.mf.add_to_short_term_memory("Orchestrator: Job successfully launched on On-Demand TPU as a fallback.")
|
| 83 |
+
return "Job launched successfully on the On-Demand instance."
|
| 84 |
+
except Exception as e:
|
| 85 |
+
print(f"Orchestrator CRITICAL FAILURE: Could not acquire any TPU resource. Reason: {e}")
|
| 86 |
+
self.mf.add_to_short_term_memory("Orchestrator: CRITICAL FAILURE. All TPU resources are unavailable.")
|
| 87 |
+
return f"Error: All TPU resources (Spot and On-Demand) are currently unavailable. Reason: {e}"
|
| 88 |
+
|
| 89 |
+
def run_oracle_query(self, query_vector):
|
| 90 |
+
"""
|
| 91 |
+
Runs a query on the Oracle Core. This is always a high-priority,
|
| 92 |
+
on-demand task.
|
| 93 |
+
"""
|
| 94 |
+
# This function would be designed to communicate directly with the
|
| 95 |
+
# models running on the provisioned on-demand TPU in us-central2-b.
|
| 96 |
+
# Its logic assumes that resource is always available.
|
| 97 |
+
pass
|