Spaces:
Running
Running
| from __future__ import annotations | |
| import random | |
| import unittest | |
| from gpu_pool import GPUPool, GPUState | |
| class GPUPoolTests(unittest.TestCase): | |
| def test_allocation_overload_preempt_and_false_report(self) -> None: | |
| pool = GPUPool(num_gpus=2, memory_per_gpu=80) | |
| self.assertTrue(pool.allocate("JOB-001", "GPU-00", 60)) | |
| self.assertTrue(pool.allocate("JOB-002", "GPU-00", 30)) | |
| hidden = pool.snapshot(include_hidden=True)[0] | |
| self.assertEqual(hidden["state"], GPUState.OVERLOADED.value) | |
| self.assertEqual(hidden["memory_used"], 90) | |
| pool.inject_false_report("GPU-00", {"state": "IDLE", "memory_free": 40}) | |
| visible = pool.snapshot(include_hidden=False)[0] | |
| self.assertEqual(visible["state"], "IDLE") | |
| self.assertEqual(visible["memory_free"], 40) | |
| self.assertTrue(visible["report_tampered"]) | |
| self.assertTrue(pool.preempt("JOB-002")) | |
| hidden = pool.snapshot(include_hidden=True)[0] | |
| self.assertEqual(hidden["state"], GPUState.ALLOCATED.value) | |
| self.assertEqual(hidden["memory_free"], 20) | |
| def test_failure_and_recovery_cycle(self) -> None: | |
| pool = GPUPool(num_gpus=1, memory_per_gpu=80, failure_probability=1.0, recovery_steps=2) | |
| pool.allocate("JOB-001", "GPU-00", 20) | |
| failed = pool.tick(rng=random.Random(0)) | |
| self.assertEqual(failed, ["GPU-00"]) | |
| self.assertEqual(pool.snapshot(include_hidden=True)[0]["state"], GPUState.FAILED.value) | |
| self.assertEqual(pool.cluster_health_score(), 0.0) | |
| pool.tick(rng=random.Random(0)) | |
| self.assertEqual(pool.snapshot(include_hidden=True)[0]["state"], GPUState.RECOVERING.value) | |
| pool.tick(rng=random.Random(0)) | |
| pool.tick(rng=random.Random(0)) | |
| snapshot = pool.snapshot(include_hidden=True)[0] | |
| self.assertEqual(snapshot["state"], GPUState.IDLE.value) | |
| self.assertEqual(snapshot["jobs"], []) | |
| if __name__ == "__main__": | |
| unittest.main() | |