Spaces:
Paused
Paused
AdriBat1
Add Deep-NanoGPT experiment (Phase 1 & 2): resumable training, inference, 72-layer models
671ce97 | import os | |
| import sys | |
| import time | |
| from antigravity_sdk.client import RemoteGPU | |
| # Config | |
| SCRIPT_PATH = "examples/deep_nanogpt_resumable.py" | |
| MAX_LOOPS = 20 | |
| def main(): | |
| if not os.path.exists(SCRIPT_PATH): | |
| print(f"β Script not found: {SCRIPT_PATH}") | |
| sys.exit(1) | |
| with open(SCRIPT_PATH, 'r') as f: | |
| code = f.read() | |
| print(f"π Launching Deep-NanoGPT Phase 2 (Resumable Training)...") | |
| gpu = RemoteGPU() | |
| for i in range(MAX_LOOPS): | |
| print(f"\nπ Loop {i+1}/{MAX_LOOPS}...") | |
| # Determine if we should download files (only on last likely step, or check output) | |
| # We'll enable download always, but the script only copies them to cwd at the end. | |
| result = gpu.run(code, download_files=True, verbose=True) | |
| output = result.output | |
| if "TRAINING_COMPLETE" in output: | |
| print("\nβ Training Finished!") | |
| break | |
| elif "CONTINUE_TRAINING" in output: | |
| print("β³ Chunk complete. Resuming next chunk...") | |
| time.sleep(2) # Breathing room | |
| elif "FATAL SCRIPT ERROR" in output: | |
| print("β Fatal Error on server. Stopping.") | |
| break | |
| else: | |
| print("β οΈ Unknown status. Stopping safely.") | |
| break | |
| # Final check | |
| if os.path.exists("comparison_loss_v2.png"): | |
| print("\nβ Success! Saved comparison_loss_v2.png") | |
| if __name__ == "__main__": | |
| main() | |