Spaces:
Running
on
Zero
Running
on
Zero
| #!/usr/bin/env python3 | |
| """ | |
| Zero GPU Monitor - Keeps the Space alive and monitors health | |
| """ | |
| import os | |
| import time | |
| import requests | |
| import sys | |
| from datetime import datetime | |
| # Configuration | |
| SPACE_URL = os.environ.get('SPACE_URL', 'http://localhost:7860') | |
| CHECK_INTERVAL = 180 # 3 minutes | |
| HEALTH_ENDPOINT = '/health' | |
| MAX_FAILURES = 3 | |
| def check_space_health(): | |
| """Check if the Space is responding""" | |
| try: | |
| response = requests.get(SPACE_URL, timeout=10) | |
| return response.status_code == 200 | |
| except Exception as e: | |
| print(f"Health check failed: {e}") | |
| return False | |
| def keep_space_warm(): | |
| """Send a dummy request to keep the Space warm""" | |
| try: | |
| # Send a simple request to the API | |
| payload = { | |
| "fn_index": 0, | |
| "data": ["MSKGEELFTGVVPILVELDGDVNGHKFSVSGEGEGDATYGKLTLKFICTTGKLPVPWPTLVTTLSYGVQCFSRYPDHMKQHDFFKSAMPEGYVQERTIFFKDDGNYKTRAEVKFEGDTLVNRIELKGIDFKEDGNILGHKLEYNYNSHNVYIMADKQKNGIKVNFKIRHNIEDGSVQLADHYQQNTPIGDGPVLLPDNHYLSTQSALSKDPNEKRDHMVLLEFVTAAGITHGMDELYK", 1, 10, "Small", False, 10] | |
| } | |
| response = requests.post(f"{SPACE_URL}/api/predict", json=payload, timeout=30) | |
| return response.status_code in [200, 202] | |
| except Exception as e: | |
| print(f"Keep-warm request failed: {e}") | |
| return False | |
| def monitor_loop(): | |
| """Main monitoring loop""" | |
| print(f"Starting Zero GPU Space monitor...") | |
| print(f"Space URL: {SPACE_URL}") | |
| print(f"Check interval: {CHECK_INTERVAL} seconds") | |
| consecutive_failures = 0 | |
| last_warm_up = datetime.now() | |
| while True: | |
| try: | |
| current_time = datetime.now() | |
| print(f"\n[{current_time}] Performing health check...") | |
| # Check if Space is healthy | |
| if check_space_health(): | |
| print("✓ Space is healthy") | |
| consecutive_failures = 0 | |
| # Send keep-warm request every check | |
| time_since_warmup = (current_time - last_warm_up).total_seconds() | |
| if time_since_warmup > CHECK_INTERVAL: | |
| print("Sending keep-warm request...") | |
| if keep_space_warm(): | |
| print("✓ Keep-warm successful") | |
| last_warm_up = current_time | |
| else: | |
| print("✗ Keep-warm failed") | |
| else: | |
| consecutive_failures += 1 | |
| print(f"✗ Space is not responding (failure {consecutive_failures}/{MAX_FAILURES})") | |
| if consecutive_failures >= MAX_FAILURES: | |
| print("ERROR: Space appears to be down!") | |
| print("Please restart the Space from Hugging Face interface") | |
| # Could add notification logic here | |
| # Wait before next check | |
| time.sleep(CHECK_INTERVAL) | |
| except KeyboardInterrupt: | |
| print("\nMonitor stopped by user") | |
| break | |
| except Exception as e: | |
| print(f"Monitor error: {e}") | |
| time.sleep(60) # Wait a minute before retrying | |
| if __name__ == "__main__": | |
| # Get Space URL from environment or command line | |
| if len(sys.argv) > 1: | |
| SPACE_URL = sys.argv[1] | |
| monitor_loop() |