Spaces:
Runtime error
Runtime error
da03
commited on
Commit
Β·
c74f490
1
Parent(s):
c3d464b
- dispatcher.py +7 -1
- start_system.sh +25 -15
dispatcher.py
CHANGED
|
@@ -438,4 +438,10 @@ async def startup_event():
|
|
| 438 |
|
| 439 |
if __name__ == "__main__":
|
| 440 |
import uvicorn
|
| 441 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 438 |
|
| 439 |
if __name__ == "__main__":
|
| 440 |
import uvicorn
|
| 441 |
+
import argparse
|
| 442 |
+
|
| 443 |
+
parser = argparse.ArgumentParser(description="Dispatcher for Neural OS")
|
| 444 |
+
parser.add_argument("--port", type=int, default=8000, help="Port to run the dispatcher on")
|
| 445 |
+
args = parser.parse_args()
|
| 446 |
+
|
| 447 |
+
uvicorn.run(app, host="0.0.0.0", port=args.port)
|
start_system.sh
CHANGED
|
@@ -42,12 +42,12 @@ cleanup() {
|
|
| 42 |
wait $DISPATCHER_PID 2>/dev/null
|
| 43 |
fi
|
| 44 |
|
| 45 |
-
# Kill workers
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
|
| 49 |
-
|
| 50 |
-
|
| 51 |
|
| 52 |
echo "β
System stopped"
|
| 53 |
exit 0
|
|
@@ -97,20 +97,28 @@ echo "β
Dispatcher started (PID: $DISPATCHER_PID)"
|
|
| 97 |
|
| 98 |
# Start workers
|
| 99 |
echo "π§ Starting $NUM_GPUS GPU workers..."
|
| 100 |
-
python start_workers.py --num-gpus $NUM_GPUS --no-monitor > workers.log 2>&1
|
| 101 |
-
|
| 102 |
|
| 103 |
-
# Wait a bit for workers to
|
| 104 |
-
sleep
|
| 105 |
|
| 106 |
-
# Check if workers started successfully
|
| 107 |
-
if
|
| 108 |
echo "β Failed to start workers. Check workers.log for errors."
|
| 109 |
cleanup
|
| 110 |
exit 1
|
| 111 |
fi
|
| 112 |
|
| 113 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 114 |
echo ""
|
| 115 |
echo "π System is ready!"
|
| 116 |
echo "================================"
|
|
@@ -137,8 +145,10 @@ while true; do
|
|
| 137 |
exit 1
|
| 138 |
fi
|
| 139 |
|
| 140 |
-
|
| 141 |
-
|
|
|
|
|
|
|
| 142 |
cleanup
|
| 143 |
exit 1
|
| 144 |
fi
|
|
|
|
| 42 |
wait $DISPATCHER_PID 2>/dev/null
|
| 43 |
fi
|
| 44 |
|
| 45 |
+
# Kill workers by finding their processes
|
| 46 |
+
echo "Stopping workers..."
|
| 47 |
+
pkill -f "python.*worker.py.*--gpu-id" 2>/dev/null || true
|
| 48 |
+
sleep 2
|
| 49 |
+
# Force kill if any are still running
|
| 50 |
+
pkill -9 -f "python.*worker.py.*--gpu-id" 2>/dev/null || true
|
| 51 |
|
| 52 |
echo "β
System stopped"
|
| 53 |
exit 0
|
|
|
|
| 97 |
|
| 98 |
# Start workers
|
| 99 |
echo "π§ Starting $NUM_GPUS GPU workers..."
|
| 100 |
+
python start_workers.py --num-gpus $NUM_GPUS --no-monitor > workers.log 2>&1
|
| 101 |
+
WORKER_START_EXIT_CODE=$?
|
| 102 |
|
| 103 |
+
# Wait a bit for workers to register
|
| 104 |
+
sleep 3
|
| 105 |
|
| 106 |
+
# Check if workers started successfully by checking the exit code and log
|
| 107 |
+
if [ $WORKER_START_EXIT_CODE -ne 0 ]; then
|
| 108 |
echo "β Failed to start workers. Check workers.log for errors."
|
| 109 |
cleanup
|
| 110 |
exit 1
|
| 111 |
fi
|
| 112 |
|
| 113 |
+
# Check if workers are actually running by looking for their processes
|
| 114 |
+
RUNNING_WORKERS=$(ps aux | grep -c "python.*worker.py.*--gpu-id" || echo "0")
|
| 115 |
+
if [ "$RUNNING_WORKERS" -lt "$NUM_GPUS" ]; then
|
| 116 |
+
echo "β Not all workers are running. Expected $NUM_GPUS, found $RUNNING_WORKERS. Check workers.log for errors."
|
| 117 |
+
cleanup
|
| 118 |
+
exit 1
|
| 119 |
+
fi
|
| 120 |
+
|
| 121 |
+
echo "β
Workers started successfully ($RUNNING_WORKERS workers running)"
|
| 122 |
echo ""
|
| 123 |
echo "π System is ready!"
|
| 124 |
echo "================================"
|
|
|
|
| 145 |
exit 1
|
| 146 |
fi
|
| 147 |
|
| 148 |
+
# Check if workers are still running
|
| 149 |
+
CURRENT_WORKERS=$(ps aux | grep -c "python.*worker.py.*--gpu-id" || echo "0")
|
| 150 |
+
if [ "$CURRENT_WORKERS" -lt "$NUM_GPUS" ]; then
|
| 151 |
+
echo "β οΈ Some workers died unexpectedly. Expected $NUM_GPUS, found $CURRENT_WORKERS"
|
| 152 |
cleanup
|
| 153 |
exit 1
|
| 154 |
fi
|