Spaces:
Runtime error
Runtime error
da03
commited on
Commit
·
c686855
1
Parent(s):
1173f9e
- start_system.sh +8 -7
start_system.sh
CHANGED
|
@@ -44,10 +44,10 @@ cleanup() {
|
|
| 44 |
|
| 45 |
# Kill workers by finding their processes
|
| 46 |
echo "Stopping workers..."
|
| 47 |
-
pkill -f "python.*worker.py.*--
|
| 48 |
sleep 2
|
| 49 |
# Force kill if any are still running
|
| 50 |
-
pkill -9 -f "python.*worker.py.*--
|
| 51 |
|
| 52 |
echo "✅ System stopped"
|
| 53 |
exit 0
|
|
@@ -101,8 +101,9 @@ echo "🔧 Starting $NUM_GPUS GPU workers..."
|
|
| 101 |
python start_workers.py --num-gpus $NUM_GPUS --no-monitor > workers.log 2>&1
|
| 102 |
WORKER_START_EXIT_CODE=$?
|
| 103 |
|
| 104 |
-
# Wait
|
| 105 |
-
|
|
|
|
| 106 |
|
| 107 |
# Check if workers started successfully by checking the exit code and log
|
| 108 |
if [ $WORKER_START_EXIT_CODE -ne 0 ]; then
|
|
@@ -111,8 +112,8 @@ if [ $WORKER_START_EXIT_CODE -ne 0 ]; then
|
|
| 111 |
exit 1
|
| 112 |
fi
|
| 113 |
|
| 114 |
-
# Check if workers are actually running by looking for their processes
|
| 115 |
-
RUNNING_WORKERS=$(ps aux | grep -c "python.*worker.py.*--
|
| 116 |
if [ "$RUNNING_WORKERS" -lt "$NUM_GPUS" ]; then
|
| 117 |
echo "❌ Not all workers are running. Expected $NUM_GPUS, found $RUNNING_WORKERS. Check workers.log for errors."
|
| 118 |
cleanup
|
|
@@ -164,7 +165,7 @@ while true; do
|
|
| 164 |
fi
|
| 165 |
|
| 166 |
# Check if workers are still running
|
| 167 |
-
CURRENT_WORKERS=$(ps aux | grep -c "python.*worker.py.*--
|
| 168 |
if [ "$CURRENT_WORKERS" -lt "$NUM_GPUS" ]; then
|
| 169 |
echo "⚠️ Some workers died unexpectedly. Expected $NUM_GPUS, found $CURRENT_WORKERS"
|
| 170 |
cleanup
|
|
|
|
| 44 |
|
| 45 |
# Kill workers by finding their processes
|
| 46 |
echo "Stopping workers..."
|
| 47 |
+
pkill -f "python.*worker.py.*--worker-address" 2>/dev/null || true
|
| 48 |
sleep 2
|
| 49 |
# Force kill if any are still running
|
| 50 |
+
pkill -9 -f "python.*worker.py.*--worker-address" 2>/dev/null || true
|
| 51 |
|
| 52 |
echo "✅ System stopped"
|
| 53 |
exit 0
|
|
|
|
| 101 |
python start_workers.py --num-gpus $NUM_GPUS --no-monitor > workers.log 2>&1
|
| 102 |
WORKER_START_EXIT_CODE=$?
|
| 103 |
|
| 104 |
+
# Wait for workers to fully load models and register (60 seconds)
|
| 105 |
+
echo "⏳ Waiting 60 seconds for workers to load models and register..."
|
| 106 |
+
sleep 60
|
| 107 |
|
| 108 |
# Check if workers started successfully by checking the exit code and log
|
| 109 |
if [ $WORKER_START_EXIT_CODE -ne 0 ]; then
|
|
|
|
| 112 |
exit 1
|
| 113 |
fi
|
| 114 |
|
| 115 |
+
# Check if workers are actually running by looking for their processes (updated for new --worker-address format)
|
| 116 |
+
RUNNING_WORKERS=$(ps aux | grep -c "python.*worker.py.*--worker-address" || echo "0")
|
| 117 |
if [ "$RUNNING_WORKERS" -lt "$NUM_GPUS" ]; then
|
| 118 |
echo "❌ Not all workers are running. Expected $NUM_GPUS, found $RUNNING_WORKERS. Check workers.log for errors."
|
| 119 |
cleanup
|
|
|
|
| 165 |
fi
|
| 166 |
|
| 167 |
# Check if workers are still running
|
| 168 |
+
CURRENT_WORKERS=$(ps aux | grep -c "python.*worker.py.*--worker-address" || echo "0")
|
| 169 |
if [ "$CURRENT_WORKERS" -lt "$NUM_GPUS" ]; then
|
| 170 |
echo "⚠️ Some workers died unexpectedly. Expected $NUM_GPUS, found $CURRENT_WORKERS"
|
| 171 |
cleanup
|