Bc-AI commited on
Commit
0f2ae04
Β·
verified Β·
1 Parent(s): ffed02e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +51 -37
app.py CHANGED
@@ -46,7 +46,7 @@ worker_health = {
46
  }
47
 
48
  request_timestamps = deque(maxlen=100)
49
- current_load_mode = "light"
50
  cluster_stats = {
51
  "total_requests": 0,
52
  "successful_requests": 0,
@@ -94,11 +94,23 @@ def get_current_load() -> int:
94
  def update_load_mode():
95
  global current_load_mode
96
  load = get_current_load()
 
97
 
98
- if load <= LIGHT_LOAD_THRESHOLD:
99
- current_load_mode = "light"
100
- elif load >= HEAVY_LOAD_THRESHOLD:
101
- current_load_mode = "heavy"
 
 
 
 
 
 
 
 
 
 
 
102
 
103
  return current_load_mode, load
104
 
@@ -937,25 +949,26 @@ async def generate(request: GenerateRequest):
937
  "stream": True
938
  }
939
 
940
- print(f"🎯 {mode.upper()} | Load: {load} | Healthy: {len(healthy)}")
941
 
942
  try:
943
  if mode == "light" and len(healthy) >= 2:
944
- # DISTRIBUTED MODE
945
- generator, decoder1, decoder2 = select_distributed_workers()
946
- cluster_stats["successful_requests"] += 1
947
- return StreamingResponse(
948
- distributed_generation(generator, decoder1, decoder2, request_data, "generate"),
949
- media_type="text/event-stream"
950
- )
951
- else:
952
- # HEAVY LOAD MODE
953
- worker = get_least_busy_worker()
954
- cluster_stats["successful_requests"] += 1
955
- return StreamingResponse(
956
- heavy_load_generation(worker, request_data, "generate"),
957
- media_type="text/event-stream"
958
- )
 
959
  except Exception as e:
960
  cluster_stats["failed_requests"] += 1
961
  raise
@@ -981,25 +994,26 @@ async def chat(request: ChatRequest):
981
  "stream": True
982
  }
983
 
984
- print(f"πŸ’¬ {mode.upper()} | Load: {load} | Healthy: {len(healthy)}")
985
 
986
  try:
987
  if mode == "light" and len(healthy) >= 2:
988
- # DISTRIBUTED MODE
989
- generator, decoder1, decoder2 = select_distributed_workers()
990
- cluster_stats["successful_requests"] += 1
991
- return StreamingResponse(
992
- distributed_generation(generator, decoder1, decoder2, request_data, "chat"),
993
- media_type="text/event-stream"
994
- )
995
- else:
996
- # HEAVY LOAD MODE
997
- worker = get_least_busy_worker()
998
- cluster_stats["successful_requests"] += 1
999
- return StreamingResponse(
1000
- heavy_load_generation(worker, request_data, "chat"),
1001
- media_type="text/event-stream"
1002
- )
 
1003
  except Exception as e:
1004
  cluster_stats["failed_requests"] += 1
1005
  raise
 
46
  }
47
 
48
  request_timestamps = deque(maxlen=100)
49
+ current_load_mode = "light" # "light", "medium", "heavy"
50
  cluster_stats = {
51
  "total_requests": 0,
52
  "successful_requests": 0,
 
94
  def update_load_mode():
95
  global current_load_mode
96
  load = get_current_load()
97
+ healthy_count = len(get_healthy_workers())
98
 
99
+ # Adjust thresholds based on available workers
100
+ if healthy_count >= 5:
101
+ if load <= LIGHT_LOAD_THRESHOLD:
102
+ current_load_mode = "light" # 1 gen + 4 decoders
103
+ elif load <= MEDIUM_LOAD_THRESHOLD:
104
+ current_load_mode = "medium" # 2 gens + 3 decoders OR parallel requests
105
+ else:
106
+ current_load_mode = "heavy" # all workers independent
107
+ elif healthy_count >= 3:
108
+ if load <= 2:
109
+ current_load_mode = "light" # 1 gen + 2 decoders
110
+ else:
111
+ current_load_mode = "heavy" # distribute requests
112
+ else:
113
+ current_load_mode = "heavy" # fallback to simple distribution
114
 
115
  return current_load_mode, load
116
 
 
949
  "stream": True
950
  }
951
 
952
+ print(f"🎯 {mode.upper()} | Load: {load} | Workers: {len(healthy)}")
953
 
954
  try:
955
  if mode == "light" and len(healthy) >= 2:
956
+ # DISTRIBUTED MODE - 1 gen + multiple decoders
957
+ generators, decoders = select_distributed_workers()
958
+ if decoders:
959
+ cluster_stats["successful_requests"] += 1
960
+ return StreamingResponse(
961
+ distributed_generation(generators, decoders, request_data, "generate"),
962
+ media_type="text/event-stream"
963
+ )
964
+
965
+ # HEAVY/FALLBACK - single worker
966
+ worker = get_least_busy_worker()
967
+ cluster_stats["successful_requests"] += 1
968
+ return StreamingResponse(
969
+ heavy_load_generation(worker, request_data, "generate"),
970
+ media_type="text/event-stream"
971
+ )
972
  except Exception as e:
973
  cluster_stats["failed_requests"] += 1
974
  raise
 
994
  "stream": True
995
  }
996
 
997
+ print(f"πŸ’¬ {mode.upper()} | Load: {load} | Workers: {len(healthy)}")
998
 
999
  try:
1000
  if mode == "light" and len(healthy) >= 2:
1001
+ # DISTRIBUTED MODE - 1 gen + multiple decoders
1002
+ generators, decoders = select_distributed_workers()
1003
+ if decoders:
1004
+ cluster_stats["successful_requests"] += 1
1005
+ return StreamingResponse(
1006
+ distributed_generation(generators, decoders, request_data, "chat"),
1007
+ media_type="text/event-stream"
1008
+ )
1009
+
1010
+ # HEAVY/FALLBACK - single worker
1011
+ worker = get_least_busy_worker()
1012
+ cluster_stats["successful_requests"] += 1
1013
+ return StreamingResponse(
1014
+ heavy_load_generation(worker, request_data, "chat"),
1015
+ media_type="text/event-stream"
1016
+ )
1017
  except Exception as e:
1018
  cluster_stats["failed_requests"] += 1
1019
  raise