Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -46,7 +46,7 @@ worker_health = {
|
|
| 46 |
}
|
| 47 |
|
| 48 |
request_timestamps = deque(maxlen=100)
|
| 49 |
-
current_load_mode = "light"
|
| 50 |
cluster_stats = {
|
| 51 |
"total_requests": 0,
|
| 52 |
"successful_requests": 0,
|
|
@@ -94,11 +94,23 @@ def get_current_load() -> int:
|
|
| 94 |
def update_load_mode():
|
| 95 |
global current_load_mode
|
| 96 |
load = get_current_load()
|
|
|
|
| 97 |
|
| 98 |
-
|
| 99 |
-
|
| 100 |
-
|
| 101 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 102 |
|
| 103 |
return current_load_mode, load
|
| 104 |
|
|
@@ -937,25 +949,26 @@ async def generate(request: GenerateRequest):
|
|
| 937 |
"stream": True
|
| 938 |
}
|
| 939 |
|
| 940 |
-
print(f"π― {mode.upper()} | Load: {load} |
|
| 941 |
|
| 942 |
try:
|
| 943 |
if mode == "light" and len(healthy) >= 2:
|
| 944 |
-
# DISTRIBUTED MODE
|
| 945 |
-
|
| 946 |
-
|
| 947 |
-
|
| 948 |
-
|
| 949 |
-
|
| 950 |
-
|
| 951 |
-
|
| 952 |
-
|
| 953 |
-
|
| 954 |
-
|
| 955 |
-
|
| 956 |
-
|
| 957 |
-
|
| 958 |
-
|
|
|
|
| 959 |
except Exception as e:
|
| 960 |
cluster_stats["failed_requests"] += 1
|
| 961 |
raise
|
|
@@ -981,25 +994,26 @@ async def chat(request: ChatRequest):
|
|
| 981 |
"stream": True
|
| 982 |
}
|
| 983 |
|
| 984 |
-
print(f"π¬ {mode.upper()} | Load: {load} |
|
| 985 |
|
| 986 |
try:
|
| 987 |
if mode == "light" and len(healthy) >= 2:
|
| 988 |
-
# DISTRIBUTED MODE
|
| 989 |
-
|
| 990 |
-
|
| 991 |
-
|
| 992 |
-
|
| 993 |
-
|
| 994 |
-
|
| 995 |
-
|
| 996 |
-
|
| 997 |
-
|
| 998 |
-
|
| 999 |
-
|
| 1000 |
-
|
| 1001 |
-
|
| 1002 |
-
|
|
|
|
| 1003 |
except Exception as e:
|
| 1004 |
cluster_stats["failed_requests"] += 1
|
| 1005 |
raise
|
|
|
|
| 46 |
}
|
| 47 |
|
| 48 |
request_timestamps = deque(maxlen=100)
|
| 49 |
+
current_load_mode = "light" # "light", "medium", "heavy"
|
| 50 |
cluster_stats = {
|
| 51 |
"total_requests": 0,
|
| 52 |
"successful_requests": 0,
|
|
|
|
| 94 |
def update_load_mode():
|
| 95 |
global current_load_mode
|
| 96 |
load = get_current_load()
|
| 97 |
+
healthy_count = len(get_healthy_workers())
|
| 98 |
|
| 99 |
+
# Adjust thresholds based on available workers
|
| 100 |
+
if healthy_count >= 5:
|
| 101 |
+
if load <= LIGHT_LOAD_THRESHOLD:
|
| 102 |
+
current_load_mode = "light" # 1 gen + 4 decoders
|
| 103 |
+
elif load <= MEDIUM_LOAD_THRESHOLD:
|
| 104 |
+
current_load_mode = "medium" # 2 gens + 3 decoders OR parallel requests
|
| 105 |
+
else:
|
| 106 |
+
current_load_mode = "heavy" # all workers independent
|
| 107 |
+
elif healthy_count >= 3:
|
| 108 |
+
if load <= 2:
|
| 109 |
+
current_load_mode = "light" # 1 gen + 2 decoders
|
| 110 |
+
else:
|
| 111 |
+
current_load_mode = "heavy" # distribute requests
|
| 112 |
+
else:
|
| 113 |
+
current_load_mode = "heavy" # fallback to simple distribution
|
| 114 |
|
| 115 |
return current_load_mode, load
|
| 116 |
|
|
|
|
| 949 |
"stream": True
|
| 950 |
}
|
| 951 |
|
| 952 |
+
print(f"π― {mode.upper()} | Load: {load} | Workers: {len(healthy)}")
|
| 953 |
|
| 954 |
try:
|
| 955 |
if mode == "light" and len(healthy) >= 2:
|
| 956 |
+
# DISTRIBUTED MODE - 1 gen + multiple decoders
|
| 957 |
+
generators, decoders = select_distributed_workers()
|
| 958 |
+
if decoders:
|
| 959 |
+
cluster_stats["successful_requests"] += 1
|
| 960 |
+
return StreamingResponse(
|
| 961 |
+
distributed_generation(generators, decoders, request_data, "generate"),
|
| 962 |
+
media_type="text/event-stream"
|
| 963 |
+
)
|
| 964 |
+
|
| 965 |
+
# HEAVY/FALLBACK - single worker
|
| 966 |
+
worker = get_least_busy_worker()
|
| 967 |
+
cluster_stats["successful_requests"] += 1
|
| 968 |
+
return StreamingResponse(
|
| 969 |
+
heavy_load_generation(worker, request_data, "generate"),
|
| 970 |
+
media_type="text/event-stream"
|
| 971 |
+
)
|
| 972 |
except Exception as e:
|
| 973 |
cluster_stats["failed_requests"] += 1
|
| 974 |
raise
|
|
|
|
| 994 |
"stream": True
|
| 995 |
}
|
| 996 |
|
| 997 |
+
print(f"π¬ {mode.upper()} | Load: {load} | Workers: {len(healthy)}")
|
| 998 |
|
| 999 |
try:
|
| 1000 |
if mode == "light" and len(healthy) >= 2:
|
| 1001 |
+
# DISTRIBUTED MODE - 1 gen + multiple decoders
|
| 1002 |
+
generators, decoders = select_distributed_workers()
|
| 1003 |
+
if decoders:
|
| 1004 |
+
cluster_stats["successful_requests"] += 1
|
| 1005 |
+
return StreamingResponse(
|
| 1006 |
+
distributed_generation(generators, decoders, request_data, "chat"),
|
| 1007 |
+
media_type="text/event-stream"
|
| 1008 |
+
)
|
| 1009 |
+
|
| 1010 |
+
# HEAVY/FALLBACK - single worker
|
| 1011 |
+
worker = get_least_busy_worker()
|
| 1012 |
+
cluster_stats["successful_requests"] += 1
|
| 1013 |
+
return StreamingResponse(
|
| 1014 |
+
heavy_load_generation(worker, request_data, "chat"),
|
| 1015 |
+
media_type="text/event-stream"
|
| 1016 |
+
)
|
| 1017 |
except Exception as e:
|
| 1018 |
cluster_stats["failed_requests"] += 1
|
| 1019 |
raise
|