Update main.py
Browse files
main.py
CHANGED
|
@@ -124,17 +124,13 @@ def _condense_messages(messages: list[Message], max_tokens: int) -> str:
|
|
| 124 |
system_msgs = [m for m in messages if m.role == "system"]
|
| 125 |
user_assistant = [m for m in messages if m.role in ("user", "assistant")]
|
| 126 |
|
| 127 |
-
# Budujemy prompt w kolejności: system + user/assistant
|
| 128 |
condensed_parts = []
|
| 129 |
|
| 130 |
-
# system zawsze pełny
|
| 131 |
for m in system_msgs:
|
| 132 |
condensed_parts.append(_content_str(m))
|
| 133 |
|
| 134 |
-
# dynamiczne skracanie starszych user/assistant
|
| 135 |
tokens_so_far = sum(_token_count(part) for part in condensed_parts)
|
| 136 |
|
| 137 |
-
# jeśli wchodzimy w limity
|
| 138 |
for m in user_assistant:
|
| 139 |
text = _content_str(m)
|
| 140 |
tcount = _token_count(text)
|
|
@@ -144,7 +140,7 @@ def _condense_messages(messages: list[Message], max_tokens: int) -> str:
|
|
| 144 |
if remaining_tokens <= 0:
|
| 145 |
continue
|
| 146 |
approx_chars = remaining_tokens * AVG_CHARS_PER_TOKEN
|
| 147 |
-
text = text[-approx_chars:]
|
| 148 |
tcount = _token_count(text)
|
| 149 |
|
| 150 |
condensed_parts.append(text)
|
|
@@ -228,17 +224,18 @@ async def _call_falcon_once(prompt: str, req: ChatCompletionRequest) -> str:
|
|
| 228 |
"top_p": req.top_p,
|
| 229 |
}
|
| 230 |
|
| 231 |
-
# inicjalizacja nowego chatu z promptem
|
| 232 |
await asyncio.to_thread(
|
| 233 |
client.predict,
|
| 234 |
-
|
| 235 |
settings_form_value=settings,
|
| 236 |
api_name="/new_chat",
|
| 237 |
)
|
| 238 |
|
|
|
|
| 239 |
result = await asyncio.to_thread(
|
| 240 |
client.predict,
|
| 241 |
-
|
| 242 |
settings_form_value=settings,
|
| 243 |
api_name="/add_message",
|
| 244 |
)
|
|
@@ -254,12 +251,22 @@ async def _stream_sse(text: str, req: ChatCompletionRequest) -> AsyncGenerator[s
|
|
| 254 |
cid = f"chatcmpl-{uuid.uuid4().hex}"
|
| 255 |
created = int(time.time())
|
| 256 |
for i in range(0, len(text), 8):
|
| 257 |
-
chunk = {
|
| 258 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 259 |
yield f"data: {json.dumps(chunk)}\n\n"
|
| 260 |
await asyncio.sleep(0.01)
|
| 261 |
-
final_chunk = {
|
| 262 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 263 |
yield f"data: {json.dumps(final_chunk)}\n\n"
|
| 264 |
yield "data: [DONE]\n\n"
|
| 265 |
|
|
@@ -291,7 +298,15 @@ app.add_middleware(CORSMiddleware, allow_origins=["*"], allow_credentials=True,
|
|
| 291 |
|
| 292 |
@app.get("/")
|
| 293 |
async def root():
|
| 294 |
-
return {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 295 |
|
| 296 |
|
| 297 |
@app.get("/health")
|
|
|
|
| 124 |
system_msgs = [m for m in messages if m.role == "system"]
|
| 125 |
user_assistant = [m for m in messages if m.role in ("user", "assistant")]
|
| 126 |
|
|
|
|
| 127 |
condensed_parts = []
|
| 128 |
|
|
|
|
| 129 |
for m in system_msgs:
|
| 130 |
condensed_parts.append(_content_str(m))
|
| 131 |
|
|
|
|
| 132 |
tokens_so_far = sum(_token_count(part) for part in condensed_parts)
|
| 133 |
|
|
|
|
| 134 |
for m in user_assistant:
|
| 135 |
text = _content_str(m)
|
| 136 |
tcount = _token_count(text)
|
|
|
|
| 140 |
if remaining_tokens <= 0:
|
| 141 |
continue
|
| 142 |
approx_chars = remaining_tokens * AVG_CHARS_PER_TOKEN
|
| 143 |
+
text = text[-approx_chars:]
|
| 144 |
tcount = _token_count(text)
|
| 145 |
|
| 146 |
condensed_parts.append(text)
|
|
|
|
| 224 |
"top_p": req.top_p,
|
| 225 |
}
|
| 226 |
|
| 227 |
+
# inicjalizacja nowego chatu z promptem (jeśli endpoint wymaga)
|
| 228 |
await asyncio.to_thread(
|
| 229 |
client.predict,
|
| 230 |
+
prompt, # pierwszy argument podawany pozycyjnie
|
| 231 |
settings_form_value=settings,
|
| 232 |
api_name="/new_chat",
|
| 233 |
)
|
| 234 |
|
| 235 |
+
# dodanie wiadomości
|
| 236 |
result = await asyncio.to_thread(
|
| 237 |
client.predict,
|
| 238 |
+
prompt, # pierwszy argument podawany pozycyjnie
|
| 239 |
settings_form_value=settings,
|
| 240 |
api_name="/add_message",
|
| 241 |
)
|
|
|
|
| 251 |
cid = f"chatcmpl-{uuid.uuid4().hex}"
|
| 252 |
created = int(time.time())
|
| 253 |
for i in range(0, len(text), 8):
|
| 254 |
+
chunk = {
|
| 255 |
+
"id": cid,
|
| 256 |
+
"object": "chat.completion.chunk",
|
| 257 |
+
"created": created,
|
| 258 |
+
"model": req.model,
|
| 259 |
+
"choices": [{"index": 0, "delta": {"content": text[i:i+8]}, "finish_reason": None}]
|
| 260 |
+
}
|
| 261 |
yield f"data: {json.dumps(chunk)}\n\n"
|
| 262 |
await asyncio.sleep(0.01)
|
| 263 |
+
final_chunk = {
|
| 264 |
+
"id": cid,
|
| 265 |
+
"object": "chat.completion.chunk",
|
| 266 |
+
"created": created,
|
| 267 |
+
"model": req.model,
|
| 268 |
+
"choices": [{"index": 0, "delta": {}, "finish_reason": "stop"}]
|
| 269 |
+
}
|
| 270 |
yield f"data: {json.dumps(final_chunk)}\n\n"
|
| 271 |
yield "data: [DONE]\n\n"
|
| 272 |
|
|
|
|
| 298 |
|
| 299 |
@app.get("/")
|
| 300 |
async def root():
|
| 301 |
+
return {
|
| 302 |
+
"service": "FOC API",
|
| 303 |
+
"version": "5.0.0",
|
| 304 |
+
"endpoints": {
|
| 305 |
+
"health": "/health",
|
| 306 |
+
"models": "/v1/models",
|
| 307 |
+
"chat": "/v1/chat/completions"
|
| 308 |
+
}
|
| 309 |
+
}
|
| 310 |
|
| 311 |
|
| 312 |
@app.get("/health")
|