Spaces:
Running
Running
Dmitry Beresnev commited on
Commit ·
677456b
1
Parent(s): 6379bd0
Fix 400 for llama.cpp web UI completion requests
Browse files- cpp/server.cpp +101 -11
cpp/server.cpp
CHANGED
|
@@ -21,6 +21,95 @@ static std::string build_sse_event(const json &payload) {
|
|
| 21 |
return "data: " + payload.dump() + "\n\n";
|
| 22 |
}
|
| 23 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 24 |
static std::string build_buffered_stream_response(const std::string &completion_body) {
|
| 25 |
json completion = json::parse(completion_body, nullptr, false);
|
| 26 |
if (completion.is_discarded() || !completion.is_object()) {
|
|
@@ -31,16 +120,7 @@ static std::string build_buffered_stream_response(const std::string &completion_
|
|
| 31 |
const std::string model = completion.value("model", "");
|
| 32 |
const auto created = completion.value("created", 0);
|
| 33 |
|
| 34 |
-
std::string assistant_content;
|
| 35 |
-
if (completion.contains("choices") && completion["choices"].is_array() && !completion["choices"].empty()) {
|
| 36 |
-
const auto &choice = completion["choices"][0];
|
| 37 |
-
if (choice.is_object() && choice.contains("message") && choice["message"].is_object()) {
|
| 38 |
-
const auto &message = choice["message"];
|
| 39 |
-
if (message.contains("content") && message["content"].is_string()) {
|
| 40 |
-
assistant_content = message["content"].get<std::string>();
|
| 41 |
-
}
|
| 42 |
-
}
|
| 43 |
-
}
|
| 44 |
|
| 45 |
std::ostringstream oss;
|
| 46 |
oss << build_sse_event({
|
|
@@ -258,18 +338,25 @@ http::response<http::string_body> handle_request(
|
|
| 258 |
}
|
| 259 |
}
|
| 260 |
|
| 261 |
-
if (path == "/v1/chat/completions" && req.method() == http::verb::post) {
|
| 262 |
if (auto auth_res = ensure_authenticated(Role::USER)) return *auth_res;
|
| 263 |
json payload = json::parse(req.body(), nullptr, false);
|
| 264 |
if (payload.is_discarded()) {
|
| 265 |
return json_response(http::status::bad_request, {{"error", "Invalid JSON"}});
|
| 266 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
| 267 |
const bool stream_requested = request_stream_enabled(payload);
|
| 268 |
if (stream_requested) {
|
| 269 |
payload["stream"] = false;
|
| 270 |
log_line("request_id=" + request_id +
|
| 271 |
" stream_requested=true mode=buffered_sse_fallback");
|
| 272 |
}
|
|
|
|
|
|
|
|
|
|
| 273 |
|
| 274 |
std::string token_error;
|
| 275 |
auto estimate = estimate_chat_tokens(payload, config.limits, token_error);
|
|
@@ -338,6 +425,9 @@ http::response<http::string_body> handle_request(
|
|
| 338 |
if (stream_requested && result.status >= 200 && result.status < 300) {
|
| 339 |
result.body = build_buffered_stream_response(result.body);
|
| 340 |
result.content_type = "text/event-stream; charset=utf-8";
|
|
|
|
|
|
|
|
|
|
| 341 |
}
|
| 342 |
res.set(http::field::content_type, result.content_type);
|
| 343 |
res.set(http::field::server, "llm-manager");
|
|
|
|
| 21 |
return "data: " + payload.dump() + "\n\n";
|
| 22 |
}
|
| 23 |
|
| 24 |
+
static std::string extract_chat_text(const json &completion) {
|
| 25 |
+
if (!completion.is_object()) return "";
|
| 26 |
+
if (!completion.contains("choices") || !completion["choices"].is_array() || completion["choices"].empty()) {
|
| 27 |
+
return "";
|
| 28 |
+
}
|
| 29 |
+
const auto &choice = completion["choices"][0];
|
| 30 |
+
if (!choice.is_object()) return "";
|
| 31 |
+
if (choice.contains("message") && choice["message"].is_object()) {
|
| 32 |
+
const auto &message = choice["message"];
|
| 33 |
+
if (message.contains("content") && message["content"].is_string()) {
|
| 34 |
+
return message["content"].get<std::string>();
|
| 35 |
+
}
|
| 36 |
+
}
|
| 37 |
+
if (choice.contains("text") && choice["text"].is_string()) {
|
| 38 |
+
return choice["text"].get<std::string>();
|
| 39 |
+
}
|
| 40 |
+
return "";
|
| 41 |
+
}
|
| 42 |
+
|
| 43 |
+
static json completion_payload_to_chat_payload(const json &payload, const LimitsConfig &limits) {
|
| 44 |
+
json chat_payload = json::object();
|
| 45 |
+
chat_payload["messages"] = json::array();
|
| 46 |
+
|
| 47 |
+
if (payload.contains("prompt")) {
|
| 48 |
+
if (payload["prompt"].is_string()) {
|
| 49 |
+
chat_payload["messages"].push_back({
|
| 50 |
+
{"role", "user"},
|
| 51 |
+
{"content", payload["prompt"].get<std::string>()}
|
| 52 |
+
});
|
| 53 |
+
} else if (payload["prompt"].is_array()) {
|
| 54 |
+
std::string joined_prompt;
|
| 55 |
+
bool first = true;
|
| 56 |
+
for (const auto &item : payload["prompt"]) {
|
| 57 |
+
if (!item.is_string()) continue;
|
| 58 |
+
if (!first) joined_prompt += "\n";
|
| 59 |
+
joined_prompt += item.get<std::string>();
|
| 60 |
+
first = false;
|
| 61 |
+
}
|
| 62 |
+
chat_payload["messages"].push_back({
|
| 63 |
+
{"role", "user"},
|
| 64 |
+
{"content", joined_prompt}
|
| 65 |
+
});
|
| 66 |
+
}
|
| 67 |
+
}
|
| 68 |
+
|
| 69 |
+
int normalized_max_tokens = limits.default_max_tokens;
|
| 70 |
+
if (payload.contains("n_predict") && payload["n_predict"].is_number_integer()) {
|
| 71 |
+
normalized_max_tokens = payload["n_predict"].get<int>();
|
| 72 |
+
} else if (payload.contains("max_tokens") && payload["max_tokens"].is_number_integer()) {
|
| 73 |
+
normalized_max_tokens = payload["max_tokens"].get<int>();
|
| 74 |
+
}
|
| 75 |
+
if (normalized_max_tokens <= 0) {
|
| 76 |
+
normalized_max_tokens = limits.default_max_tokens;
|
| 77 |
+
}
|
| 78 |
+
chat_payload["max_tokens"] = normalized_max_tokens;
|
| 79 |
+
|
| 80 |
+
if (chat_payload["messages"].empty()) {
|
| 81 |
+
chat_payload["messages"].push_back({
|
| 82 |
+
{"role", "user"},
|
| 83 |
+
{"content", ""}
|
| 84 |
+
});
|
| 85 |
+
}
|
| 86 |
+
|
| 87 |
+
if (payload.contains("temperature")) chat_payload["temperature"] = payload["temperature"];
|
| 88 |
+
if (payload.contains("top_p")) chat_payload["top_p"] = payload["top_p"];
|
| 89 |
+
if (payload.contains("top_k")) chat_payload["top_k"] = payload["top_k"];
|
| 90 |
+
if (payload.contains("stop")) chat_payload["stop"] = payload["stop"];
|
| 91 |
+
if (payload.contains("stream")) chat_payload["stream"] = payload["stream"];
|
| 92 |
+
return chat_payload;
|
| 93 |
+
}
|
| 94 |
+
|
| 95 |
+
static std::string build_completion_compat_response(const std::string &completion_body) {
|
| 96 |
+
json completion = json::parse(completion_body, nullptr, false);
|
| 97 |
+
if (completion.is_discarded() || !completion.is_object()) {
|
| 98 |
+
return completion_body;
|
| 99 |
+
}
|
| 100 |
+
|
| 101 |
+
json out = {
|
| 102 |
+
{"content", extract_chat_text(completion)}
|
| 103 |
+
};
|
| 104 |
+
|
| 105 |
+
if (completion.contains("stop")) out["stop"] = completion["stop"];
|
| 106 |
+
if (completion.contains("stopped_eos")) out["stopped_eos"] = completion["stopped_eos"];
|
| 107 |
+
if (completion.contains("stopped_limit")) out["stopped_limit"] = completion["stopped_limit"];
|
| 108 |
+
if (completion.contains("tokens_predicted")) out["tokens_predicted"] = completion["tokens_predicted"];
|
| 109 |
+
if (completion.contains("tokens_evaluated")) out["tokens_evaluated"] = completion["tokens_evaluated"];
|
| 110 |
+
return out.dump();
|
| 111 |
+
}
|
| 112 |
+
|
| 113 |
static std::string build_buffered_stream_response(const std::string &completion_body) {
|
| 114 |
json completion = json::parse(completion_body, nullptr, false);
|
| 115 |
if (completion.is_discarded() || !completion.is_object()) {
|
|
|
|
| 120 |
const std::string model = completion.value("model", "");
|
| 121 |
const auto created = completion.value("created", 0);
|
| 122 |
|
| 123 |
+
const std::string assistant_content = extract_chat_text(completion);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 124 |
|
| 125 |
std::ostringstream oss;
|
| 126 |
oss << build_sse_event({
|
|
|
|
| 338 |
}
|
| 339 |
}
|
| 340 |
|
| 341 |
+
if ((path == "/v1/chat/completions" || path == "/completion") && req.method() == http::verb::post) {
|
| 342 |
if (auto auth_res = ensure_authenticated(Role::USER)) return *auth_res;
|
| 343 |
json payload = json::parse(req.body(), nullptr, false);
|
| 344 |
if (payload.is_discarded()) {
|
| 345 |
return json_response(http::status::bad_request, {{"error", "Invalid JSON"}});
|
| 346 |
}
|
| 347 |
+
const bool completion_compat_mode = path == "/completion";
|
| 348 |
+
if (completion_compat_mode) {
|
| 349 |
+
payload = completion_payload_to_chat_payload(payload, config.limits);
|
| 350 |
+
}
|
| 351 |
const bool stream_requested = request_stream_enabled(payload);
|
| 352 |
if (stream_requested) {
|
| 353 |
payload["stream"] = false;
|
| 354 |
log_line("request_id=" + request_id +
|
| 355 |
" stream_requested=true mode=buffered_sse_fallback");
|
| 356 |
}
|
| 357 |
+
if (completion_compat_mode) {
|
| 358 |
+
log_line("request_id=" + request_id + " completion_compat_mode=true");
|
| 359 |
+
}
|
| 360 |
|
| 361 |
std::string token_error;
|
| 362 |
auto estimate = estimate_chat_tokens(payload, config.limits, token_error);
|
|
|
|
| 425 |
if (stream_requested && result.status >= 200 && result.status < 300) {
|
| 426 |
result.body = build_buffered_stream_response(result.body);
|
| 427 |
result.content_type = "text/event-stream; charset=utf-8";
|
| 428 |
+
} else if (completion_compat_mode && result.status >= 200 && result.status < 300) {
|
| 429 |
+
result.body = build_completion_compat_response(result.body);
|
| 430 |
+
result.content_type = "application/json";
|
| 431 |
}
|
| 432 |
res.set(http::field::content_type, result.content_type);
|
| 433 |
res.set(http::field::server, "llm-manager");
|