Dmitry Beresnev commited on
Commit
e8080f5
·
1 Parent(s): 3634ca6

add token generation speed to ui

Browse files
Files changed (1) hide show
  1. cpp/server.cpp +58 -1
cpp/server.cpp CHANGED
@@ -107,9 +107,63 @@ static std::string build_completion_compat_response(const std::string &completio
107
  if (completion.contains("stopped_limit")) out["stopped_limit"] = completion["stopped_limit"];
108
  if (completion.contains("tokens_predicted")) out["tokens_predicted"] = completion["tokens_predicted"];
109
  if (completion.contains("tokens_evaluated")) out["tokens_evaluated"] = completion["tokens_evaluated"];
 
 
 
 
 
 
 
 
 
 
 
110
  return out.dump();
111
  }
112
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
113
  static std::string build_buffered_stream_response(const std::string &completion_body) {
114
  json completion = json::parse(completion_body, nullptr, false);
115
  if (completion.is_discarded() || !completion.is_object()) {
@@ -427,7 +481,10 @@ http::response<http::string_body> handle_request(
427
 
428
  http::response<http::string_body> res{
429
  static_cast<http::status>(result.status), req.version()};
430
- if (stream_requested && result.status >= 200 && result.status < 300) {
 
 
 
431
  result.body = build_buffered_stream_response(result.body);
432
  result.content_type = "text/event-stream; charset=utf-8";
433
  } else if (completion_compat_mode && result.status >= 200 && result.status < 300) {
 
107
  if (completion.contains("stopped_limit")) out["stopped_limit"] = completion["stopped_limit"];
108
  if (completion.contains("tokens_predicted")) out["tokens_predicted"] = completion["tokens_predicted"];
109
  if (completion.contains("tokens_evaluated")) out["tokens_evaluated"] = completion["tokens_evaluated"];
110
+ if (completion.contains("timings")) out["timings"] = completion["timings"];
111
+
112
+ // Map OpenAI usage fields to native llama.cpp field names if not already set
113
+ if (completion.contains("usage") && completion["usage"].is_object()) {
114
+ const auto &usage = completion["usage"];
115
+ if (!out.contains("tokens_predicted") && usage.contains("completion_tokens"))
116
+ out["tokens_predicted"] = usage["completion_tokens"];
117
+ if (!out.contains("tokens_evaluated") && usage.contains("prompt_tokens"))
118
+ out["tokens_evaluated"] = usage["prompt_tokens"];
119
+ }
120
+
121
  return out.dump();
122
  }
123
 
124
+ // Emits native llama.cpp SSE format for the /completion + stream=true case.
125
+ // The web UI expects {"content":"...","stop":false} chunks followed by a final
126
+ // stop=true chunk that includes timings so it can display generation stats.
127
+ static std::string build_completion_buffered_stream_response(const std::string &completion_body) {
128
+ json completion = json::parse(completion_body, nullptr, false);
129
+ if (completion.is_discarded() || !completion.is_object()) {
130
+ return "data: [DONE]\n\n";
131
+ }
132
+
133
+ const std::string assistant_content = extract_chat_text(completion);
134
+ std::ostringstream oss;
135
+
136
+ if (!assistant_content.empty()) {
137
+ oss << "data: " << json({{"content", assistant_content}, {"stop", false}}).dump() << "\n\n";
138
+ }
139
+
140
+ // Final event — includes timings and token counts so the UI can render stats
141
+ json final_event = {{"content", ""}, {"stop", true}};
142
+
143
+ if (completion.contains("timings"))
144
+ final_event["timings"] = completion["timings"];
145
+
146
+ if (completion.contains("usage") && completion["usage"].is_object()) {
147
+ const auto &usage = completion["usage"];
148
+ if (usage.contains("completion_tokens")) final_event["tokens_predicted"] = usage["completion_tokens"];
149
+ if (usage.contains("prompt_tokens")) final_event["tokens_evaluated"] = usage["prompt_tokens"];
150
+ }
151
+ if (completion.contains("tokens_predicted")) final_event["tokens_predicted"] = completion["tokens_predicted"];
152
+ if (completion.contains("tokens_evaluated")) final_event["tokens_evaluated"] = completion["tokens_evaluated"];
153
+
154
+ if (completion.contains("choices") && completion["choices"].is_array() && !completion["choices"].empty()) {
155
+ const auto &choice = completion["choices"][0];
156
+ if (choice.contains("finish_reason") && choice["finish_reason"].is_string()) {
157
+ const std::string reason = choice["finish_reason"].get<std::string>();
158
+ if (reason == "stop") final_event["stop_type"] = "eos";
159
+ else if (reason == "length") final_event["stop_type"] = "limit";
160
+ }
161
+ }
162
+
163
+ oss << "data: " << final_event.dump() << "\n\n";
164
+ return oss.str();
165
+ }
166
+
167
  static std::string build_buffered_stream_response(const std::string &completion_body) {
168
  json completion = json::parse(completion_body, nullptr, false);
169
  if (completion.is_discarded() || !completion.is_object()) {
 
481
 
482
  http::response<http::string_body> res{
483
  static_cast<http::status>(result.status), req.version()};
484
+ if (stream_requested && completion_compat_mode && result.status >= 200 && result.status < 300) {
485
+ result.body = build_completion_buffered_stream_response(result.body);
486
+ result.content_type = "text/event-stream; charset=utf-8";
487
+ } else if (stream_requested && result.status >= 200 && result.status < 300) {
488
  result.body = build_buffered_stream_response(result.body);
489
  result.content_type = "text/event-stream; charset=utf-8";
490
  } else if (completion_compat_mode && result.status >= 200 && result.status < 300) {