jnjj commited on
Commit
d4f31e1
·
verified ·
1 Parent(s): c64a3c0

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +41 -57
app.py CHANGED
@@ -31,7 +31,14 @@ class JsonFormatter(logging.Formatter):
31
  skip_keys = {'message', 'asctime', 'levelname', 'levelno', 'pathname', 'filename', 'module', 'funcName', 'lineno', 'created', 'msecs', 'relativeCreated', 'thread', 'threadName', 'process', 'processName', 'exc_info', 'exc_text', 'stack_info', 'request_id'}
32
  for key, value in record.__dict__.items():
33
  if not key.startswith('_') and key not in log_record and key not in skip_keys:
34
- log_record[key] = value
 
 
 
 
 
 
 
35
  return json.dumps(log_record)
36
 
37
  def setup_logging():
@@ -52,8 +59,8 @@ MODEL_REPO = os.getenv("MODEL_REPO", "jnjj/vcvcvcv")
52
  MODEL_FILE = os.getenv("MODEL_FILE", "gemma-3-4b-it-q4_0.gguf")
53
  N_CTX_CONFIG = int(os.getenv("N_CTX", "2048"))
54
  N_BATCH = int(os.getenv("N_BATCH", "512"))
55
- N_GPU_LAYERS_CONFIG = int(os.getenv("N_GPU_LAYERS", "0")) # Allow configuring GPU layers, but default to 0 as per original
56
- MAX_CONTINUATIONS = int(os.getenv("MAX_CONTINUATIONS", "-1")) # -1 for unlimited
57
 
58
  FIXED_REPEAT_PENALTY = float(os.getenv("FIXED_REPEAT_PENALTY", "1.1"))
59
  FIXED_SEED = int(os.getenv("FIXED_SEED", "-1"))
@@ -121,11 +128,10 @@ def prepare_messages(data: Dict, format: Optional[str] = None, request_id: str =
121
  content = str(content)
122
 
123
  if role == "system":
124
- # If a system message is provided in the list and is the first message, override the default
125
  if i == 0 and final_messages and final_messages[0]["role"] == "system":
126
  logger.info("Replacing default system prompt with user-provided system message.", extra={'request_id': request_id})
127
- final_messages[0]["content"] = content # Update content of existing system message
128
- elif i == 0 and not final_messages: # Should not happen if effective_system_prompt_content is not empty, but handle defensively
129
  final_messages.append({"role": "system", "content": content})
130
  else:
131
  logger.warning(f"Ignoring additional system message at index {i} as system prompt is already set or should be at the start.", extra={'request_id': request_id, 'message_index': i})
@@ -153,15 +159,13 @@ def estimate_token_count(messages: List[Dict[str, str]], request_id: str = 'N/A'
153
  return -1
154
 
155
  try:
156
- # Use add_generation_prompt=True to include the final assistant turn marker if the last message is not assistant
157
  chat_prompt_string = llm.apply_chat_template(messages, add_generation_prompt=True)
158
  tokens = llm.tokenize(chat_prompt_string.encode('utf-8', errors='ignore'), add_bos=True)
159
  return len(tokens)
160
  except Exception as e:
161
  logger.error(f"Could not estimate token count using apply_chat_template: {e}", exc_info=True, extra={'request_id': request_id})
162
- # Fallback to a simple character-based estimation if template fails
163
  char_count = sum(len(m.get('content', '')) for m in messages)
164
- estimated_tokens = char_count // 4 # Common rough estimate
165
  logger.warning(f"Falling back to character-based token estimation (~{estimated_tokens})", extra={'request_id': request_id, 'estimated_tokens': estimated_tokens, 'char_count': char_count})
166
  return estimated_tokens
167
 
@@ -175,7 +179,6 @@ def truncate_messages_for_context(messages: List[Dict[str, str]], max_tokens: in
175
  truncated_messages: List[Dict[str, str]] = []
176
  system_prompt: Optional[Dict[str, str]] = None
177
 
178
- # Preserve system prompt if it exists at the beginning
179
  if messages and messages[0].get("role") == "system":
180
  system_prompt = messages[0]
181
  truncated_messages.append(system_prompt)
@@ -189,9 +192,7 @@ def truncate_messages_for_context(messages: List[Dict[str, str]], max_tokens: in
189
  current_token_count = sum(len(m.get('content', '')) for m in truncated_messages) // 4
190
 
191
  messages_to_add = []
192
- # Add non-system messages from newest to oldest
193
  for msg in reversed(remaining_messages):
194
- # Tentative list including the current message and already selected messages
195
  potential_list = ([system_prompt] if system_prompt else []) + [msg] + messages_to_add
196
 
197
  next_token_count = estimate_token_count(potential_list, request_id=request_id)
@@ -200,7 +201,6 @@ def truncate_messages_for_context(messages: List[Dict[str, str]], max_tokens: in
200
  messages_to_add.insert(0, msg)
201
  current_token_count = next_token_count
202
  elif next_token_count == -1:
203
- # If estimation fails mid-truncation, stop adding but keep what we have
204
  logger.warning(f"Token estimation failed while adding message: {msg}. Stopping truncation early.", extra={'request_id': request_id})
205
  break
206
  else:
@@ -213,7 +213,6 @@ def truncate_messages_for_context(messages: List[Dict[str, str]], max_tokens: in
213
  final_count = len(final_truncated_list)
214
 
215
  if not final_truncated_list or all(m.get("role") == "system" for m in final_truncated_list):
216
- # If truncation results in only system message or empty, try to keep at least the last user message
217
  if any(m.get("role") == "user" for m in messages):
218
  last_user_message = next((m for m in reversed(messages) if m.get("role") == "user"), None)
219
  if last_user_message:
@@ -222,6 +221,7 @@ def truncate_messages_for_context(messages: List[Dict[str, str]], max_tokens: in
222
  final_count = len(final_truncated_list)
223
  current_token_count = estimate_token_count(final_truncated_list, request_id=request_id)
224
 
 
225
  if final_count < original_count:
226
  logger.warning(f"Context truncated: Kept {final_count}/{original_count} messages. Estimated tokens: ~{current_token_count}/{target_token_limit} (target).",
227
  extra={'request_id': request_id, 'kept': final_count, 'original': original_count, 'estimated_tokens': current_token_count, 'target_limit': target_token_limit})
@@ -229,13 +229,27 @@ def truncate_messages_for_context(messages: List[Dict[str, str]], max_tokens: in
229
  logger.debug(f"Context truncation check complete. Kept all {final_count} messages. Estimated tokens: ~{current_token_count}.",
230
  extra={'request_id': request_id, 'kept': final_count, 'estimated_tokens': current_token_count})
231
 
 
232
  if not final_truncated_list:
233
  logger.error("Context truncation resulted in an empty message list!", extra={'request_id': request_id})
234
- # This should ideally not happen with the last user message fallback, but defensive programming
235
  return []
236
 
237
  return final_truncated_list
238
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
239
  def load_model():
240
  global llm, ACTUAL_N_CTX, ACTUAL_N_BATCH, ACTUAL_N_GPU_LAYERS
241
  logger.info(f"Attempting to load model: {MODEL_REPO}/{MODEL_FILE}")
@@ -253,9 +267,9 @@ def load_model():
253
  )
254
  logger.info("Model loaded successfully.")
255
  if llm:
256
- ACTUAL_N_CTX = llm.n_ctx if hasattr(llm, 'n_ctx') else N_CTX_CONFIG
257
- ACTUAL_N_BATCH = llm.n_batch if hasattr(llm, 'n_batch') else N_BATCH
258
- ACTUAL_N_GPU_LAYERS = llm.n_gpu_layers if hasattr(llm, 'n_gpu_layers') else 0
259
 
260
  if ACTUAL_N_CTX != N_CTX_CONFIG:
261
  logger.warning(f"Model's actual context size ({ACTUAL_N_CTX}) differs from config ({N_CTX_CONFIG}). Using actual.", extra={'actual_n_ctx': ACTUAL_N_CTX, 'configured_n_ctx': N_CTX_CONFIG})
@@ -266,7 +280,6 @@ def load_model():
266
  logger.info(f"Actual Model Batch Size (n_batch): {ACTUAL_N_BATCH}")
267
  logger.info(f"Actual Model GPU Layers (n_gpu_layers): {ACTUAL_N_GPU_LAYERS}")
268
 
269
-
270
  try:
271
  test_tokens = llm.tokenize(b"Test sentence.")
272
  logger.info(f"Tokenizer test successful. 'Test sentence.' -> {len(test_tokens)} tokens.")
@@ -487,7 +500,6 @@ html_code = """
487
  const { done, value } = await reader.read();
488
  if (done) {
489
  finished = true;
490
- // Only update status if not already showing an error or final state
491
  if (!generationStatus.textContent.includes("finished") && !generationStatus.textContent.includes("stopped") && !generationStatus.textContent.includes("Error") && !generationStatus.textContent.includes("Max continuations")) {
492
  generationStatus.textContent = `Streaming finished. Total continuations: ${continuationCount}.`;
493
  generationStatus.className = 'info';
@@ -497,7 +509,6 @@ html_code = """
497
 
498
  const chunk = decoder.decode(value, { stream: true });
499
 
500
- // Check for specific server markers first
501
  const continueMatch = chunk.match(/\n\[CONTINUING (\d+) - TRUNCATING CONTEXT\.\.\.\]\n/);
502
  const errorMatch = chunk.match(/\n\[ERROR\](.*)/);
503
  const infoMatch = chunk.match(/\n\[INFO\](.*)/);
@@ -512,19 +523,18 @@ html_code = """
512
  generationOutput.textContent += chunk;
513
  generationStatus.textContent = `Error during generation: ${errorMatch[1]}`;
514
  generationStatus.className = 'error';
515
- finished = true; // Stop on error marker
516
  } else if (infoMatch) {
517
  generationOutput.textContent += chunk;
518
  generationStatus.textContent = `Generation info: ${infoMatch[1]}. Total continuations: ${continuationCount}.`;
519
  generationStatus.className = 'info';
520
  if (infoMatch[1].includes("stopped") || infoMatch[1].includes("finished") || infoMatch[1].includes("Max continuations")) {
521
- finished = true; // Stop on explicit stop/finish info
522
  }
523
  lastStatusUpdate = Date.now();
524
  }
525
  else {
526
  generationOutput.textContent += chunk;
527
- // Update status periodically if no marker received
528
  if (Date.now() - lastStatusUpdate > 1000 && !generationStatus.className.includes('warning') && !generationStatus.className.includes('error')) {
529
  generationStatus.textContent = `Streaming... (Continuation #${continuationCount})`;
530
  generationStatus.className = 'info';
@@ -534,7 +544,7 @@ html_code = """
534
  generationOutput.scrollTop = generationOutput.scrollHeight;
535
  }
536
 
537
- } else { // Non-streaming
538
  const text = await response.text();
539
  const finishReason = response.headers.get('X-Finish-Reason');
540
  const continuations = response.headers.get('X-Continuations');
@@ -552,12 +562,9 @@ html_code = """
552
  }
553
  if (usageCompletionTokens && usageCompletionTokens !== 'N/A') statusText += ` Completion Tokens: ~${usageCompletionTokens}.`;
554
 
555
- // Check for explicit error/info markers in the non-streaming output text as well
556
  if (text.includes("[ERROR]")) {
557
  statusText = "Generation finished with errors. See output."
558
  generationStatus.className = 'error';
559
- } else if (text.includes("[INFO]")) {
560
- // Status is likely already set by headers, but might add more context from text
561
  }
562
 
563
 
@@ -577,7 +584,6 @@ html_code = """
577
  apiInfoBtn.addEventListener('click', getApiInfo);
578
  generateBtn.addEventListener('click', generateText);
579
 
580
- // Initial checks on page load
581
  checkHealth();
582
  getApiInfo();
583
  </script>
@@ -614,12 +620,7 @@ def model_info():
614
 
615
  model_details: Union[Dict[str, Any], str] = "Model details unavailable"
616
  try:
617
- n_embd = 'N/A'
618
- if hasattr(llm, '_model') and hasattr(llm._model, 'n_embd') and callable(llm._model.n_embd):
619
- try:
620
- n_embd = llm._model.n_embd()
621
- except Exception as embd_e:
622
- logger.warning(f"Could not get n_embd: {embd_e}", extra={'request_id': request_id})
623
 
624
  model_details = {
625
  "n_embd": n_embd,
@@ -661,7 +662,6 @@ def model_info():
661
  return jsonify(info), 200
662
 
663
  def _generate_single_cycle(messages: List[Dict[str, str]], params: Dict, stream: bool, request_id: str) -> Union[Generator[Dict, None, None], Dict]:
664
- """Performs one call to llm.create_chat_completion."""
665
  try:
666
  logger.debug(f"Starting llama.cpp chat completion call. Stream: {stream}. Messages: {len(messages)}. Params summary: temp={params.get('temperature')}, top_p={params.get('top_p')}, top_k={params.get('top_k')}, stop={params.get('stop')}", extra={'request_id': request_id, 'stream': stream, 'message_count': len(messages)})
667
  result = llm.create_chat_completion(
@@ -733,7 +733,6 @@ def generate():
733
  effective_n_ctx = get_effective_n_ctx()
734
  input_token_count = estimate_token_count(initial_messages, request_id=request_id)
735
 
736
- # Check initial input size against a buffered context window
737
  if input_token_count != -1 and input_token_count > effective_n_ctx * CONTEXT_TRUNCATION_BUFFER_RATIO:
738
  logger.warning(f"Initial input (~{input_token_count} tokens) likely exceeds safe context window ({int(effective_n_ctx * CONTEXT_TRUNCATION_BUFFER_RATIO)}). Attempting truncation.", extra={'request_id': request_id, 'initial_tokens': input_token_count, 'n_ctx': effective_n_ctx, 'buffer_ratio': CONTEXT_TRUNCATION_BUFFER_RATIO})
739
  truncated_initial = truncate_messages_for_context(initial_messages, effective_n_ctx, CONTEXT_TRUNCATION_BUFFER_RATIO, request_id=request_id)
@@ -809,7 +808,7 @@ def generate():
809
  if chunk_finish_reason:
810
  finish_reason = chunk_finish_reason
811
  usage_this_cycle = chunk_usage
812
- final_usage = usage_this_cycle # Keep usage from the chunk that signaled finish
813
  break
814
 
815
  if not finish_reason and generated_this_cycle_content:
@@ -837,9 +836,7 @@ def generate():
837
  else:
838
  current_messages[-1]['content'] += generated_this_cycle_content
839
 
840
- # Sum completion tokens based on usage reported in chunks if available,
841
- # otherwise this sum will be an underestimate or rely on final usage.
842
- total_completion_tokens_generated += usage_this_cycle.get("completion_tokens", 0) # This is likely 0 for most chunks
843
 
844
  if finish_reason == 'stop' or finish_reason == 'end_of_stream':
845
  logger.info(f"Streaming generation stopped naturally in cycle {cycle_number}. Reason: {finish_reason}", extra={'request_id': req_id, 'cycle': cycle_number, 'finish_reason': finish_reason})
@@ -875,11 +872,9 @@ def generate():
875
  "X-Accel-Buffering": "no",
876
  "X-Request-ID": request_id
877
  }
878
- # Note: X- headers like usage can't reliably reflect accumulated values in SSE until stream ends.
879
- # We rely on the JS client or final log message for total details.
880
  return Response(stream_with_context(streaming_generator(request_id)), headers=headers)
881
 
882
- else: # Non-streaming
883
  while True:
884
  if MAX_CONTINUATIONS >= 0 and continuations > MAX_CONTINUATIONS:
885
  logger.info(f"Max continuations ({MAX_CONTINUATIONS}) reached. Stopping non-streaming.", extra={'request_id': request_id})
@@ -986,8 +981,6 @@ def generate():
986
  response.headers["X-Finish-Reason"] = final_finish_reason
987
  response.headers["X-Continuations"] = str(continuations)
988
  response.headers["X-Usage-Completion-Tokens"] = str(total_completion_tokens_generated)
989
- # Note: Prompt tokens and total tokens usage from the LAST cycle might be misleading
990
- # if significant truncation happened in previous cycles.
991
  response.headers["X-Usage-Prompt-Tokens-Last-Cycle"] = str(final_usage.get("prompt_tokens", "N/A"))
992
  response.headers["X-Usage-Total-Tokens-Last-Cycle"] = str(final_usage.get("total_tokens", "N/A"))
993
  return response
@@ -1006,14 +999,5 @@ if __name__ == "__main__":
1006
  if not llm:
1007
  logger.critical("MODEL FAILED TO LOAD. SERVER WILL START BUT '/generate' WILL FAIL.")
1008
 
1009
- if not is_debug:
1010
- try:
1011
- from waitress import serve
1012
- logger.info("Running with Waitress production server.")
1013
- serve(app, host=host, port=port, threads=8)
1014
- except ImportError:
1015
- logger.warning("Waitress not found. Falling back to Flask development server. Install waitress for production.")
1016
- app.run(host=host, port=port, threaded=True, debug=is_debug)
1017
- else:
1018
- logger.info("Running with Flask development server (Debug=True).")
1019
- app.run(host=host, port=port, threaded=True, debug=is_debug, use_reloader=False)
 
31
  skip_keys = {'message', 'asctime', 'levelname', 'levelno', 'pathname', 'filename', 'module', 'funcName', 'lineno', 'created', 'msecs', 'relativeCreated', 'thread', 'threadName', 'process', 'processName', 'exc_info', 'exc_text', 'stack_info', 'request_id'}
32
  for key, value in record.__dict__.items():
33
  if not key.startswith('_') and key not in log_record and key not in skip_keys:
34
+ # Ensure value is JSON serializable
35
+ try:
36
+ json.dumps(value)
37
+ log_record[key] = value
38
+ except TypeError:
39
+ log_record[key] = str(value) # Convert non-serializable types to string
40
+ except Exception:
41
+ log_record[key] = "[Unserializable Value]"
42
  return json.dumps(log_record)
43
 
44
  def setup_logging():
 
59
  MODEL_FILE = os.getenv("MODEL_FILE", "gemma-3-4b-it-q4_0.gguf")
60
  N_CTX_CONFIG = int(os.getenv("N_CTX", "2048"))
61
  N_BATCH = int(os.getenv("N_BATCH", "512"))
62
+ N_GPU_LAYERS_CONFIG = int(os.getenv("N_GPU_LAYERS", "0"))
63
+ MAX_CONTINUATIONS = int(os.getenv("MAX_CONTINUATIONS", "-1"))
64
 
65
  FIXED_REPEAT_PENALTY = float(os.getenv("FIXED_REPEAT_PENALTY", "1.1"))
66
  FIXED_SEED = int(os.getenv("FIXED_SEED", "-1"))
 
128
  content = str(content)
129
 
130
  if role == "system":
 
131
  if i == 0 and final_messages and final_messages[0]["role"] == "system":
132
  logger.info("Replacing default system prompt with user-provided system message.", extra={'request_id': request_id})
133
+ final_messages[0]["content"] = content
134
+ elif i == 0 and not final_messages:
135
  final_messages.append({"role": "system", "content": content})
136
  else:
137
  logger.warning(f"Ignoring additional system message at index {i} as system prompt is already set or should be at the start.", extra={'request_id': request_id, 'message_index': i})
 
159
  return -1
160
 
161
  try:
 
162
  chat_prompt_string = llm.apply_chat_template(messages, add_generation_prompt=True)
163
  tokens = llm.tokenize(chat_prompt_string.encode('utf-8', errors='ignore'), add_bos=True)
164
  return len(tokens)
165
  except Exception as e:
166
  logger.error(f"Could not estimate token count using apply_chat_template: {e}", exc_info=True, extra={'request_id': request_id})
 
167
  char_count = sum(len(m.get('content', '')) for m in messages)
168
+ estimated_tokens = char_count // 4
169
  logger.warning(f"Falling back to character-based token estimation (~{estimated_tokens})", extra={'request_id': request_id, 'estimated_tokens': estimated_tokens, 'char_count': char_count})
170
  return estimated_tokens
171
 
 
179
  truncated_messages: List[Dict[str, str]] = []
180
  system_prompt: Optional[Dict[str, str]] = None
181
 
 
182
  if messages and messages[0].get("role") == "system":
183
  system_prompt = messages[0]
184
  truncated_messages.append(system_prompt)
 
192
  current_token_count = sum(len(m.get('content', '')) for m in truncated_messages) // 4
193
 
194
  messages_to_add = []
 
195
  for msg in reversed(remaining_messages):
 
196
  potential_list = ([system_prompt] if system_prompt else []) + [msg] + messages_to_add
197
 
198
  next_token_count = estimate_token_count(potential_list, request_id=request_id)
 
201
  messages_to_add.insert(0, msg)
202
  current_token_count = next_token_count
203
  elif next_token_count == -1:
 
204
  logger.warning(f"Token estimation failed while adding message: {msg}. Stopping truncation early.", extra={'request_id': request_id})
205
  break
206
  else:
 
213
  final_count = len(final_truncated_list)
214
 
215
  if not final_truncated_list or all(m.get("role") == "system" for m in final_truncated_list):
 
216
  if any(m.get("role") == "user" for m in messages):
217
  last_user_message = next((m for m in reversed(messages) if m.get("role") == "user"), None)
218
  if last_user_message:
 
221
  final_count = len(final_truncated_list)
222
  current_token_count = estimate_token_count(final_truncated_list, request_id=request_id)
223
 
224
+
225
  if final_count < original_count:
226
  logger.warning(f"Context truncated: Kept {final_count}/{original_count} messages. Estimated tokens: ~{current_token_count}/{target_token_limit} (target).",
227
  extra={'request_id': request_id, 'kept': final_count, 'original': original_count, 'estimated_tokens': current_token_count, 'target_limit': target_token_limit})
 
229
  logger.debug(f"Context truncation check complete. Kept all {final_count} messages. Estimated tokens: ~{current_token_count}.",
230
  extra={'request_id': request_id, 'kept': final_count, 'estimated_tokens': current_token_count})
231
 
232
+
233
  if not final_truncated_list:
234
  logger.error("Context truncation resulted in an empty message list!", extra={'request_id': request_id})
 
235
  return []
236
 
237
  return final_truncated_list
238
 
239
+ def get_property_or_method_value(obj: Any, prop_name: str, default: Any = None) -> Any:
240
+ """Safely get property value or call method if callable."""
241
+ if hasattr(obj, prop_name):
242
+ prop = getattr(obj, prop_name)
243
+ if callable(prop):
244
+ try:
245
+ return prop()
246
+ except Exception:
247
+ logger.warning(f"Error calling method {prop_name} on {type(obj)}", exc_info=True)
248
+ return default
249
+ else:
250
+ return prop
251
+ return default
252
+
253
  def load_model():
254
  global llm, ACTUAL_N_CTX, ACTUAL_N_BATCH, ACTUAL_N_GPU_LAYERS
255
  logger.info(f"Attempting to load model: {MODEL_REPO}/{MODEL_FILE}")
 
267
  )
268
  logger.info("Model loaded successfully.")
269
  if llm:
270
+ ACTUAL_N_CTX = get_property_or_method_value(llm, 'n_ctx', N_CTX_CONFIG)
271
+ ACTUAL_N_BATCH = get_property_or_method_value(llm, 'n_batch', N_BATCH)
272
+ ACTUAL_N_GPU_LAYERS = get_property_or_method_value(llm, 'n_gpu_layers', 0)
273
 
274
  if ACTUAL_N_CTX != N_CTX_CONFIG:
275
  logger.warning(f"Model's actual context size ({ACTUAL_N_CTX}) differs from config ({N_CTX_CONFIG}). Using actual.", extra={'actual_n_ctx': ACTUAL_N_CTX, 'configured_n_ctx': N_CTX_CONFIG})
 
280
  logger.info(f"Actual Model Batch Size (n_batch): {ACTUAL_N_BATCH}")
281
  logger.info(f"Actual Model GPU Layers (n_gpu_layers): {ACTUAL_N_GPU_LAYERS}")
282
 
 
283
  try:
284
  test_tokens = llm.tokenize(b"Test sentence.")
285
  logger.info(f"Tokenizer test successful. 'Test sentence.' -> {len(test_tokens)} tokens.")
 
500
  const { done, value } = await reader.read();
501
  if (done) {
502
  finished = true;
 
503
  if (!generationStatus.textContent.includes("finished") && !generationStatus.textContent.includes("stopped") && !generationStatus.textContent.includes("Error") && !generationStatus.textContent.includes("Max continuations")) {
504
  generationStatus.textContent = `Streaming finished. Total continuations: ${continuationCount}.`;
505
  generationStatus.className = 'info';
 
509
 
510
  const chunk = decoder.decode(value, { stream: true });
511
 
 
512
  const continueMatch = chunk.match(/\n\[CONTINUING (\d+) - TRUNCATING CONTEXT\.\.\.\]\n/);
513
  const errorMatch = chunk.match(/\n\[ERROR\](.*)/);
514
  const infoMatch = chunk.match(/\n\[INFO\](.*)/);
 
523
  generationOutput.textContent += chunk;
524
  generationStatus.textContent = `Error during generation: ${errorMatch[1]}`;
525
  generationStatus.className = 'error';
526
+ finished = true;
527
  } else if (infoMatch) {
528
  generationOutput.textContent += chunk;
529
  generationStatus.textContent = `Generation info: ${infoMatch[1]}. Total continuations: ${continuationCount}.`;
530
  generationStatus.className = 'info';
531
  if (infoMatch[1].includes("stopped") || infoMatch[1].includes("finished") || infoMatch[1].includes("Max continuations")) {
532
+ finished = true;
533
  }
534
  lastStatusUpdate = Date.now();
535
  }
536
  else {
537
  generationOutput.textContent += chunk;
 
538
  if (Date.now() - lastStatusUpdate > 1000 && !generationStatus.className.includes('warning') && !generationStatus.className.includes('error')) {
539
  generationStatus.textContent = `Streaming... (Continuation #${continuationCount})`;
540
  generationStatus.className = 'info';
 
544
  generationOutput.scrollTop = generationOutput.scrollHeight;
545
  }
546
 
547
+ } else {
548
  const text = await response.text();
549
  const finishReason = response.headers.get('X-Finish-Reason');
550
  const continuations = response.headers.get('X-Continuations');
 
562
  }
563
  if (usageCompletionTokens && usageCompletionTokens !== 'N/A') statusText += ` Completion Tokens: ~${usageCompletionTokens}.`;
564
 
 
565
  if (text.includes("[ERROR]")) {
566
  statusText = "Generation finished with errors. See output."
567
  generationStatus.className = 'error';
 
 
568
  }
569
 
570
 
 
584
  apiInfoBtn.addEventListener('click', getApiInfo);
585
  generateBtn.addEventListener('click', generateText);
586
 
 
587
  checkHealth();
588
  getApiInfo();
589
  </script>
 
620
 
621
  model_details: Union[Dict[str, Any], str] = "Model details unavailable"
622
  try:
623
+ n_embd = get_property_or_method_value(get_property_or_method_value(llm, '_model'), 'n_embd', 'N/A')
 
 
 
 
 
624
 
625
  model_details = {
626
  "n_embd": n_embd,
 
662
  return jsonify(info), 200
663
 
664
  def _generate_single_cycle(messages: List[Dict[str, str]], params: Dict, stream: bool, request_id: str) -> Union[Generator[Dict, None, None], Dict]:
 
665
  try:
666
  logger.debug(f"Starting llama.cpp chat completion call. Stream: {stream}. Messages: {len(messages)}. Params summary: temp={params.get('temperature')}, top_p={params.get('top_p')}, top_k={params.get('top_k')}, stop={params.get('stop')}", extra={'request_id': request_id, 'stream': stream, 'message_count': len(messages)})
667
  result = llm.create_chat_completion(
 
733
  effective_n_ctx = get_effective_n_ctx()
734
  input_token_count = estimate_token_count(initial_messages, request_id=request_id)
735
 
 
736
  if input_token_count != -1 and input_token_count > effective_n_ctx * CONTEXT_TRUNCATION_BUFFER_RATIO:
737
  logger.warning(f"Initial input (~{input_token_count} tokens) likely exceeds safe context window ({int(effective_n_ctx * CONTEXT_TRUNCATION_BUFFER_RATIO)}). Attempting truncation.", extra={'request_id': request_id, 'initial_tokens': input_token_count, 'n_ctx': effective_n_ctx, 'buffer_ratio': CONTEXT_TRUNCATION_BUFFER_RATIO})
738
  truncated_initial = truncate_messages_for_context(initial_messages, effective_n_ctx, CONTEXT_TRUNCATION_BUFFER_RATIO, request_id=request_id)
 
808
  if chunk_finish_reason:
809
  finish_reason = chunk_finish_reason
810
  usage_this_cycle = chunk_usage
811
+ final_usage = usage_this_cycle
812
  break
813
 
814
  if not finish_reason and generated_this_cycle_content:
 
836
  else:
837
  current_messages[-1]['content'] += generated_this_cycle_content
838
 
839
+ total_completion_tokens_generated += usage_this_cycle.get("completion_tokens", 0)
 
 
840
 
841
  if finish_reason == 'stop' or finish_reason == 'end_of_stream':
842
  logger.info(f"Streaming generation stopped naturally in cycle {cycle_number}. Reason: {finish_reason}", extra={'request_id': req_id, 'cycle': cycle_number, 'finish_reason': finish_reason})
 
872
  "X-Accel-Buffering": "no",
873
  "X-Request-ID": request_id
874
  }
 
 
875
  return Response(stream_with_context(streaming_generator(request_id)), headers=headers)
876
 
877
+ else:
878
  while True:
879
  if MAX_CONTINUATIONS >= 0 and continuations > MAX_CONTINUATIONS:
880
  logger.info(f"Max continuations ({MAX_CONTINUATIONS}) reached. Stopping non-streaming.", extra={'request_id': request_id})
 
981
  response.headers["X-Finish-Reason"] = final_finish_reason
982
  response.headers["X-Continuations"] = str(continuations)
983
  response.headers["X-Usage-Completion-Tokens"] = str(total_completion_tokens_generated)
 
 
984
  response.headers["X-Usage-Prompt-Tokens-Last-Cycle"] = str(final_usage.get("prompt_tokens", "N/A"))
985
  response.headers["X-Usage-Total-Tokens-Last-Cycle"] = str(final_usage.get("total_tokens", "N/A"))
986
  return response
 
999
  if not llm:
1000
  logger.critical("MODEL FAILED TO LOAD. SERVER WILL START BUT '/generate' WILL FAIL.")
1001
 
1002
+ logger.info("Running with Flask development server.")
1003
+ app.run(host=host, port=port, threaded=True, debug=is_debug, use_reloader=False)