sdsdsdsdcvcvc

Paused

App Files Files Community

jnjj commited on Apr 25, 2025

Commit

d4f31e1

verified ·

1 Parent(s): c64a3c0

Update app.py

Browse files

Files changed (1) hide show

app.py +41 -57

app.py CHANGED Viewed

@@ -31,7 +31,14 @@ class JsonFormatter(logging.Formatter):
         skip_keys = {'message', 'asctime', 'levelname', 'levelno', 'pathname', 'filename', 'module', 'funcName', 'lineno', 'created', 'msecs', 'relativeCreated', 'thread', 'threadName', 'process', 'processName', 'exc_info', 'exc_text', 'stack_info', 'request_id'}
         for key, value in record.__dict__.items():
             if not key.startswith('_') and key not in log_record and key not in skip_keys:
-                 log_record[key] = value
         return json.dumps(log_record)
 def setup_logging():
@@ -52,8 +59,8 @@ MODEL_REPO = os.getenv("MODEL_REPO", "jnjj/vcvcvcv")
 MODEL_FILE = os.getenv("MODEL_FILE", "gemma-3-4b-it-q4_0.gguf")
 N_CTX_CONFIG = int(os.getenv("N_CTX", "2048"))
 N_BATCH = int(os.getenv("N_BATCH", "512"))
-N_GPU_LAYERS_CONFIG = int(os.getenv("N_GPU_LAYERS", "0")) # Allow configuring GPU layers, but default to 0 as per original
-MAX_CONTINUATIONS = int(os.getenv("MAX_CONTINUATIONS", "-1")) # -1 for unlimited
 FIXED_REPEAT_PENALTY = float(os.getenv("FIXED_REPEAT_PENALTY", "1.1"))
 FIXED_SEED = int(os.getenv("FIXED_SEED", "-1"))
@@ -121,11 +128,10 @@ def prepare_messages(data: Dict, format: Optional[str] = None, request_id: str =
                  content = str(content)
             if role == "system":
-                 # If a system message is provided in the list and is the first message, override the default
                  if i == 0 and final_messages and final_messages[0]["role"] == "system":
                      logger.info("Replacing default system prompt with user-provided system message.", extra={'request_id': request_id})
-                     final_messages[0]["content"] = content # Update content of existing system message
-                 elif i == 0 and not final_messages: # Should not happen if effective_system_prompt_content is not empty, but handle defensively
                       final_messages.append({"role": "system", "content": content})
                  else:
                      logger.warning(f"Ignoring additional system message at index {i} as system prompt is already set or should be at the start.", extra={'request_id': request_id, 'message_index': i})
@@ -153,15 +159,13 @@ def estimate_token_count(messages: List[Dict[str, str]], request_id: str = 'N/A'
         return -1
     try:
-        # Use add_generation_prompt=True to include the final assistant turn marker if the last message is not assistant
         chat_prompt_string = llm.apply_chat_template(messages, add_generation_prompt=True)
         tokens = llm.tokenize(chat_prompt_string.encode('utf-8', errors='ignore'), add_bos=True)
         return len(tokens)
     except Exception as e:
         logger.error(f"Could not estimate token count using apply_chat_template: {e}", exc_info=True, extra={'request_id': request_id})
-        # Fallback to a simple character-based estimation if template fails
         char_count = sum(len(m.get('content', '')) for m in messages)
-        estimated_tokens = char_count // 4 # Common rough estimate
         logger.warning(f"Falling back to character-based token estimation (~{estimated_tokens})", extra={'request_id': request_id, 'estimated_tokens': estimated_tokens, 'char_count': char_count})
         return estimated_tokens
@@ -175,7 +179,6 @@ def truncate_messages_for_context(messages: List[Dict[str, str]], max_tokens: in
     truncated_messages: List[Dict[str, str]] = []
     system_prompt: Optional[Dict[str, str]] = None
-    # Preserve system prompt if it exists at the beginning
     if messages and messages[0].get("role") == "system":
         system_prompt = messages[0]
         truncated_messages.append(system_prompt)
@@ -189,9 +192,7 @@ def truncate_messages_for_context(messages: List[Dict[str, str]], max_tokens: in
          current_token_count = sum(len(m.get('content', '')) for m in truncated_messages) // 4
     messages_to_add = []
-    # Add non-system messages from newest to oldest
     for msg in reversed(remaining_messages):
-        # Tentative list including the current message and already selected messages
         potential_list = ([system_prompt] if system_prompt else []) + [msg] + messages_to_add
         next_token_count = estimate_token_count(potential_list, request_id=request_id)
@@ -200,7 +201,6 @@ def truncate_messages_for_context(messages: List[Dict[str, str]], max_tokens: in
             messages_to_add.insert(0, msg)
             current_token_count = next_token_count
         elif next_token_count == -1:
-             # If estimation fails mid-truncation, stop adding but keep what we have
              logger.warning(f"Token estimation failed while adding message: {msg}. Stopping truncation early.", extra={'request_id': request_id})
              break
         else:
@@ -213,7 +213,6 @@ def truncate_messages_for_context(messages: List[Dict[str, str]], max_tokens: in
     final_count = len(final_truncated_list)
     if not final_truncated_list or all(m.get("role") == "system" for m in final_truncated_list):
-         # If truncation results in only system message or empty, try to keep at least the last user message
          if any(m.get("role") == "user" for m in messages):
              last_user_message = next((m for m in reversed(messages) if m.get("role") == "user"), None)
              if last_user_message:
@@ -222,6 +221,7 @@ def truncate_messages_for_context(messages: List[Dict[str, str]], max_tokens: in
                  final_count = len(final_truncated_list)
                  current_token_count = estimate_token_count(final_truncated_list, request_id=request_id)
     if final_count < original_count:
         logger.warning(f"Context truncated: Kept {final_count}/{original_count} messages. Estimated tokens: ~{current_token_count}/{target_token_limit} (target).",
                        extra={'request_id': request_id, 'kept': final_count, 'original': original_count, 'estimated_tokens': current_token_count, 'target_limit': target_token_limit})
@@ -229,13 +229,27 @@ def truncate_messages_for_context(messages: List[Dict[str, str]], max_tokens: in
          logger.debug(f"Context truncation check complete. Kept all {final_count} messages. Estimated tokens: ~{current_token_count}.",
                       extra={'request_id': request_id, 'kept': final_count, 'estimated_tokens': current_token_count})
     if not final_truncated_list:
          logger.error("Context truncation resulted in an empty message list!", extra={'request_id': request_id})
-         # This should ideally not happen with the last user message fallback, but defensive programming
          return []
     return final_truncated_list
 def load_model():
     global llm, ACTUAL_N_CTX, ACTUAL_N_BATCH, ACTUAL_N_GPU_LAYERS
     logger.info(f"Attempting to load model: {MODEL_REPO}/{MODEL_FILE}")
@@ -253,9 +267,9 @@ def load_model():
         )
         logger.info("Model loaded successfully.")
         if llm:
-            ACTUAL_N_CTX = llm.n_ctx if hasattr(llm, 'n_ctx') else N_CTX_CONFIG
-            ACTUAL_N_BATCH = llm.n_batch if hasattr(llm, 'n_batch') else N_BATCH
-            ACTUAL_N_GPU_LAYERS = llm.n_gpu_layers if hasattr(llm, 'n_gpu_layers') else 0
             if ACTUAL_N_CTX != N_CTX_CONFIG:
                  logger.warning(f"Model's actual context size ({ACTUAL_N_CTX}) differs from config ({N_CTX_CONFIG}). Using actual.", extra={'actual_n_ctx': ACTUAL_N_CTX, 'configured_n_ctx': N_CTX_CONFIG})
@@ -266,7 +280,6 @@ def load_model():
             logger.info(f"Actual Model Batch Size (n_batch): {ACTUAL_N_BATCH}")
             logger.info(f"Actual Model GPU Layers (n_gpu_layers): {ACTUAL_N_GPU_LAYERS}")
             try:
                  test_tokens = llm.tokenize(b"Test sentence.")
                  logger.info(f"Tokenizer test successful. 'Test sentence.' -> {len(test_tokens)} tokens.")
@@ -487,7 +500,6 @@ html_code = """
                         const { done, value } = await reader.read();
                         if (done) {
                             finished = true;
-                            // Only update status if not already showing an error or final state
                             if (!generationStatus.textContent.includes("finished") && !generationStatus.textContent.includes("stopped") && !generationStatus.textContent.includes("Error") && !generationStatus.textContent.includes("Max continuations")) {
                                 generationStatus.textContent = `Streaming finished. Total continuations: ${continuationCount}.`;
                                 generationStatus.className = 'info';
@@ -497,7 +509,6 @@ html_code = """
                         const chunk = decoder.decode(value, { stream: true });
-                        // Check for specific server markers first
                         const continueMatch = chunk.match(/\n\[CONTINUING (\d+) - TRUNCATING CONTEXT\.\.\.\]\n/);
                         const errorMatch = chunk.match(/\n\[ERROR\](.*)/);
                         const infoMatch = chunk.match(/\n\[INFO\](.*)/);
@@ -512,19 +523,18 @@ html_code = """
                              generationOutput.textContent += chunk;
                              generationStatus.textContent = `Error during generation: ${errorMatch[1]}`;
                              generationStatus.className = 'error';
-                             finished = true; // Stop on error marker
                         } else if (infoMatch) {
                              generationOutput.textContent += chunk;
                              generationStatus.textContent = `Generation info: ${infoMatch[1]}. Total continuations: ${continuationCount}.`;
                              generationStatus.className = 'info';
                              if (infoMatch[1].includes("stopped") || infoMatch[1].includes("finished") || infoMatch[1].includes("Max continuations")) {
-                                finished = true; // Stop on explicit stop/finish info
                              }
                              lastStatusUpdate = Date.now();
                          }
                          else {
                              generationOutput.textContent += chunk;
-                             // Update status periodically if no marker received
                              if (Date.now() - lastStatusUpdate > 1000 && !generationStatus.className.includes('warning') && !generationStatus.className.includes('error')) {
                                  generationStatus.textContent = `Streaming... (Continuation #${continuationCount})`;
                                  generationStatus.className = 'info';
@@ -534,7 +544,7 @@ html_code = """
                         generationOutput.scrollTop = generationOutput.scrollHeight;
                     }
-                } else { // Non-streaming
                     const text = await response.text();
                     const finishReason = response.headers.get('X-Finish-Reason');
                     const continuations = response.headers.get('X-Continuations');
@@ -552,12 +562,9 @@ html_code = """
                     }
                     if (usageCompletionTokens && usageCompletionTokens !== 'N/A') statusText += ` Completion Tokens: ~${usageCompletionTokens}.`;
-                    // Check for explicit error/info markers in the non-streaming output text as well
                     if (text.includes("[ERROR]")) {
                         statusText = "Generation finished with errors. See output."
                         generationStatus.className = 'error';
-                    } else if (text.includes("[INFO]")) {
-                         // Status is likely already set by headers, but might add more context from text
                     }
@@ -577,7 +584,6 @@ html_code = """
         apiInfoBtn.addEventListener('click', getApiInfo);
         generateBtn.addEventListener('click', generateText);
-        // Initial checks on page load
         checkHealth();
         getApiInfo();
     </script>
@@ -614,12 +620,7 @@ def model_info():
     model_details: Union[Dict[str, Any], str] = "Model details unavailable"
     try:
-         n_embd = 'N/A'
-         if hasattr(llm, '_model') and hasattr(llm._model, 'n_embd') and callable(llm._model.n_embd):
-             try:
-                 n_embd = llm._model.n_embd()
-             except Exception as embd_e:
-                  logger.warning(f"Could not get n_embd: {embd_e}", extra={'request_id': request_id})
          model_details = {
              "n_embd": n_embd,
@@ -661,7 +662,6 @@ def model_info():
     return jsonify(info), 200
 def _generate_single_cycle(messages: List[Dict[str, str]], params: Dict, stream: bool, request_id: str) -> Union[Generator[Dict, None, None], Dict]:
-    """Performs one call to llm.create_chat_completion."""
     try:
         logger.debug(f"Starting llama.cpp chat completion call. Stream: {stream}. Messages: {len(messages)}. Params summary: temp={params.get('temperature')}, top_p={params.get('top_p')}, top_k={params.get('top_k')}, stop={params.get('stop')}", extra={'request_id': request_id, 'stream': stream, 'message_count': len(messages)})
         result = llm.create_chat_completion(
@@ -733,7 +733,6 @@ def generate():
         effective_n_ctx = get_effective_n_ctx()
         input_token_count = estimate_token_count(initial_messages, request_id=request_id)
-        # Check initial input size against a buffered context window
         if input_token_count != -1 and input_token_count > effective_n_ctx * CONTEXT_TRUNCATION_BUFFER_RATIO:
             logger.warning(f"Initial input (~{input_token_count} tokens) likely exceeds safe context window ({int(effective_n_ctx * CONTEXT_TRUNCATION_BUFFER_RATIO)}). Attempting truncation.", extra={'request_id': request_id, 'initial_tokens': input_token_count, 'n_ctx': effective_n_ctx, 'buffer_ratio': CONTEXT_TRUNCATION_BUFFER_RATIO})
             truncated_initial = truncate_messages_for_context(initial_messages, effective_n_ctx, CONTEXT_TRUNCATION_BUFFER_RATIO, request_id=request_id)
@@ -809,7 +808,7 @@ def generate():
                     if chunk_finish_reason:
                         finish_reason = chunk_finish_reason
                         usage_this_cycle = chunk_usage
-                        final_usage = usage_this_cycle # Keep usage from the chunk that signaled finish
                         break
                 if not finish_reason and generated_this_cycle_content:
@@ -837,9 +836,7 @@ def generate():
                  else:
                       current_messages[-1]['content'] += generated_this_cycle_content
-                 # Sum completion tokens based on usage reported in chunks if available,
-                 # otherwise this sum will be an underestimate or rely on final usage.
-                 total_completion_tokens_generated += usage_this_cycle.get("completion_tokens", 0) # This is likely 0 for most chunks
             if finish_reason == 'stop' or finish_reason == 'end_of_stream':
                 logger.info(f"Streaming generation stopped naturally in cycle {cycle_number}. Reason: {finish_reason}", extra={'request_id': req_id, 'cycle': cycle_number, 'finish_reason': finish_reason})
@@ -875,11 +872,9 @@ def generate():
             "X-Accel-Buffering": "no",
             "X-Request-ID": request_id
         }
-        # Note: X- headers like usage can't reliably reflect accumulated values in SSE until stream ends.
-        # We rely on the JS client or final log message for total details.
         return Response(stream_with_context(streaming_generator(request_id)), headers=headers)
-    else: # Non-streaming
         while True:
             if MAX_CONTINUATIONS >= 0 and continuations > MAX_CONTINUATIONS:
                 logger.info(f"Max continuations ({MAX_CONTINUATIONS}) reached. Stopping non-streaming.", extra={'request_id': request_id})
@@ -986,8 +981,6 @@ def generate():
         response.headers["X-Finish-Reason"] = final_finish_reason
         response.headers["X-Continuations"] = str(continuations)
         response.headers["X-Usage-Completion-Tokens"] = str(total_completion_tokens_generated)
-        # Note: Prompt tokens and total tokens usage from the LAST cycle might be misleading
-        # if significant truncation happened in previous cycles.
         response.headers["X-Usage-Prompt-Tokens-Last-Cycle"] = str(final_usage.get("prompt_tokens", "N/A"))
         response.headers["X-Usage-Total-Tokens-Last-Cycle"] = str(final_usage.get("total_tokens", "N/A"))
         return response
@@ -1006,14 +999,5 @@ if __name__ == "__main__":
     if not llm:
          logger.critical("MODEL FAILED TO LOAD. SERVER WILL START BUT '/generate' WILL FAIL.")
-    if not is_debug:
-        try:
-            from waitress import serve
-            logger.info("Running with Waitress production server.")
-            serve(app, host=host, port=port, threads=8)
-        except ImportError:
-            logger.warning("Waitress not found. Falling back to Flask development server. Install waitress for production.")
-            app.run(host=host, port=port, threaded=True, debug=is_debug)
-    else:
-        logger.info("Running with Flask development server (Debug=True).")
-        app.run(host=host, port=port, threaded=True, debug=is_debug, use_reloader=False)

         skip_keys = {'message', 'asctime', 'levelname', 'levelno', 'pathname', 'filename', 'module', 'funcName', 'lineno', 'created', 'msecs', 'relativeCreated', 'thread', 'threadName', 'process', 'processName', 'exc_info', 'exc_text', 'stack_info', 'request_id'}
         for key, value in record.__dict__.items():
             if not key.startswith('_') and key not in log_record and key not in skip_keys:
+                 # Ensure value is JSON serializable
+                 try:
+                     json.dumps(value)
+                     log_record[key] = value
+                 except TypeError:
+                     log_record[key] = str(value) # Convert non-serializable types to string
+                 except Exception:
+                     log_record[key] = "[Unserializable Value]"
         return json.dumps(log_record)
 def setup_logging():
 MODEL_FILE = os.getenv("MODEL_FILE", "gemma-3-4b-it-q4_0.gguf")
 N_CTX_CONFIG = int(os.getenv("N_CTX", "2048"))
 N_BATCH = int(os.getenv("N_BATCH", "512"))
+N_GPU_LAYERS_CONFIG = int(os.getenv("N_GPU_LAYERS", "0"))
+MAX_CONTINUATIONS = int(os.getenv("MAX_CONTINUATIONS", "-1"))
 FIXED_REPEAT_PENALTY = float(os.getenv("FIXED_REPEAT_PENALTY", "1.1"))
 FIXED_SEED = int(os.getenv("FIXED_SEED", "-1"))
                  content = str(content)
             if role == "system":
                  if i == 0 and final_messages and final_messages[0]["role"] == "system":
                      logger.info("Replacing default system prompt with user-provided system message.", extra={'request_id': request_id})
+                     final_messages[0]["content"] = content
+                 elif i == 0 and not final_messages:
                       final_messages.append({"role": "system", "content": content})
                  else:
                      logger.warning(f"Ignoring additional system message at index {i} as system prompt is already set or should be at the start.", extra={'request_id': request_id, 'message_index': i})
         return -1
     try:
         chat_prompt_string = llm.apply_chat_template(messages, add_generation_prompt=True)
         tokens = llm.tokenize(chat_prompt_string.encode('utf-8', errors='ignore'), add_bos=True)
         return len(tokens)
     except Exception as e:
         logger.error(f"Could not estimate token count using apply_chat_template: {e}", exc_info=True, extra={'request_id': request_id})
         char_count = sum(len(m.get('content', '')) for m in messages)
+        estimated_tokens = char_count // 4
         logger.warning(f"Falling back to character-based token estimation (~{estimated_tokens})", extra={'request_id': request_id, 'estimated_tokens': estimated_tokens, 'char_count': char_count})
         return estimated_tokens
     truncated_messages: List[Dict[str, str]] = []
     system_prompt: Optional[Dict[str, str]] = None
     if messages and messages[0].get("role") == "system":
         system_prompt = messages[0]
         truncated_messages.append(system_prompt)
          current_token_count = sum(len(m.get('content', '')) for m in truncated_messages) // 4
     messages_to_add = []
     for msg in reversed(remaining_messages):
         potential_list = ([system_prompt] if system_prompt else []) + [msg] + messages_to_add
         next_token_count = estimate_token_count(potential_list, request_id=request_id)
             messages_to_add.insert(0, msg)
             current_token_count = next_token_count
         elif next_token_count == -1:
              logger.warning(f"Token estimation failed while adding message: {msg}. Stopping truncation early.", extra={'request_id': request_id})
              break
         else:
     final_count = len(final_truncated_list)
     if not final_truncated_list or all(m.get("role") == "system" for m in final_truncated_list):
          if any(m.get("role") == "user" for m in messages):
              last_user_message = next((m for m in reversed(messages) if m.get("role") == "user"), None)
              if last_user_message:
                  final_count = len(final_truncated_list)
                  current_token_count = estimate_token_count(final_truncated_list, request_id=request_id)
     if final_count < original_count:
         logger.warning(f"Context truncated: Kept {final_count}/{original_count} messages. Estimated tokens: ~{current_token_count}/{target_token_limit} (target).",
                        extra={'request_id': request_id, 'kept': final_count, 'original': original_count, 'estimated_tokens': current_token_count, 'target_limit': target_token_limit})
          logger.debug(f"Context truncation check complete. Kept all {final_count} messages. Estimated tokens: ~{current_token_count}.",
                       extra={'request_id': request_id, 'kept': final_count, 'estimated_tokens': current_token_count})
     if not final_truncated_list:
          logger.error("Context truncation resulted in an empty message list!", extra={'request_id': request_id})
          return []
     return final_truncated_list
+def get_property_or_method_value(obj: Any, prop_name: str, default: Any = None) -> Any:
+    """Safely get property value or call method if callable."""
+    if hasattr(obj, prop_name):
+        prop = getattr(obj, prop_name)
+        if callable(prop):
+            try:
+                return prop()
+            except Exception:
+                logger.warning(f"Error calling method {prop_name} on {type(obj)}", exc_info=True)
+                return default
+        else:
+            return prop
+    return default
 def load_model():
     global llm, ACTUAL_N_CTX, ACTUAL_N_BATCH, ACTUAL_N_GPU_LAYERS
     logger.info(f"Attempting to load model: {MODEL_REPO}/{MODEL_FILE}")
         )
         logger.info("Model loaded successfully.")
         if llm:
+            ACTUAL_N_CTX = get_property_or_method_value(llm, 'n_ctx', N_CTX_CONFIG)
+            ACTUAL_N_BATCH = get_property_or_method_value(llm, 'n_batch', N_BATCH)
+            ACTUAL_N_GPU_LAYERS = get_property_or_method_value(llm, 'n_gpu_layers', 0)
             if ACTUAL_N_CTX != N_CTX_CONFIG:
                  logger.warning(f"Model's actual context size ({ACTUAL_N_CTX}) differs from config ({N_CTX_CONFIG}). Using actual.", extra={'actual_n_ctx': ACTUAL_N_CTX, 'configured_n_ctx': N_CTX_CONFIG})
             logger.info(f"Actual Model Batch Size (n_batch): {ACTUAL_N_BATCH}")
             logger.info(f"Actual Model GPU Layers (n_gpu_layers): {ACTUAL_N_GPU_LAYERS}")
             try:
                  test_tokens = llm.tokenize(b"Test sentence.")
                  logger.info(f"Tokenizer test successful. 'Test sentence.' -> {len(test_tokens)} tokens.")
                         const { done, value } = await reader.read();
                         if (done) {
                             finished = true;
                             if (!generationStatus.textContent.includes("finished") && !generationStatus.textContent.includes("stopped") && !generationStatus.textContent.includes("Error") && !generationStatus.textContent.includes("Max continuations")) {
                                 generationStatus.textContent = `Streaming finished. Total continuations: ${continuationCount}.`;
                                 generationStatus.className = 'info';
                         const chunk = decoder.decode(value, { stream: true });
                         const continueMatch = chunk.match(/\n\[CONTINUING (\d+) - TRUNCATING CONTEXT\.\.\.\]\n/);
                         const errorMatch = chunk.match(/\n\[ERROR\](.*)/);
                         const infoMatch = chunk.match(/\n\[INFO\](.*)/);
                              generationOutput.textContent += chunk;
                              generationStatus.textContent = `Error during generation: ${errorMatch[1]}`;
                              generationStatus.className = 'error';
+                             finished = true;
                         } else if (infoMatch) {
                              generationOutput.textContent += chunk;
                              generationStatus.textContent = `Generation info: ${infoMatch[1]}. Total continuations: ${continuationCount}.`;
                              generationStatus.className = 'info';
                              if (infoMatch[1].includes("stopped") || infoMatch[1].includes("finished") || infoMatch[1].includes("Max continuations")) {
+                                finished = true;
                              }
                              lastStatusUpdate = Date.now();
                          }
                          else {
                              generationOutput.textContent += chunk;
                              if (Date.now() - lastStatusUpdate > 1000 && !generationStatus.className.includes('warning') && !generationStatus.className.includes('error')) {
                                  generationStatus.textContent = `Streaming... (Continuation #${continuationCount})`;
                                  generationStatus.className = 'info';
                         generationOutput.scrollTop = generationOutput.scrollHeight;
                     }
+                } else {
                     const text = await response.text();
                     const finishReason = response.headers.get('X-Finish-Reason');
                     const continuations = response.headers.get('X-Continuations');
                     }
                     if (usageCompletionTokens && usageCompletionTokens !== 'N/A') statusText += ` Completion Tokens: ~${usageCompletionTokens}.`;
                     if (text.includes("[ERROR]")) {
                         statusText = "Generation finished with errors. See output."
                         generationStatus.className = 'error';
                     }
         apiInfoBtn.addEventListener('click', getApiInfo);
         generateBtn.addEventListener('click', generateText);
         checkHealth();
         getApiInfo();
     </script>
     model_details: Union[Dict[str, Any], str] = "Model details unavailable"
     try:
+         n_embd = get_property_or_method_value(get_property_or_method_value(llm, '_model'), 'n_embd', 'N/A')
          model_details = {
              "n_embd": n_embd,
     return jsonify(info), 200
 def _generate_single_cycle(messages: List[Dict[str, str]], params: Dict, stream: bool, request_id: str) -> Union[Generator[Dict, None, None], Dict]:
     try:
         logger.debug(f"Starting llama.cpp chat completion call. Stream: {stream}. Messages: {len(messages)}. Params summary: temp={params.get('temperature')}, top_p={params.get('top_p')}, top_k={params.get('top_k')}, stop={params.get('stop')}", extra={'request_id': request_id, 'stream': stream, 'message_count': len(messages)})
         result = llm.create_chat_completion(
         effective_n_ctx = get_effective_n_ctx()
         input_token_count = estimate_token_count(initial_messages, request_id=request_id)
         if input_token_count != -1 and input_token_count > effective_n_ctx * CONTEXT_TRUNCATION_BUFFER_RATIO:
             logger.warning(f"Initial input (~{input_token_count} tokens) likely exceeds safe context window ({int(effective_n_ctx * CONTEXT_TRUNCATION_BUFFER_RATIO)}). Attempting truncation.", extra={'request_id': request_id, 'initial_tokens': input_token_count, 'n_ctx': effective_n_ctx, 'buffer_ratio': CONTEXT_TRUNCATION_BUFFER_RATIO})
             truncated_initial = truncate_messages_for_context(initial_messages, effective_n_ctx, CONTEXT_TRUNCATION_BUFFER_RATIO, request_id=request_id)
                     if chunk_finish_reason:
                         finish_reason = chunk_finish_reason
                         usage_this_cycle = chunk_usage
+                        final_usage = usage_this_cycle
                         break
                 if not finish_reason and generated_this_cycle_content:
                  else:
                       current_messages[-1]['content'] += generated_this_cycle_content
+                 total_completion_tokens_generated += usage_this_cycle.get("completion_tokens", 0)
             if finish_reason == 'stop' or finish_reason == 'end_of_stream':
                 logger.info(f"Streaming generation stopped naturally in cycle {cycle_number}. Reason: {finish_reason}", extra={'request_id': req_id, 'cycle': cycle_number, 'finish_reason': finish_reason})
             "X-Accel-Buffering": "no",
             "X-Request-ID": request_id
         }
         return Response(stream_with_context(streaming_generator(request_id)), headers=headers)
+    else:
         while True:
             if MAX_CONTINUATIONS >= 0 and continuations > MAX_CONTINUATIONS:
                 logger.info(f"Max continuations ({MAX_CONTINUATIONS}) reached. Stopping non-streaming.", extra={'request_id': request_id})
         response.headers["X-Finish-Reason"] = final_finish_reason
         response.headers["X-Continuations"] = str(continuations)
         response.headers["X-Usage-Completion-Tokens"] = str(total_completion_tokens_generated)
         response.headers["X-Usage-Prompt-Tokens-Last-Cycle"] = str(final_usage.get("prompt_tokens", "N/A"))
         response.headers["X-Usage-Total-Tokens-Last-Cycle"] = str(final_usage.get("total_tokens", "N/A"))
         return response
     if not llm:
          logger.critical("MODEL FAILED TO LOAD. SERVER WILL START BUT '/generate' WILL FAIL.")
+    logger.info("Running with Flask development server.")
+    app.run(host=host, port=port, threaded=True, debug=is_debug, use_reloader=False)