Spaces:
Paused
Paused
Update app.py
Browse files
app.py
CHANGED
|
@@ -31,7 +31,14 @@ class JsonFormatter(logging.Formatter):
|
|
| 31 |
skip_keys = {'message', 'asctime', 'levelname', 'levelno', 'pathname', 'filename', 'module', 'funcName', 'lineno', 'created', 'msecs', 'relativeCreated', 'thread', 'threadName', 'process', 'processName', 'exc_info', 'exc_text', 'stack_info', 'request_id'}
|
| 32 |
for key, value in record.__dict__.items():
|
| 33 |
if not key.startswith('_') and key not in log_record and key not in skip_keys:
|
| 34 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 35 |
return json.dumps(log_record)
|
| 36 |
|
| 37 |
def setup_logging():
|
|
@@ -52,8 +59,8 @@ MODEL_REPO = os.getenv("MODEL_REPO", "jnjj/vcvcvcv")
|
|
| 52 |
MODEL_FILE = os.getenv("MODEL_FILE", "gemma-3-4b-it-q4_0.gguf")
|
| 53 |
N_CTX_CONFIG = int(os.getenv("N_CTX", "2048"))
|
| 54 |
N_BATCH = int(os.getenv("N_BATCH", "512"))
|
| 55 |
-
N_GPU_LAYERS_CONFIG = int(os.getenv("N_GPU_LAYERS", "0"))
|
| 56 |
-
MAX_CONTINUATIONS = int(os.getenv("MAX_CONTINUATIONS", "-1"))
|
| 57 |
|
| 58 |
FIXED_REPEAT_PENALTY = float(os.getenv("FIXED_REPEAT_PENALTY", "1.1"))
|
| 59 |
FIXED_SEED = int(os.getenv("FIXED_SEED", "-1"))
|
|
@@ -121,11 +128,10 @@ def prepare_messages(data: Dict, format: Optional[str] = None, request_id: str =
|
|
| 121 |
content = str(content)
|
| 122 |
|
| 123 |
if role == "system":
|
| 124 |
-
# If a system message is provided in the list and is the first message, override the default
|
| 125 |
if i == 0 and final_messages and final_messages[0]["role"] == "system":
|
| 126 |
logger.info("Replacing default system prompt with user-provided system message.", extra={'request_id': request_id})
|
| 127 |
-
final_messages[0]["content"] = content
|
| 128 |
-
elif i == 0 and not final_messages:
|
| 129 |
final_messages.append({"role": "system", "content": content})
|
| 130 |
else:
|
| 131 |
logger.warning(f"Ignoring additional system message at index {i} as system prompt is already set or should be at the start.", extra={'request_id': request_id, 'message_index': i})
|
|
@@ -153,15 +159,13 @@ def estimate_token_count(messages: List[Dict[str, str]], request_id: str = 'N/A'
|
|
| 153 |
return -1
|
| 154 |
|
| 155 |
try:
|
| 156 |
-
# Use add_generation_prompt=True to include the final assistant turn marker if the last message is not assistant
|
| 157 |
chat_prompt_string = llm.apply_chat_template(messages, add_generation_prompt=True)
|
| 158 |
tokens = llm.tokenize(chat_prompt_string.encode('utf-8', errors='ignore'), add_bos=True)
|
| 159 |
return len(tokens)
|
| 160 |
except Exception as e:
|
| 161 |
logger.error(f"Could not estimate token count using apply_chat_template: {e}", exc_info=True, extra={'request_id': request_id})
|
| 162 |
-
# Fallback to a simple character-based estimation if template fails
|
| 163 |
char_count = sum(len(m.get('content', '')) for m in messages)
|
| 164 |
-
estimated_tokens = char_count // 4
|
| 165 |
logger.warning(f"Falling back to character-based token estimation (~{estimated_tokens})", extra={'request_id': request_id, 'estimated_tokens': estimated_tokens, 'char_count': char_count})
|
| 166 |
return estimated_tokens
|
| 167 |
|
|
@@ -175,7 +179,6 @@ def truncate_messages_for_context(messages: List[Dict[str, str]], max_tokens: in
|
|
| 175 |
truncated_messages: List[Dict[str, str]] = []
|
| 176 |
system_prompt: Optional[Dict[str, str]] = None
|
| 177 |
|
| 178 |
-
# Preserve system prompt if it exists at the beginning
|
| 179 |
if messages and messages[0].get("role") == "system":
|
| 180 |
system_prompt = messages[0]
|
| 181 |
truncated_messages.append(system_prompt)
|
|
@@ -189,9 +192,7 @@ def truncate_messages_for_context(messages: List[Dict[str, str]], max_tokens: in
|
|
| 189 |
current_token_count = sum(len(m.get('content', '')) for m in truncated_messages) // 4
|
| 190 |
|
| 191 |
messages_to_add = []
|
| 192 |
-
# Add non-system messages from newest to oldest
|
| 193 |
for msg in reversed(remaining_messages):
|
| 194 |
-
# Tentative list including the current message and already selected messages
|
| 195 |
potential_list = ([system_prompt] if system_prompt else []) + [msg] + messages_to_add
|
| 196 |
|
| 197 |
next_token_count = estimate_token_count(potential_list, request_id=request_id)
|
|
@@ -200,7 +201,6 @@ def truncate_messages_for_context(messages: List[Dict[str, str]], max_tokens: in
|
|
| 200 |
messages_to_add.insert(0, msg)
|
| 201 |
current_token_count = next_token_count
|
| 202 |
elif next_token_count == -1:
|
| 203 |
-
# If estimation fails mid-truncation, stop adding but keep what we have
|
| 204 |
logger.warning(f"Token estimation failed while adding message: {msg}. Stopping truncation early.", extra={'request_id': request_id})
|
| 205 |
break
|
| 206 |
else:
|
|
@@ -213,7 +213,6 @@ def truncate_messages_for_context(messages: List[Dict[str, str]], max_tokens: in
|
|
| 213 |
final_count = len(final_truncated_list)
|
| 214 |
|
| 215 |
if not final_truncated_list or all(m.get("role") == "system" for m in final_truncated_list):
|
| 216 |
-
# If truncation results in only system message or empty, try to keep at least the last user message
|
| 217 |
if any(m.get("role") == "user" for m in messages):
|
| 218 |
last_user_message = next((m for m in reversed(messages) if m.get("role") == "user"), None)
|
| 219 |
if last_user_message:
|
|
@@ -222,6 +221,7 @@ def truncate_messages_for_context(messages: List[Dict[str, str]], max_tokens: in
|
|
| 222 |
final_count = len(final_truncated_list)
|
| 223 |
current_token_count = estimate_token_count(final_truncated_list, request_id=request_id)
|
| 224 |
|
|
|
|
| 225 |
if final_count < original_count:
|
| 226 |
logger.warning(f"Context truncated: Kept {final_count}/{original_count} messages. Estimated tokens: ~{current_token_count}/{target_token_limit} (target).",
|
| 227 |
extra={'request_id': request_id, 'kept': final_count, 'original': original_count, 'estimated_tokens': current_token_count, 'target_limit': target_token_limit})
|
|
@@ -229,13 +229,27 @@ def truncate_messages_for_context(messages: List[Dict[str, str]], max_tokens: in
|
|
| 229 |
logger.debug(f"Context truncation check complete. Kept all {final_count} messages. Estimated tokens: ~{current_token_count}.",
|
| 230 |
extra={'request_id': request_id, 'kept': final_count, 'estimated_tokens': current_token_count})
|
| 231 |
|
|
|
|
| 232 |
if not final_truncated_list:
|
| 233 |
logger.error("Context truncation resulted in an empty message list!", extra={'request_id': request_id})
|
| 234 |
-
# This should ideally not happen with the last user message fallback, but defensive programming
|
| 235 |
return []
|
| 236 |
|
| 237 |
return final_truncated_list
|
| 238 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 239 |
def load_model():
|
| 240 |
global llm, ACTUAL_N_CTX, ACTUAL_N_BATCH, ACTUAL_N_GPU_LAYERS
|
| 241 |
logger.info(f"Attempting to load model: {MODEL_REPO}/{MODEL_FILE}")
|
|
@@ -253,9 +267,9 @@ def load_model():
|
|
| 253 |
)
|
| 254 |
logger.info("Model loaded successfully.")
|
| 255 |
if llm:
|
| 256 |
-
ACTUAL_N_CTX =
|
| 257 |
-
ACTUAL_N_BATCH =
|
| 258 |
-
ACTUAL_N_GPU_LAYERS =
|
| 259 |
|
| 260 |
if ACTUAL_N_CTX != N_CTX_CONFIG:
|
| 261 |
logger.warning(f"Model's actual context size ({ACTUAL_N_CTX}) differs from config ({N_CTX_CONFIG}). Using actual.", extra={'actual_n_ctx': ACTUAL_N_CTX, 'configured_n_ctx': N_CTX_CONFIG})
|
|
@@ -266,7 +280,6 @@ def load_model():
|
|
| 266 |
logger.info(f"Actual Model Batch Size (n_batch): {ACTUAL_N_BATCH}")
|
| 267 |
logger.info(f"Actual Model GPU Layers (n_gpu_layers): {ACTUAL_N_GPU_LAYERS}")
|
| 268 |
|
| 269 |
-
|
| 270 |
try:
|
| 271 |
test_tokens = llm.tokenize(b"Test sentence.")
|
| 272 |
logger.info(f"Tokenizer test successful. 'Test sentence.' -> {len(test_tokens)} tokens.")
|
|
@@ -487,7 +500,6 @@ html_code = """
|
|
| 487 |
const { done, value } = await reader.read();
|
| 488 |
if (done) {
|
| 489 |
finished = true;
|
| 490 |
-
// Only update status if not already showing an error or final state
|
| 491 |
if (!generationStatus.textContent.includes("finished") && !generationStatus.textContent.includes("stopped") && !generationStatus.textContent.includes("Error") && !generationStatus.textContent.includes("Max continuations")) {
|
| 492 |
generationStatus.textContent = `Streaming finished. Total continuations: ${continuationCount}.`;
|
| 493 |
generationStatus.className = 'info';
|
|
@@ -497,7 +509,6 @@ html_code = """
|
|
| 497 |
|
| 498 |
const chunk = decoder.decode(value, { stream: true });
|
| 499 |
|
| 500 |
-
// Check for specific server markers first
|
| 501 |
const continueMatch = chunk.match(/\n\[CONTINUING (\d+) - TRUNCATING CONTEXT\.\.\.\]\n/);
|
| 502 |
const errorMatch = chunk.match(/\n\[ERROR\](.*)/);
|
| 503 |
const infoMatch = chunk.match(/\n\[INFO\](.*)/);
|
|
@@ -512,19 +523,18 @@ html_code = """
|
|
| 512 |
generationOutput.textContent += chunk;
|
| 513 |
generationStatus.textContent = `Error during generation: ${errorMatch[1]}`;
|
| 514 |
generationStatus.className = 'error';
|
| 515 |
-
finished = true;
|
| 516 |
} else if (infoMatch) {
|
| 517 |
generationOutput.textContent += chunk;
|
| 518 |
generationStatus.textContent = `Generation info: ${infoMatch[1]}. Total continuations: ${continuationCount}.`;
|
| 519 |
generationStatus.className = 'info';
|
| 520 |
if (infoMatch[1].includes("stopped") || infoMatch[1].includes("finished") || infoMatch[1].includes("Max continuations")) {
|
| 521 |
-
finished = true;
|
| 522 |
}
|
| 523 |
lastStatusUpdate = Date.now();
|
| 524 |
}
|
| 525 |
else {
|
| 526 |
generationOutput.textContent += chunk;
|
| 527 |
-
// Update status periodically if no marker received
|
| 528 |
if (Date.now() - lastStatusUpdate > 1000 && !generationStatus.className.includes('warning') && !generationStatus.className.includes('error')) {
|
| 529 |
generationStatus.textContent = `Streaming... (Continuation #${continuationCount})`;
|
| 530 |
generationStatus.className = 'info';
|
|
@@ -534,7 +544,7 @@ html_code = """
|
|
| 534 |
generationOutput.scrollTop = generationOutput.scrollHeight;
|
| 535 |
}
|
| 536 |
|
| 537 |
-
} else {
|
| 538 |
const text = await response.text();
|
| 539 |
const finishReason = response.headers.get('X-Finish-Reason');
|
| 540 |
const continuations = response.headers.get('X-Continuations');
|
|
@@ -552,12 +562,9 @@ html_code = """
|
|
| 552 |
}
|
| 553 |
if (usageCompletionTokens && usageCompletionTokens !== 'N/A') statusText += ` Completion Tokens: ~${usageCompletionTokens}.`;
|
| 554 |
|
| 555 |
-
// Check for explicit error/info markers in the non-streaming output text as well
|
| 556 |
if (text.includes("[ERROR]")) {
|
| 557 |
statusText = "Generation finished with errors. See output."
|
| 558 |
generationStatus.className = 'error';
|
| 559 |
-
} else if (text.includes("[INFO]")) {
|
| 560 |
-
// Status is likely already set by headers, but might add more context from text
|
| 561 |
}
|
| 562 |
|
| 563 |
|
|
@@ -577,7 +584,6 @@ html_code = """
|
|
| 577 |
apiInfoBtn.addEventListener('click', getApiInfo);
|
| 578 |
generateBtn.addEventListener('click', generateText);
|
| 579 |
|
| 580 |
-
// Initial checks on page load
|
| 581 |
checkHealth();
|
| 582 |
getApiInfo();
|
| 583 |
</script>
|
|
@@ -614,12 +620,7 @@ def model_info():
|
|
| 614 |
|
| 615 |
model_details: Union[Dict[str, Any], str] = "Model details unavailable"
|
| 616 |
try:
|
| 617 |
-
n_embd = 'N/A'
|
| 618 |
-
if hasattr(llm, '_model') and hasattr(llm._model, 'n_embd') and callable(llm._model.n_embd):
|
| 619 |
-
try:
|
| 620 |
-
n_embd = llm._model.n_embd()
|
| 621 |
-
except Exception as embd_e:
|
| 622 |
-
logger.warning(f"Could not get n_embd: {embd_e}", extra={'request_id': request_id})
|
| 623 |
|
| 624 |
model_details = {
|
| 625 |
"n_embd": n_embd,
|
|
@@ -661,7 +662,6 @@ def model_info():
|
|
| 661 |
return jsonify(info), 200
|
| 662 |
|
| 663 |
def _generate_single_cycle(messages: List[Dict[str, str]], params: Dict, stream: bool, request_id: str) -> Union[Generator[Dict, None, None], Dict]:
|
| 664 |
-
"""Performs one call to llm.create_chat_completion."""
|
| 665 |
try:
|
| 666 |
logger.debug(f"Starting llama.cpp chat completion call. Stream: {stream}. Messages: {len(messages)}. Params summary: temp={params.get('temperature')}, top_p={params.get('top_p')}, top_k={params.get('top_k')}, stop={params.get('stop')}", extra={'request_id': request_id, 'stream': stream, 'message_count': len(messages)})
|
| 667 |
result = llm.create_chat_completion(
|
|
@@ -733,7 +733,6 @@ def generate():
|
|
| 733 |
effective_n_ctx = get_effective_n_ctx()
|
| 734 |
input_token_count = estimate_token_count(initial_messages, request_id=request_id)
|
| 735 |
|
| 736 |
-
# Check initial input size against a buffered context window
|
| 737 |
if input_token_count != -1 and input_token_count > effective_n_ctx * CONTEXT_TRUNCATION_BUFFER_RATIO:
|
| 738 |
logger.warning(f"Initial input (~{input_token_count} tokens) likely exceeds safe context window ({int(effective_n_ctx * CONTEXT_TRUNCATION_BUFFER_RATIO)}). Attempting truncation.", extra={'request_id': request_id, 'initial_tokens': input_token_count, 'n_ctx': effective_n_ctx, 'buffer_ratio': CONTEXT_TRUNCATION_BUFFER_RATIO})
|
| 739 |
truncated_initial = truncate_messages_for_context(initial_messages, effective_n_ctx, CONTEXT_TRUNCATION_BUFFER_RATIO, request_id=request_id)
|
|
@@ -809,7 +808,7 @@ def generate():
|
|
| 809 |
if chunk_finish_reason:
|
| 810 |
finish_reason = chunk_finish_reason
|
| 811 |
usage_this_cycle = chunk_usage
|
| 812 |
-
final_usage = usage_this_cycle
|
| 813 |
break
|
| 814 |
|
| 815 |
if not finish_reason and generated_this_cycle_content:
|
|
@@ -837,9 +836,7 @@ def generate():
|
|
| 837 |
else:
|
| 838 |
current_messages[-1]['content'] += generated_this_cycle_content
|
| 839 |
|
| 840 |
-
|
| 841 |
-
# otherwise this sum will be an underestimate or rely on final usage.
|
| 842 |
-
total_completion_tokens_generated += usage_this_cycle.get("completion_tokens", 0) # This is likely 0 for most chunks
|
| 843 |
|
| 844 |
if finish_reason == 'stop' or finish_reason == 'end_of_stream':
|
| 845 |
logger.info(f"Streaming generation stopped naturally in cycle {cycle_number}. Reason: {finish_reason}", extra={'request_id': req_id, 'cycle': cycle_number, 'finish_reason': finish_reason})
|
|
@@ -875,11 +872,9 @@ def generate():
|
|
| 875 |
"X-Accel-Buffering": "no",
|
| 876 |
"X-Request-ID": request_id
|
| 877 |
}
|
| 878 |
-
# Note: X- headers like usage can't reliably reflect accumulated values in SSE until stream ends.
|
| 879 |
-
# We rely on the JS client or final log message for total details.
|
| 880 |
return Response(stream_with_context(streaming_generator(request_id)), headers=headers)
|
| 881 |
|
| 882 |
-
else:
|
| 883 |
while True:
|
| 884 |
if MAX_CONTINUATIONS >= 0 and continuations > MAX_CONTINUATIONS:
|
| 885 |
logger.info(f"Max continuations ({MAX_CONTINUATIONS}) reached. Stopping non-streaming.", extra={'request_id': request_id})
|
|
@@ -986,8 +981,6 @@ def generate():
|
|
| 986 |
response.headers["X-Finish-Reason"] = final_finish_reason
|
| 987 |
response.headers["X-Continuations"] = str(continuations)
|
| 988 |
response.headers["X-Usage-Completion-Tokens"] = str(total_completion_tokens_generated)
|
| 989 |
-
# Note: Prompt tokens and total tokens usage from the LAST cycle might be misleading
|
| 990 |
-
# if significant truncation happened in previous cycles.
|
| 991 |
response.headers["X-Usage-Prompt-Tokens-Last-Cycle"] = str(final_usage.get("prompt_tokens", "N/A"))
|
| 992 |
response.headers["X-Usage-Total-Tokens-Last-Cycle"] = str(final_usage.get("total_tokens", "N/A"))
|
| 993 |
return response
|
|
@@ -1006,14 +999,5 @@ if __name__ == "__main__":
|
|
| 1006 |
if not llm:
|
| 1007 |
logger.critical("MODEL FAILED TO LOAD. SERVER WILL START BUT '/generate' WILL FAIL.")
|
| 1008 |
|
| 1009 |
-
|
| 1010 |
-
|
| 1011 |
-
from waitress import serve
|
| 1012 |
-
logger.info("Running with Waitress production server.")
|
| 1013 |
-
serve(app, host=host, port=port, threads=8)
|
| 1014 |
-
except ImportError:
|
| 1015 |
-
logger.warning("Waitress not found. Falling back to Flask development server. Install waitress for production.")
|
| 1016 |
-
app.run(host=host, port=port, threaded=True, debug=is_debug)
|
| 1017 |
-
else:
|
| 1018 |
-
logger.info("Running with Flask development server (Debug=True).")
|
| 1019 |
-
app.run(host=host, port=port, threaded=True, debug=is_debug, use_reloader=False)
|
|
|
|
| 31 |
skip_keys = {'message', 'asctime', 'levelname', 'levelno', 'pathname', 'filename', 'module', 'funcName', 'lineno', 'created', 'msecs', 'relativeCreated', 'thread', 'threadName', 'process', 'processName', 'exc_info', 'exc_text', 'stack_info', 'request_id'}
|
| 32 |
for key, value in record.__dict__.items():
|
| 33 |
if not key.startswith('_') and key not in log_record and key not in skip_keys:
|
| 34 |
+
# Ensure value is JSON serializable
|
| 35 |
+
try:
|
| 36 |
+
json.dumps(value)
|
| 37 |
+
log_record[key] = value
|
| 38 |
+
except TypeError:
|
| 39 |
+
log_record[key] = str(value) # Convert non-serializable types to string
|
| 40 |
+
except Exception:
|
| 41 |
+
log_record[key] = "[Unserializable Value]"
|
| 42 |
return json.dumps(log_record)
|
| 43 |
|
| 44 |
def setup_logging():
|
|
|
|
| 59 |
MODEL_FILE = os.getenv("MODEL_FILE", "gemma-3-4b-it-q4_0.gguf")
|
| 60 |
N_CTX_CONFIG = int(os.getenv("N_CTX", "2048"))
|
| 61 |
N_BATCH = int(os.getenv("N_BATCH", "512"))
|
| 62 |
+
N_GPU_LAYERS_CONFIG = int(os.getenv("N_GPU_LAYERS", "0"))
|
| 63 |
+
MAX_CONTINUATIONS = int(os.getenv("MAX_CONTINUATIONS", "-1"))
|
| 64 |
|
| 65 |
FIXED_REPEAT_PENALTY = float(os.getenv("FIXED_REPEAT_PENALTY", "1.1"))
|
| 66 |
FIXED_SEED = int(os.getenv("FIXED_SEED", "-1"))
|
|
|
|
| 128 |
content = str(content)
|
| 129 |
|
| 130 |
if role == "system":
|
|
|
|
| 131 |
if i == 0 and final_messages and final_messages[0]["role"] == "system":
|
| 132 |
logger.info("Replacing default system prompt with user-provided system message.", extra={'request_id': request_id})
|
| 133 |
+
final_messages[0]["content"] = content
|
| 134 |
+
elif i == 0 and not final_messages:
|
| 135 |
final_messages.append({"role": "system", "content": content})
|
| 136 |
else:
|
| 137 |
logger.warning(f"Ignoring additional system message at index {i} as system prompt is already set or should be at the start.", extra={'request_id': request_id, 'message_index': i})
|
|
|
|
| 159 |
return -1
|
| 160 |
|
| 161 |
try:
|
|
|
|
| 162 |
chat_prompt_string = llm.apply_chat_template(messages, add_generation_prompt=True)
|
| 163 |
tokens = llm.tokenize(chat_prompt_string.encode('utf-8', errors='ignore'), add_bos=True)
|
| 164 |
return len(tokens)
|
| 165 |
except Exception as e:
|
| 166 |
logger.error(f"Could not estimate token count using apply_chat_template: {e}", exc_info=True, extra={'request_id': request_id})
|
|
|
|
| 167 |
char_count = sum(len(m.get('content', '')) for m in messages)
|
| 168 |
+
estimated_tokens = char_count // 4
|
| 169 |
logger.warning(f"Falling back to character-based token estimation (~{estimated_tokens})", extra={'request_id': request_id, 'estimated_tokens': estimated_tokens, 'char_count': char_count})
|
| 170 |
return estimated_tokens
|
| 171 |
|
|
|
|
| 179 |
truncated_messages: List[Dict[str, str]] = []
|
| 180 |
system_prompt: Optional[Dict[str, str]] = None
|
| 181 |
|
|
|
|
| 182 |
if messages and messages[0].get("role") == "system":
|
| 183 |
system_prompt = messages[0]
|
| 184 |
truncated_messages.append(system_prompt)
|
|
|
|
| 192 |
current_token_count = sum(len(m.get('content', '')) for m in truncated_messages) // 4
|
| 193 |
|
| 194 |
messages_to_add = []
|
|
|
|
| 195 |
for msg in reversed(remaining_messages):
|
|
|
|
| 196 |
potential_list = ([system_prompt] if system_prompt else []) + [msg] + messages_to_add
|
| 197 |
|
| 198 |
next_token_count = estimate_token_count(potential_list, request_id=request_id)
|
|
|
|
| 201 |
messages_to_add.insert(0, msg)
|
| 202 |
current_token_count = next_token_count
|
| 203 |
elif next_token_count == -1:
|
|
|
|
| 204 |
logger.warning(f"Token estimation failed while adding message: {msg}. Stopping truncation early.", extra={'request_id': request_id})
|
| 205 |
break
|
| 206 |
else:
|
|
|
|
| 213 |
final_count = len(final_truncated_list)
|
| 214 |
|
| 215 |
if not final_truncated_list or all(m.get("role") == "system" for m in final_truncated_list):
|
|
|
|
| 216 |
if any(m.get("role") == "user" for m in messages):
|
| 217 |
last_user_message = next((m for m in reversed(messages) if m.get("role") == "user"), None)
|
| 218 |
if last_user_message:
|
|
|
|
| 221 |
final_count = len(final_truncated_list)
|
| 222 |
current_token_count = estimate_token_count(final_truncated_list, request_id=request_id)
|
| 223 |
|
| 224 |
+
|
| 225 |
if final_count < original_count:
|
| 226 |
logger.warning(f"Context truncated: Kept {final_count}/{original_count} messages. Estimated tokens: ~{current_token_count}/{target_token_limit} (target).",
|
| 227 |
extra={'request_id': request_id, 'kept': final_count, 'original': original_count, 'estimated_tokens': current_token_count, 'target_limit': target_token_limit})
|
|
|
|
| 229 |
logger.debug(f"Context truncation check complete. Kept all {final_count} messages. Estimated tokens: ~{current_token_count}.",
|
| 230 |
extra={'request_id': request_id, 'kept': final_count, 'estimated_tokens': current_token_count})
|
| 231 |
|
| 232 |
+
|
| 233 |
if not final_truncated_list:
|
| 234 |
logger.error("Context truncation resulted in an empty message list!", extra={'request_id': request_id})
|
|
|
|
| 235 |
return []
|
| 236 |
|
| 237 |
return final_truncated_list
|
| 238 |
|
| 239 |
+
def get_property_or_method_value(obj: Any, prop_name: str, default: Any = None) -> Any:
|
| 240 |
+
"""Safely get property value or call method if callable."""
|
| 241 |
+
if hasattr(obj, prop_name):
|
| 242 |
+
prop = getattr(obj, prop_name)
|
| 243 |
+
if callable(prop):
|
| 244 |
+
try:
|
| 245 |
+
return prop()
|
| 246 |
+
except Exception:
|
| 247 |
+
logger.warning(f"Error calling method {prop_name} on {type(obj)}", exc_info=True)
|
| 248 |
+
return default
|
| 249 |
+
else:
|
| 250 |
+
return prop
|
| 251 |
+
return default
|
| 252 |
+
|
| 253 |
def load_model():
|
| 254 |
global llm, ACTUAL_N_CTX, ACTUAL_N_BATCH, ACTUAL_N_GPU_LAYERS
|
| 255 |
logger.info(f"Attempting to load model: {MODEL_REPO}/{MODEL_FILE}")
|
|
|
|
| 267 |
)
|
| 268 |
logger.info("Model loaded successfully.")
|
| 269 |
if llm:
|
| 270 |
+
ACTUAL_N_CTX = get_property_or_method_value(llm, 'n_ctx', N_CTX_CONFIG)
|
| 271 |
+
ACTUAL_N_BATCH = get_property_or_method_value(llm, 'n_batch', N_BATCH)
|
| 272 |
+
ACTUAL_N_GPU_LAYERS = get_property_or_method_value(llm, 'n_gpu_layers', 0)
|
| 273 |
|
| 274 |
if ACTUAL_N_CTX != N_CTX_CONFIG:
|
| 275 |
logger.warning(f"Model's actual context size ({ACTUAL_N_CTX}) differs from config ({N_CTX_CONFIG}). Using actual.", extra={'actual_n_ctx': ACTUAL_N_CTX, 'configured_n_ctx': N_CTX_CONFIG})
|
|
|
|
| 280 |
logger.info(f"Actual Model Batch Size (n_batch): {ACTUAL_N_BATCH}")
|
| 281 |
logger.info(f"Actual Model GPU Layers (n_gpu_layers): {ACTUAL_N_GPU_LAYERS}")
|
| 282 |
|
|
|
|
| 283 |
try:
|
| 284 |
test_tokens = llm.tokenize(b"Test sentence.")
|
| 285 |
logger.info(f"Tokenizer test successful. 'Test sentence.' -> {len(test_tokens)} tokens.")
|
|
|
|
| 500 |
const { done, value } = await reader.read();
|
| 501 |
if (done) {
|
| 502 |
finished = true;
|
|
|
|
| 503 |
if (!generationStatus.textContent.includes("finished") && !generationStatus.textContent.includes("stopped") && !generationStatus.textContent.includes("Error") && !generationStatus.textContent.includes("Max continuations")) {
|
| 504 |
generationStatus.textContent = `Streaming finished. Total continuations: ${continuationCount}.`;
|
| 505 |
generationStatus.className = 'info';
|
|
|
|
| 509 |
|
| 510 |
const chunk = decoder.decode(value, { stream: true });
|
| 511 |
|
|
|
|
| 512 |
const continueMatch = chunk.match(/\n\[CONTINUING (\d+) - TRUNCATING CONTEXT\.\.\.\]\n/);
|
| 513 |
const errorMatch = chunk.match(/\n\[ERROR\](.*)/);
|
| 514 |
const infoMatch = chunk.match(/\n\[INFO\](.*)/);
|
|
|
|
| 523 |
generationOutput.textContent += chunk;
|
| 524 |
generationStatus.textContent = `Error during generation: ${errorMatch[1]}`;
|
| 525 |
generationStatus.className = 'error';
|
| 526 |
+
finished = true;
|
| 527 |
} else if (infoMatch) {
|
| 528 |
generationOutput.textContent += chunk;
|
| 529 |
generationStatus.textContent = `Generation info: ${infoMatch[1]}. Total continuations: ${continuationCount}.`;
|
| 530 |
generationStatus.className = 'info';
|
| 531 |
if (infoMatch[1].includes("stopped") || infoMatch[1].includes("finished") || infoMatch[1].includes("Max continuations")) {
|
| 532 |
+
finished = true;
|
| 533 |
}
|
| 534 |
lastStatusUpdate = Date.now();
|
| 535 |
}
|
| 536 |
else {
|
| 537 |
generationOutput.textContent += chunk;
|
|
|
|
| 538 |
if (Date.now() - lastStatusUpdate > 1000 && !generationStatus.className.includes('warning') && !generationStatus.className.includes('error')) {
|
| 539 |
generationStatus.textContent = `Streaming... (Continuation #${continuationCount})`;
|
| 540 |
generationStatus.className = 'info';
|
|
|
|
| 544 |
generationOutput.scrollTop = generationOutput.scrollHeight;
|
| 545 |
}
|
| 546 |
|
| 547 |
+
} else {
|
| 548 |
const text = await response.text();
|
| 549 |
const finishReason = response.headers.get('X-Finish-Reason');
|
| 550 |
const continuations = response.headers.get('X-Continuations');
|
|
|
|
| 562 |
}
|
| 563 |
if (usageCompletionTokens && usageCompletionTokens !== 'N/A') statusText += ` Completion Tokens: ~${usageCompletionTokens}.`;
|
| 564 |
|
|
|
|
| 565 |
if (text.includes("[ERROR]")) {
|
| 566 |
statusText = "Generation finished with errors. See output."
|
| 567 |
generationStatus.className = 'error';
|
|
|
|
|
|
|
| 568 |
}
|
| 569 |
|
| 570 |
|
|
|
|
| 584 |
apiInfoBtn.addEventListener('click', getApiInfo);
|
| 585 |
generateBtn.addEventListener('click', generateText);
|
| 586 |
|
|
|
|
| 587 |
checkHealth();
|
| 588 |
getApiInfo();
|
| 589 |
</script>
|
|
|
|
| 620 |
|
| 621 |
model_details: Union[Dict[str, Any], str] = "Model details unavailable"
|
| 622 |
try:
|
| 623 |
+
n_embd = get_property_or_method_value(get_property_or_method_value(llm, '_model'), 'n_embd', 'N/A')
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 624 |
|
| 625 |
model_details = {
|
| 626 |
"n_embd": n_embd,
|
|
|
|
| 662 |
return jsonify(info), 200
|
| 663 |
|
| 664 |
def _generate_single_cycle(messages: List[Dict[str, str]], params: Dict, stream: bool, request_id: str) -> Union[Generator[Dict, None, None], Dict]:
|
|
|
|
| 665 |
try:
|
| 666 |
logger.debug(f"Starting llama.cpp chat completion call. Stream: {stream}. Messages: {len(messages)}. Params summary: temp={params.get('temperature')}, top_p={params.get('top_p')}, top_k={params.get('top_k')}, stop={params.get('stop')}", extra={'request_id': request_id, 'stream': stream, 'message_count': len(messages)})
|
| 667 |
result = llm.create_chat_completion(
|
|
|
|
| 733 |
effective_n_ctx = get_effective_n_ctx()
|
| 734 |
input_token_count = estimate_token_count(initial_messages, request_id=request_id)
|
| 735 |
|
|
|
|
| 736 |
if input_token_count != -1 and input_token_count > effective_n_ctx * CONTEXT_TRUNCATION_BUFFER_RATIO:
|
| 737 |
logger.warning(f"Initial input (~{input_token_count} tokens) likely exceeds safe context window ({int(effective_n_ctx * CONTEXT_TRUNCATION_BUFFER_RATIO)}). Attempting truncation.", extra={'request_id': request_id, 'initial_tokens': input_token_count, 'n_ctx': effective_n_ctx, 'buffer_ratio': CONTEXT_TRUNCATION_BUFFER_RATIO})
|
| 738 |
truncated_initial = truncate_messages_for_context(initial_messages, effective_n_ctx, CONTEXT_TRUNCATION_BUFFER_RATIO, request_id=request_id)
|
|
|
|
| 808 |
if chunk_finish_reason:
|
| 809 |
finish_reason = chunk_finish_reason
|
| 810 |
usage_this_cycle = chunk_usage
|
| 811 |
+
final_usage = usage_this_cycle
|
| 812 |
break
|
| 813 |
|
| 814 |
if not finish_reason and generated_this_cycle_content:
|
|
|
|
| 836 |
else:
|
| 837 |
current_messages[-1]['content'] += generated_this_cycle_content
|
| 838 |
|
| 839 |
+
total_completion_tokens_generated += usage_this_cycle.get("completion_tokens", 0)
|
|
|
|
|
|
|
| 840 |
|
| 841 |
if finish_reason == 'stop' or finish_reason == 'end_of_stream':
|
| 842 |
logger.info(f"Streaming generation stopped naturally in cycle {cycle_number}. Reason: {finish_reason}", extra={'request_id': req_id, 'cycle': cycle_number, 'finish_reason': finish_reason})
|
|
|
|
| 872 |
"X-Accel-Buffering": "no",
|
| 873 |
"X-Request-ID": request_id
|
| 874 |
}
|
|
|
|
|
|
|
| 875 |
return Response(stream_with_context(streaming_generator(request_id)), headers=headers)
|
| 876 |
|
| 877 |
+
else:
|
| 878 |
while True:
|
| 879 |
if MAX_CONTINUATIONS >= 0 and continuations > MAX_CONTINUATIONS:
|
| 880 |
logger.info(f"Max continuations ({MAX_CONTINUATIONS}) reached. Stopping non-streaming.", extra={'request_id': request_id})
|
|
|
|
| 981 |
response.headers["X-Finish-Reason"] = final_finish_reason
|
| 982 |
response.headers["X-Continuations"] = str(continuations)
|
| 983 |
response.headers["X-Usage-Completion-Tokens"] = str(total_completion_tokens_generated)
|
|
|
|
|
|
|
| 984 |
response.headers["X-Usage-Prompt-Tokens-Last-Cycle"] = str(final_usage.get("prompt_tokens", "N/A"))
|
| 985 |
response.headers["X-Usage-Total-Tokens-Last-Cycle"] = str(final_usage.get("total_tokens", "N/A"))
|
| 986 |
return response
|
|
|
|
| 999 |
if not llm:
|
| 1000 |
logger.critical("MODEL FAILED TO LOAD. SERVER WILL START BUT '/generate' WILL FAIL.")
|
| 1001 |
|
| 1002 |
+
logger.info("Running with Flask development server.")
|
| 1003 |
+
app.run(host=host, port=port, threaded=True, debug=is_debug, use_reloader=False)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|