Spaces:

odai0
/

silmaQ5

Sleeping

App Files Files Community

odai0 commited on Nov 1, 2025

Commit

af0f57d

1 Parent(s): f9b06f5

cleaning

Browse files

Files changed (1) hide show

app.py +16 -13

app.py CHANGED Viewed

@@ -9,7 +9,6 @@ import requests
 app = FastAPI()
 MODE = os.environ.get("MODE", "LLM")
-input_limit = 2048
 class MockLLM:
@@ -25,10 +24,12 @@ print(f"Running in {MODE} mode")
 if MODE == "MOCK":
     # llm = MockLLM()
     context_length = 1024
     llm = Llama(model_path="./model/SILMA-9B-Instruct-v1.0-Q2_K_2.gguf",
                 n_ctx=context_length, n_gpu_layers=10, n_patch=256)
 else:
     context_length = 4096
     llm = Llama.from_pretrained(
         repo_id="bartowski/SILMA-9B-Instruct-v1.0-GGUF",
@@ -199,25 +200,27 @@ def generate_text(request: PromptRequest):
         {
             "role": "system",
             "content": (
-                    """You are an assistant for an accessibility browser extension. "
-                    "Your only task is to return a **valid JSON object** based on the user's request. "
-                    "The JSON must have this format:"
-                    "{ "signal": string, "message": string }"
-                    "Valid signal codes:"
                     """ + codes + """
-                    "Rules:"
-                    "1. Always return JSON, never plain text"
-                    "2. Do not include extra keys."
-                    "3. Do not escape JSON unnecessarily."
-                    "4. Request chunking using valid signal if user asks for analysis, summarization, or possible actions."
-                    "5. If unsure, default to {"signal": "m0", "message": "I did not understand the request."}"""
             )
         },
         {"role": "user", "content": request.prompt}
     ]
     token_count = count_tokens(format_messages(messages))
-    print(token_count)
     output = llm.create_chat_completion(
         messages=messages,
         max_tokens=1024,

 app = FastAPI()
 MODE = os.environ.get("MODE", "LLM")
 class MockLLM:
 if MODE == "MOCK":
     # llm = MockLLM()
+    input_limit = 512
     context_length = 1024
     llm = Llama(model_path="./model/SILMA-9B-Instruct-v1.0-Q2_K_2.gguf",
                 n_ctx=context_length, n_gpu_layers=10, n_patch=256)
 else:
+    input_limit = 2048
     context_length = 4096
     llm = Llama.from_pretrained(
         repo_id="bartowski/SILMA-9B-Instruct-v1.0-GGUF",
         {
             "role": "system",
             "content": (
+                    """You are an assistant for an accessibility browser extension.
+                    Your only task is to return a **valid JSON object** based on the user's request.
+                    The JSON must have this format:
+                    { "signal": string, "message": string }
+                    Valid signal codes:
                     """ + codes + """
+                    Rules:
+                    1. Always return JSON, never plain text
+                    2. Do not include extra keys.
+                    3. Do not escape JSON unnecessarily.
+                    4. Request chunking using valid signal if user asks for analysis, summarization, or possible actions.
+                    5. If unsure, default to {"signal": "m0", "message": "I did not understand the request."}"""
             )
         },
         {"role": "user", "content": request.prompt}
     ]
     token_count = count_tokens(format_messages(messages))
+    if token_count > input_limit:
+        return {"signal": "e0", "message": "Input exceeds token limit."}
     output = llm.create_chat_completion(
         messages=messages,
         max_tokens=1024,