cleaning
Browse files
app.py
CHANGED
|
@@ -9,7 +9,6 @@ import requests
|
|
| 9 |
|
| 10 |
app = FastAPI()
|
| 11 |
MODE = os.environ.get("MODE", "LLM")
|
| 12 |
-
input_limit = 2048
|
| 13 |
|
| 14 |
|
| 15 |
class MockLLM:
|
|
@@ -25,10 +24,12 @@ print(f"Running in {MODE} mode")
|
|
| 25 |
|
| 26 |
if MODE == "MOCK":
|
| 27 |
# llm = MockLLM()
|
|
|
|
| 28 |
context_length = 1024
|
| 29 |
llm = Llama(model_path="./model/SILMA-9B-Instruct-v1.0-Q2_K_2.gguf",
|
| 30 |
n_ctx=context_length, n_gpu_layers=10, n_patch=256)
|
| 31 |
else:
|
|
|
|
| 32 |
context_length = 4096
|
| 33 |
llm = Llama.from_pretrained(
|
| 34 |
repo_id="bartowski/SILMA-9B-Instruct-v1.0-GGUF",
|
|
@@ -199,25 +200,27 @@ def generate_text(request: PromptRequest):
|
|
| 199 |
{
|
| 200 |
"role": "system",
|
| 201 |
"content": (
|
| 202 |
-
"""You are an assistant for an accessibility browser extension.
|
| 203 |
-
|
| 204 |
-
|
| 205 |
-
|
| 206 |
-
|
| 207 |
""" + codes + """
|
| 208 |
-
|
| 209 |
-
|
| 210 |
-
|
| 211 |
-
|
| 212 |
-
|
| 213 |
-
|
| 214 |
)
|
| 215 |
},
|
| 216 |
{"role": "user", "content": request.prompt}
|
| 217 |
]
|
| 218 |
|
| 219 |
token_count = count_tokens(format_messages(messages))
|
| 220 |
-
|
|
|
|
|
|
|
| 221 |
output = llm.create_chat_completion(
|
| 222 |
messages=messages,
|
| 223 |
max_tokens=1024,
|
|
|
|
| 9 |
|
| 10 |
app = FastAPI()
|
| 11 |
MODE = os.environ.get("MODE", "LLM")
|
|
|
|
| 12 |
|
| 13 |
|
| 14 |
class MockLLM:
|
|
|
|
| 24 |
|
| 25 |
if MODE == "MOCK":
|
| 26 |
# llm = MockLLM()
|
| 27 |
+
input_limit = 512
|
| 28 |
context_length = 1024
|
| 29 |
llm = Llama(model_path="./model/SILMA-9B-Instruct-v1.0-Q2_K_2.gguf",
|
| 30 |
n_ctx=context_length, n_gpu_layers=10, n_patch=256)
|
| 31 |
else:
|
| 32 |
+
input_limit = 2048
|
| 33 |
context_length = 4096
|
| 34 |
llm = Llama.from_pretrained(
|
| 35 |
repo_id="bartowski/SILMA-9B-Instruct-v1.0-GGUF",
|
|
|
|
| 200 |
{
|
| 201 |
"role": "system",
|
| 202 |
"content": (
|
| 203 |
+
"""You are an assistant for an accessibility browser extension.
|
| 204 |
+
Your only task is to return a **valid JSON object** based on the user's request.
|
| 205 |
+
The JSON must have this format:
|
| 206 |
+
{ "signal": string, "message": string }
|
| 207 |
+
Valid signal codes:
|
| 208 |
""" + codes + """
|
| 209 |
+
Rules:
|
| 210 |
+
1. Always return JSON, never plain text
|
| 211 |
+
2. Do not include extra keys.
|
| 212 |
+
3. Do not escape JSON unnecessarily.
|
| 213 |
+
4. Request chunking using valid signal if user asks for analysis, summarization, or possible actions.
|
| 214 |
+
5. If unsure, default to {"signal": "m0", "message": "I did not understand the request."}"""
|
| 215 |
)
|
| 216 |
},
|
| 217 |
{"role": "user", "content": request.prompt}
|
| 218 |
]
|
| 219 |
|
| 220 |
token_count = count_tokens(format_messages(messages))
|
| 221 |
+
if token_count > input_limit:
|
| 222 |
+
return {"signal": "e0", "message": "Input exceeds token limit."}
|
| 223 |
+
|
| 224 |
output = llm.create_chat_completion(
|
| 225 |
messages=messages,
|
| 226 |
max_tokens=1024,
|