odai0 commited on
Commit
af0f57d
·
1 Parent(s): f9b06f5
Files changed (1) hide show
  1. app.py +16 -13
app.py CHANGED
@@ -9,7 +9,6 @@ import requests
9
 
10
  app = FastAPI()
11
  MODE = os.environ.get("MODE", "LLM")
12
- input_limit = 2048
13
 
14
 
15
  class MockLLM:
@@ -25,10 +24,12 @@ print(f"Running in {MODE} mode")
25
 
26
  if MODE == "MOCK":
27
  # llm = MockLLM()
 
28
  context_length = 1024
29
  llm = Llama(model_path="./model/SILMA-9B-Instruct-v1.0-Q2_K_2.gguf",
30
  n_ctx=context_length, n_gpu_layers=10, n_patch=256)
31
  else:
 
32
  context_length = 4096
33
  llm = Llama.from_pretrained(
34
  repo_id="bartowski/SILMA-9B-Instruct-v1.0-GGUF",
@@ -199,25 +200,27 @@ def generate_text(request: PromptRequest):
199
  {
200
  "role": "system",
201
  "content": (
202
- """You are an assistant for an accessibility browser extension. "
203
- "Your only task is to return a **valid JSON object** based on the user's request. "
204
- "The JSON must have this format:"
205
- "{ "signal": string, "message": string }"
206
- "Valid signal codes:"
207
  """ + codes + """
208
- "Rules:"
209
- "1. Always return JSON, never plain text"
210
- "2. Do not include extra keys."
211
- "3. Do not escape JSON unnecessarily."
212
- "4. Request chunking using valid signal if user asks for analysis, summarization, or possible actions."
213
- "5. If unsure, default to {"signal": "m0", "message": "I did not understand the request."}"""
214
  )
215
  },
216
  {"role": "user", "content": request.prompt}
217
  ]
218
 
219
  token_count = count_tokens(format_messages(messages))
220
- print(token_count)
 
 
221
  output = llm.create_chat_completion(
222
  messages=messages,
223
  max_tokens=1024,
 
9
 
10
  app = FastAPI()
11
  MODE = os.environ.get("MODE", "LLM")
 
12
 
13
 
14
  class MockLLM:
 
24
 
25
  if MODE == "MOCK":
26
  # llm = MockLLM()
27
+ input_limit = 512
28
  context_length = 1024
29
  llm = Llama(model_path="./model/SILMA-9B-Instruct-v1.0-Q2_K_2.gguf",
30
  n_ctx=context_length, n_gpu_layers=10, n_patch=256)
31
  else:
32
+ input_limit = 2048
33
  context_length = 4096
34
  llm = Llama.from_pretrained(
35
  repo_id="bartowski/SILMA-9B-Instruct-v1.0-GGUF",
 
200
  {
201
  "role": "system",
202
  "content": (
203
+ """You are an assistant for an accessibility browser extension.
204
+ Your only task is to return a **valid JSON object** based on the user's request.
205
+ The JSON must have this format:
206
+ { "signal": string, "message": string }
207
+ Valid signal codes:
208
  """ + codes + """
209
+ Rules:
210
+ 1. Always return JSON, never plain text
211
+ 2. Do not include extra keys.
212
+ 3. Do not escape JSON unnecessarily.
213
+ 4. Request chunking using valid signal if user asks for analysis, summarization, or possible actions.
214
+ 5. If unsure, default to {"signal": "m0", "message": "I did not understand the request."}"""
215
  )
216
  },
217
  {"role": "user", "content": request.prompt}
218
  ]
219
 
220
  token_count = count_tokens(format_messages(messages))
221
+ if token_count > input_limit:
222
+ return {"signal": "e0", "message": "Input exceeds token limit."}
223
+
224
  output = llm.create_chat_completion(
225
  messages=messages,
226
  max_tokens=1024,