Spaces:
Paused
Paused
Update app.py
Browse files
app.py
CHANGED
|
@@ -253,7 +253,11 @@ def generate_qwen3_gguf(prompt: str) -> (str, str):
|
|
| 253 |
messages = [
|
| 254 |
{"role": "user", "content": prompt}
|
| 255 |
]
|
| 256 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 257 |
generated_text = response['choices'][0]['message']['content']
|
| 258 |
|
| 259 |
if "</think>" in generated_text:
|
|
@@ -264,6 +268,7 @@ def generate_qwen3_gguf(prompt: str) -> (str, str):
|
|
| 264 |
|
| 265 |
|
| 266 |
|
|
|
|
| 267 |
@app.post("/generate/{model_name}", response_model=GenerateResponse)
|
| 268 |
async def generate(
|
| 269 |
request: PromptRequest,
|
|
|
|
| 253 |
messages = [
|
| 254 |
{"role": "user", "content": prompt}
|
| 255 |
]
|
| 256 |
+
# Set max_tokens or max_new_tokens to keep total tokens <= 512
|
| 257 |
+
response = qwen3_gguf_llm.create_chat_completion(
|
| 258 |
+
messages=messages,
|
| 259 |
+
max_tokens=512 # or smaller, adjust to fit your use case
|
| 260 |
+
)
|
| 261 |
generated_text = response['choices'][0]['message']['content']
|
| 262 |
|
| 263 |
if "</think>" in generated_text:
|
|
|
|
| 268 |
|
| 269 |
|
| 270 |
|
| 271 |
+
|
| 272 |
@app.post("/generate/{model_name}", response_model=GenerateResponse)
|
| 273 |
async def generate(
|
| 274 |
request: PromptRequest,
|