Update app.py
Browse files
app.py
CHANGED
|
@@ -66,7 +66,7 @@ import multiprocessing
|
|
| 66 |
|
| 67 |
import llama_cpp
|
| 68 |
|
| 69 |
-
def
|
| 70 |
|
| 71 |
|
| 72 |
llama_cpp.llama_backend_init(numa=False)
|
|
@@ -283,6 +283,15 @@ def respond(
|
|
| 283 |
top_k: int,
|
| 284 |
repeat_penalty: float,
|
| 285 |
):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 286 |
"""
|
| 287 |
Respond to a message using the Gemma3 model via Llama.cpp.
|
| 288 |
|
|
@@ -320,7 +329,7 @@ def respond(
|
|
| 320 |
llm_model = model
|
| 321 |
|
| 322 |
trans(message)
|
| 323 |
-
yield "done"
|
| 324 |
|
| 325 |
provider = LlamaCppPythonProvider(llm)
|
| 326 |
|
|
@@ -367,7 +376,7 @@ def respond(
|
|
| 367 |
outputs = ""
|
| 368 |
for output in stream:
|
| 369 |
outputs += output
|
| 370 |
-
yield outputs
|
| 371 |
|
| 372 |
# Handle exceptions that may occur during the process
|
| 373 |
except Exception as e:
|
|
|
|
| 66 |
|
| 67 |
import llama_cpp
|
| 68 |
|
| 69 |
+
def low_level():
|
| 70 |
|
| 71 |
|
| 72 |
llama_cpp.llama_backend_init(numa=False)
|
|
|
|
| 283 |
top_k: int,
|
| 284 |
repeat_penalty: float,
|
| 285 |
):
|
| 286 |
+
llama = Llama("models/madlad400-3b-mt-q8_0.gguf")
|
| 287 |
+
tokens = llama.tokenize(fb"{message}")
|
| 288 |
+
llama.encode(tokens)
|
| 289 |
+
tokens = [llama.decoder_start_token()]
|
| 290 |
+
for token in llama.generate(tokens, top_k=40, top_p=0.95, temp=1, repeat_penalty=1.0):
|
| 291 |
+
yield (llama.detokenize([token]))
|
| 292 |
+
if token == llama.token_eos():
|
| 293 |
+
break
|
| 294 |
+
|
| 295 |
"""
|
| 296 |
Respond to a message using the Gemma3 model via Llama.cpp.
|
| 297 |
|
|
|
|
| 329 |
llm_model = model
|
| 330 |
|
| 331 |
trans(message)
|
| 332 |
+
#yield "done"
|
| 333 |
|
| 334 |
provider = LlamaCppPythonProvider(llm)
|
| 335 |
|
|
|
|
| 376 |
outputs = ""
|
| 377 |
for output in stream:
|
| 378 |
outputs += output
|
| 379 |
+
#yield outputs
|
| 380 |
|
| 381 |
# Handle exceptions that may occur during the process
|
| 382 |
except Exception as e:
|