Update app.py
Browse files
app.py
CHANGED
|
@@ -82,8 +82,8 @@ def respond(
|
|
| 82 |
if llama == None:
|
| 83 |
llama = Llama("models/t5-query-reformulation-RL-q8_0.gguf",flash_attn=False,
|
| 84 |
n_gpu_layers=0,
|
| 85 |
-
n_batch=
|
| 86 |
-
n_ctx=
|
| 87 |
n_threads=2,
|
| 88 |
n_threads_batch=2)
|
| 89 |
|
|
@@ -91,11 +91,14 @@ def respond(
|
|
| 91 |
llama.encode(tokens)
|
| 92 |
tokens = [llama.decoder_start_token()]
|
| 93 |
outputs =""
|
| 94 |
-
|
| 95 |
-
|
| 96 |
-
|
| 97 |
-
|
| 98 |
-
|
|
|
|
|
|
|
|
|
|
| 99 |
return outputs
|
| 100 |
except Exception as e:
|
| 101 |
# Custom exception handling
|
|
|
|
| 82 |
if llama == None:
|
| 83 |
llama = Llama("models/t5-query-reformulation-RL-q8_0.gguf",flash_attn=False,
|
| 84 |
n_gpu_layers=0,
|
| 85 |
+
n_batch=32,
|
| 86 |
+
n_ctx=512,
|
| 87 |
n_threads=2,
|
| 88 |
n_threads_batch=2)
|
| 89 |
|
|
|
|
| 91 |
llama.encode(tokens)
|
| 92 |
tokens = [llama.decoder_start_token()]
|
| 93 |
outputs =""
|
| 94 |
+
iteration = 5
|
| 95 |
+
for i in range(iteration):
|
| 96 |
+
for token in llama.generate(tokens, top_k=top_k, top_p=top_p, temp=temperature, repeat_penalty=repeat_penalty):
|
| 97 |
+
outputs+= llama.detokenize([token]).decode()
|
| 98 |
+
yield outputs
|
| 99 |
+
if token == llama.token_eos():
|
| 100 |
+
break
|
| 101 |
+
outputs+="\n"
|
| 102 |
return outputs
|
| 103 |
except Exception as e:
|
| 104 |
# Custom exception handling
|