Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
|
@@ -98,17 +98,35 @@ def predict_chat(message: str, history: list):
|
|
| 98 |
prompt_input += f"Assistant: {msg['content']}\n"
|
| 99 |
prompt_input += "Assistant:"
|
| 100 |
|
| 101 |
-
# FIXED: Use
|
| 102 |
-
|
| 103 |
-
|
| 104 |
-
|
| 105 |
-
|
| 106 |
-
|
| 107 |
-
|
| 108 |
-
|
| 109 |
-
|
| 110 |
-
|
| 111 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 112 |
generated_text += token
|
| 113 |
yield generated_text
|
| 114 |
|
|
|
|
| 98 |
prompt_input += f"Assistant: {msg['content']}\n"
|
| 99 |
prompt_input += "Assistant:"
|
| 100 |
|
| 101 |
+
# FIXED: Use the correct ctransformers method - call model() directly for streaming
|
| 102 |
+
try:
|
| 103 |
+
for token in model(
|
| 104 |
+
prompt_input,
|
| 105 |
+
max_new_tokens=MAX_NEW_TOKENS,
|
| 106 |
+
temperature=TEMPERATURE,
|
| 107 |
+
top_k=TOP_K,
|
| 108 |
+
top_p=TOP_P,
|
| 109 |
+
do_sample=DO_SAMPLE,
|
| 110 |
+
repetition_penalty=1.1,
|
| 111 |
+
stop=["User:", "\nUser", "\n#", "\n##", "<|endoftext|>"],
|
| 112 |
+
stream=True
|
| 113 |
+
):
|
| 114 |
+
generated_text += token
|
| 115 |
+
yield generated_text
|
| 116 |
+
except Exception as e:
|
| 117 |
+
print(f"Error in GGUF generation: {e}")
|
| 118 |
+
# Fallback to non-streaming generation
|
| 119 |
+
output = model(
|
| 120 |
+
prompt_input,
|
| 121 |
+
max_new_tokens=MAX_NEW_TOKENS,
|
| 122 |
+
temperature=TEMPERATURE,
|
| 123 |
+
top_k=TOP_K,
|
| 124 |
+
top_p=TOP_P,
|
| 125 |
+
do_sample=DO_SAMPLE,
|
| 126 |
+
repetition_penalty=1.1,
|
| 127 |
+
stop=["User:", "\nUser", "\n#", "\n##", "<|endoftext|>"]
|
| 128 |
+
)
|
| 129 |
+
yield output
|
| 130 |
generated_text += token
|
| 131 |
yield generated_text
|
| 132 |
|