Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -130,7 +130,23 @@ def answer(system: str, context: str, question: str, user_id="demo", history="No
|
|
| 130 |
context_list += store["texts"]
|
| 131 |
|
| 132 |
# 2. Build a Qwen-chat prompt (helper defined earlier)
|
| 133 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 134 |
|
| 135 |
# 3. Generate and strip everything before the assistant tag
|
| 136 |
load_chat()
|
|
|
|
| 130 |
context_list += store["texts"]
|
| 131 |
|
| 132 |
# 2. Build a Qwen-chat prompt (helper defined earlier)
|
| 133 |
+
MAX_PROMPT_TOKENS = 8192 # 8 k is ~4 GB KV-cache
|
| 134 |
+
|
| 135 |
+
prompt = build_qwen_prompt(system, context_list, question)
|
| 136 |
+
tokens = tokenizer(prompt, return_tensors="pt", add_special_tokens=False)
|
| 137 |
+
|
| 138 |
+
if tokens.input_ids.size(1) > MAX_PROMPT_TOKENS:
|
| 139 |
+
# keep the last MAX_PROMPT_TOKENS tokens (most recent content)
|
| 140 |
+
tokens = {k: v[:, -MAX_PROMPT_TOKENS:] for k, v in tokens.items()}
|
| 141 |
+
|
| 142 |
+
tokens = {k: v.to(chat_model.device) for k, v in tokens.items()}
|
| 143 |
+
|
| 144 |
+
output = chat_model.generate(
|
| 145 |
+
**tokens,
|
| 146 |
+
max_new_tokens=512,
|
| 147 |
+
max_length=MAX_PROMPT_TOKENS + 512,
|
| 148 |
+
)
|
| 149 |
+
|
| 150 |
|
| 151 |
# 3. Generate and strip everything before the assistant tag
|
| 152 |
load_chat()
|