fsojni commited on
Commit
6947209
·
verified ·
1 Parent(s): d52709d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +17 -1
app.py CHANGED
@@ -130,7 +130,23 @@ def answer(system: str, context: str, question: str, user_id="demo", history="No
130
  context_list += store["texts"]
131
 
132
  # 2. Build a Qwen-chat prompt (helper defined earlier)
133
- prompt = build_qwen_prompt(system, context_list, question)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
134
 
135
  # 3. Generate and strip everything before the assistant tag
136
  load_chat()
 
130
  context_list += store["texts"]
131
 
132
  # 2. Build a Qwen-chat prompt (helper defined earlier)
133
+ MAX_PROMPT_TOKENS = 8192 # 8 k is ~4 GB KV-cache
134
+
135
+ prompt = build_qwen_prompt(system, context_list, question)
136
+ tokens = tokenizer(prompt, return_tensors="pt", add_special_tokens=False)
137
+
138
+ if tokens.input_ids.size(1) > MAX_PROMPT_TOKENS:
139
+ # keep the last MAX_PROMPT_TOKENS tokens (most recent content)
140
+ tokens = {k: v[:, -MAX_PROMPT_TOKENS:] for k, v in tokens.items()}
141
+
142
+ tokens = {k: v.to(chat_model.device) for k, v in tokens.items()}
143
+
144
+ output = chat_model.generate(
145
+ **tokens,
146
+ max_new_tokens=512,
147
+ max_length=MAX_PROMPT_TOKENS + 512,
148
+ )
149
+
150
 
151
  # 3. Generate and strip everything before the assistant tag
152
  load_chat()