MarshallCN commited on
Commit
c9580d5
·
1 Parent(s): cdf40d3

fix max_token > n_ctr(512 for llama_cpp) issue

Browse files
Files changed (2) hide show
  1. README.md +1 -1
  2. ggufv2.py +127 -21
README.md CHANGED
@@ -4,7 +4,7 @@ emoji: "🧠"
4
  colorFrom: "indigo"
5
  colorTo: "red"
6
  sdk: "gradio"
7
- sdk_version: "4.44.0"
8
  app_file: "ggufv2.py"
9
  pinned: false
10
  Script path: "build.sh"
 
4
  colorFrom: "indigo"
5
  colorTo: "red"
6
  sdk: "gradio"
7
+ sdk_version: "5.49.1"
8
  app_file: "ggufv2.py"
9
  pinned: false
10
  Script path: "build.sh"
ggufv2.py CHANGED
@@ -10,7 +10,6 @@ from llama_cpp import Llama
10
 
11
  # Multi-session helpers from utils.py
12
  from utils import mk_msg_dir, _as_dir, persist_messages
13
-
14
  # ===================== Model =====================
15
  # You can swap to another GGUF by changing repo_id/filename.
16
  model = Llama.from_pretrained(
@@ -20,21 +19,62 @@ model = Llama.from_pretrained(
20
 
21
  assistant_name = "Nova"
22
  user_name = "Marshall"
23
- persona = f"""Your name is {assistant_name}. Address the user as "{user_name}". Use Markdown; put code in fenced blocks with a language tag. Be concise but never give empty feedback.""".strip()
24
 
25
  # Where each conversation (session) persists its messages
26
  BASE_MSG_DIR = Path("./msgs/msgs_QwenGGUF")
27
  BASE_MSG_DIR.mkdir(parents=True, exist_ok=True)
28
 
29
  # ---------- Qwen chat template (no tools) ----------
30
- def render_qwen(messages: List[Dict[str, str]], add_generation_prompt: bool = True) -> str:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
31
  """
32
- Convert OpenAI-style messages to Qwen2.5 Instruct format:
33
- <|im_start|>system ... <|im_end|>
34
- <|im_start|>user ... <|im_end|>
35
- <|im_start|>assistant (generation continues here)
36
  """
37
- # System prompt
 
 
 
 
 
 
 
38
  if messages and messages[0].get("role") == "system":
39
  sys_txt = messages[0]["content"]
40
  rest = messages[1:]
@@ -42,16 +82,72 @@ def render_qwen(messages: List[Dict[str, str]], add_generation_prompt: bool = Tr
42
  sys_txt = persona
43
  rest = messages
44
 
45
- parts = [f"<|im_start|>system\n{sys_txt}<|im_end|>\n"]
46
- for m in rest:
47
- role = m.get("role")
48
- if role not in ("user", "assistant"):
49
- continue
50
- parts.append(f"<|im_start|>{role}\n{m['content']}<|im_end|>\n")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
51
 
52
- if add_generation_prompt:
53
- parts.append("<|im_start|>assistant\n")
54
- return "".join(parts)
55
 
56
  STOP_TOKENS = ["<|im_end|>", "<|endoftext|>"]
57
 
@@ -185,14 +281,24 @@ def on_send(user_text: str,
185
 
186
  # 3) append user, render, generate
187
  messages = messages + [{"role": "user", "content": user_text}]
188
- prompt = render_qwen(messages, add_generation_prompt=True)
 
 
 
 
 
 
 
 
 
 
189
 
190
  try:
191
  result = model.create_completion(
192
  prompt=prompt,
193
  temperature=float(temperature),
194
  top_p=float(top_p),
195
- max_tokens=int(max_new_tokens),
196
  repeat_penalty=float(repetition_penalty),
197
  stop=STOP_TOKENS,
198
  )
@@ -202,7 +308,7 @@ def on_send(user_text: str,
202
  prompt,
203
  temperature=float(temperature),
204
  top_p=float(top_p),
205
- max_tokens=int(max_new_tokens),
206
  repeat_penalty=float(repetition_penalty),
207
  stop=STOP_TOKENS,
208
  )
@@ -236,7 +342,7 @@ with gr.Blocks(title="Qwen GGUF — multi-session") as demo:
236
  with gr.Accordion("Generation settings", open=False):
237
  temperature = gr.Slider(0.0, 2.0, value=0.7, step=0.05, label="temperature")
238
  top_p = gr.Slider(0.1, 1.0, value=0.95, step=0.01, label="top_p")
239
- max_new_tokens = gr.Slider(16, 1024, value=256, step=16, label="max_new_tokens")
240
  repetition_penalty = gr.Slider(1.0, 2.0, value=1.07, step=0.01, label="repetition_penalty")
241
 
242
  session_list = gr.Radio(choices=[], value=None, label="Conversations", interactive=True)
 
10
 
11
  # Multi-session helpers from utils.py
12
  from utils import mk_msg_dir, _as_dir, persist_messages
 
13
  # ===================== Model =====================
14
  # You can swap to another GGUF by changing repo_id/filename.
15
  model = Llama.from_pretrained(
 
19
 
20
  assistant_name = "Nova"
21
  user_name = "Marshall"
22
+ persona = f"""Your name is {assistant_name}. Address the user as "{user_name}". Use Markdown; put code in fenced blocks with a language tag.""".strip()
23
 
24
  # Where each conversation (session) persists its messages
25
  BASE_MSG_DIR = Path("./msgs/msgs_QwenGGUF")
26
  BASE_MSG_DIR.mkdir(parents=True, exist_ok=True)
27
 
28
  # ---------- Qwen chat template (no tools) ----------
29
+ # def render_qwen(messages: List[Dict[str, str]], add_generation_prompt: bool = True) -> str:
30
+ # """
31
+ # Convert OpenAI-style messages to Qwen2.5 Instruct format:
32
+ # <|im_start|>system ... <|im_end|>
33
+ # <|im_start|>user ... <|im_end|>
34
+ # <|im_start|>assistant (generation continues here)
35
+ # """
36
+ # # System prompt
37
+ # if messages and messages[0].get("role") == "system":
38
+ # sys_txt = messages[0]["content"]
39
+ # rest = messages[1:]
40
+ # else:
41
+ # sys_txt = persona
42
+ # rest = messages
43
+
44
+ # parts = [f"<|im_start|>system\n{sys_txt}<|im_end|>\n"]
45
+ # for m in rest:
46
+ # role = m.get("role")
47
+ # if role not in ("user", "assistant"):
48
+ # continue
49
+ # parts.append(f"<|im_start|>{role}\n{m['content']}<|im_end|>\n")
50
+
51
+ # if add_generation_prompt:
52
+ # parts.append("<|im_start|>assistant\n")
53
+ # return "".join(parts)
54
+
55
+ def render_qwen_trim(
56
+ messages: List[Dict[str, str]],
57
+ model, # llama_cpp.Llama 实例(用于 token 计数)
58
+ n_ctx: Optional[int] = None, # 不传则用 model.n_ctx()
59
+ add_generation_prompt: bool = True,
60
+ persona: str = "",
61
+ reserve_new: int = 256, # 希望生成的新 token 预算(上限)
62
+ pad: int = 8, # 保险余量,避免越界
63
+ hard_user_tail_chars: int = 2000, # 还不够时,最后一条 user 文本的硬截断字符数
64
+ ) -> Tuple[str, int]:
65
  """
66
+ - 只保留 system + 最近的若干轮对话,使得 total_tokens + reserve_new + pad <= n_ctx
67
+ - 若仍不够,则截短最后一条 user。
68
+ - 返回 (prompt, safe_max_new),safe_max_new 已确保不越界。
 
69
  """
70
+ def _tok_len(txt: str) -> int:
71
+ # 与 llama_cpp 的计数保持一致
72
+ return len(model.tokenize(txt.encode("utf-8"), add_bos=True))
73
+
74
+ if n_ctx is None:
75
+ n_ctx = getattr(model, "n_ctx")() if callable(getattr(model, "n_ctx", None)) else model.n_ctx
76
+
77
+ # 1) 拆出 system 与其余消息
78
  if messages and messages[0].get("role") == "system":
79
  sys_txt = messages[0]["content"]
80
  rest = messages[1:]
 
82
  sys_txt = persona
83
  rest = messages
84
 
85
+ # 仅保留 user / assistant
86
+ rest = [m for m in rest if m.get("role") in ("user", "assistant")]
87
+
88
+ # 2) 生成函数:把 system + 若干轮对话渲染为 Qwen prompt
89
+ def _render(sys_text: str, turns: List[Dict[str, str]], add_gen: bool) -> str:
90
+ parts = [f"<|im_start|>system\n{sys_text}<|im_end|>\n"]
91
+ for m in turns:
92
+ parts.append(f"<|im_start|>{m['role']}\n{m['content']}<|im_end|>\n")
93
+ if add_gen:
94
+ parts.append("<|im_start|>assistant\n")
95
+ return "".join(parts)
96
+
97
+ # 3) 先尝试保留全部轮次,从最老开始裁剪直到 fits
98
+ kept = rest[:] # 深拷贝
99
+ while True:
100
+ prompt = _render(sys_txt, kept, add_generation_prompt)
101
+ used = _tok_len(prompt)
102
+
103
+ # 计算还能安全生成的 token 数
104
+ safe_max_new = max(1, n_ctx - used - pad)
105
+ # 希望生成 reserve_new,但不能超过 safe_max_new
106
+ if used + reserve_new + pad <= n_ctx:
107
+ # 有余量,按 reserve_new 返回可生成上限
108
+ return prompt, min(reserve_new, safe_max_new)
109
+
110
+ # 没有余量——需要裁剪历史。如果可裁剪的 turns < 1,则进入硬截断
111
+ if len(kept) <= 1:
112
+ break # 只剩最后一条,跳出去做硬截断
113
+
114
+ # 从最早的一条开始丢;为避免打断成对语义,可一次丢两条(user+assistant)
115
+ # 但如果开头不是成对,就按 1 条丢弃。
116
+ drop_count = 2 if len(kept) >= 2 else 1
117
+ # 保证留下至少 1 条(最后一条 user)用于上下文
118
+ while drop_count > 0 and len(kept) > 1:
119
+ kept.pop(0)
120
+ drop_count -= 1
121
+
122
+ # 4) 仍然不够:硬截断“最后一条 user”文本尾部
123
+ # 目标:尽量保留最近语义,同时立刻释放 token 空间
124
+ if kept and kept[-1]["role"] == "user":
125
+ kept[-1] = {
126
+ "role": "user",
127
+ "content": kept[-1]["content"][-hard_user_tail_chars:]
128
+ }
129
+ elif kept:
130
+ # 最后一条不是 user,则尽量截短它(通常是 assistant)
131
+ kept[-1] = {
132
+ "role": kept[-1]["role"],
133
+ "content": kept[-1]["content"][-hard_user_tail_chars:]
134
+ }
135
+
136
+ # 重新渲染并最终给出安全 max_new
137
+ prompt = _render(sys_txt, kept, add_generation_prompt)
138
+ used = _tok_len(prompt)
139
+ safe_max_new = max(1, n_ctx - used - pad)
140
+
141
+ # 如果仍然超(极端长的 system),进一步把 system 也截短
142
+ if used + pad > n_ctx:
143
+ trimmed_sys = sys_txt[-hard_user_tail_chars:]
144
+ prompt = _render(trimmed_sys, kept, add_generation_prompt)
145
+ used = _tok_len(prompt)
146
+ safe_max_new = max(1, n_ctx - used - pad)
147
+
148
+ # 不允许返回负或 0
149
+ return prompt, max(1, safe_max_new)
150
 
 
 
 
151
 
152
  STOP_TOKENS = ["<|im_end|>", "<|endoftext|>"]
153
 
 
281
 
282
  # 3) append user, render, generate
283
  messages = messages + [{"role": "user", "content": user_text}]
284
+ # prompt = render_qwen(messages, add_generation_prompt=True)
285
+ prompt, max_new = render_qwen_trim(
286
+ messages=messages,
287
+ model=model, # llama_cpp.Llama 实例
288
+ n_ctx=None, # 不传用 model.n_ctx()
289
+ add_generation_prompt=True,
290
+ persona=persona, # 你之前的 persona 变量
291
+ reserve_new=max_new_tokens, # 你希望的生成长度
292
+ pad=16
293
+ )
294
+
295
 
296
  try:
297
  result = model.create_completion(
298
  prompt=prompt,
299
  temperature=float(temperature),
300
  top_p=float(top_p),
301
+ max_tokens=int(max_new),
302
  repeat_penalty=float(repetition_penalty),
303
  stop=STOP_TOKENS,
304
  )
 
308
  prompt,
309
  temperature=float(temperature),
310
  top_p=float(top_p),
311
+ max_tokens=int(max_new),
312
  repeat_penalty=float(repetition_penalty),
313
  stop=STOP_TOKENS,
314
  )
 
342
  with gr.Accordion("Generation settings", open=False):
343
  temperature = gr.Slider(0.0, 2.0, value=0.7, step=0.05, label="temperature")
344
  top_p = gr.Slider(0.1, 1.0, value=0.95, step=0.01, label="top_p")
345
+ max_new_tokens = gr.Slider(16, 512, value=256, step=16, label="max_new_tokens")
346
  repetition_penalty = gr.Slider(1.0, 2.0, value=1.07, step=0.01, label="repetition_penalty")
347
 
348
  session_list = gr.Radio(choices=[], value=None, label="Conversations", interactive=True)