MarshallCN commited on
Commit ·
c9580d5
1
Parent(s): cdf40d3
fix max_token > n_ctr(512 for llama_cpp) issue
Browse files
README.md
CHANGED
|
@@ -4,7 +4,7 @@ emoji: "🧠"
|
|
| 4 |
colorFrom: "indigo"
|
| 5 |
colorTo: "red"
|
| 6 |
sdk: "gradio"
|
| 7 |
-
sdk_version: "
|
| 8 |
app_file: "ggufv2.py"
|
| 9 |
pinned: false
|
| 10 |
Script path: "build.sh"
|
|
|
|
| 4 |
colorFrom: "indigo"
|
| 5 |
colorTo: "red"
|
| 6 |
sdk: "gradio"
|
| 7 |
+
sdk_version: "5.49.1"
|
| 8 |
app_file: "ggufv2.py"
|
| 9 |
pinned: false
|
| 10 |
Script path: "build.sh"
|
ggufv2.py
CHANGED
|
@@ -10,7 +10,6 @@ from llama_cpp import Llama
|
|
| 10 |
|
| 11 |
# Multi-session helpers from utils.py
|
| 12 |
from utils import mk_msg_dir, _as_dir, persist_messages
|
| 13 |
-
|
| 14 |
# ===================== Model =====================
|
| 15 |
# You can swap to another GGUF by changing repo_id/filename.
|
| 16 |
model = Llama.from_pretrained(
|
|
@@ -20,21 +19,62 @@ model = Llama.from_pretrained(
|
|
| 20 |
|
| 21 |
assistant_name = "Nova"
|
| 22 |
user_name = "Marshall"
|
| 23 |
-
persona = f"""Your name is {assistant_name}. Address the user as "{user_name}". Use Markdown; put code in fenced blocks with a language tag.
|
| 24 |
|
| 25 |
# Where each conversation (session) persists its messages
|
| 26 |
BASE_MSG_DIR = Path("./msgs/msgs_QwenGGUF")
|
| 27 |
BASE_MSG_DIR.mkdir(parents=True, exist_ok=True)
|
| 28 |
|
| 29 |
# ---------- Qwen chat template (no tools) ----------
|
| 30 |
-
def render_qwen(messages: List[Dict[str, str]], add_generation_prompt: bool = True) -> str:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 31 |
"""
|
| 32 |
-
|
| 33 |
-
|
| 34 |
-
|
| 35 |
-
<|im_start|>assistant (generation continues here)
|
| 36 |
"""
|
| 37 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 38 |
if messages and messages[0].get("role") == "system":
|
| 39 |
sys_txt = messages[0]["content"]
|
| 40 |
rest = messages[1:]
|
|
@@ -42,16 +82,72 @@ def render_qwen(messages: List[Dict[str, str]], add_generation_prompt: bool = Tr
|
|
| 42 |
sys_txt = persona
|
| 43 |
rest = messages
|
| 44 |
|
| 45 |
-
|
| 46 |
-
for m in rest
|
| 47 |
-
|
| 48 |
-
|
| 49 |
-
|
| 50 |
-
parts
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 51 |
|
| 52 |
-
if add_generation_prompt:
|
| 53 |
-
parts.append("<|im_start|>assistant\n")
|
| 54 |
-
return "".join(parts)
|
| 55 |
|
| 56 |
STOP_TOKENS = ["<|im_end|>", "<|endoftext|>"]
|
| 57 |
|
|
@@ -185,14 +281,24 @@ def on_send(user_text: str,
|
|
| 185 |
|
| 186 |
# 3) append user, render, generate
|
| 187 |
messages = messages + [{"role": "user", "content": user_text}]
|
| 188 |
-
prompt = render_qwen(messages, add_generation_prompt=True)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 189 |
|
| 190 |
try:
|
| 191 |
result = model.create_completion(
|
| 192 |
prompt=prompt,
|
| 193 |
temperature=float(temperature),
|
| 194 |
top_p=float(top_p),
|
| 195 |
-
max_tokens=int(
|
| 196 |
repeat_penalty=float(repetition_penalty),
|
| 197 |
stop=STOP_TOKENS,
|
| 198 |
)
|
|
@@ -202,7 +308,7 @@ def on_send(user_text: str,
|
|
| 202 |
prompt,
|
| 203 |
temperature=float(temperature),
|
| 204 |
top_p=float(top_p),
|
| 205 |
-
max_tokens=int(
|
| 206 |
repeat_penalty=float(repetition_penalty),
|
| 207 |
stop=STOP_TOKENS,
|
| 208 |
)
|
|
@@ -236,7 +342,7 @@ with gr.Blocks(title="Qwen GGUF — multi-session") as demo:
|
|
| 236 |
with gr.Accordion("Generation settings", open=False):
|
| 237 |
temperature = gr.Slider(0.0, 2.0, value=0.7, step=0.05, label="temperature")
|
| 238 |
top_p = gr.Slider(0.1, 1.0, value=0.95, step=0.01, label="top_p")
|
| 239 |
-
max_new_tokens = gr.Slider(16,
|
| 240 |
repetition_penalty = gr.Slider(1.0, 2.0, value=1.07, step=0.01, label="repetition_penalty")
|
| 241 |
|
| 242 |
session_list = gr.Radio(choices=[], value=None, label="Conversations", interactive=True)
|
|
|
|
| 10 |
|
| 11 |
# Multi-session helpers from utils.py
|
| 12 |
from utils import mk_msg_dir, _as_dir, persist_messages
|
|
|
|
| 13 |
# ===================== Model =====================
|
| 14 |
# You can swap to another GGUF by changing repo_id/filename.
|
| 15 |
model = Llama.from_pretrained(
|
|
|
|
| 19 |
|
| 20 |
assistant_name = "Nova"
|
| 21 |
user_name = "Marshall"
|
| 22 |
+
persona = f"""Your name is {assistant_name}. Address the user as "{user_name}". Use Markdown; put code in fenced blocks with a language tag.""".strip()
|
| 23 |
|
| 24 |
# Where each conversation (session) persists its messages
|
| 25 |
BASE_MSG_DIR = Path("./msgs/msgs_QwenGGUF")
|
| 26 |
BASE_MSG_DIR.mkdir(parents=True, exist_ok=True)
|
| 27 |
|
| 28 |
# ---------- Qwen chat template (no tools) ----------
|
| 29 |
+
# def render_qwen(messages: List[Dict[str, str]], add_generation_prompt: bool = True) -> str:
|
| 30 |
+
# """
|
| 31 |
+
# Convert OpenAI-style messages to Qwen2.5 Instruct format:
|
| 32 |
+
# <|im_start|>system ... <|im_end|>
|
| 33 |
+
# <|im_start|>user ... <|im_end|>
|
| 34 |
+
# <|im_start|>assistant (generation continues here)
|
| 35 |
+
# """
|
| 36 |
+
# # System prompt
|
| 37 |
+
# if messages and messages[0].get("role") == "system":
|
| 38 |
+
# sys_txt = messages[0]["content"]
|
| 39 |
+
# rest = messages[1:]
|
| 40 |
+
# else:
|
| 41 |
+
# sys_txt = persona
|
| 42 |
+
# rest = messages
|
| 43 |
+
|
| 44 |
+
# parts = [f"<|im_start|>system\n{sys_txt}<|im_end|>\n"]
|
| 45 |
+
# for m in rest:
|
| 46 |
+
# role = m.get("role")
|
| 47 |
+
# if role not in ("user", "assistant"):
|
| 48 |
+
# continue
|
| 49 |
+
# parts.append(f"<|im_start|>{role}\n{m['content']}<|im_end|>\n")
|
| 50 |
+
|
| 51 |
+
# if add_generation_prompt:
|
| 52 |
+
# parts.append("<|im_start|>assistant\n")
|
| 53 |
+
# return "".join(parts)
|
| 54 |
+
|
| 55 |
+
def render_qwen_trim(
|
| 56 |
+
messages: List[Dict[str, str]],
|
| 57 |
+
model, # llama_cpp.Llama 实例(用于 token 计数)
|
| 58 |
+
n_ctx: Optional[int] = None, # 不传则用 model.n_ctx()
|
| 59 |
+
add_generation_prompt: bool = True,
|
| 60 |
+
persona: str = "",
|
| 61 |
+
reserve_new: int = 256, # 希望生成的新 token 预算(上限)
|
| 62 |
+
pad: int = 8, # 保险余量,避免越界
|
| 63 |
+
hard_user_tail_chars: int = 2000, # 还不够时,最后一条 user 文本的硬截断字符数
|
| 64 |
+
) -> Tuple[str, int]:
|
| 65 |
"""
|
| 66 |
+
- 只保留 system + 最近的若干轮对话,使得 total_tokens + reserve_new + pad <= n_ctx
|
| 67 |
+
- 若仍不够,则截短最后一条 user。
|
| 68 |
+
- 返回 (prompt, safe_max_new),safe_max_new 已确保不越界。
|
|
|
|
| 69 |
"""
|
| 70 |
+
def _tok_len(txt: str) -> int:
|
| 71 |
+
# 与 llama_cpp 的计数保持一致
|
| 72 |
+
return len(model.tokenize(txt.encode("utf-8"), add_bos=True))
|
| 73 |
+
|
| 74 |
+
if n_ctx is None:
|
| 75 |
+
n_ctx = getattr(model, "n_ctx")() if callable(getattr(model, "n_ctx", None)) else model.n_ctx
|
| 76 |
+
|
| 77 |
+
# 1) 拆出 system 与其余消息
|
| 78 |
if messages and messages[0].get("role") == "system":
|
| 79 |
sys_txt = messages[0]["content"]
|
| 80 |
rest = messages[1:]
|
|
|
|
| 82 |
sys_txt = persona
|
| 83 |
rest = messages
|
| 84 |
|
| 85 |
+
# 仅保留 user / assistant
|
| 86 |
+
rest = [m for m in rest if m.get("role") in ("user", "assistant")]
|
| 87 |
+
|
| 88 |
+
# 2) 生成函数:把 system + 若干轮对话渲染为 Qwen prompt
|
| 89 |
+
def _render(sys_text: str, turns: List[Dict[str, str]], add_gen: bool) -> str:
|
| 90 |
+
parts = [f"<|im_start|>system\n{sys_text}<|im_end|>\n"]
|
| 91 |
+
for m in turns:
|
| 92 |
+
parts.append(f"<|im_start|>{m['role']}\n{m['content']}<|im_end|>\n")
|
| 93 |
+
if add_gen:
|
| 94 |
+
parts.append("<|im_start|>assistant\n")
|
| 95 |
+
return "".join(parts)
|
| 96 |
+
|
| 97 |
+
# 3) 先尝试保留全部轮次,从最老开始裁剪直到 fits
|
| 98 |
+
kept = rest[:] # 深拷贝
|
| 99 |
+
while True:
|
| 100 |
+
prompt = _render(sys_txt, kept, add_generation_prompt)
|
| 101 |
+
used = _tok_len(prompt)
|
| 102 |
+
|
| 103 |
+
# 计算还能安全生成的 token 数
|
| 104 |
+
safe_max_new = max(1, n_ctx - used - pad)
|
| 105 |
+
# 希望生成 reserve_new,但不能超过 safe_max_new
|
| 106 |
+
if used + reserve_new + pad <= n_ctx:
|
| 107 |
+
# 有余量,按 reserve_new 返回可生成上限
|
| 108 |
+
return prompt, min(reserve_new, safe_max_new)
|
| 109 |
+
|
| 110 |
+
# 没有余量——需要裁剪历史。如果可裁剪的 turns < 1,则进入硬截断
|
| 111 |
+
if len(kept) <= 1:
|
| 112 |
+
break # 只剩最后一条,跳出去做硬截断
|
| 113 |
+
|
| 114 |
+
# 从最早的一条开始丢;为避免打断成对语义,可一次丢两条(user+assistant)
|
| 115 |
+
# 但如果开头不是成对,就按 1 条丢弃。
|
| 116 |
+
drop_count = 2 if len(kept) >= 2 else 1
|
| 117 |
+
# 保证留下至少 1 条(最后一条 user)用于上下文
|
| 118 |
+
while drop_count > 0 and len(kept) > 1:
|
| 119 |
+
kept.pop(0)
|
| 120 |
+
drop_count -= 1
|
| 121 |
+
|
| 122 |
+
# 4) 仍然不够:硬截断“最后一条 user”文本尾部
|
| 123 |
+
# 目标:尽量保留最近语义,同时立刻释放 token 空间
|
| 124 |
+
if kept and kept[-1]["role"] == "user":
|
| 125 |
+
kept[-1] = {
|
| 126 |
+
"role": "user",
|
| 127 |
+
"content": kept[-1]["content"][-hard_user_tail_chars:]
|
| 128 |
+
}
|
| 129 |
+
elif kept:
|
| 130 |
+
# 最后一条不是 user,则尽量截短它(通常是 assistant)
|
| 131 |
+
kept[-1] = {
|
| 132 |
+
"role": kept[-1]["role"],
|
| 133 |
+
"content": kept[-1]["content"][-hard_user_tail_chars:]
|
| 134 |
+
}
|
| 135 |
+
|
| 136 |
+
# 重新渲染并最终给出安全 max_new
|
| 137 |
+
prompt = _render(sys_txt, kept, add_generation_prompt)
|
| 138 |
+
used = _tok_len(prompt)
|
| 139 |
+
safe_max_new = max(1, n_ctx - used - pad)
|
| 140 |
+
|
| 141 |
+
# 如果仍然超(极端长的 system),进一步把 system 也截短
|
| 142 |
+
if used + pad > n_ctx:
|
| 143 |
+
trimmed_sys = sys_txt[-hard_user_tail_chars:]
|
| 144 |
+
prompt = _render(trimmed_sys, kept, add_generation_prompt)
|
| 145 |
+
used = _tok_len(prompt)
|
| 146 |
+
safe_max_new = max(1, n_ctx - used - pad)
|
| 147 |
+
|
| 148 |
+
# 不允许返回负或 0
|
| 149 |
+
return prompt, max(1, safe_max_new)
|
| 150 |
|
|
|
|
|
|
|
|
|
|
| 151 |
|
| 152 |
STOP_TOKENS = ["<|im_end|>", "<|endoftext|>"]
|
| 153 |
|
|
|
|
| 281 |
|
| 282 |
# 3) append user, render, generate
|
| 283 |
messages = messages + [{"role": "user", "content": user_text}]
|
| 284 |
+
# prompt = render_qwen(messages, add_generation_prompt=True)
|
| 285 |
+
prompt, max_new = render_qwen_trim(
|
| 286 |
+
messages=messages,
|
| 287 |
+
model=model, # llama_cpp.Llama 实例
|
| 288 |
+
n_ctx=None, # 不传用 model.n_ctx()
|
| 289 |
+
add_generation_prompt=True,
|
| 290 |
+
persona=persona, # 你之前的 persona 变量
|
| 291 |
+
reserve_new=max_new_tokens, # 你希望的生成长度
|
| 292 |
+
pad=16
|
| 293 |
+
)
|
| 294 |
+
|
| 295 |
|
| 296 |
try:
|
| 297 |
result = model.create_completion(
|
| 298 |
prompt=prompt,
|
| 299 |
temperature=float(temperature),
|
| 300 |
top_p=float(top_p),
|
| 301 |
+
max_tokens=int(max_new),
|
| 302 |
repeat_penalty=float(repetition_penalty),
|
| 303 |
stop=STOP_TOKENS,
|
| 304 |
)
|
|
|
|
| 308 |
prompt,
|
| 309 |
temperature=float(temperature),
|
| 310 |
top_p=float(top_p),
|
| 311 |
+
max_tokens=int(max_new),
|
| 312 |
repeat_penalty=float(repetition_penalty),
|
| 313 |
stop=STOP_TOKENS,
|
| 314 |
)
|
|
|
|
| 342 |
with gr.Accordion("Generation settings", open=False):
|
| 343 |
temperature = gr.Slider(0.0, 2.0, value=0.7, step=0.05, label="temperature")
|
| 344 |
top_p = gr.Slider(0.1, 1.0, value=0.95, step=0.01, label="top_p")
|
| 345 |
+
max_new_tokens = gr.Slider(16, 512, value=256, step=16, label="max_new_tokens")
|
| 346 |
repetition_penalty = gr.Slider(1.0, 2.0, value=1.07, step=0.01, label="repetition_penalty")
|
| 347 |
|
| 348 |
session_list = gr.Radio(choices=[], value=None, label="Conversations", interactive=True)
|