Spaces:
Runtime error
Runtime error
Delete chip-space/chip/compressor.py
Browse files- chip-space/chip/compressor.py +0 -334
chip-space/chip/compressor.py
DELETED
|
@@ -1,334 +0,0 @@
|
|
| 1 |
-
"""
|
| 2 |
-
chip.compressor
|
| 3 |
-
================
|
| 4 |
-
CHIP 主压缩器。设计原则:
|
| 5 |
-
1. 协议是文本,不是模型 — 不依赖 LLM 调用,纯规则可跑
|
| 6 |
-
2. 双轨 — Qwen 轨用中文方括号,cl100k 轨用 XML/Markdown
|
| 7 |
-
3. 可逆 — 保留命名实体、数字、代码、URL 不动
|
| 8 |
-
4. 可审计 — 每条改动可追溯到 rules.yaml 的某条规则
|
| 9 |
-
|
| 10 |
-
当前实现层级:
|
| 11 |
-
L1 (lex) — 词法替换:啰嗦套话 → 紧凑动宾,纯正则,~1.3-1.5x 压缩
|
| 12 |
-
L2 (syn) — 句法重排:虚词替换、列表化,需 jieba 分词,~2-3x
|
| 13 |
-
L3 (idiom) — 成语压缩(基于实测白名单),需 target 是国产 tokenizer
|
| 14 |
-
L4 (proto) — 协议层归一化,统一为 ### 标签
|
| 15 |
-
|
| 16 |
-
NP-aware 角色提取(可选):
|
| 17 |
-
L2-022 默认用正则,在含空格的复合 NP 上偶有截断。
|
| 18 |
-
设环境变量 CHIP_USE_JIEBA=1 启用 jieba 增强版。
|
| 19 |
-
"""
|
| 20 |
-
from __future__ import annotations
|
| 21 |
-
|
| 22 |
-
import os
|
| 23 |
-
import re
|
| 24 |
-
from dataclasses import dataclass, field
|
| 25 |
-
from pathlib import Path
|
| 26 |
-
from typing import Iterable
|
| 27 |
-
|
| 28 |
-
import yaml
|
| 29 |
-
|
| 30 |
-
|
| 31 |
-
# ============ 数据类 ============
|
| 32 |
-
@dataclass
|
| 33 |
-
class Rule:
|
| 34 |
-
"""一条 CHIP 转换规则。"""
|
| 35 |
-
id: str
|
| 36 |
-
layer: str # "L1" | "L2" | "L3" | "L4"
|
| 37 |
-
pattern: str # 正则
|
| 38 |
-
replacement: str
|
| 39 |
-
description: str = ""
|
| 40 |
-
saves: int = 0 # 在参考 tokenizer 上预估省多少 token
|
| 41 |
-
risk: str = "low" # low | mid | high
|
| 42 |
-
flags: int = 0
|
| 43 |
-
_compiled: re.Pattern = field(default=None, repr=False)
|
| 44 |
-
|
| 45 |
-
def compile(self):
|
| 46 |
-
if self._compiled is None:
|
| 47 |
-
self._compiled = re.compile(self.pattern, self.flags)
|
| 48 |
-
return self._compiled
|
| 49 |
-
|
| 50 |
-
|
| 51 |
-
@dataclass
|
| 52 |
-
class CompressionResult:
|
| 53 |
-
"""压缩结果,带 audit trail。"""
|
| 54 |
-
original: str
|
| 55 |
-
compressed: str
|
| 56 |
-
applied_rules: list[str] # 命中的 rule id 列表
|
| 57 |
-
target: str # tokenizer 名
|
| 58 |
-
layers: tuple
|
| 59 |
-
|
| 60 |
-
@property
|
| 61 |
-
def char_ratio(self) -> float:
|
| 62 |
-
return len(self.compressed) / max(len(self.original), 1)
|
| 63 |
-
|
| 64 |
-
def diff(self) -> str:
|
| 65 |
-
"""简单的并排展示。"""
|
| 66 |
-
return f"原: {self.original}\n压: {self.compressed}\n规则: {', '.join(self.applied_rules) or '(none)'}"
|
| 67 |
-
|
| 68 |
-
|
| 69 |
-
# ============ 规则加载 ============
|
| 70 |
-
DEFAULT_RULES_PATH = Path(__file__).parent / "rules" / "rules.yaml"
|
| 71 |
-
|
| 72 |
-
|
| 73 |
-
def load_rules(path: Path | str = DEFAULT_RULES_PATH) -> list[Rule]:
|
| 74 |
-
"""从 yaml 加载规则。"""
|
| 75 |
-
path = Path(path)
|
| 76 |
-
with open(path, encoding="utf-8") as f:
|
| 77 |
-
data = yaml.safe_load(f)
|
| 78 |
-
|
| 79 |
-
rules = []
|
| 80 |
-
for item in data.get("rules", []):
|
| 81 |
-
flags = 0
|
| 82 |
-
for flag_name in item.get("flags", []):
|
| 83 |
-
flags |= getattr(re, flag_name.upper(), 0)
|
| 84 |
-
rules.append(Rule(
|
| 85 |
-
id=item["id"],
|
| 86 |
-
layer=item["layer"],
|
| 87 |
-
pattern=item["pattern"],
|
| 88 |
-
replacement=item.get("replacement", ""),
|
| 89 |
-
description=item.get("description", ""),
|
| 90 |
-
saves=item.get("saves", 0),
|
| 91 |
-
risk=item.get("risk", "low"),
|
| 92 |
-
flags=flags,
|
| 93 |
-
))
|
| 94 |
-
return rules
|
| 95 |
-
|
| 96 |
-
|
| 97 |
-
# ============ 保护性 mask ============
|
| 98 |
-
# 这些 pattern 命中的子串会先被替换成占位符,跑完规则后再还原。
|
| 99 |
-
# 防止规则误改专有名词、URL、代码、数字。
|
| 100 |
-
PROTECT_PATTERNS = [
|
| 101 |
-
("URL", re.compile(r"https?://\S+")),
|
| 102 |
-
("CODE", re.compile(r"```[\s\S]*?```|`[^`\n]+`")),
|
| 103 |
-
("NUM", re.compile(r"\d+(?:\.\d+)?(?:%|km|kg|m|s|°C)?")),
|
| 104 |
-
("EMAIL", re.compile(r"[\w.+-]+@[\w-]+\.[\w.-]+")),
|
| 105 |
-
# 双引号包裹的引文(用户原话)
|
| 106 |
-
("QUOTE", re.compile(r"[\"\u201c][^\"\u201d]+[\"\u201d]")),
|
| 107 |
-
]
|
| 108 |
-
|
| 109 |
-
|
| 110 |
-
# 占位符前缀用一个不会出现在自然中文里、且不会被 PROTECT_PATTERNS 命中的 token
|
| 111 |
-
_PH_OPEN = "\u2983" # ⦃
|
| 112 |
-
_PH_CLOSE = "\u2984" # ⦄
|
| 113 |
-
_PH_RE = re.compile(rf"{_PH_OPEN}\d+{_PH_CLOSE}")
|
| 114 |
-
|
| 115 |
-
|
| 116 |
-
def _mask(text: str) -> tuple[str, list[tuple[str, str]]]:
|
| 117 |
-
"""把不可压缩片段替换成 ⦃i⦄ 占位符,返回 (masked, mappings)。
|
| 118 |
-
|
| 119 |
-
关键:每次 sub 时跳过已经 mask 过的占位符,避免嵌套替换。
|
| 120 |
-
"""
|
| 121 |
-
mappings = []
|
| 122 |
-
masked = text
|
| 123 |
-
|
| 124 |
-
def make_sub():
|
| 125 |
-
def _sub(m):
|
| 126 |
-
# 如果 match 整体落在已有占位符内,跳过
|
| 127 |
-
content = m.group(0)
|
| 128 |
-
if _PH_RE.fullmatch(content):
|
| 129 |
-
return content
|
| 130 |
-
i = len(mappings)
|
| 131 |
-
placeholder = f"{_PH_OPEN}{i}{_PH_CLOSE}"
|
| 132 |
-
mappings.append((placeholder, content))
|
| 133 |
-
return placeholder
|
| 134 |
-
return _sub
|
| 135 |
-
|
| 136 |
-
for tag, pat in PROTECT_PATTERNS:
|
| 137 |
-
masked = pat.sub(make_sub(), masked)
|
| 138 |
-
return masked, mappings
|
| 139 |
-
|
| 140 |
-
|
| 141 |
-
def _unmask(text: str, mappings: list[tuple[str, str]]) -> str:
|
| 142 |
-
# 反向替换避免 ⦃1⦄ 误替换 ⦃10⦄
|
| 143 |
-
for placeholder, original in reversed(mappings):
|
| 144 |
-
text = text.replace(placeholder, original)
|
| 145 |
-
return text
|
| 146 |
-
|
| 147 |
-
|
| 148 |
-
# ============ 主类 ============
|
| 149 |
-
class Compressor:
|
| 150 |
-
"""可重用的压缩器实例。"""
|
| 151 |
-
|
| 152 |
-
def __init__(self,
|
| 153 |
-
rules_path: Path | str = DEFAULT_RULES_PATH,
|
| 154 |
-
target: str = "qwen2.5",
|
| 155 |
-
layers: Iterable[str] = ("L1", "L2", "L4")):
|
| 156 |
-
"""
|
| 157 |
-
Args:
|
| 158 |
-
target: 目标 tokenizer,影响成语压缩等 target-aware 决策
|
| 159 |
-
layers: 启用的压缩层
|
| 160 |
-
- L1: 词法层(套话剪枝),保险,默认开
|
| 161 |
-
- L2: 句法层(模式重排),保险,默认开
|
| 162 |
-
- L3: 成语层(语义压缩),需 target 是国产 tokenizer 才有意义,默认关
|
| 163 |
-
- L4: 协议层归一化(### 标题统一),无害,默认开
|
| 164 |
-
"""
|
| 165 |
-
self.rules = load_rules(rules_path)
|
| 166 |
-
self.target = target
|
| 167 |
-
self.layers = tuple(layers)
|
| 168 |
-
# 预编译
|
| 169 |
-
for r in self.rules:
|
| 170 |
-
r.compile()
|
| 171 |
-
|
| 172 |
-
def compress(self, text: str) -> CompressionResult:
|
| 173 |
-
original = text
|
| 174 |
-
|
| 175 |
-
# 可选:jieba 增强角色提取 (pre-process,优先于 L2-022 的纯正则)
|
| 176 |
-
applied_pre = []
|
| 177 |
-
if os.getenv("CHIP_USE_JIEBA") == "1" and "L2" in self.layers:
|
| 178 |
-
text, jieba_applied = _jieba_role_extract(text)
|
| 179 |
-
if jieba_applied:
|
| 180 |
-
applied_pre.append("L2-022J(jieba)")
|
| 181 |
-
|
| 182 |
-
masked, mappings = _mask(text)
|
| 183 |
-
applied = list(applied_pre)
|
| 184 |
-
|
| 185 |
-
for rule in self.rules:
|
| 186 |
-
if rule.layer not in self.layers:
|
| 187 |
-
continue
|
| 188 |
-
new_text, n = rule._compiled.subn(rule.replacement, masked)
|
| 189 |
-
if n > 0:
|
| 190 |
-
applied.append(f"{rule.id}×{n}")
|
| 191 |
-
masked = new_text
|
| 192 |
-
|
| 193 |
-
# 收尾:多余空白、连续标点
|
| 194 |
-
masked = re.sub(r"[ \t]+", " ", masked)
|
| 195 |
-
masked = re.sub(r"\s*\n\s*\n\s*\n+", "\n\n", masked)
|
| 196 |
-
|
| 197 |
-
# 协议层留下的孤立标点清理(L2-022 等会留下 "\n,xxx")
|
| 198 |
-
masked = re.sub(r"\n[,,;;。.\s]+", "\n", masked)
|
| 199 |
-
masked = re.sub(r"^[,,;;]+\s*", "", masked, flags=re.MULTILINE)
|
| 200 |
-
|
| 201 |
-
masked = masked.strip()
|
| 202 |
-
compressed = _unmask(masked, mappings)
|
| 203 |
-
|
| 204 |
-
return CompressionResult(
|
| 205 |
-
original=original,
|
| 206 |
-
compressed=compressed,
|
| 207 |
-
applied_rules=applied,
|
| 208 |
-
target=self.target,
|
| 209 |
-
layers=self.layers,
|
| 210 |
-
)
|
| 211 |
-
|
| 212 |
-
|
| 213 |
-
# ============ 便捷函数 ============
|
| 214 |
-
_default_compressor = None
|
| 215 |
-
|
| 216 |
-
|
| 217 |
-
def compress(text: str,
|
| 218 |
-
target: str = "qwen2.5",
|
| 219 |
-
layers: Iterable[str] = ("L1", "L2", "L4"),
|
| 220 |
-
return_result: bool = False) -> str | CompressionResult:
|
| 221 |
-
"""简便入口。
|
| 222 |
-
|
| 223 |
-
>>> compress("请帮我总结一下这段文字")
|
| 224 |
-
'总结一下这段文字'
|
| 225 |
-
|
| 226 |
-
>>> compress("...", layers=["L1","L2","L3","L4"]) # 启用所有层(包括成语)
|
| 227 |
-
|
| 228 |
-
>>> r = compress("...", return_result=True)
|
| 229 |
-
>>> print(r.diff())
|
| 230 |
-
"""
|
| 231 |
-
global _default_compressor
|
| 232 |
-
key = (target, tuple(layers))
|
| 233 |
-
if _default_compressor is None or _default_compressor[0] != key:
|
| 234 |
-
_default_compressor = (key, Compressor(target=target, layers=layers))
|
| 235 |
-
result = _default_compressor[1].compress(text)
|
| 236 |
-
return result if return_result else result.compressed
|
| 237 |
-
|
| 238 |
-
|
| 239 |
-
# ============ jieba NP 提取(可选增强) ============
|
| 240 |
-
_jieba_loaded = False
|
| 241 |
-
|
| 242 |
-
|
| 243 |
-
def _ensure_jieba():
|
| 244 |
-
"""懒加载 jieba。"""
|
| 245 |
-
global _jieba_loaded
|
| 246 |
-
if _jieba_loaded:
|
| 247 |
-
return True
|
| 248 |
-
try:
|
| 249 |
-
import jieba.posseg as pseg # noqa: F401
|
| 250 |
-
_jieba_loaded = True
|
| 251 |
-
return True
|
| 252 |
-
except ImportError:
|
| 253 |
-
return False
|
| 254 |
-
|
| 255 |
-
|
| 256 |
-
# 角色扮演的触发短语 — jieba 用它定位
|
| 257 |
-
_ROLE_PREFIX_RE = re.compile(
|
| 258 |
-
r"请\s*(?:你)?\s*扮演\s*(?:一(?:个|位))?\s*"
|
| 259 |
-
)
|
| 260 |
-
|
| 261 |
-
|
| 262 |
-
def _jieba_role_extract(text: str) -> tuple[str, bool]:
|
| 263 |
-
"""用 jieba 词性标注提取最长名词短语作为角色描述。
|
| 264 |
-
|
| 265 |
-
替换 L2-022 的纯正则 lookahead 实现 — 后者在以下场景失败:
|
| 266 |
-
- 角色描述非常长且无标点结尾
|
| 267 |
-
- 角色描述被句中的连词意外截断("...然后..." 这种)
|
| 268 |
-
|
| 269 |
-
策略:
|
| 270 |
-
1. 找到 "请你扮演[一位]" 触发短语
|
| 271 |
-
2. 从触发短语后开始,jieba.posseg 切分
|
| 272 |
-
3. 贪婪收集 NP token,直到遇到 hard-stop:
|
| 273 |
-
- 连词 c (然后/接着/以及)
|
| 274 |
-
- 介词 p (对/把/为)
|
| 275 |
-
- 动词 v (但 vn 动名词允许)
|
| 276 |
-
- 句末标点 w (。;,等)
|
| 277 |
-
4. 助词 'uj/u/ul'(的/地/得)、空格、英文都允许进入 NP
|
| 278 |
-
"""
|
| 279 |
-
if not _ensure_jieba():
|
| 280 |
-
return text, False
|
| 281 |
-
|
| 282 |
-
import jieba.posseg as pseg
|
| 283 |
-
|
| 284 |
-
m = _ROLE_PREFIX_RE.search(text)
|
| 285 |
-
if not m:
|
| 286 |
-
return text, False
|
| 287 |
-
|
| 288 |
-
head = text[:m.start()]
|
| 289 |
-
body = text[m.end():]
|
| 290 |
-
if not body:
|
| 291 |
-
return text, False
|
| 292 |
-
|
| 293 |
-
words = list(pseg.cut(body))
|
| 294 |
-
|
| 295 |
-
# NP 定义:最长前缀,直到遇到硬终止
|
| 296 |
-
# HARD_STOP:动词(非 vn)、连词、介词、标点
|
| 297 |
-
# ALLOW_IN_NP:名词、形容词、英文、数字、量词、助词(的/地/得)、空格
|
| 298 |
-
np_chars = []
|
| 299 |
-
cumlen = 0
|
| 300 |
-
rest_start = 0
|
| 301 |
-
found_np_core = False # 是否已经收到名词或形容词(NP 核心)
|
| 302 |
-
|
| 303 |
-
for w, flag in words:
|
| 304 |
-
# hard stop 条件
|
| 305 |
-
is_hard_stop = (
|
| 306 |
-
flag == "w" # 标点
|
| 307 |
-
or w in {",", ",", "。", ".", ";", ";", ":", ":", "、", "\n"}
|
| 308 |
-
or flag == "c" # 连词
|
| 309 |
-
or flag == "p" # 介词
|
| 310 |
-
or (flag.startswith("v") and flag != "vn") # 真动词(非动名词)
|
| 311 |
-
)
|
| 312 |
-
if is_hard_stop and found_np_core:
|
| 313 |
-
rest_start = cumlen
|
| 314 |
-
break
|
| 315 |
-
|
| 316 |
-
# 在 NP 内
|
| 317 |
-
np_chars.append(w)
|
| 318 |
-
cumlen += len(w)
|
| 319 |
-
if flag.startswith("n") or flag.startswith("a") or flag == "eng":
|
| 320 |
-
found_np_core = True
|
| 321 |
-
else:
|
| 322 |
-
# 遍历完了,整个 body 都是 NP
|
| 323 |
-
rest_start = cumlen
|
| 324 |
-
|
| 325 |
-
np_str = "".join(np_chars).strip()
|
| 326 |
-
if not np_str or len(np_str) < 2 or not found_np_core:
|
| 327 |
-
return text, False
|
| 328 |
-
|
| 329 |
-
rest = body[rest_start:]
|
| 330 |
-
new_text = f"{head}\n### 角色\n{np_str}\n{rest}"
|
| 331 |
-
# 清理紧跟在角色块后的孤立标点
|
| 332 |
-
new_text = re.sub(r"\n[,,;;。.]+", "\n", new_text)
|
| 333 |
-
new_text = new_text.strip()
|
| 334 |
-
return new_text, True
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|