luancy1208 commited on
Commit
00add69
·
verified ·
1 Parent(s): e842dd8

Delete chip-space/chip/compressor.py

Browse files
Files changed (1) hide show
  1. chip-space/chip/compressor.py +0 -334
chip-space/chip/compressor.py DELETED
@@ -1,334 +0,0 @@
1
- """
2
- chip.compressor
3
- ================
4
- CHIP 主压缩器。设计原则:
5
- 1. 协议是文本,不是模型 — 不依赖 LLM 调用,纯规则可跑
6
- 2. 双轨 — Qwen 轨用中文方括号,cl100k 轨用 XML/Markdown
7
- 3. 可逆 — 保留命名实体、数字、代码、URL 不动
8
- 4. 可审计 — 每条改动可追溯到 rules.yaml 的某条规则
9
-
10
- 当前实现层级:
11
- L1 (lex) — 词法替换:啰嗦套话 → 紧凑动宾,纯正则,~1.3-1.5x 压缩
12
- L2 (syn) — 句法重排:虚词替换、列表化,需 jieba 分词,~2-3x
13
- L3 (idiom) — 成语压缩(基于实测白名单),需 target 是国产 tokenizer
14
- L4 (proto) — 协议层归一化,统一为 ### 标签
15
-
16
- NP-aware 角色提取(可选):
17
- L2-022 默认用正则,在含空格的复合 NP 上偶有截断。
18
- 设环境变量 CHIP_USE_JIEBA=1 启用 jieba 增强版。
19
- """
20
- from __future__ import annotations
21
-
22
- import os
23
- import re
24
- from dataclasses import dataclass, field
25
- from pathlib import Path
26
- from typing import Iterable
27
-
28
- import yaml
29
-
30
-
31
- # ============ 数据类 ============
32
- @dataclass
33
- class Rule:
34
- """一条 CHIP 转换规则。"""
35
- id: str
36
- layer: str # "L1" | "L2" | "L3" | "L4"
37
- pattern: str # 正则
38
- replacement: str
39
- description: str = ""
40
- saves: int = 0 # 在参考 tokenizer 上预估省多少 token
41
- risk: str = "low" # low | mid | high
42
- flags: int = 0
43
- _compiled: re.Pattern = field(default=None, repr=False)
44
-
45
- def compile(self):
46
- if self._compiled is None:
47
- self._compiled = re.compile(self.pattern, self.flags)
48
- return self._compiled
49
-
50
-
51
- @dataclass
52
- class CompressionResult:
53
- """压缩结果,带 audit trail。"""
54
- original: str
55
- compressed: str
56
- applied_rules: list[str] # 命中的 rule id 列表
57
- target: str # tokenizer 名
58
- layers: tuple
59
-
60
- @property
61
- def char_ratio(self) -> float:
62
- return len(self.compressed) / max(len(self.original), 1)
63
-
64
- def diff(self) -> str:
65
- """简单的并排展示。"""
66
- return f"原: {self.original}\n压: {self.compressed}\n规则: {', '.join(self.applied_rules) or '(none)'}"
67
-
68
-
69
- # ============ 规则加载 ============
70
- DEFAULT_RULES_PATH = Path(__file__).parent / "rules" / "rules.yaml"
71
-
72
-
73
- def load_rules(path: Path | str = DEFAULT_RULES_PATH) -> list[Rule]:
74
- """从 yaml 加载规则。"""
75
- path = Path(path)
76
- with open(path, encoding="utf-8") as f:
77
- data = yaml.safe_load(f)
78
-
79
- rules = []
80
- for item in data.get("rules", []):
81
- flags = 0
82
- for flag_name in item.get("flags", []):
83
- flags |= getattr(re, flag_name.upper(), 0)
84
- rules.append(Rule(
85
- id=item["id"],
86
- layer=item["layer"],
87
- pattern=item["pattern"],
88
- replacement=item.get("replacement", ""),
89
- description=item.get("description", ""),
90
- saves=item.get("saves", 0),
91
- risk=item.get("risk", "low"),
92
- flags=flags,
93
- ))
94
- return rules
95
-
96
-
97
- # ============ 保护性 mask ============
98
- # 这些 pattern 命中的子串会先被替换成占位符,跑完规则后再还原。
99
- # 防止规则误改专有名词、URL、代码、数字。
100
- PROTECT_PATTERNS = [
101
- ("URL", re.compile(r"https?://\S+")),
102
- ("CODE", re.compile(r"```[\s\S]*?```|`[^`\n]+`")),
103
- ("NUM", re.compile(r"\d+(?:\.\d+)?(?:%|km|kg|m|s|°C)?")),
104
- ("EMAIL", re.compile(r"[\w.+-]+@[\w-]+\.[\w.-]+")),
105
- # 双引号包裹的引文(用户原话)
106
- ("QUOTE", re.compile(r"[\"\u201c][^\"\u201d]+[\"\u201d]")),
107
- ]
108
-
109
-
110
- # 占位符前缀用一个不会出现在自然中文里、且不会被 PROTECT_PATTERNS 命中的 token
111
- _PH_OPEN = "\u2983" # ⦃
112
- _PH_CLOSE = "\u2984" # ⦄
113
- _PH_RE = re.compile(rf"{_PH_OPEN}\d+{_PH_CLOSE}")
114
-
115
-
116
- def _mask(text: str) -> tuple[str, list[tuple[str, str]]]:
117
- """把不可压缩片段替换成 ⦃i⦄ 占位符,返回 (masked, mappings)。
118
-
119
- 关键:每次 sub 时跳过已经 mask 过的占位符,避免嵌套替换。
120
- """
121
- mappings = []
122
- masked = text
123
-
124
- def make_sub():
125
- def _sub(m):
126
- # 如果 match 整体落在已有占位符内,跳过
127
- content = m.group(0)
128
- if _PH_RE.fullmatch(content):
129
- return content
130
- i = len(mappings)
131
- placeholder = f"{_PH_OPEN}{i}{_PH_CLOSE}"
132
- mappings.append((placeholder, content))
133
- return placeholder
134
- return _sub
135
-
136
- for tag, pat in PROTECT_PATTERNS:
137
- masked = pat.sub(make_sub(), masked)
138
- return masked, mappings
139
-
140
-
141
- def _unmask(text: str, mappings: list[tuple[str, str]]) -> str:
142
- # 反向替换避免 ⦃1⦄ 误替换 ⦃10⦄
143
- for placeholder, original in reversed(mappings):
144
- text = text.replace(placeholder, original)
145
- return text
146
-
147
-
148
- # ============ 主类 ============
149
- class Compressor:
150
- """可重用的压缩器实例。"""
151
-
152
- def __init__(self,
153
- rules_path: Path | str = DEFAULT_RULES_PATH,
154
- target: str = "qwen2.5",
155
- layers: Iterable[str] = ("L1", "L2", "L4")):
156
- """
157
- Args:
158
- target: 目标 tokenizer,影响成语压缩等 target-aware 决策
159
- layers: 启用的压缩层
160
- - L1: 词法层(套话剪枝),保险,默认开
161
- - L2: 句法层(模式重排),保险,默认开
162
- - L3: 成语层(语义压缩),需 target 是国产 tokenizer 才有意义,默认关
163
- - L4: 协议层归一化(### 标题统一),无害,默认开
164
- """
165
- self.rules = load_rules(rules_path)
166
- self.target = target
167
- self.layers = tuple(layers)
168
- # 预编译
169
- for r in self.rules:
170
- r.compile()
171
-
172
- def compress(self, text: str) -> CompressionResult:
173
- original = text
174
-
175
- # 可选:jieba 增强角色提取 (pre-process,优先于 L2-022 的纯正则)
176
- applied_pre = []
177
- if os.getenv("CHIP_USE_JIEBA") == "1" and "L2" in self.layers:
178
- text, jieba_applied = _jieba_role_extract(text)
179
- if jieba_applied:
180
- applied_pre.append("L2-022J(jieba)")
181
-
182
- masked, mappings = _mask(text)
183
- applied = list(applied_pre)
184
-
185
- for rule in self.rules:
186
- if rule.layer not in self.layers:
187
- continue
188
- new_text, n = rule._compiled.subn(rule.replacement, masked)
189
- if n > 0:
190
- applied.append(f"{rule.id}×{n}")
191
- masked = new_text
192
-
193
- # 收尾:多余空白、连续标点
194
- masked = re.sub(r"[ \t]+", " ", masked)
195
- masked = re.sub(r"\s*\n\s*\n\s*\n+", "\n\n", masked)
196
-
197
- # 协议层留下的孤立标点清理(L2-022 等会留下 "\n,xxx")
198
- masked = re.sub(r"\n[,,;;。.\s]+", "\n", masked)
199
- masked = re.sub(r"^[,,;;]+\s*", "", masked, flags=re.MULTILINE)
200
-
201
- masked = masked.strip()
202
- compressed = _unmask(masked, mappings)
203
-
204
- return CompressionResult(
205
- original=original,
206
- compressed=compressed,
207
- applied_rules=applied,
208
- target=self.target,
209
- layers=self.layers,
210
- )
211
-
212
-
213
- # ============ 便捷函数 ============
214
- _default_compressor = None
215
-
216
-
217
- def compress(text: str,
218
- target: str = "qwen2.5",
219
- layers: Iterable[str] = ("L1", "L2", "L4"),
220
- return_result: bool = False) -> str | CompressionResult:
221
- """简便入口。
222
-
223
- >>> compress("请帮我总结一下这段文字")
224
- '总结一下这段文字'
225
-
226
- >>> compress("...", layers=["L1","L2","L3","L4"]) # 启用所有层(包括成语)
227
-
228
- >>> r = compress("...", return_result=True)
229
- >>> print(r.diff())
230
- """
231
- global _default_compressor
232
- key = (target, tuple(layers))
233
- if _default_compressor is None or _default_compressor[0] != key:
234
- _default_compressor = (key, Compressor(target=target, layers=layers))
235
- result = _default_compressor[1].compress(text)
236
- return result if return_result else result.compressed
237
-
238
-
239
- # ============ jieba NP 提取(可选增强) ============
240
- _jieba_loaded = False
241
-
242
-
243
- def _ensure_jieba():
244
- """懒加载 jieba。"""
245
- global _jieba_loaded
246
- if _jieba_loaded:
247
- return True
248
- try:
249
- import jieba.posseg as pseg # noqa: F401
250
- _jieba_loaded = True
251
- return True
252
- except ImportError:
253
- return False
254
-
255
-
256
- # 角色扮演的触发短语 — jieba 用它定位
257
- _ROLE_PREFIX_RE = re.compile(
258
- r"请\s*(?:你)?\s*扮演\s*(?:一(?:个|位))?\s*"
259
- )
260
-
261
-
262
- def _jieba_role_extract(text: str) -> tuple[str, bool]:
263
- """用 jieba 词性标注提取最长名词短语作为角色描述。
264
-
265
- 替换 L2-022 的纯正则 lookahead 实现 — 后者在以下场景失败:
266
- - 角色描述非常长且无标点结尾
267
- - 角色描述被句中的连词意外截断("...然后..." 这种)
268
-
269
- 策略:
270
- 1. 找到 "请你扮演[一位]" 触发短语
271
- 2. 从触发短语后开始,jieba.posseg 切分
272
- 3. 贪婪收集 NP token,直到遇到 hard-stop:
273
- - 连词 c (然后/接着/以及)
274
- - 介词 p (对/把/为)
275
- - 动词 v (但 vn 动名词允许)
276
- - 句末标点 w (。;,等)
277
- 4. 助词 'uj/u/ul'(的/地/得)、空格、英文都允许进入 NP
278
- """
279
- if not _ensure_jieba():
280
- return text, False
281
-
282
- import jieba.posseg as pseg
283
-
284
- m = _ROLE_PREFIX_RE.search(text)
285
- if not m:
286
- return text, False
287
-
288
- head = text[:m.start()]
289
- body = text[m.end():]
290
- if not body:
291
- return text, False
292
-
293
- words = list(pseg.cut(body))
294
-
295
- # NP 定义:最长前缀,直到遇到硬终止
296
- # HARD_STOP:动词(非 vn)、连词、介词、标点
297
- # ALLOW_IN_NP:名词、形容词、英文、数字、量词、助词(的/地/得)、空格
298
- np_chars = []
299
- cumlen = 0
300
- rest_start = 0
301
- found_np_core = False # 是否已经收到名词或形容词(NP 核心)
302
-
303
- for w, flag in words:
304
- # hard stop 条件
305
- is_hard_stop = (
306
- flag == "w" # 标点
307
- or w in {",", ",", "。", ".", ";", ";", ":", ":", "、", "\n"}
308
- or flag == "c" # 连词
309
- or flag == "p" # 介词
310
- or (flag.startswith("v") and flag != "vn") # 真动词(非动名词)
311
- )
312
- if is_hard_stop and found_np_core:
313
- rest_start = cumlen
314
- break
315
-
316
- # 在 NP 内
317
- np_chars.append(w)
318
- cumlen += len(w)
319
- if flag.startswith("n") or flag.startswith("a") or flag == "eng":
320
- found_np_core = True
321
- else:
322
- # 遍历完了,整个 body 都是 NP
323
- rest_start = cumlen
324
-
325
- np_str = "".join(np_chars).strip()
326
- if not np_str or len(np_str) < 2 or not found_np_core:
327
- return text, False
328
-
329
- rest = body[rest_start:]
330
- new_text = f"{head}\n### 角色\n{np_str}\n{rest}"
331
- # 清理紧跟在角色块后的孤立标点
332
- new_text = re.sub(r"\n[,,;;。.]+", "\n", new_text)
333
- new_text = new_text.strip()
334
- return new_text, True