Spaces:
Runtime error
Runtime error
| # CHIP Compression Rules v0.2 (2025-05-01) | |
| # ========================================== | |
| # v0.2 vs v0.1 主要变化: | |
| # - 标签层从 [角:X] / 【任】 改为 ### 角色 (实测全 tokenizer 1 token,完爆方括号) | |
| # - 新增 L3 成语层(基于 idiom_whitelist.json 实测) | |
| # - 新增 L4 协议层(归一化用户已有的标签) | |
| rules: | |
| # ============================================================ | |
| # L1: 词法层 — 啰嗦套话剪枝 | |
| # ============================================================ | |
| - id: L1-001 | |
| layer: L1 | |
| pattern: "请你?帮我?" | |
| replacement: "" | |
| saves: 2 | |
| risk: low | |
| description: "客套语 '请你帮我' / '请帮我' → 空" | |
| - id: L1-002 | |
| layer: L1 | |
| pattern: "麻烦你?" | |
| replacement: "" | |
| saves: 2 | |
| risk: low | |
| - id: L1-003 | |
| layer: L1 | |
| pattern: "如果可以的话[,,]?" | |
| replacement: "" | |
| saves: 3 | |
| risk: low | |
| - id: L1-004 | |
| layer: L1 | |
| pattern: "(?:能不能|可不可以|可以|能)(?=帮|告诉|解释|总结|分析)" | |
| replacement: "" | |
| saves: 2 | |
| risk: low | |
| - id: L1-005 | |
| layer: L1 | |
| pattern: "辛苦你?" | |
| replacement: "" | |
| saves: 2 | |
| risk: low | |
| - id: L1-006 | |
| layer: L1 | |
| pattern: "(?:谢谢|感谢)(?:你|了)?[!!.。]?" | |
| replacement: "" | |
| saves: 2 | |
| risk: low | |
| # ---- 进行/做 + 动词性名词 → 单字动词 ---- | |
| - id: L1-010 | |
| layer: L1 | |
| pattern: "进行(?:一?(?:个|下|次)?)?分析" | |
| replacement: "分析" | |
| saves: 2 | |
| risk: low | |
| - id: L1-011 | |
| layer: L1 | |
| pattern: "进行(?:一?(?:个|下|次)?)?总结" | |
| replacement: "总结" | |
| saves: 2 | |
| risk: low | |
| - id: L1-012 | |
| layer: L1 | |
| pattern: "进行(?:一?(?:个|下|次)?)?处理" | |
| replacement: "处理" | |
| saves: 2 | |
| risk: low | |
| - id: L1-013 | |
| layer: L1 | |
| pattern: "进行(?:一?(?:个|下|次)?)?解释" | |
| replacement: "解释" | |
| saves: 2 | |
| risk: low | |
| - id: L1-014 | |
| layer: L1 | |
| pattern: "做(?:一?(?:个|下|次)?)?判断" | |
| replacement: "判定" | |
| saves: 3 | |
| risk: low | |
| - id: L1-015 | |
| layer: L1 | |
| pattern: "做(?:一?(?:个|下|次)?)?解释" | |
| replacement: "解释" | |
| saves: 3 | |
| risk: low | |
| - id: L1-016 | |
| layer: L1 | |
| pattern: "给(?:出|我)(?:一些|几个)?建议" | |
| replacement: "建议" | |
| saves: 2 | |
| risk: low | |
| - id: L1-017 | |
| layer: L1 | |
| pattern: "提供(?:一些|相关|相对)?帮助" | |
| replacement: "助" | |
| saves: 2 | |
| risk: mid | |
| - id: L1-018 | |
| layer: L1 | |
| pattern: "进行(?:一?(?:个|下|次)?)?检查" | |
| replacement: "检查" | |
| saves: 2 | |
| risk: low | |
| - id: L1-019 | |
| layer: L1 | |
| pattern: "进行(?:一?(?:个|下|次)?)?优化" | |
| replacement: "优化" | |
| saves: 2 | |
| risk: low | |
| # ---- 连接词 ---- | |
| - id: L1-020 | |
| layer: L1 | |
| pattern: "也就是说[,,]?" | |
| replacement: "即" | |
| saves: 3 | |
| risk: low | |
| - id: L1-021 | |
| layer: L1 | |
| pattern: "换句话说[,,]?" | |
| replacement: "即" | |
| saves: 3 | |
| risk: low | |
| - id: L1-022 | |
| layer: L1 | |
| pattern: "与此同时[,,]?" | |
| replacement: "同时," | |
| saves: 2 | |
| risk: low | |
| - id: L1-023 | |
| layer: L1 | |
| pattern: "在这种情况下[,,]?" | |
| replacement: "此时," | |
| saves: 3 | |
| risk: low | |
| - id: L1-024 | |
| layer: L1 | |
| pattern: "由此可见[,,]?" | |
| replacement: "故" | |
| saves: 3 | |
| risk: low | |
| - id: L1-025 | |
| layer: L1 | |
| pattern: "因此(?:[,,]|说)?" | |
| replacement: "故" | |
| saves: 1 | |
| risk: low | |
| - id: L1-026 | |
| layer: L1 | |
| pattern: "如果没有" | |
| replacement: "若无" | |
| saves: 2 | |
| risk: low | |
| - id: L1-027 | |
| layer: L1 | |
| pattern: "通过(.+?)的方式" | |
| replacement: "用\\1" | |
| saves: 2 | |
| risk: mid | |
| - id: L1-028 | |
| layer: L1 | |
| pattern: "(?:如上所述|前面提到的|刚才说的)" | |
| replacement: "前述" | |
| saves: 3 | |
| risk: low | |
| # ---- 修饰副词 ---- | |
| - id: L1-030 | |
| layer: L1 | |
| pattern: "比较(?:简洁|清晰|详细)地?" | |
| replacement: "" | |
| saves: 3 | |
| risk: low | |
| - id: L1-031 | |
| layer: L1 | |
| pattern: "相对(?:简洁|详细|完整)地?" | |
| replacement: "" | |
| saves: 3 | |
| risk: low | |
| - id: L1-032 | |
| layer: L1 | |
| pattern: "尽可能(?:地)?" | |
| replacement: "尽量" | |
| saves: 1 | |
| risk: low | |
| - id: L1-033 | |
| layer: L1 | |
| pattern: "非常(?:详细|详尽|全面)地?" | |
| replacement: "详细" | |
| saves: 2 | |
| risk: low | |
| # ============================================================ | |
| # L2: 句法层 | |
| # ============================================================ | |
| - id: L2-001 | |
| layer: L2 | |
| pattern: "对(.+?)进行(?:一?(?:个|下|次)?(?:全面|详细|简要|认真|深入)?的?)?([\\u4e00-\\u9fff]{1,4})" | |
| replacement: "\\2\\1" | |
| saves: 2 | |
| risk: mid | |
| description: "'对 X 进行 Y' → 'Y X'" | |
| - id: L2-002 | |
| layer: L2 | |
| pattern: "把(.+?)作为(.+?)(?=[,,。.\\s])" | |
| replacement: "视\\1为\\2" | |
| saves: 2 | |
| risk: mid | |
| - id: L2-003 | |
| layer: L2 | |
| pattern: "由于(.+?)所以" | |
| replacement: "\\1故" | |
| saves: 3 | |
| risk: low | |
| - id: L2-004 | |
| layer: L2 | |
| pattern: "虽然(.+?)但是" | |
| replacement: "\\1然" | |
| saves: 3 | |
| risk: mid | |
| - id: L2-005 | |
| layer: L2 | |
| pattern: "不仅(.+?)而且" | |
| replacement: "\\1且" | |
| saves: 3 | |
| risk: low | |
| - id: L2-006 | |
| layer: L2 | |
| pattern: "因为(.+?)所以" | |
| replacement: "\\1故" | |
| saves: 3 | |
| risk: low | |
| - id: L2-007 | |
| layer: L2 | |
| pattern: "如果(.+?)那么" | |
| replacement: "若\\1则" | |
| saves: 2 | |
| risk: low | |
| # ---- 列表化 ---- | |
| - id: L2-010 | |
| layer: L2 | |
| pattern: "第一[,,]" | |
| replacement: "1. " | |
| saves: 1 | |
| risk: low | |
| - id: L2-011 | |
| layer: L2 | |
| pattern: "第二[,,]" | |
| replacement: "2. " | |
| saves: 1 | |
| risk: low | |
| - id: L2-012 | |
| layer: L2 | |
| pattern: "第三[,,]" | |
| replacement: "3. " | |
| saves: 1 | |
| risk: low | |
| - id: L2-013 | |
| layer: L2 | |
| pattern: "第四[,,]" | |
| replacement: "4. " | |
| saves: 1 | |
| risk: low | |
| - id: L2-014 | |
| layer: L2 | |
| pattern: "首先[,,]" | |
| replacement: "1. " | |
| saves: 1 | |
| risk: low | |
| - id: L2-015 | |
| layer: L2 | |
| pattern: "其次[,,]" | |
| replacement: "2. " | |
| saves: 1 | |
| risk: low | |
| # ============================================================ | |
| # L2 协议化重写 (v0.2 修订) | |
| # 实测:### 在所有 9 个 tokenizer 上都是 1 token | |
| # ============================================================ | |
| - id: L2-020 | |
| layer: L2 | |
| pattern: "请\\s*(?:用|以)?\\s*(?:JSON|json|Json)\\s*格式\\s*(?:输出|返回|回答)" | |
| replacement: "\n### 输出\nJSON" | |
| saves: 4 | |
| risk: low | |
| - id: L2-021 | |
| layer: L2 | |
| pattern: "请\\s*(?:用|以)?\\s*中文\\s*(?:回答|回复|输出)" | |
| replacement: "\n### 输出\n中文" | |
| saves: 3 | |
| risk: low | |
| - id: L2-022 | |
| layer: L2 | |
| pattern: "请\\s*(?:你)?\\s*扮演\\s*(?:一(?:个|位))?\\s*(.+?)(?=[,,。.\\n]|的角色|$)" | |
| replacement: "\n### 角色\n\\1\n" | |
| saves: 4 | |
| risk: high | |
| description: | | |
| '请你扮演一位 X' → '### 角色\nX' | |
| 已知问题:含空格的复合 NP 可能被截断,Day 3 用 jieba 修复 | |
| # ============================================================ | |
| # L3: 成语层(默认 universal 11 条核心成语,需 layer=L3 显式启用) | |
| # 在 ≥3 国产 tokenizer 上 1 token,基于 idiom_whitelist.json 实测 | |
| # ============================================================ | |
| - id: L3-001 | |
| layer: L3 | |
| pattern: "(?:大家都知道|每个人都知道|众人皆知)" | |
| replacement: "众所周知" | |
| saves: 2 | |
| risk: mid | |
| - id: L3-002 | |
| layer: L3 | |
| pattern: "投入(?:全部|所有)?(?:精力|力量)(?:去做|做)?" | |
| replacement: "全力以赴" | |
| saves: 2 | |
| risk: mid | |
| - id: L3-003 | |
| layer: L3 | |
| pattern: "(?:根据|结合|按照)(?:当地|实际)情况" | |
| replacement: "因地制宜" | |
| saves: 2 | |
| risk: mid | |
| - id: L3-004 | |
| layer: L3 | |
| pattern: "(?:一步一步|一步步)(?:地)?(?:推进|进行)" | |
| replacement: "循序渐进" | |
| saves: 3 | |
| risk: mid | |
| - id: L3-005 | |
| layer: L3 | |
| pattern: "(?:不断|持续|一直)(?:坚持|努力做)" | |
| replacement: "持之以恒" | |
| saves: 2 | |
| risk: mid | |
| - id: L3-006 | |
| layer: L3 | |
| pattern: "认真(?:仔细)?(?:地)?对待" | |
| replacement: "脚踏实地" | |
| saves: 1 | |
| risk: mid | |
| # ============================================================ | |
| # L4: 协议层归一化 | |
| # ============================================================ | |
| - id: L4-001 | |
| layer: L4 | |
| pattern: "(?:#+\\s*)?(?:任务|目标|Task|TASK)\\s*[::]\\s*" | |
| replacement: "### 任务\n" | |
| saves: 0 | |
| risk: low | |
| - id: L4-002 | |
| layer: L4 | |
| pattern: "(?:#+\\s*)?(?:角色|身份|Role|ROLE)\\s*[::]\\s*" | |
| replacement: "### 角色\n" | |
| saves: 0 | |
| risk: low | |
| - id: L4-003 | |
| layer: L4 | |
| pattern: "(?:#+\\s*)?(?:输出|返回|输出格式|Output|OUTPUT)\\s*[::]\\s*" | |
| replacement: "### 输出\n" | |
| saves: 0 | |
| risk: low | |
| - id: L4-004 | |
| layer: L4 | |
| pattern: "(?:#+\\s*)?(?:约束|限制|要求|规则|Constraints|CONSTRAINTS)\\s*[::]\\s*" | |
| replacement: "### 约束\n" | |
| saves: 0 | |
| risk: low | |