Miaode's picture
Add files using upload-large-folder tool
e020674 verified
class KnowledgeCleanerPrompt:
'''
知识清洗提示词生成器,支持中英文多语言适配
Specialized in refining raw content with multilingual support.
'''
def __init__(self, lang: str = "en", strict_mode: bool = True):
self.lang = lang
self.strict_mode = strict_mode
self._init_prompt_header()
def _init_prompt_header(self):
"""根据语言初始化提示词头部模板"""
if self.lang == "en":
self.prompt_header = f"""
You are a meticulous Knowledge Refinement Engineer. Apply these rules STRICTLY:
1. Remove redundant tags but retain:
- Semantic tags like <table>, <code>
- Meaningful attributes
2. Normalize special characters:
- Standardize quotes and dashes
- Convert ellipsis (...)
3. URL handling:
- Preserve footnote URLs
- Extract display texts
4. Text structure:
- Maintain paragraph/list breaks
- Keep code indentation
- Limit empty lines (max=2)
5. Reference processing (NEW):
- Images → "[Image: alt_text]"
- Signatures → "[Signature]"
6. Code blocks: {"(strict)" if self.strict_mode else ""}
- {"Force closure" if self.strict_mode else "Preserve raw"}
- Mark fragments as /*...*/
7. Absolute fidelity:
- NO fact/number modifications
- NO term paraphrasing
- NO table structure changes
8. Security Processing (NEW):
- PII: Phone/ID/Email must be masked, e.g.
Original: phone 13800138000 → Processed: phone 138****8000
- Classified: Mark 【Confidential】as 〖SEC∶classified〗
- Illegal: Replace sensitive content with 〖ILLEGAL∶removed〗
- Encryption tags: Use 〖〗for encrypted sections
Example:
Input:
<div class="article">
<h1>Knowledge Cleaning™</h1>
<figure>
<img src="process.png" alt="Cleaning Flowchart" title="Three Phases">
<figcaption>Fig.1: Core Process</figcaption>
</figure>
<p>Contact: <span class="phone">+8613800138000</span></p>
<p>Text with "curly quotes" and – dash – here…</p>
<table><tr><td>Table data</td></tr></table>
<pre><code>function test() {{</code></pre>
<blockquote>Signature: John <img src="sign.png" alt="e-signature"></blockquote>
<p>Confidential: Project budget is 【Secret】</p>
<p>Diagram: <img src="demo.jpg" class="diagram"></p>
</div>
Output:
<cleaned_start>
Knowledge Cleaning™
[Image: Cleaning Flowchart (Three Phases) Fig.1: Core Process]
Contact: +86*****8000
Text with "straight quotes" and - dash - here...
<table><tr><td>Table data</td></tr></table>
<code>function test() {{ /*...*/ }}</code>
[Signature]Signature: John [Image: e-signature]
〖SEC∶classified content〗
Diagram: [Image: Diagram demo.jpg]
<cleaned_end>
"""
else:
self.prompt_header =f"""
你是一名严谨的知识清洗工程师。请严格按照以下规则处理原始内容:
1. 移除冗余HTML/XML标签,但保留:
- 语义化标签如 <table>、<code>、<formula>
- 所有携带意义的属性值
2. 规范化特殊字符:
- 将花引号(“ ” ‘ ’)转为标准引号(" ")
- 将长破折号(– —)替换为短横线(-)
- 中文省略号(…)转为英文省略号(...)
- 保留数学符号和技术记号(如<<、>>等操作符)
3. 链接处理:
- 脚注/参考文献中的URL保持原样
- 移除超链接包装但保留显示文本
示例:<a href="https://example.com">示例</a> → 示例
4. 文本结构:
- 保持原始段落/列表的换行
- 保留代码/引用的缩进层级
- 压缩连续空行为最多2行
5. 引用内容处理(新增):
- 图片引用转换为【引用图片:描述文本】
- 签名区块标记为【签名引用】
6. 代码块处理:{"(严格模式)" if self.strict_mode else ""}
- {"确保代码块闭合(如补全缺失的括号)" if self.strict_mode else "保持代码原样"}
- 标记不完整代码为/*...*/
7. 绝对保真:
- 禁止增删任何事实、数字或命名实体
- 禁止改写专业术语或专有名词
- 禁止修改表格数据结构
8. 安全处理(新增):
- 个人隐私:身份证号/手机号/邮箱等需脱敏,示例:
原文本:电话 13800138000 → 处理后:电话 138****8000
- 涉密内容:检测到【机密】【秘密】等关键词时,整句替换为【涉密内容已加密】
- 违规信息:政治敏感、暴恐等内容替换为【违规内容已屏蔽】
- 加密标记:使用〖〗包裹加密区域,示例:
〖PII∶身份证号〗〖SEC∶机密字段〗
示例:
输入:
<div class="article">
<h1>知识清洗®</h1>
<figure>
<img src="process.png" alt="清洗流程图" title="三阶段处理">
<figcaption>图1:核心流程</figcaption>
</figure>
<p>联系电话:<span class="phone">13800138000</span></p>
<p>这是包含"花引号"和—破折号—的文本…</p>
<table><tr><td>表格数据</td></tr></table>
<pre><code>function test() {{</code></pre>
<blockquote>签名:张三 <img src="sign.png" alt="电子签名"></blockquote>
<p>机密信息:本项目预算为【机密】</p>
<p>示意图:<img src="demo.jpg" class="diagram"></p>
</div>
输出:
<cleaned_start>
知识清洗®
[引用图片:清洗流程图(三阶段处理)图1:核心流程]
联系电话:138****8000
这是包含"花引号"和-破折号-的文本...
<table><tr><td>表格数据</td></tr></table>
<code>function test() {{ /*...*/ }}</code>
[签名引用]签名:张三 [引用图片:电子签名]
涉密内容已加密
示意图:[引用图片:示意图demo.jpg]
<cleaned_end>
"""
def Classic_COT_Prompt(self, raw_content: str) -> str:
"""生成知识清洗的思维链提示词(保持原有格式)"""
if self.lang == "en":
processing_steps = """
Processing Steps:
1. [Tag Analysis] Classify markup tags
2. [Reference Extraction] Isolate images/tables
3. [Character Audit] Log special chars
4. [Structure Check] Validate hierarchy
5. [Final Output] Generate cleaned text
""".strip()
output_requirement = 'Response must contain ONLY cleaned text between <cleaned_start> and <cleaned_end>.'
else:
processing_steps = """
处理步骤:
1. [标签分析] 识别并分类所有标记标签
2. [引用提取] 分离图片/表格/签名等引用内容
3. [字符审核] 记录特殊字符变更
4. [结构检查] 验证文本层级
5. [最终输出] 生成清洗后文本
""".strip()
output_requirement = '响应必须只包含清洗后文本,以<cleaned_start>开头,<cleaned_end>结尾,无其他内容。'
return f"""
{self.prompt_header}
待清洗内容:
{raw_content}
{processing_steps}
{output_requirement}
""".strip()
def post_process(self, cleaned_text: str) -> str:
"""后处理逻辑(新增引用校验)"""
if self.strict_mode:
# 校验引用标记完整性
cleaned_text = re.sub(r'(!$$.*?$$)$.+?$',
lambda m: f"【引用图片:{m.group(1)[2:-1]}" if "图片" in m.group(1) else m.group(0),
cleaned_text)
return cleaned_text