Add files using upload-large-folder tool

e020674 verified 5 months ago

7.24 kB

	class KnowledgeCleanerPrompt:
	'''
	知识清洗提示词生成器，支持中英文多语言适配
	Specialized in refining raw content with multilingual support.
	'''
	def __init__(self, lang: str = "en", strict_mode: bool = True):
	self.lang = lang
	self.strict_mode = strict_mode
	self._init_prompt_header()

	def _init_prompt_header(self):
	"""根据语言初始化提示词头部模板"""
	if self.lang == "en":
	self.prompt_header = f"""
	You are a meticulous Knowledge Refinement Engineer. Apply these rules STRICTLY:

	1. Remove redundant tags but retain:
	- Semantic tags like <table>, <code>
	- Meaningful attributes

	2. Normalize special characters:
	- Standardize quotes and dashes
	- Convert ellipsis (...)

	3. URL handling:
	- Preserve footnote URLs
	- Extract display texts

	4. Text structure:
	- Maintain paragraph/list breaks
	- Keep code indentation
	- Limit empty lines (max=2)

	5. Reference processing (NEW):
	- Images → "[Image: alt_text]"
	- Signatures → "[Signature]"

	6. Code blocks: {"(strict)" if self.strict_mode else ""}
	- {"Force closure" if self.strict_mode else "Preserve raw"}
	- Mark fragments as /.../

	7. Absolute fidelity:
	- NO fact/number modifications
	- NO term paraphrasing
	- NO table structure changes

	8. Security Processing (NEW):
	- PII: Phone/ID/Email must be masked, e.g.
	Original: phone 13800138000 → Processed: phone 138****8000
	- Classified: Mark 【Confidential】as 〖SEC∶classified〗
	- Illegal: Replace sensitive content with 〖ILLEGAL∶removed〗
	- Encryption tags: Use 〖〗for encrypted sections

	Example:
	Input:
	<div class="article">
	<h1>Knowledge Cleaning™</h1>
	<figure>
	<img src="process.png" alt="Cleaning Flowchart" title="Three Phases">
	<figcaption>Fig.1: Core Process</figcaption>
	</figure>
	<p>Contact: <span class="phone">+8613800138000</span></p>
	<p>Text with "curly quotes" and – dash – here…</p>
	<table><tr><td>Table data</td></tr></table>
	<pre><code>function test() {{</code></pre>
	<blockquote>Signature: John <img src="sign.png" alt="e-signature"></blockquote>
	<p>Confidential: Project budget is 【Secret】</p>
	<p>Diagram: <img src="demo.jpg" class="diagram"></p>
	</div>

	Output:
	<cleaned_start>
	Knowledge Cleaning™

	[Image: Cleaning Flowchart (Three Phases) Fig.1: Core Process]

	Contact: +86*****8000

	Text with "straight quotes" and - dash - here...

	<table><tr><td>Table data</td></tr></table>

	<code>function test() {{ /.../ }}</code>

	[Signature]Signature: John [Image: e-signature]

	〖SEC∶classified content〗

	Diagram: [Image: Diagram demo.jpg]
	<cleaned_end>
	"""
	else:
	self.prompt_header =f"""
	你是一名严谨的知识清洗工程师。请严格按照以下规则处理原始内容：

	1. 移除冗余HTML/XML标签，但保留：
	- 语义化标签如 <table>、<code>、<formula>
	- 所有携带意义的属性值

	2. 规范化特殊字符：
	- 将花引号（“ ” ‘ ’）转为标准引号（" "）
	- 将长破折号（– —）替换为短横线（-）
	- 中文省略号（…）转为英文省略号（...）
	- 保留数学符号和技术记号（如<<、>>等操作符）

	3. 链接处理：
	- 脚注/参考文献中的URL保持原样
	- 移除超链接包装但保留显示文本
	示例：<a href="https://example.com">示例</a> → 示例

	4. 文本结构：
	- 保持原始段落/列表的换行
	- 保留代码/引用的缩进层级
	- 压缩连续空行为最多2行

	5. 引用内容处理（新增）：
	- 图片引用转换为【引用图片：描述文本】
	- 签名区块标记为【签名引用】

	6. 代码块处理：{"（严格模式）" if self.strict_mode else ""}
	- {"确保代码块闭合（如补全缺失的括号）" if self.strict_mode else "保持代码原样"}
	- 标记不完整代码为/.../

	7. 绝对保真：
	- 禁止增删任何事实、数字或命名实体
	- 禁止改写专业术语或专有名词
	- 禁止修改表格数据结构

	8. 安全处理（新增）：
	- 个人隐私：身份证号/手机号/邮箱等需脱敏，示例：
	原文本：电话 13800138000 → 处理后：电话 138****8000
	- 涉密内容：检测到【机密】【秘密】等关键词时，整句替换为【涉密内容已加密】
	- 违规信息：政治敏感、暴恐等内容替换为【违规内容已屏蔽】
	- 加密标记：使用〖〗包裹加密区域，示例：
	〖PII∶身份证号〗〖SEC∶机密字段〗

	示例：
	输入：
	<div class="article">
	<h1>知识清洗®</h1>
	<figure>
	<img src="process.png" alt="清洗流程图" title="三阶段处理">
	<figcaption>图1：核心流程</figcaption>
	</figure>
	<p>联系电话：<span class="phone">13800138000</span></p>
	<p>这是包含"花引号"和—破折号—的文本…</p>
	<table><tr><td>表格数据</td></tr></table>
	<pre><code>function test() {{</code></pre>
	<blockquote>签名：张三 <img src="sign.png" alt="电子签名"></blockquote>
	<p>机密信息：本项目预算为【机密】</p>
	<p>示意图：<img src="demo.jpg" class="diagram"></p>
	</div>

	输出：
	<cleaned_start>
	知识清洗®

	[引用图片：清洗流程图（三阶段处理）图1：核心流程]

	联系电话：138****8000

	这是包含"花引号"和-破折号-的文本...

	<table><tr><td>表格数据</td></tr></table>

	<code>function test() {{ /.../ }}</code>

	[签名引用]签名：张三 [引用图片：电子签名]

	涉密内容已加密

	示意图：[引用图片：示意图demo.jpg]
	<cleaned_end>
	"""

	def Classic_COT_Prompt(self, raw_content: str) -> str:
	"""生成知识清洗的思维链提示词（保持原有格式）"""
	if self.lang == "en":
	processing_steps = """
	Processing Steps:
	1. [Tag Analysis] Classify markup tags
	2. [Reference Extraction] Isolate images/tables
	3. [Character Audit] Log special chars
	4. [Structure Check] Validate hierarchy
	5. [Final Output] Generate cleaned text
	""".strip()
	output_requirement = 'Response must contain ONLY cleaned text between <cleaned_start> and <cleaned_end>.'
	else:
	processing_steps = """
	处理步骤：
	1. [标签分析] 识别并分类所有标记标签
	2. [引用提取] 分离图片/表格/签名等引用内容
	3. [字符审核] 记录特殊字符变更
	4. [结构检查] 验证文本层级
	5. [最终输出] 生成清洗后文本
	""".strip()
	output_requirement = '响应必须只包含清洗后文本，以<cleaned_start>开头，<cleaned_end>结尾，无其他内容。'

	return f"""
	{self.prompt_header}

	待清洗内容：
	{raw_content}

	{processing_steps}

	{output_requirement}
	""".strip()

	def post_process(self, cleaned_text: str) -> str:
	"""后处理逻辑（新增引用校验）"""
	if self.strict_mode:
	# 校验引用标记完整性
	cleaned_text = re.sub(r'(!$$.*?$$)$.+?$',
	lambda m: f"【引用图片：{m.group(1)[2:-1]}" if "图片" in m.group(1) else m.group(0),
	cleaned_text)
	return cleaned_text