PregoPal / modules /diet_extractor.py
J.B-Lin
chore: save current version before UI improvements (i18n + font fix)
ec90eae
Raw
History Blame Contribute Delete
9.63 kB
"""
PregoPal - 对话内容提取器
=========================
从 AI 回复中通过正则表达式提取结构化数据。
提取标记格式(在 system prompt 中告知模型使用):
[EXTRACT_DIET]...[/EXTRACT_DIET] # 饮食记录
[EXTRACT_RECIPE]...[/EXTRACT_RECIPE] # 菜谱信息
[EXTRACT_PREFERENCE]...[/EXTRACT_PREFERENCE] # 偏好/忌口/过敏
[EXTRACT_WEIGHT]...[/EXTRACT_WEIGHT] # 体重记录
[EXTRACT_MEMORY]...[/EXTRACT_MEMORY] # 家庭记忆
[THINKING]...[/THINKING] # 当前思考步骤
"""
import re
import datetime
class DietExtractor:
"""从 AI 回复中提取所有结构化数据"""
# ============================================================
# 正则模式
# ============================================================
DIET_PATTERN = re.compile(
r'\[EXTRACT_DIET\](.*?)\[/EXTRACT_DIET\]', re.DOTALL
)
RECIPE_PATTERN = re.compile(
r'\[EXTRACT_RECIPE\](.*?)\[/EXTRACT_RECIPE\]', re.DOTALL
)
PREFERENCE_PATTERN = re.compile(
r'\[EXTRACT_PREFERENCE\](.*?)\[/EXTRACT_PREFERENCE\]', re.DOTALL
)
WEIGHT_PATTERN = re.compile(
r'\[EXTRACT_WEIGHT\](.*?)\[/EXTRACT_WEIGHT\]', re.DOTALL
)
MEMORY_PATTERN = re.compile(
r'\[EXTRACT_MEMORY\](.*?)\[/EXTRACT_MEMORY\]', re.DOTALL
)
THINKING_PATTERN = re.compile(
r'\[THINKING\](.*?)\[/THINKING\]', re.DOTALL
)
# ============================================================
# 统一提取接口
# ============================================================
@staticmethod
def extract_all(text: str) -> dict:
"""从文本中提取所有结构化数据"""
return {
"diets": DietExtractor._parse_diets(text),
"recipes": DietExtractor._parse_recipes(text),
"preferences": DietExtractor._parse_preferences(text),
"weights": DietExtractor._parse_weights(text),
"memories": DietExtractor._parse_memories(text),
"thinking": DietExtractor._parse_thinking(text),
}
# ============================================================
# 各类型解析
# ============================================================
@staticmethod
def _parse_block(text: str, pattern: re.Pattern) -> list[dict]:
"""通用解析块"""
results = []
for match in pattern.finditer(text):
block = match.group(1).strip()
entry = {}
for line in block.split('\n'):
line = line.strip()
if ':' in line:
key, val = line.split(':', 1)
entry[key.strip()] = val.strip()
if entry:
results.append(entry)
return results
@staticmethod
def _parse_diets(text: str) -> list[dict]:
"""解析 [EXTRACT_DIET] 块"""
results = []
for match in DietExtractor.DIET_PATTERN.finditer(text):
block = match.group(1).strip()
entry = {"meals": {}}
for line in block.split('\n'):
line = line.strip()
if not line or ':' not in line:
continue
key, val = line.split(':', 1)
key = key.strip()
val = val.strip()
if key in ('日期', '记录人', '备注'):
entry[key] = val
elif key in ('早餐', '午餐', '晚餐', '加餐'):
if val:
entry["meals"][key] = val
if entry.get("meals"):
results.append(entry)
return results
@staticmethod
def _parse_recipes(text: str) -> list[dict]:
"""解析 [EXTRACT_RECIPE] 块"""
return DietExtractor._parse_block(text, DietExtractor.RECIPE_PATTERN)
@staticmethod
def _parse_preferences(text: str) -> list[dict]:
"""解析 [EXTRACT_PREFERENCE] 块"""
return DietExtractor._parse_block(text, DietExtractor.PREFERENCE_PATTERN)
@staticmethod
def _parse_weights(text: str) -> list[dict]:
"""解析 [EXTRACT_WEIGHT] 块"""
return DietExtractor._parse_block(text, DietExtractor.WEIGHT_PATTERN)
@staticmethod
def _parse_memories(text: str) -> list[dict]:
"""解析 [EXTRACT_MEMORY] 块"""
return DietExtractor._parse_block(text, DietExtractor.MEMORY_PATTERN)
@staticmethod
def _parse_thinking(text: str) -> dict | None:
"""解析 [THINKING] 块(只取第一个)"""
match = DietExtractor.THINKING_PATTERN.search(text)
if not match:
return None
block = match.group(1).strip()
result = {}
for line in block.split('\n'):
line = line.strip()
if ':' in line:
key, val = line.split(':', 1)
result[key.strip()] = val.strip()
return result if result else None
# ============================================================
# Fallback 关键词匹配(当正则失败时使用)
# ============================================================
@staticmethod
def fallback_extract_diet(text: str) -> dict | None:
"""
当 [EXTRACT_DIET] 格式不完整时,用关键词匹配尝试提取
匹配模式:
- "吃了" → 记录食物
- "想吃" → 记录愿望
- "记录" → 记录内容
"""
result = {"meals": {}, "notes": ""}
# 尝试匹配 "吃了" 模式
eat_patterns = [
r'(早餐|午餐|晚餐|加餐).{0,5}(?:吃了|吃的|吃的是|想吃)(.{1,50})',
r'(?:吃了|吃的|吃的是|想吃).{0,5}(早餐|午餐|晚餐|加餐)(.{1,50})',
]
for pattern in eat_patterns:
for match in re.finditer(pattern, text):
meal_time = match.group(1)
food = match.group(2).strip()
if meal_time in ('早餐', '午餐', '晚餐', '加餐') and food:
result["meals"][meal_time] = food
# 尝试匹配 "记录" 模式
record_match = re.search(r'记录[了::]?\s*(.{1,100})', text)
if record_match:
result["notes"] = record_match.group(1).strip()
return result if result["meals"] else None
@staticmethod
def fallback_extract_thinking(text: str) -> dict | None:
"""
当 [THINKING] 格式不完整时,用关键词匹配尝试提取
"""
result = {}
step_match = re.search(r'(?:当前步骤|正在|当前).{0,5}(.{5,30})', text)
if step_match:
result["当前步骤"] = step_match.group(1).strip()
next_match = re.search(r'(?:下一步|接下来|然后).{0,5}(.{5,30})', text)
if next_match:
result["下一步"] = next_match.group(1).strip()
return result if result else None
# ============================================================
# 综合提取(正则优先 + fallback)
# ============================================================
@staticmethod
def robust_extract(text: str) -> dict:
"""综合提取:先用正则,失败时用 fallback"""
result = DietExtractor.extract_all(text)
# 如果正则没提取到饮食,尝试 fallback
if not result["diets"]:
fallback = DietExtractor.fallback_extract_diet(text)
if fallback:
result["diets"] = [fallback]
# 如果正则没提取到 thinking,尝试 fallback
if not result["thinking"]:
fallback = DietExtractor.fallback_extract_thinking(text)
if fallback:
result["thinking"] = fallback
return result
# ============================================================
# System Prompt 模板(告知模型使用标记格式)
# ============================================================
EXTRACT_SYSTEM_PROMPT = """
## 数据记录格式
当对话中出现以下情况时,请使用特定标记包裹结构化数据:
### 1. 记录饮食
当孕妇或家人提到吃了什么、想吃什么时,输出:
[EXTRACT_DIET]
日期: {date}
餐次: 早餐/午餐/晚餐/加餐
食物: 具体食物名称
份量: 大概份量
记录人: 说话人身份
备注: 孕妇说想吃/孕妇吃了/家人说孕妇想吃
[/EXTRACT_DIET]
### 2. 记录菜谱
当家人提到会做什么菜时,输出:
[EXTRACT_RECIPE]
菜名: 菜名
制作人: 谁做的
难度: 简单/中等/困难
食材: 主要食材
备注: 孕妇爱吃/家人爱吃
[/EXTRACT_RECIPE]
### 3. 记录偏好/忌口/过敏
当提到饮食偏好、忌口或过敏时,输出:
[EXTRACT_PREFERENCE]
人员: 姓名
类型: 偏好/忌口/过敏
内容: 具体内容
[/EXTRACT_PREFERENCE]
### 4. 记录体重
当孕妇提到体重时,输出:
[EXTRACT_WEIGHT]
日期: {date}
体重: 数值(kg)
记录人: 说话人身份
[/EXTRACT_WEIGHT]
### 5. 记录家庭记忆
当提到家庭关系或重要事件时,输出:
[EXTRACT_MEMORY]
类型: 关系/事件/日常
内容: 具体内容
[/EXTRACT_MEMORY]
### 6. 显示当前思考
每次执行下一步操作时,输出:
[THINKING]
当前步骤: 正在做什么
下一步: 接下来要做什么
[/THINKING]
注意:标记块可以出现在回复的任何位置,不影响正常对话内容。
"""
def get_extract_prompt(date_str: str = None) -> str:
"""获取带日期的提取提示词"""
if date_str is None:
date_str = datetime.date.today().isoformat()
return EXTRACT_SYSTEM_PROMPT.format(date=date_str)