Spaces:
Runtime error
Runtime error
| """ | |
| PregoPal - 对话内容提取器 | |
| ========================= | |
| 从 AI 回复中通过正则表达式提取结构化数据。 | |
| 提取标记格式(在 system prompt 中告知模型使用): | |
| [EXTRACT_DIET]...[/EXTRACT_DIET] # 饮食记录 | |
| [EXTRACT_RECIPE]...[/EXTRACT_RECIPE] # 菜谱信息 | |
| [EXTRACT_PREFERENCE]...[/EXTRACT_PREFERENCE] # 偏好/忌口/过敏 | |
| [EXTRACT_WEIGHT]...[/EXTRACT_WEIGHT] # 体重记录 | |
| [EXTRACT_MEMORY]...[/EXTRACT_MEMORY] # 家庭记忆 | |
| [THINKING]...[/THINKING] # 当前思考步骤 | |
| """ | |
| import re | |
| import datetime | |
| class DietExtractor: | |
| """从 AI 回复中提取所有结构化数据""" | |
| # ============================================================ | |
| # 正则模式 | |
| # ============================================================ | |
| DIET_PATTERN = re.compile( | |
| r'\[EXTRACT_DIET\](.*?)\[/EXTRACT_DIET\]', re.DOTALL | |
| ) | |
| RECIPE_PATTERN = re.compile( | |
| r'\[EXTRACT_RECIPE\](.*?)\[/EXTRACT_RECIPE\]', re.DOTALL | |
| ) | |
| PREFERENCE_PATTERN = re.compile( | |
| r'\[EXTRACT_PREFERENCE\](.*?)\[/EXTRACT_PREFERENCE\]', re.DOTALL | |
| ) | |
| WEIGHT_PATTERN = re.compile( | |
| r'\[EXTRACT_WEIGHT\](.*?)\[/EXTRACT_WEIGHT\]', re.DOTALL | |
| ) | |
| MEMORY_PATTERN = re.compile( | |
| r'\[EXTRACT_MEMORY\](.*?)\[/EXTRACT_MEMORY\]', re.DOTALL | |
| ) | |
| THINKING_PATTERN = re.compile( | |
| r'\[THINKING\](.*?)\[/THINKING\]', re.DOTALL | |
| ) | |
| # ============================================================ | |
| # 统一提取接口 | |
| # ============================================================ | |
| def extract_all(text: str) -> dict: | |
| """从文本中提取所有结构化数据""" | |
| return { | |
| "diets": DietExtractor._parse_diets(text), | |
| "recipes": DietExtractor._parse_recipes(text), | |
| "preferences": DietExtractor._parse_preferences(text), | |
| "weights": DietExtractor._parse_weights(text), | |
| "memories": DietExtractor._parse_memories(text), | |
| "thinking": DietExtractor._parse_thinking(text), | |
| } | |
| # ============================================================ | |
| # 各类型解析 | |
| # ============================================================ | |
| def _parse_block(text: str, pattern: re.Pattern) -> list[dict]: | |
| """通用解析块""" | |
| results = [] | |
| for match in pattern.finditer(text): | |
| block = match.group(1).strip() | |
| entry = {} | |
| for line in block.split('\n'): | |
| line = line.strip() | |
| if ':' in line: | |
| key, val = line.split(':', 1) | |
| entry[key.strip()] = val.strip() | |
| if entry: | |
| results.append(entry) | |
| return results | |
| def _parse_diets(text: str) -> list[dict]: | |
| """解析 [EXTRACT_DIET] 块""" | |
| results = [] | |
| for match in DietExtractor.DIET_PATTERN.finditer(text): | |
| block = match.group(1).strip() | |
| entry = {"meals": {}} | |
| for line in block.split('\n'): | |
| line = line.strip() | |
| if not line or ':' not in line: | |
| continue | |
| key, val = line.split(':', 1) | |
| key = key.strip() | |
| val = val.strip() | |
| if key in ('日期', '记录人', '备注'): | |
| entry[key] = val | |
| elif key in ('早餐', '午餐', '晚餐', '加餐'): | |
| if val: | |
| entry["meals"][key] = val | |
| if entry.get("meals"): | |
| results.append(entry) | |
| return results | |
| def _parse_recipes(text: str) -> list[dict]: | |
| """解析 [EXTRACT_RECIPE] 块""" | |
| return DietExtractor._parse_block(text, DietExtractor.RECIPE_PATTERN) | |
| def _parse_preferences(text: str) -> list[dict]: | |
| """解析 [EXTRACT_PREFERENCE] 块""" | |
| return DietExtractor._parse_block(text, DietExtractor.PREFERENCE_PATTERN) | |
| def _parse_weights(text: str) -> list[dict]: | |
| """解析 [EXTRACT_WEIGHT] 块""" | |
| return DietExtractor._parse_block(text, DietExtractor.WEIGHT_PATTERN) | |
| def _parse_memories(text: str) -> list[dict]: | |
| """解析 [EXTRACT_MEMORY] 块""" | |
| return DietExtractor._parse_block(text, DietExtractor.MEMORY_PATTERN) | |
| def _parse_thinking(text: str) -> dict | None: | |
| """解析 [THINKING] 块(只取第一个)""" | |
| match = DietExtractor.THINKING_PATTERN.search(text) | |
| if not match: | |
| return None | |
| block = match.group(1).strip() | |
| result = {} | |
| for line in block.split('\n'): | |
| line = line.strip() | |
| if ':' in line: | |
| key, val = line.split(':', 1) | |
| result[key.strip()] = val.strip() | |
| return result if result else None | |
| # ============================================================ | |
| # Fallback 关键词匹配(当正则失败时使用) | |
| # ============================================================ | |
| def fallback_extract_diet(text: str) -> dict | None: | |
| """ | |
| 当 [EXTRACT_DIET] 格式不完整时,用关键词匹配尝试提取 | |
| 匹配模式: | |
| - "吃了" → 记录食物 | |
| - "想吃" → 记录愿望 | |
| - "记录" → 记录内容 | |
| """ | |
| result = {"meals": {}, "notes": ""} | |
| # 尝试匹配 "吃了" 模式 | |
| eat_patterns = [ | |
| r'(早餐|午餐|晚餐|加餐).{0,5}(?:吃了|吃的|吃的是|想吃)(.{1,50})', | |
| r'(?:吃了|吃的|吃的是|想吃).{0,5}(早餐|午餐|晚餐|加餐)(.{1,50})', | |
| ] | |
| for pattern in eat_patterns: | |
| for match in re.finditer(pattern, text): | |
| meal_time = match.group(1) | |
| food = match.group(2).strip() | |
| if meal_time in ('早餐', '午餐', '晚餐', '加餐') and food: | |
| result["meals"][meal_time] = food | |
| # 尝试匹配 "记录" 模式 | |
| record_match = re.search(r'记录[了::]?\s*(.{1,100})', text) | |
| if record_match: | |
| result["notes"] = record_match.group(1).strip() | |
| return result if result["meals"] else None | |
| def fallback_extract_thinking(text: str) -> dict | None: | |
| """ | |
| 当 [THINKING] 格式不完整时,用关键词匹配尝试提取 | |
| """ | |
| result = {} | |
| step_match = re.search(r'(?:当前步骤|正在|当前).{0,5}(.{5,30})', text) | |
| if step_match: | |
| result["当前步骤"] = step_match.group(1).strip() | |
| next_match = re.search(r'(?:下一步|接下来|然后).{0,5}(.{5,30})', text) | |
| if next_match: | |
| result["下一步"] = next_match.group(1).strip() | |
| return result if result else None | |
| # ============================================================ | |
| # 综合提取(正则优先 + fallback) | |
| # ============================================================ | |
| def robust_extract(text: str) -> dict: | |
| """综合提取:先用正则,失败时用 fallback""" | |
| result = DietExtractor.extract_all(text) | |
| # 如果正则没提取到饮食,尝试 fallback | |
| if not result["diets"]: | |
| fallback = DietExtractor.fallback_extract_diet(text) | |
| if fallback: | |
| result["diets"] = [fallback] | |
| # 如果正则没提取到 thinking,尝试 fallback | |
| if not result["thinking"]: | |
| fallback = DietExtractor.fallback_extract_thinking(text) | |
| if fallback: | |
| result["thinking"] = fallback | |
| return result | |
| # ============================================================ | |
| # System Prompt 模板(告知模型使用标记格式) | |
| # ============================================================ | |
| EXTRACT_SYSTEM_PROMPT = """ | |
| ## 数据记录格式 | |
| 当对话中出现以下情况时,请使用特定标记包裹结构化数据: | |
| ### 1. 记录饮食 | |
| 当孕妇或家人提到吃了什么、想吃什么时,输出: | |
| [EXTRACT_DIET] | |
| 日期: {date} | |
| 餐次: 早餐/午餐/晚餐/加餐 | |
| 食物: 具体食物名称 | |
| 份量: 大概份量 | |
| 记录人: 说话人身份 | |
| 备注: 孕妇说想吃/孕妇吃了/家人说孕妇想吃 | |
| [/EXTRACT_DIET] | |
| ### 2. 记录菜谱 | |
| 当家人提到会做什么菜时,输出: | |
| [EXTRACT_RECIPE] | |
| 菜名: 菜名 | |
| 制作人: 谁做的 | |
| 难度: 简单/中等/困难 | |
| 食材: 主要食材 | |
| 备注: 孕妇爱吃/家人爱吃 | |
| [/EXTRACT_RECIPE] | |
| ### 3. 记录偏好/忌口/过敏 | |
| 当提到饮食偏好、忌口或过敏时,输出: | |
| [EXTRACT_PREFERENCE] | |
| 人员: 姓名 | |
| 类型: 偏好/忌口/过敏 | |
| 内容: 具体内容 | |
| [/EXTRACT_PREFERENCE] | |
| ### 4. 记录体重 | |
| 当孕妇提到体重时,输出: | |
| [EXTRACT_WEIGHT] | |
| 日期: {date} | |
| 体重: 数值(kg) | |
| 记录人: 说话人身份 | |
| [/EXTRACT_WEIGHT] | |
| ### 5. 记录家庭记忆 | |
| 当提到家庭关系或重要事件时,输出: | |
| [EXTRACT_MEMORY] | |
| 类型: 关系/事件/日常 | |
| 内容: 具体内容 | |
| [/EXTRACT_MEMORY] | |
| ### 6. 显示当前思考 | |
| 每次执行下一步操作时,输出: | |
| [THINKING] | |
| 当前步骤: 正在做什么 | |
| 下一步: 接下来要做什么 | |
| [/THINKING] | |
| 注意:标记块可以出现在回复的任何位置,不影响正常对话内容。 | |
| """ | |
| def get_extract_prompt(date_str: str = None) -> str: | |
| """获取带日期的提取提示词""" | |
| if date_str is None: | |
| date_str = datetime.date.today().isoformat() | |
| return EXTRACT_SYSTEM_PROMPT.format(date=date_str) | |