Spaces:
Running on Zero
Running on Zero
| """Dream text parser: extracts entities and emotions from dream descriptions. | |
| Supports both structured JSON output (8B model) and free-text fallback (1B model). | |
| """ | |
| import re | |
| from typing import Optional | |
| from core.models import Entity, DreamParseResult, _new_id | |
| import config | |
| def _strip_thinking(text: str) -> str: | |
| """Remove <think>...</think> tags and other model artifacts.""" | |
| # Collect thinking content for fallback | |
| thinking = re.findall(r'<think>(.*?)</think>', text, re.DOTALL) | |
| text = re.sub(r'<think>.*?</think>\s*', '', text, flags=re.DOTALL) | |
| text = re.sub(r'<\|im_start\|>.*', '', text, flags=re.DOTALL) | |
| text = re.sub(r'<\|im_end\|>.*', '', text, flags=re.DOTALL) | |
| result = text.strip() | |
| # If nothing left, try to extract from thinking content | |
| if not result and thinking: | |
| content = thinking[-1].strip() | |
| lines = [l.strip() for l in content.split('\n') if l.strip()] | |
| # Skip reasoning lines | |
| skip = ['首先', '然后', '最后', '嗯', '想到', '考虑', '需要', '用户要求'] | |
| good = [l for l in lines if not any(l.startswith(w) for w in skip)] | |
| result = '\n'.join(good[-5:]) if good else '\n'.join(lines[-3:]) | |
| return result | |
| def _parse_text_format(text: str) -> Optional[dict]: | |
| """Parse the text-based format from the parser prompt. | |
| Expected format: | |
| 地点:[名称] — [描述] | |
| 人物:[名称] — [描述] | |
| 情绪:[情绪1]、[情绪2] | |
| 总结:[一句话总结] | |
| """ | |
| text = _strip_thinking(text) | |
| entities = [] | |
| emotions = [] | |
| summary = "" | |
| # Extract locations | |
| for match in re.finditer(r'地点[::]\s*(.+?)(?:\s*[—\-–]\s*(.+))?$', text, re.MULTILINE): | |
| name = match.group(1).strip() | |
| desc = (match.group(2) or name).strip() | |
| if name and len(name) > 1: | |
| entities.append({"name": name, "type": "location", "description": desc}) | |
| # Extract NPCs/characters | |
| for match in re.finditer(r'(?:人物|NPC|角色)[::]\s*(.+?)(?:\s*[—\-–]\s*(.+))?$', text, re.MULTILINE): | |
| name = match.group(1).strip() | |
| desc = (match.group(2) or name).strip() | |
| if name and len(name) > 1: | |
| entities.append({"name": name, "type": "npc", "description": desc}) | |
| # Extract items | |
| for match in re.finditer(r'(?:物品|道具|东西)[::]\s*(.+?)(?:\s*[—\-–]\s*(.+))?$', text, re.MULTILINE): | |
| name = match.group(1).strip() | |
| desc = (match.group(2) or name).strip() | |
| if name and len(name) > 1: | |
| entities.append({"name": name, "type": "item", "description": desc}) | |
| # Extract creatures | |
| for match in re.finditer(r'(?:生物|怪物|精灵|龙)[::]\s*(.+?)(?:\s*[—\-–]\s*(.+))?$', text, re.MULTILINE): | |
| name = match.group(1).strip() | |
| desc = (match.group(2) or name).strip() | |
| if name and len(name) > 1: | |
| entities.append({"name": name, "type": "creature", "description": desc}) | |
| # Extract emotions | |
| emo_match = re.search(r'情绪[::]\s*(.+)$', text, re.MULTILINE) | |
| if emo_match: | |
| raw = emo_match.group(1).strip() | |
| emotions = [e.strip() for e in re.split(r'[、,,,\s]+', raw) if e.strip()] | |
| # Extract summary | |
| sum_match = re.search(r'总结[::]\s*(.+)$', text, re.MULTILINE) | |
| if sum_match: | |
| summary = sum_match.group(1).strip() | |
| if entities: | |
| return {"entities": entities, "emotions": emotions, "summary": summary} | |
| return None | |
| def _fallback_entity_extraction(text: str, dream_text: str) -> list[Entity]: | |
| """Smart regex-based entity extraction from natural language output.""" | |
| text = _strip_thinking(text) | |
| entities = [] | |
| # Strategy 1: Extract from JSON-like patterns | |
| names_in_quotes = re.findall(r'[""「『]([^""」』]{1,20})[""」』]', text) | |
| for name in names_in_quotes[:5]: | |
| entities.append(Entity(name=name, entity_type="location", description=name)) | |
| # Strategy 2: Extract from Chinese noun patterns (XX的YY, XX城/森林/宫殿 etc.) | |
| if not entities: | |
| location_patterns = re.findall( | |
| r'([一-鿿]{2,6}(?:城|宫|殿|森林|湖|海|山|塔|岛|馆|桥|堡|村|镇|世界|之地|遗迹))', | |
| text + dream_text, | |
| ) | |
| for name in list(set(location_patterns))[:3]: | |
| entities.append(Entity(name=name, entity_type="location", description=name)) | |
| # Strategy 3: Extract character-like patterns (XX者/人/师/王/精灵) | |
| char_patterns = re.findall( | |
| r'([一-鿿]{2,6}(?:者|人|师|王|精灵|龙|骑士|法师|旅人|老人|少女|守护者))', | |
| text + dream_text, | |
| ) | |
| for name in list(set(char_patterns))[:2]: | |
| entities.append(Entity(name=name, entity_type="npc", description=name)) | |
| # Strategy 4: Last resort - extract key nouns from dream text | |
| if not entities: | |
| # Simple keyword extraction: words between common delimiters | |
| keywords = re.findall(r'在[一]?[个座]?([^,。的和与遇到])', dream_text) | |
| for kw in keywords[:3]: | |
| kw = kw.strip() | |
| if len(kw) >= 2: | |
| entities.append(Entity(name=kw, entity_type="location", description=kw)) | |
| return entities | |
| def _extract_emotions_from_text(text: str, dream_text: str) -> list[str]: | |
| """Extract emotions from text or infer from dream content.""" | |
| # Common emotion words | |
| emotion_keywords = [ | |
| "恐惧", "害怕", "紧张", "焦虑", "兴奋", "快乐", "开心", "幸福", | |
| "悲伤", "孤独", "失落", "惊奇", "好奇", "神秘", "平静", "愤怒", | |
| "温暖", "自由", "困惑", "迷惘", "震撼", "敬畏", "温馨", "浪漫", | |
| ] | |
| found = [] | |
| combined = text + dream_text | |
| for emo in emotion_keywords: | |
| if emo in combined: | |
| found.append(emo) | |
| return found[:3] if found else ["神秘"] | |
| def _build_parse_prompt(dream_text: str, lang: str = "zh") -> str: | |
| """Build the prompt for dream parsing.""" | |
| if lang == "zh": | |
| return config.PARSE_DREAM_PROMPT_ZH.format(dream_text=dream_text) | |
| else: | |
| return f"""Analyze the following dream and extract key information as JSON. | |
| Dream: {dream_text} | |
| Output strictly in this JSON format (no extra text): | |
| {{ | |
| "entities": [ | |
| {{"name": "entity name", "type": "location|npc|item|creature", "description": "brief description"}} | |
| ], | |
| "emotions": ["emotion1", "emotion2"], | |
| "summary": "one-sentence dream summary" | |
| }}""" | |
| def _build_simple_prompt(dream_text: str) -> str: | |
| """Build a simpler prompt for small models that may struggle with JSON.""" | |
| return f"""请列出这个梦境中的地点和人物,每行一个: | |
| 梦境:{dream_text} | |
| 地点: | |
| 1.""" | |
| def parse_dream( | |
| dream_text: str, | |
| generate_fn: callable, | |
| max_retries: int = 2, | |
| ) -> DreamParseResult: | |
| """Parse a dream description into structured entities and emotions. | |
| Tries structured JSON extraction first, falls back to plain-text extraction. | |
| Args: | |
| dream_text: Raw dream description from user. | |
| generate_fn: Function that takes a prompt string and returns model output text. | |
| max_retries: Number of attempts to get valid JSON from the model. | |
| Returns: | |
| DreamParseResult with entities, emotions, and summary. | |
| """ | |
| if not dream_text or len(dream_text.strip()) < 5: | |
| return DreamParseResult( | |
| entities=[], | |
| emotions=[], | |
| summary="梦境太短,无法解析" | |
| ) | |
| dream_text = dream_text[:1024] | |
| lang = config.detect_language(dream_text) | |
| # --- Attempt 1: Text-based format extraction --- | |
| prompt = _build_parse_prompt(dream_text, lang) | |
| for _ in range(max_retries): | |
| try: | |
| raw_output = generate_fn(prompt) | |
| data = _parse_text_format(raw_output) | |
| if data and data.get("entities"): | |
| entities = [] | |
| for item in data["entities"]: | |
| entities.append(Entity( | |
| name=item.get("name", "未知"), | |
| entity_type=item.get("type", "location"), | |
| description=item.get("description", ""), | |
| )) | |
| emotions = data.get("emotions", ["神秘"]) | |
| return DreamParseResult( | |
| entities=entities, | |
| emotions=emotions, | |
| summary=data.get("summary", dream_text[:50]), | |
| ) | |
| except Exception: | |
| continue | |
| # --- Attempt 2: Simple list-based extraction --- | |
| try: | |
| simple_prompt = _build_simple_prompt(dream_text) | |
| raw_output = generate_fn(simple_prompt) | |
| entities = _fallback_entity_extraction(raw_output, dream_text) | |
| if entities: | |
| emotions = _extract_emotions_from_text(raw_output, dream_text) | |
| summary = dream_text[:60].replace("\n", " ") | |
| return DreamParseResult( | |
| entities=entities, | |
| emotions=emotions, | |
| summary=summary, | |
| ) | |
| except Exception: | |
| pass | |
| # --- Attempt 3: Pure regex extraction from dream text --- | |
| entities = _fallback_entity_extraction("", dream_text) | |
| if not entities: | |
| # Absolute last resort: create a single location from the whole dream | |
| entities = [Entity( | |
| name="梦境之地", | |
| entity_type="location", | |
| description=dream_text[:100], | |
| )] | |
| return DreamParseResult( | |
| entities=entities, | |
| emotions=_extract_emotions_from_text("", dream_text), | |
| summary=dream_text[:60].replace("\n", " "), | |
| ) | |