"""Dream text parser: extracts entities and emotions from dream descriptions. Supports both structured JSON output (8B model) and free-text fallback (1B model). """ import re from typing import Optional from core.models import Entity, DreamParseResult, _new_id import config def _strip_thinking(text: str) -> str: """Remove ... tags and other model artifacts.""" # Collect thinking content for fallback thinking = re.findall(r'(.*?)', text, re.DOTALL) text = re.sub(r'.*?\s*', '', text, flags=re.DOTALL) text = re.sub(r'<\|im_start\|>.*', '', text, flags=re.DOTALL) text = re.sub(r'<\|im_end\|>.*', '', text, flags=re.DOTALL) result = text.strip() # If nothing left, try to extract from thinking content if not result and thinking: content = thinking[-1].strip() lines = [l.strip() for l in content.split('\n') if l.strip()] # Skip reasoning lines skip = ['首先', '然后', '最后', '嗯', '想到', '考虑', '需要', '用户要求'] good = [l for l in lines if not any(l.startswith(w) for w in skip)] result = '\n'.join(good[-5:]) if good else '\n'.join(lines[-3:]) return result def _parse_text_format(text: str) -> Optional[dict]: """Parse the text-based format from the parser prompt. Expected format: 地点:[名称] — [描述] 人物:[名称] — [描述] 情绪:[情绪1]、[情绪2] 总结:[一句话总结] """ text = _strip_thinking(text) entities = [] emotions = [] summary = "" # Extract locations for match in re.finditer(r'地点[::]\s*(.+?)(?:\s*[—\-–]\s*(.+))?$', text, re.MULTILINE): name = match.group(1).strip() desc = (match.group(2) or name).strip() if name and len(name) > 1: entities.append({"name": name, "type": "location", "description": desc}) # Extract NPCs/characters for match in re.finditer(r'(?:人物|NPC|角色)[::]\s*(.+?)(?:\s*[—\-–]\s*(.+))?$', text, re.MULTILINE): name = match.group(1).strip() desc = (match.group(2) or name).strip() if name and len(name) > 1: entities.append({"name": name, "type": "npc", "description": desc}) # Extract items for match in re.finditer(r'(?:物品|道具|东西)[::]\s*(.+?)(?:\s*[—\-–]\s*(.+))?$', text, re.MULTILINE): name = match.group(1).strip() desc = (match.group(2) or name).strip() if name and len(name) > 1: entities.append({"name": name, "type": "item", "description": desc}) # Extract creatures for match in re.finditer(r'(?:生物|怪物|精灵|龙)[::]\s*(.+?)(?:\s*[—\-–]\s*(.+))?$', text, re.MULTILINE): name = match.group(1).strip() desc = (match.group(2) or name).strip() if name and len(name) > 1: entities.append({"name": name, "type": "creature", "description": desc}) # Extract emotions emo_match = re.search(r'情绪[::]\s*(.+)$', text, re.MULTILINE) if emo_match: raw = emo_match.group(1).strip() emotions = [e.strip() for e in re.split(r'[、,,,\s]+', raw) if e.strip()] # Extract summary sum_match = re.search(r'总结[::]\s*(.+)$', text, re.MULTILINE) if sum_match: summary = sum_match.group(1).strip() if entities: return {"entities": entities, "emotions": emotions, "summary": summary} return None def _fallback_entity_extraction(text: str, dream_text: str) -> list[Entity]: """Smart regex-based entity extraction from natural language output.""" text = _strip_thinking(text) entities = [] # Strategy 1: Extract from JSON-like patterns names_in_quotes = re.findall(r'[""「『]([^""」』]{1,20})[""」』]', text) for name in names_in_quotes[:5]: entities.append(Entity(name=name, entity_type="location", description=name)) # Strategy 2: Extract from Chinese noun patterns (XX的YY, XX城/森林/宫殿 etc.) if not entities: location_patterns = re.findall( r'([一-鿿]{2,6}(?:城|宫|殿|森林|湖|海|山|塔|岛|馆|桥|堡|村|镇|世界|之地|遗迹))', text + dream_text, ) for name in list(set(location_patterns))[:3]: entities.append(Entity(name=name, entity_type="location", description=name)) # Strategy 3: Extract character-like patterns (XX者/人/师/王/精灵) char_patterns = re.findall( r'([一-鿿]{2,6}(?:者|人|师|王|精灵|龙|骑士|法师|旅人|老人|少女|守护者))', text + dream_text, ) for name in list(set(char_patterns))[:2]: entities.append(Entity(name=name, entity_type="npc", description=name)) # Strategy 4: Last resort - extract key nouns from dream text if not entities: # Simple keyword extraction: words between common delimiters keywords = re.findall(r'在[一]?[个座]?([^,。的和与遇到])', dream_text) for kw in keywords[:3]: kw = kw.strip() if len(kw) >= 2: entities.append(Entity(name=kw, entity_type="location", description=kw)) return entities def _extract_emotions_from_text(text: str, dream_text: str) -> list[str]: """Extract emotions from text or infer from dream content.""" # Common emotion words emotion_keywords = [ "恐惧", "害怕", "紧张", "焦虑", "兴奋", "快乐", "开心", "幸福", "悲伤", "孤独", "失落", "惊奇", "好奇", "神秘", "平静", "愤怒", "温暖", "自由", "困惑", "迷惘", "震撼", "敬畏", "温馨", "浪漫", ] found = [] combined = text + dream_text for emo in emotion_keywords: if emo in combined: found.append(emo) return found[:3] if found else ["神秘"] def _build_parse_prompt(dream_text: str, lang: str = "zh") -> str: """Build the prompt for dream parsing.""" if lang == "zh": return config.PARSE_DREAM_PROMPT_ZH.format(dream_text=dream_text) else: return f"""Analyze the following dream and extract key information as JSON. Dream: {dream_text} Output strictly in this JSON format (no extra text): {{ "entities": [ {{"name": "entity name", "type": "location|npc|item|creature", "description": "brief description"}} ], "emotions": ["emotion1", "emotion2"], "summary": "one-sentence dream summary" }}""" def _build_simple_prompt(dream_text: str) -> str: """Build a simpler prompt for small models that may struggle with JSON.""" return f"""请列出这个梦境中的地点和人物,每行一个: 梦境:{dream_text} 地点: 1.""" def parse_dream( dream_text: str, generate_fn: callable, max_retries: int = 2, ) -> DreamParseResult: """Parse a dream description into structured entities and emotions. Tries structured JSON extraction first, falls back to plain-text extraction. Args: dream_text: Raw dream description from user. generate_fn: Function that takes a prompt string and returns model output text. max_retries: Number of attempts to get valid JSON from the model. Returns: DreamParseResult with entities, emotions, and summary. """ if not dream_text or len(dream_text.strip()) < 5: return DreamParseResult( entities=[], emotions=[], summary="梦境太短,无法解析" ) dream_text = dream_text[:1024] lang = config.detect_language(dream_text) # --- Attempt 1: Text-based format extraction --- prompt = _build_parse_prompt(dream_text, lang) for _ in range(max_retries): try: raw_output = generate_fn(prompt) data = _parse_text_format(raw_output) if data and data.get("entities"): entities = [] for item in data["entities"]: entities.append(Entity( name=item.get("name", "未知"), entity_type=item.get("type", "location"), description=item.get("description", ""), )) emotions = data.get("emotions", ["神秘"]) return DreamParseResult( entities=entities, emotions=emotions, summary=data.get("summary", dream_text[:50]), ) except Exception: continue # --- Attempt 2: Simple list-based extraction --- try: simple_prompt = _build_simple_prompt(dream_text) raw_output = generate_fn(simple_prompt) entities = _fallback_entity_extraction(raw_output, dream_text) if entities: emotions = _extract_emotions_from_text(raw_output, dream_text) summary = dream_text[:60].replace("\n", " ") return DreamParseResult( entities=entities, emotions=emotions, summary=summary, ) except Exception: pass # --- Attempt 3: Pure regex extraction from dream text --- entities = _fallback_entity_extraction("", dream_text) if not entities: # Absolute last resort: create a single location from the whole dream entities = [Entity( name="梦境之地", entity_type="location", description=dream_text[:100], )] return DreamParseResult( entities=entities, emotions=_extract_emotions_from_text("", dream_text), summary=dream_text[:60].replace("\n", " "), )