"""Dream text parser: extracts entities and emotions from dream descriptions.
Supports both structured JSON output (8B model) and free-text fallback (1B model).
"""
import re
from typing import Optional
from core.models import Entity, DreamParseResult, _new_id
import config
def _strip_thinking(text: str) -> str:
"""Remove ... tags and other model artifacts."""
# Collect thinking content for fallback
thinking = re.findall(r'(.*?)', text, re.DOTALL)
text = re.sub(r'.*?\s*', '', text, flags=re.DOTALL)
text = re.sub(r'<\|im_start\|>.*', '', text, flags=re.DOTALL)
text = re.sub(r'<\|im_end\|>.*', '', text, flags=re.DOTALL)
result = text.strip()
# If nothing left, try to extract from thinking content
if not result and thinking:
content = thinking[-1].strip()
lines = [l.strip() for l in content.split('\n') if l.strip()]
# Skip reasoning lines
skip = ['首先', '然后', '最后', '嗯', '想到', '考虑', '需要', '用户要求']
good = [l for l in lines if not any(l.startswith(w) for w in skip)]
result = '\n'.join(good[-5:]) if good else '\n'.join(lines[-3:])
return result
def _parse_text_format(text: str) -> Optional[dict]:
"""Parse the text-based format from the parser prompt.
Expected format:
地点:[名称] — [描述]
人物:[名称] — [描述]
情绪:[情绪1]、[情绪2]
总结:[一句话总结]
"""
text = _strip_thinking(text)
entities = []
emotions = []
summary = ""
# Extract locations
for match in re.finditer(r'地点[::]\s*(.+?)(?:\s*[—\-–]\s*(.+))?$', text, re.MULTILINE):
name = match.group(1).strip()
desc = (match.group(2) or name).strip()
if name and len(name) > 1:
entities.append({"name": name, "type": "location", "description": desc})
# Extract NPCs/characters
for match in re.finditer(r'(?:人物|NPC|角色)[::]\s*(.+?)(?:\s*[—\-–]\s*(.+))?$', text, re.MULTILINE):
name = match.group(1).strip()
desc = (match.group(2) or name).strip()
if name and len(name) > 1:
entities.append({"name": name, "type": "npc", "description": desc})
# Extract items
for match in re.finditer(r'(?:物品|道具|东西)[::]\s*(.+?)(?:\s*[—\-–]\s*(.+))?$', text, re.MULTILINE):
name = match.group(1).strip()
desc = (match.group(2) or name).strip()
if name and len(name) > 1:
entities.append({"name": name, "type": "item", "description": desc})
# Extract creatures
for match in re.finditer(r'(?:生物|怪物|精灵|龙)[::]\s*(.+?)(?:\s*[—\-–]\s*(.+))?$', text, re.MULTILINE):
name = match.group(1).strip()
desc = (match.group(2) or name).strip()
if name and len(name) > 1:
entities.append({"name": name, "type": "creature", "description": desc})
# Extract emotions
emo_match = re.search(r'情绪[::]\s*(.+)$', text, re.MULTILINE)
if emo_match:
raw = emo_match.group(1).strip()
emotions = [e.strip() for e in re.split(r'[、,,,\s]+', raw) if e.strip()]
# Extract summary
sum_match = re.search(r'总结[::]\s*(.+)$', text, re.MULTILINE)
if sum_match:
summary = sum_match.group(1).strip()
if entities:
return {"entities": entities, "emotions": emotions, "summary": summary}
return None
def _fallback_entity_extraction(text: str, dream_text: str) -> list[Entity]:
"""Smart regex-based entity extraction from natural language output."""
text = _strip_thinking(text)
entities = []
# Strategy 1: Extract from JSON-like patterns
names_in_quotes = re.findall(r'[""「『]([^""」』]{1,20})[""」』]', text)
for name in names_in_quotes[:5]:
entities.append(Entity(name=name, entity_type="location", description=name))
# Strategy 2: Extract from Chinese noun patterns (XX的YY, XX城/森林/宫殿 etc.)
if not entities:
location_patterns = re.findall(
r'([一-鿿]{2,6}(?:城|宫|殿|森林|湖|海|山|塔|岛|馆|桥|堡|村|镇|世界|之地|遗迹))',
text + dream_text,
)
for name in list(set(location_patterns))[:3]:
entities.append(Entity(name=name, entity_type="location", description=name))
# Strategy 3: Extract character-like patterns (XX者/人/师/王/精灵)
char_patterns = re.findall(
r'([一-鿿]{2,6}(?:者|人|师|王|精灵|龙|骑士|法师|旅人|老人|少女|守护者))',
text + dream_text,
)
for name in list(set(char_patterns))[:2]:
entities.append(Entity(name=name, entity_type="npc", description=name))
# Strategy 4: Last resort - extract key nouns from dream text
if not entities:
# Simple keyword extraction: words between common delimiters
keywords = re.findall(r'在[一]?[个座]?([^,。的和与遇到])', dream_text)
for kw in keywords[:3]:
kw = kw.strip()
if len(kw) >= 2:
entities.append(Entity(name=kw, entity_type="location", description=kw))
return entities
def _extract_emotions_from_text(text: str, dream_text: str) -> list[str]:
"""Extract emotions from text or infer from dream content."""
# Common emotion words
emotion_keywords = [
"恐惧", "害怕", "紧张", "焦虑", "兴奋", "快乐", "开心", "幸福",
"悲伤", "孤独", "失落", "惊奇", "好奇", "神秘", "平静", "愤怒",
"温暖", "自由", "困惑", "迷惘", "震撼", "敬畏", "温馨", "浪漫",
]
found = []
combined = text + dream_text
for emo in emotion_keywords:
if emo in combined:
found.append(emo)
return found[:3] if found else ["神秘"]
def _build_parse_prompt(dream_text: str, lang: str = "zh") -> str:
"""Build the prompt for dream parsing."""
if lang == "zh":
return config.PARSE_DREAM_PROMPT_ZH.format(dream_text=dream_text)
else:
return f"""Analyze the following dream and extract key information as JSON.
Dream: {dream_text}
Output strictly in this JSON format (no extra text):
{{
"entities": [
{{"name": "entity name", "type": "location|npc|item|creature", "description": "brief description"}}
],
"emotions": ["emotion1", "emotion2"],
"summary": "one-sentence dream summary"
}}"""
def _build_simple_prompt(dream_text: str) -> str:
"""Build a simpler prompt for small models that may struggle with JSON."""
return f"""请列出这个梦境中的地点和人物,每行一个:
梦境:{dream_text}
地点:
1."""
def parse_dream(
dream_text: str,
generate_fn: callable,
max_retries: int = 2,
) -> DreamParseResult:
"""Parse a dream description into structured entities and emotions.
Tries structured JSON extraction first, falls back to plain-text extraction.
Args:
dream_text: Raw dream description from user.
generate_fn: Function that takes a prompt string and returns model output text.
max_retries: Number of attempts to get valid JSON from the model.
Returns:
DreamParseResult with entities, emotions, and summary.
"""
if not dream_text or len(dream_text.strip()) < 5:
return DreamParseResult(
entities=[],
emotions=[],
summary="梦境太短,无法解析"
)
dream_text = dream_text[:1024]
lang = config.detect_language(dream_text)
# --- Attempt 1: Text-based format extraction ---
prompt = _build_parse_prompt(dream_text, lang)
for _ in range(max_retries):
try:
raw_output = generate_fn(prompt)
data = _parse_text_format(raw_output)
if data and data.get("entities"):
entities = []
for item in data["entities"]:
entities.append(Entity(
name=item.get("name", "未知"),
entity_type=item.get("type", "location"),
description=item.get("description", ""),
))
emotions = data.get("emotions", ["神秘"])
return DreamParseResult(
entities=entities,
emotions=emotions,
summary=data.get("summary", dream_text[:50]),
)
except Exception:
continue
# --- Attempt 2: Simple list-based extraction ---
try:
simple_prompt = _build_simple_prompt(dream_text)
raw_output = generate_fn(simple_prompt)
entities = _fallback_entity_extraction(raw_output, dream_text)
if entities:
emotions = _extract_emotions_from_text(raw_output, dream_text)
summary = dream_text[:60].replace("\n", " ")
return DreamParseResult(
entities=entities,
emotions=emotions,
summary=summary,
)
except Exception:
pass
# --- Attempt 3: Pure regex extraction from dream text ---
entities = _fallback_entity_extraction("", dream_text)
if not entities:
# Absolute last resort: create a single location from the whole dream
entities = [Entity(
name="梦境之地",
entity_type="location",
description=dream_text[:100],
)]
return DreamParseResult(
entities=entities,
emotions=_extract_emotions_from_text("", dream_text),
summary=dream_text[:60].replace("\n", " "),
)