Spaces:

build-small-hackathon
/

dreamland

Running on Zero

panda-lsy

fix: robust thinking tag handling for MiniCPM5-1B

2e43ba3 21 days ago

9.63 kB

	"""Dream text parser: extracts entities and emotions from dream descriptions.

	Supports both structured JSON output (8B model) and free-text fallback (1B model).
	"""

	import re
	from typing import Optional

	from core.models import Entity, DreamParseResult, _new_id
	import config


	def _strip_thinking(text: str) -> str:
	"""Remove <think>...</think> tags and other model artifacts."""
	# Collect thinking content for fallback
	thinking = re.findall(r'<think>(.*?)</think>', text, re.DOTALL)
	text = re.sub(r'<think>.?</think>\s', '', text, flags=re.DOTALL)
	text = re.sub(r'<\\|im_start\\|>.*', '', text, flags=re.DOTALL)
	text = re.sub(r'<\\|im_end\\|>.*', '', text, flags=re.DOTALL)
	result = text.strip()

	# If nothing left, try to extract from thinking content
	if not result and thinking:
	content = thinking[-1].strip()
	lines = [l.strip() for l in content.split('\n') if l.strip()]
	# Skip reasoning lines
	skip = ['首先', '然后', '最后', '嗯', '想到', '考虑', '需要', '用户要求']
	good = [l for l in lines if not any(l.startswith(w) for w in skip)]
	result = '\n'.join(good[-5:]) if good else '\n'.join(lines[-3:])

	return result


	def _parse_text_format(text: str) -> Optional[dict]:
	"""Parse the text-based format from the parser prompt.

	Expected format:
	地点：[名称] — [描述]
	人物：[名称] — [描述]
	情绪：[情绪1]、[情绪2]
	总结：[一句话总结]
	"""
	text = _strip_thinking(text)
	entities = []
	emotions = []
	summary = ""

	# Extract locations
	for match in re.finditer(r'地点[：:]\s(.+?)(?:\s[—\-–]\s*(.+))?$', text, re.MULTILINE):
	name = match.group(1).strip()
	desc = (match.group(2) or name).strip()
	if name and len(name) > 1:
	entities.append({"name": name, "type": "location", "description": desc})

	# Extract NPCs/characters
	for match in re.finditer(r'(?:人物\|NPC\|角色)[：:]\s(.+?)(?:\s[—\-–]\s*(.+))?$', text, re.MULTILINE):
	name = match.group(1).strip()
	desc = (match.group(2) or name).strip()
	if name and len(name) > 1:
	entities.append({"name": name, "type": "npc", "description": desc})

	# Extract items
	for match in re.finditer(r'(?:物品\|道具\|东西)[：:]\s(.+?)(?:\s[—\-–]\s*(.+))?$', text, re.MULTILINE):
	name = match.group(1).strip()
	desc = (match.group(2) or name).strip()
	if name and len(name) > 1:
	entities.append({"name": name, "type": "item", "description": desc})

	# Extract creatures
	for match in re.finditer(r'(?:生物\|怪物\|精灵\|龙)[：:]\s(.+?)(?:\s[—\-–]\s*(.+))?$', text, re.MULTILINE):
	name = match.group(1).strip()
	desc = (match.group(2) or name).strip()
	if name and len(name) > 1:
	entities.append({"name": name, "type": "creature", "description": desc})

	# Extract emotions
	emo_match = re.search(r'情绪[：:]\s*(.+)$', text, re.MULTILINE)
	if emo_match:
	raw = emo_match.group(1).strip()
	emotions = [e.strip() for e in re.split(r'[、，,，\s]+', raw) if e.strip()]

	# Extract summary
	sum_match = re.search(r'总结[：:]\s*(.+)$', text, re.MULTILINE)
	if sum_match:
	summary = sum_match.group(1).strip()

	if entities:
	return {"entities": entities, "emotions": emotions, "summary": summary}
	return None


	def _fallback_entity_extraction(text: str, dream_text: str) -> list[Entity]:
	"""Smart regex-based entity extraction from natural language output."""
	text = _strip_thinking(text)
	entities = []

	# Strategy 1: Extract from JSON-like patterns
	names_in_quotes = re.findall(r'[""「『]([^""」』]{1,20})[""」』]', text)
	for name in names_in_quotes[:5]:
	entities.append(Entity(name=name, entity_type="location", description=name))

	# Strategy 2: Extract from Chinese noun patterns (XX的YY, XX城/森林/宫殿 etc.)
	if not entities:
	location_patterns = re.findall(
	r'([一-鿿]{2,6}(?:城\|宫\|殿\|森林\|湖\|海\|山\|塔\|岛\|馆\|桥\|堡\|村\|镇\|世界\|之地\|遗迹))',
	text + dream_text,
	)
	for name in list(set(location_patterns))[:3]:
	entities.append(Entity(name=name, entity_type="location", description=name))

	# Strategy 3: Extract character-like patterns (XX者/人/师/王/精灵)
	char_patterns = re.findall(
	r'([一-鿿]{2,6}(?:者\|人\|师\|王\|精灵\|龙\|骑士\|法师\|旅人\|老人\|少女\|守护者))',
	text + dream_text,
	)
	for name in list(set(char_patterns))[:2]:
	entities.append(Entity(name=name, entity_type="npc", description=name))

	# Strategy 4: Last resort - extract key nouns from dream text
	if not entities:
	# Simple keyword extraction: words between common delimiters
	keywords = re.findall(r'在[一]?[个座]?([^，。的和与遇到])', dream_text)
	for kw in keywords[:3]:
	kw = kw.strip()
	if len(kw) >= 2:
	entities.append(Entity(name=kw, entity_type="location", description=kw))

	return entities


	def _extract_emotions_from_text(text: str, dream_text: str) -> list[str]:
	"""Extract emotions from text or infer from dream content."""
	# Common emotion words
	emotion_keywords = [
	"恐惧", "害怕", "紧张", "焦虑", "兴奋", "快乐", "开心", "幸福",
	"悲伤", "孤独", "失落", "惊奇", "好奇", "神秘", "平静", "愤怒",
	"温暖", "自由", "困惑", "迷惘", "震撼", "敬畏", "温馨", "浪漫",
	]
	found = []
	combined = text + dream_text
	for emo in emotion_keywords:
	if emo in combined:
	found.append(emo)

	return found[:3] if found else ["神秘"]


	def _build_parse_prompt(dream_text: str, lang: str = "zh") -> str:
	"""Build the prompt for dream parsing."""
	if lang == "zh":
	return config.PARSE_DREAM_PROMPT_ZH.format(dream_text=dream_text)
	else:
	return f"""Analyze the following dream and extract key information as JSON.

	Dream: {dream_text}

	Output strictly in this JSON format (no extra text):
	{{
	"entities": [
	{{"name": "entity name", "type": "location\|npc\|item\|creature", "description": "brief description"}}
	],
	"emotions": ["emotion1", "emotion2"],
	"summary": "one-sentence dream summary"
	}}"""


	def _build_simple_prompt(dream_text: str) -> str:
	"""Build a simpler prompt for small models that may struggle with JSON."""
	return f"""请列出这个梦境中的地点和人物，每行一个：

	梦境：{dream_text}

	地点：
	1."""


	def parse_dream(
	dream_text: str,
	generate_fn: callable,
	max_retries: int = 2,
	) -> DreamParseResult:
	"""Parse a dream description into structured entities and emotions.

	Tries structured JSON extraction first, falls back to plain-text extraction.

	Args:
	dream_text: Raw dream description from user.
	generate_fn: Function that takes a prompt string and returns model output text.
	max_retries: Number of attempts to get valid JSON from the model.

	Returns:
	DreamParseResult with entities, emotions, and summary.
	"""
	if not dream_text or len(dream_text.strip()) < 5:
	return DreamParseResult(
	entities=[],
	emotions=[],
	summary="梦境太短，无法解析"
	)

	dream_text = dream_text[:1024]
	lang = config.detect_language(dream_text)

	# --- Attempt 1: Text-based format extraction ---
	prompt = _build_parse_prompt(dream_text, lang)
	for _ in range(max_retries):
	try:
	raw_output = generate_fn(prompt)
	data = _parse_text_format(raw_output)

	if data and data.get("entities"):
	entities = []
	for item in data["entities"]:
	entities.append(Entity(
	name=item.get("name", "未知"),
	entity_type=item.get("type", "location"),
	description=item.get("description", ""),
	))

	emotions = data.get("emotions", ["神秘"])
	return DreamParseResult(
	entities=entities,
	emotions=emotions,
	summary=data.get("summary", dream_text[:50]),
	)
	except Exception:
	continue

	# --- Attempt 2: Simple list-based extraction ---
	try:
	simple_prompt = _build_simple_prompt(dream_text)
	raw_output = generate_fn(simple_prompt)
	entities = _fallback_entity_extraction(raw_output, dream_text)

	if entities:
	emotions = _extract_emotions_from_text(raw_output, dream_text)
	summary = dream_text[:60].replace("\n", " ")
	return DreamParseResult(
	entities=entities,
	emotions=emotions,
	summary=summary,
	)
	except Exception:
	pass

	# --- Attempt 3: Pure regex extraction from dream text ---
	entities = _fallback_entity_extraction("", dream_text)
	if not entities:
	# Absolute last resort: create a single location from the whole dream
	entities = [Entity(
	name="梦境之地",
	entity_type="location",
	description=dream_text[:100],
	)]

	return DreamParseResult(
	entities=entities,
	emotions=_extract_emotions_from_text("", dream_text),
	summary=dream_text[:60].replace("\n", " "),
	)