File size: 3,141 Bytes
dff25f7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
"""
LLM output parsing utilities for the persona engine.

Parses structured output from the single-pass Actor:
  【内心独白】 → monologue
  【最终回复】 → reply
  【表达方式】 → modality (语音/文字/照片/…)

Supports both Chinese and English section headers.
"""

from __future__ import annotations

import re


# -- Modality Parsing --
# No hardcoded map — registered SKILLs are the source of truth.
# Parser only extracts the raw modality keyword from LLM output.

def _parse_modality(raw: str) -> str:
    """Extract primary modality keyword from Actor output.

    Returns the first token before any punctuation/space.
    Skill engine decides if it maps to a registered skill.
    """
    import re
    cleaned = raw.strip().lstrip("\uff1a: \n")
    # Take first token before punctuation (。.,,、/ or whitespace)
    match = re.match(r'[\w\u4e00-\u9fff]+', cleaned)
    return match.group(0) if match else "文字"


# -- Section header regex: Chinese 【】 and English [] formats --
_SECTION_RE = re.compile(
    r'(?:【(?P<zh>内心独白|最终回复|表达方式)】'
    r'|\[(?P<en>Inner Monologue|Final Reply|Expression Mode)\])'
)
_TAG_MAP = {
    '内心独白': 'monologue', 'Inner Monologue': 'monologue',
    '最终回复': 'reply',     'Final Reply': 'reply',
    '表达方式': 'modality',  'Expression Mode': 'modality',
}


def extract_reply(raw: str) -> tuple[str, str, str]:
    """Extract monologue, reply, and modality from Actor output.

    Supports both Chinese (【最终回复】) and English ([Final Reply]) section headers.
    Returns canonical Chinese modality key for internal consistency.
    """
    # Strip <think>...</think> blocks from reasoning models (e.g. MiniMax M2.7)
    raw = re.sub(r'<think>.*?</think>', '', raw, flags=re.DOTALL).strip()

    sections: dict[str, str] = {}
    matches = list(_SECTION_RE.finditer(raw))
    for i, m in enumerate(matches):
        tag = m.group('zh') or m.group('en')
        key = _TAG_MAP[tag]
        start = m.end()
        end = matches[i + 1].start() if i + 1 < len(matches) else len(raw)
        sections[key] = raw[start:end].strip()

    monologue = sections.get('monologue', '')
    reply = sections.get('reply', '')
    modality_raw = sections.get('modality', '')

    # Parse modality with bilingual map
    modality = _parse_modality(modality_raw) if modality_raw else "文字"

    # Silence short-circuit: Actor chose not to speak
    if modality in ("静默", "silence", "Silence"):
        return monologue, "", "静默"

    if not reply:
        # Fallback: strip action descriptions
        reply = re.sub(r'[(((][^)))]*[)))]', '', raw).strip()
        reply = re.sub(r'\*[^*]+\*', '', reply).strip()
        if not reply:
            reply = "..."
    else:
        # Normal path: strip action tags like *顿了顿* / *顿了顿* and (沉默) from reply
        reply = re.sub(r'[**][^**]+[**]', '', reply).strip()
        reply = re.sub(r'[((][^(())]{1,40}[))]', '', reply).strip()
        if not reply:
            reply = "..."

    return monologue, reply, modality