File size: 4,241 Bytes
c9bffa5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
"""

Post-processing helpers for Smartwatch LM v0.1 replies.



Use after tokenizer decode to fix BPE artifacts, parse intents, and fill slot placeholders.

"""

from __future__ import annotations

import re
from dataclasses import dataclass

# BPE byte-level artifacts (GPT-style tokenizer)
_BPE_SPACE = "\u0120"  # Ġ
_BPE_NEWLINE = "\u010a"  # Ċ

# Mojibake sequences from bad UTF-8 round-trips
_MOJIBAKE_REPLACEMENTS: tuple[tuple[str, str], ...] = (
    ("âĢĶ", "—"),
    ("âĢĻ", "'"),
    ("âĢĺ", "'"),
    ("’", "'"),
    ("–", "—"),
)


def build_prompt(history: list[tuple[str, str]], user_message: str) -> str:
    """Build the user:/bot: transcript the model expects."""
    lines: list[str] = []
    for user_text, bot_text in history:
        lines.append(f"user: {user_text}")
        lines.append(f"bot: {bot_text}")
    lines.append(f"user: {user_message}")
    lines.append("bot:")
    return "\n".join(lines)


def _compact_tag(match: re.Match[str]) -> str:
    inner = re.sub(r"\s+", "", match.group(1))
    return f"<{inner}>"


def clean_reply(text: str) -> str:
    """Remove tokenizer gibberish and normalize intent/slot tags."""
    out = text.replace(_BPE_SPACE, " ").replace(_BPE_NEWLINE, "\n")
    for bad, good in _MOJIBAKE_REPLACEMENTS:
        out = out.replace(bad, good)
    out = re.sub(r" +", " ", out)
    out = re.sub(r"<\s*([^>]+?)\s*>", _compact_tag, out)
    return out.replace(" '", "'").strip()


def extract_bot_reply(prompt: str, generated: str) -> str:
    """Strip the prompt prefix and keep a single bot line."""
    marker = prompt.rstrip() + " "
    if generated.startswith(marker):
        reply = generated[len(marker) :]
    elif "bot:" in generated:
        reply = generated.rsplit("bot:", 1)[-1]
    else:
        reply = generated

    reply = reply.lstrip()
    if "\nuser:" in reply:
        reply = reply.split("\nuser:", 1)[0]
    if "\n\n" in reply:
        reply = reply.split("\n\n", 1)[0]
    return clean_reply(reply.split("\n", 1)[0].strip())


@dataclass
class ParsedReply:
    intent: str
    template: str


def extract_intent_reply(text: str) -> ParsedReply:
    """Parse intent tag and reply template from raw or cleaned model output."""
    cleaned = clean_reply(text)
    match = re.search(r"<\s*INTENT\s*:[^>]+>", cleaned, re.IGNORECASE)
    if not match:
        first = cleaned.split("\n", 1)[0].strip()
        return ParsedReply(intent="NONE", template=first or cleaned)

    rest = cleaned[match.start() :]
    rest = re.split(r"\nuser\s*:", rest, maxsplit=1, flags=re.IGNORECASE)[0]
    line = rest.split("\n", 1)[0].strip()

    intent_match = re.match(r"^<INTENT:([A-Z_]+)>\s*(.*)", line, re.IGNORECASE | re.DOTALL)
    if intent_match:
        return ParsedReply(intent=intent_match.group(1), template=intent_match.group(2).strip())
    return ParsedReply(intent="NONE", template=line)


def fill_slots(text: str, data: dict[str, str]) -> str:
    """Replace <SLOT_NAME> tokens with live sensor values."""
    return re.sub(
        r"<([A-Z_]+)>",
        lambda m: data.get(m.group(1), m.group(0)),
        text,
    )


def process_model_output(

    prompt: str,

    generated: str,

    slot_data: dict[str, str] | None = None,

) -> tuple[str, ParsedReply, str]:
    """

    Full pipeline: raw decode -> single-line reply -> intent parse -> filled display text.



    Returns (raw_bot_line, parsed, display_text).

    """
    raw = extract_bot_reply(prompt, generated)
    parsed = extract_intent_reply(raw)
    display = fill_slots(parsed.template, slot_data or {})
    return raw, parsed, display


if __name__ == "__main__":
    # Demo with fake tokenizer glitches — no model required.
    messy = (
        "Ġ<INTENT:GET_STEPS>ĠYou'reĠatĠ<STEPS_TODAY>ĠofĠ<STEP_GOAL>ĠâĢĶĠkeepĠgoing!\n"
        "user: what about yesterday"
    )
    parsed = extract_intent_reply(messy)
    slots = {"STEPS_TODAY": "4,231", "STEP_GOAL": "10,000"}
    print("intent:  ", parsed.intent)
    print("template:", parsed.template)
    print("display: ", fill_slots(parsed.template, slots))