File size: 4,241 Bytes
c9bffa5 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 | """
Post-processing helpers for Smartwatch LM v0.1 replies.
Use after tokenizer decode to fix BPE artifacts, parse intents, and fill slot placeholders.
"""
from __future__ import annotations
import re
from dataclasses import dataclass
# BPE byte-level artifacts (GPT-style tokenizer)
_BPE_SPACE = "\u0120" # Ġ
_BPE_NEWLINE = "\u010a" # Ċ
# Mojibake sequences from bad UTF-8 round-trips
_MOJIBAKE_REPLACEMENTS: tuple[tuple[str, str], ...] = (
("âĢĶ", "—"),
("âĢĻ", "'"),
("âĢĺ", "'"),
("’", "'"),
("–", "—"),
)
def build_prompt(history: list[tuple[str, str]], user_message: str) -> str:
"""Build the user:/bot: transcript the model expects."""
lines: list[str] = []
for user_text, bot_text in history:
lines.append(f"user: {user_text}")
lines.append(f"bot: {bot_text}")
lines.append(f"user: {user_message}")
lines.append("bot:")
return "\n".join(lines)
def _compact_tag(match: re.Match[str]) -> str:
inner = re.sub(r"\s+", "", match.group(1))
return f"<{inner}>"
def clean_reply(text: str) -> str:
"""Remove tokenizer gibberish and normalize intent/slot tags."""
out = text.replace(_BPE_SPACE, " ").replace(_BPE_NEWLINE, "\n")
for bad, good in _MOJIBAKE_REPLACEMENTS:
out = out.replace(bad, good)
out = re.sub(r" +", " ", out)
out = re.sub(r"<\s*([^>]+?)\s*>", _compact_tag, out)
return out.replace(" '", "'").strip()
def extract_bot_reply(prompt: str, generated: str) -> str:
"""Strip the prompt prefix and keep a single bot line."""
marker = prompt.rstrip() + " "
if generated.startswith(marker):
reply = generated[len(marker) :]
elif "bot:" in generated:
reply = generated.rsplit("bot:", 1)[-1]
else:
reply = generated
reply = reply.lstrip()
if "\nuser:" in reply:
reply = reply.split("\nuser:", 1)[0]
if "\n\n" in reply:
reply = reply.split("\n\n", 1)[0]
return clean_reply(reply.split("\n", 1)[0].strip())
@dataclass
class ParsedReply:
intent: str
template: str
def extract_intent_reply(text: str) -> ParsedReply:
"""Parse intent tag and reply template from raw or cleaned model output."""
cleaned = clean_reply(text)
match = re.search(r"<\s*INTENT\s*:[^>]+>", cleaned, re.IGNORECASE)
if not match:
first = cleaned.split("\n", 1)[0].strip()
return ParsedReply(intent="NONE", template=first or cleaned)
rest = cleaned[match.start() :]
rest = re.split(r"\nuser\s*:", rest, maxsplit=1, flags=re.IGNORECASE)[0]
line = rest.split("\n", 1)[0].strip()
intent_match = re.match(r"^<INTENT:([A-Z_]+)>\s*(.*)", line, re.IGNORECASE | re.DOTALL)
if intent_match:
return ParsedReply(intent=intent_match.group(1), template=intent_match.group(2).strip())
return ParsedReply(intent="NONE", template=line)
def fill_slots(text: str, data: dict[str, str]) -> str:
"""Replace <SLOT_NAME> tokens with live sensor values."""
return re.sub(
r"<([A-Z_]+)>",
lambda m: data.get(m.group(1), m.group(0)),
text,
)
def process_model_output(
prompt: str,
generated: str,
slot_data: dict[str, str] | None = None,
) -> tuple[str, ParsedReply, str]:
"""
Full pipeline: raw decode -> single-line reply -> intent parse -> filled display text.
Returns (raw_bot_line, parsed, display_text).
"""
raw = extract_bot_reply(prompt, generated)
parsed = extract_intent_reply(raw)
display = fill_slots(parsed.template, slot_data or {})
return raw, parsed, display
if __name__ == "__main__":
# Demo with fake tokenizer glitches — no model required.
messy = (
"Ġ<INTENT:GET_STEPS>ĠYou'reĠatĠ<STEPS_TODAY>ĠofĠ<STEP_GOAL>ĠâĢĶĠkeepĠgoing!\n"
"user: what about yesterday"
)
parsed = extract_intent_reply(messy)
slots = {"STEPS_TODAY": "4,231", "STEP_GOAL": "10,000"}
print("intent: ", parsed.intent)
print("template:", parsed.template)
print("display: ", fill_slots(parsed.template, slots))
|