smartwatch-lm-0.1 / reply_utils.py
prathamkode's picture
Self-contained gibberish cleanup docs, reply_utils and onnx_sample scripts
c9bffa5 verified
"""
Post-processing helpers for Smartwatch LM v0.1 replies.
Use after tokenizer decode to fix BPE artifacts, parse intents, and fill slot placeholders.
"""
from __future__ import annotations
import re
from dataclasses import dataclass
# BPE byte-level artifacts (GPT-style tokenizer)
_BPE_SPACE = "\u0120" # Ġ
_BPE_NEWLINE = "\u010a" # Ċ
# Mojibake sequences from bad UTF-8 round-trips
_MOJIBAKE_REPLACEMENTS: tuple[tuple[str, str], ...] = (
("âĢĶ", "—"),
("âĢĻ", "'"),
("âĢĺ", "'"),
("’", "'"),
("–", "—"),
)
def build_prompt(history: list[tuple[str, str]], user_message: str) -> str:
"""Build the user:/bot: transcript the model expects."""
lines: list[str] = []
for user_text, bot_text in history:
lines.append(f"user: {user_text}")
lines.append(f"bot: {bot_text}")
lines.append(f"user: {user_message}")
lines.append("bot:")
return "\n".join(lines)
def _compact_tag(match: re.Match[str]) -> str:
inner = re.sub(r"\s+", "", match.group(1))
return f"<{inner}>"
def clean_reply(text: str) -> str:
"""Remove tokenizer gibberish and normalize intent/slot tags."""
out = text.replace(_BPE_SPACE, " ").replace(_BPE_NEWLINE, "\n")
for bad, good in _MOJIBAKE_REPLACEMENTS:
out = out.replace(bad, good)
out = re.sub(r" +", " ", out)
out = re.sub(r"<\s*([^>]+?)\s*>", _compact_tag, out)
return out.replace(" '", "'").strip()
def extract_bot_reply(prompt: str, generated: str) -> str:
"""Strip the prompt prefix and keep a single bot line."""
marker = prompt.rstrip() + " "
if generated.startswith(marker):
reply = generated[len(marker) :]
elif "bot:" in generated:
reply = generated.rsplit("bot:", 1)[-1]
else:
reply = generated
reply = reply.lstrip()
if "\nuser:" in reply:
reply = reply.split("\nuser:", 1)[0]
if "\n\n" in reply:
reply = reply.split("\n\n", 1)[0]
return clean_reply(reply.split("\n", 1)[0].strip())
@dataclass
class ParsedReply:
intent: str
template: str
def extract_intent_reply(text: str) -> ParsedReply:
"""Parse intent tag and reply template from raw or cleaned model output."""
cleaned = clean_reply(text)
match = re.search(r"<\s*INTENT\s*:[^>]+>", cleaned, re.IGNORECASE)
if not match:
first = cleaned.split("\n", 1)[0].strip()
return ParsedReply(intent="NONE", template=first or cleaned)
rest = cleaned[match.start() :]
rest = re.split(r"\nuser\s*:", rest, maxsplit=1, flags=re.IGNORECASE)[0]
line = rest.split("\n", 1)[0].strip()
intent_match = re.match(r"^<INTENT:([A-Z_]+)>\s*(.*)", line, re.IGNORECASE | re.DOTALL)
if intent_match:
return ParsedReply(intent=intent_match.group(1), template=intent_match.group(2).strip())
return ParsedReply(intent="NONE", template=line)
def fill_slots(text: str, data: dict[str, str]) -> str:
"""Replace <SLOT_NAME> tokens with live sensor values."""
return re.sub(
r"<([A-Z_]+)>",
lambda m: data.get(m.group(1), m.group(0)),
text,
)
def process_model_output(
prompt: str,
generated: str,
slot_data: dict[str, str] | None = None,
) -> tuple[str, ParsedReply, str]:
"""
Full pipeline: raw decode -> single-line reply -> intent parse -> filled display text.
Returns (raw_bot_line, parsed, display_text).
"""
raw = extract_bot_reply(prompt, generated)
parsed = extract_intent_reply(raw)
display = fill_slots(parsed.template, slot_data or {})
return raw, parsed, display
if __name__ == "__main__":
# Demo with fake tokenizer glitches — no model required.
messy = (
"Ġ<INTENT:GET_STEPS>ĠYou'reĠatĠ<STEPS_TODAY>ĠofĠ<STEP_GOAL>ĠâĢĶĠkeepĠgoing!\n"
"user: what about yesterday"
)
parsed = extract_intent_reply(messy)
slots = {"STEPS_TODAY": "4,231", "STEP_GOAL": "10,000"}
print("intent: ", parsed.intent)
print("template:", parsed.template)
print("display: ", fill_slots(parsed.template, slots))