Self-contained gibberish cleanup docs, reply_utils and onnx_sample scripts

c9bffa5 verified 7 days ago

4.24 kB

	"""
	Post-processing helpers for Smartwatch LM v0.1 replies.

	Use after tokenizer decode to fix BPE artifacts, parse intents, and fill slot placeholders.
	"""

	from __future__ import annotations

	import re
	from dataclasses import dataclass

	# BPE byte-level artifacts (GPT-style tokenizer)
	_BPE_SPACE = "\u0120" # Ġ
	_BPE_NEWLINE = "\u010a" # Ċ

	# Mojibake sequences from bad UTF-8 round-trips
	_MOJIBAKE_REPLACEMENTS: tuple[tuple[str, str], ...] = (
	("âĢĶ", "—"),
	("âĢĻ", "'"),
	("âĢĺ", "'"),
	("Ã¢â‚¬â„¢", "'"),
	("Ã¢â‚¬â€œ", "—"),
	)


	def build_prompt(history: list[tuple[str, str]], user_message: str) -> str:
	"""Build the user:/bot: transcript the model expects."""
	lines: list[str] = []
	for user_text, bot_text in history:
	lines.append(f"user: {user_text}")
	lines.append(f"bot: {bot_text}")
	lines.append(f"user: {user_message}")
	lines.append("bot:")
	return "\n".join(lines)


	def _compact_tag(match: re.Match[str]) -> str:
	inner = re.sub(r"\s+", "", match.group(1))
	return f"<{inner}>"


	def clean_reply(text: str) -> str:
	"""Remove tokenizer gibberish and normalize intent/slot tags."""
	out = text.replace(_BPE_SPACE, " ").replace(_BPE_NEWLINE, "\n")
	for bad, good in _MOJIBAKE_REPLACEMENTS:
	out = out.replace(bad, good)
	out = re.sub(r" +", " ", out)
	out = re.sub(r"<\s([^>]+?)\s>", _compact_tag, out)
	return out.replace(" '", "'").strip()


	def extract_bot_reply(prompt: str, generated: str) -> str:
	"""Strip the prompt prefix and keep a single bot line."""
	marker = prompt.rstrip() + " "
	if generated.startswith(marker):
	reply = generated[len(marker) :]
	elif "bot:" in generated:
	reply = generated.rsplit("bot:", 1)[-1]
	else:
	reply = generated

	reply = reply.lstrip()
	if "\nuser:" in reply:
	reply = reply.split("\nuser:", 1)[0]
	if "\n\n" in reply:
	reply = reply.split("\n\n", 1)[0]
	return clean_reply(reply.split("\n", 1)[0].strip())


	@dataclass
	class ParsedReply:
	intent: str
	template: str


	def extract_intent_reply(text: str) -> ParsedReply:
	"""Parse intent tag and reply template from raw or cleaned model output."""
	cleaned = clean_reply(text)
	match = re.search(r"<\sINTENT\s:[^>]+>", cleaned, re.IGNORECASE)
	if not match:
	first = cleaned.split("\n", 1)[0].strip()
	return ParsedReply(intent="NONE", template=first or cleaned)

	rest = cleaned[match.start() :]
	rest = re.split(r"\nuser\s*:", rest, maxsplit=1, flags=re.IGNORECASE)[0]
	line = rest.split("\n", 1)[0].strip()

	intent_match = re.match(r"^<INTENT:([A-Z_]+)>\s(.)", line, re.IGNORECASE \| re.DOTALL)
	if intent_match:
	return ParsedReply(intent=intent_match.group(1), template=intent_match.group(2).strip())
	return ParsedReply(intent="NONE", template=line)


	def fill_slots(text: str, data: dict[str, str]) -> str:
	"""Replace <SLOT_NAME> tokens with live sensor values."""
	return re.sub(
	r"<([A-Z_]+)>",
	lambda m: data.get(m.group(1), m.group(0)),
	text,
	)


	def process_model_output(
	prompt: str,
	generated: str,
	slot_data: dict[str, str] \| None = None,
	) -> tuple[str, ParsedReply, str]:
	"""
	Full pipeline: raw decode -> single-line reply -> intent parse -> filled display text.

	Returns (raw_bot_line, parsed, display_text).
	"""
	raw = extract_bot_reply(prompt, generated)
	parsed = extract_intent_reply(raw)
	display = fill_slots(parsed.template, slot_data or {})
	return raw, parsed, display


	if __name__ == "__main__":
	# Demo with fake tokenizer glitches — no model required.
	messy = (
	"Ġ<INTENT:GET_STEPS>ĠYou'reĠatĠ<STEPS_TODAY>ĠofĠ<STEP_GOAL>ĠâĢĶĠkeepĠgoing!\n"
	"user: what about yesterday"
	)
	parsed = extract_intent_reply(messy)
	slots = {"STEPS_TODAY": "4,231", "STEP_GOAL": "10,000"}
	print("intent: ", parsed.intent)
	print("template:", parsed.template)
	print("display: ", fill_slots(parsed.template, slots))