nano-proofread / data_proofread.py

Upload data_proofread.py with huggingface_hub

837909a verified 2 days ago

10.8 kB

	"""proofread — a code-generated supervised task: a short phrase with one common
	writing error -> the corrected phrase.

	The cleanest kind of must-beat-a-script story: the errors are CONTEXT-DEPENDENT
	homophone confusions a lookup table cannot resolve. `their`, `there`, and `they're`
	are spelled differently and sound the same; which one is right depends entirely on the
	surrounding words. A context-free script can only guess the most common one. The model
	reads the phrase and uses the grammatical context to pick.

	Scope (stated plainly): a FIXED set of high-frequency confusions plus doubled words,
	not general grammar correction:

	their / there / they're your / you're its / it's
	then / than to / too could have / could of

	Each example is one line::

	their going to win => they're going to win
	its raining again => it's raining again
	the the cat sat => the cat sat

	Generation is correct by construction: phrases are built from ~65 distinct grammatical
	FRAMES with rich, agreement-checked slot fillers and optional prefix/suffix wrapping,
	so the surface form varies enormously and the model must learn the local grammatical
	CUE for each word rather than a template. We then inject one error (swap the confusion
	word, or double a word). ~15% identity. `proofread_pairs` is shared by train and eval;
	`naive_fix` is the best context-free script baseline; `OOD_CASES` is a hand-written
	natural-phrase set (NOT drawn from the frames) for measuring true generalisation.
	numpy-only, no torch.
	"""

	from __future__ import annotations

	import numpy as np

	_NOUN = ("house car dog cat book phone plan idea project dinner movie story song city "
	"street dream bag coffee game party trip photo report meeting budget window "
	"door garden room school office team chair table laptop ticket letter").split()
	_NOUN_PL = ("friends parents kids keys shoes neighbours children students teachers dogs "
	"cats books phones plans ideas cars rooms photos tickets letters").split()
	_ADJ = ("happy tired ready late early busy angry quiet loud bright dark cold warm funny "
	"serious tough easy strange lovely simple clever lucky honest calm noisy fancy "
	"gentle brave proud nervous curious cheerful sad hungry sleepy").split()
	_VING = ("running coming leaving working playing waiting joking winning losing laughing "
	"talking cooking driving reading writing studying dancing moving growing "
	"building helping trying singing sleeping").split()
	_VERB = ("win lose leave stay help start finish call visit move build cook read write "
	"study dance play sing watch learn travel rest eat go").split()
	_PVERB = ("ran left came stayed helped started finished called visited moved built "
	"cooked wrote studied danced played sang watched learned travelled ate").split()
	_PP = ("won lost left helped started finished called moved built cooked played watched "
	"learned tried gone done seen made said known taken called").split()
	_ADJ_ER = ("bigger smaller faster slower taller shorter older younger richer stronger "
	"warmer colder louder quieter cheaper nicer braver kinder").split()
	_NAME = "anna ben sara tom maria john lucy sam emma dave mike kate".split()
	_PLACE = "park store office gym beach city market station library cafe shop airport".split()
	_SUBJ_PL = "we they people students teachers kids friends parents neighbours children".split()
	_SUBJ_SG = "i you he she".split()
	_SUBJ_ANY = _SUBJ_PL + _SUBJ_SG + _NAME

	_SLOTS = {
	"n": _NOUN, "np": _NOUN_PL, "a": _ADJ, "vg": _VING, "v": _VERB, "pv": _PVERB,
	"pp": _PP, "ae": _ADJ_ER, "nm": _NAME, "pl": _PLACE,
	"sp": _SUBJ_PL, "sa": _SUBJ_ANY,
	}

	_CONFUSIONS = {
	"their": ["there", "they're"], "there": ["their", "they're"],
	"they're": ["their", "there"],
	"your": ["you're"], "you're": ["your"],
	"its": ["it's"], "it's": ["its"],
	"then": ["than"], "than": ["then"],
	"to": ["too"], "too": ["to"],
	"could have": ["could of"],
	}

	# canonical (most-common) member the context-free script falls back to per family
	_CANON = {
	"their": "there", "there": "there", "they're": "there",
	"your": "your", "you're": "your",
	"its": "its", "it's": "its",
	"then": "then", "than": "then",
	"to": "to", "too": "to",
	}

	_FRAMES = [
	("their {n}", "their"), ("their {a} {n}", "their"), ("{sp} love their {np}", "their"),
	("{nm} forgot their {n}", "their"), ("their {np} are {a}", "their"),
	("i saw their {n}", "their"),
	("there is a {n}", "there"), ("there are {np}", "there"), ("over there", "there"),
	("{sa} went there", "there"), ("{nm} sat there", "there"), ("put it there", "there"),
	("there it is", "there"),
	("they're {vg}", "they're"), ("they're {a}", "they're"),
	("they're going to {v}", "they're"), ("they're not {a}", "they're"),
	("i think they're {a}", "they're"), ("they're here", "they're"),
	("your {n}", "your"), ("your {a} {n}", "your"), ("i like your {n}", "your"),
	("your {np} are {a}", "your"), ("is your {n} ready", "your"),
	("you're {vg}", "you're"), ("you're {a}", "you're"),
	("you're going to {v}", "you're"), ("you're the best", "you're"),
	("i think you're {a}", "you're"), ("you're right", "you're"),
	("its {n}", "its"), ("the {n} lost its {n}", "its"), ("every {n} has its {n}", "its"),
	("its {n} is {a}", "its"), ("the dog wagged its tail", "its"),
	("it's {a}", "it's"), ("it's {vg}", "it's"), ("it's a {n}", "it's"),
	("it's going to {v}", "it's"), ("it's late", "it's"), ("i think it's {a}", "it's"),
	("and then {sa} {pv}", "then"), ("back then", "then"), ("{sa} {pv} then left", "then"),
	("see you then", "then"), ("first this then that", "then"),
	("we {pv} then went home", "then"),
	("{ae} than before", "than"), ("more {a} than {nm}", "than"),
	("better than ever", "than"), ("{nm} is {ae} than {nm}", "than"),
	("rather than wait", "than"), ("it is {ae} than that", "than"),
	("go to the {pl}", "to"), ("time to {v}", "to"), ("{sp} want to {v}", "to"),
	("talk to {nm}", "to"), ("back to work", "to"), ("i need to {v}", "to"),
	("too {a}", "too"), ("{a} too", "too"), ("me too", "too"), ("way too {a}", "too"),
	("it is too {a} to {v}", "too"), ("{nm} came too", "too"),
	("{sa} could have {pp}", "could have"), ("{nm} could have {pp}", "could have"),
	("we could have won", "could have"), ("you could have {pp}", "could have"),
	("it could have {pp}", "could have"),
	]

	_PREFIX = "honestly well look ok hey so".split()
	_SUFFIX = ["today", "again", "right now", "this time", "for sure", "i guess", "as well"]


	def _pick(rng, seq):
	return seq[int(rng.integers(len(seq)))]


	def _fill(rng, template):
	out = template
	while "{" in out:
	a = out.index("{")
	b = out.index("}")
	key = out[a + 1:b]
	out = out[:a] + _pick(rng, _SLOTS[key]) + out[b + 1:]
	return out


	def _correct_phrase(rng):
	template, target = _FRAMES[int(rng.integers(len(_FRAMES)))]
	phrase = _fill(rng, template)
	if rng.random() < 0.25:
	pre = _pick(rng, _PREFIX)
	if pre != phrase.split()[0]:
	phrase = pre + " " + phrase
	if rng.random() < 0.25:
	suf = _pick(rng, _SUFFIX)
	if suf.split()[0] != phrase.split()[-1]:
	phrase = phrase + " " + suf
	return phrase, target


	def _double_a_word(rng, phrase):
	words = phrase.split()
	i = int(rng.integers(len(words)))
	return " ".join(words[:i + 1] + [words[i]] + words[i + 1:])


	def _swap_target(phrase, target, wrong):
	toks = phrase.split()
	tgt = target.split()
	for i in range(len(toks) - len(tgt) + 1):
	if toks[i:i + len(tgt)] == tgt:
	return " ".join(toks[:i] + wrong.split() + toks[i + len(tgt):])
	return phrase


	def _corrupt(rng, phrase, target):
	if rng.random() < 0.75 and target in _CONFUSIONS:
	wrong = _pick(rng, _CONFUSIONS[target])
	return _swap_target(phrase, target, wrong)
	return _double_a_word(rng, phrase)


	def proofread_pairs(seed, n):
	"""`n` deterministic (prompt, corrected-phrase) pairs from `seed`. ~15% identity."""
	rng = np.random.default_rng(seed)
	out = []
	while len(out) < n:
	correct, target = _correct_phrase(rng)
	if rng.random() < 0.15:
	inp = correct
	else:
	inp = _corrupt(rng, correct, target)
	if inp == correct:
	inp = _double_a_word(rng, correct)
	if len(inp) + 4 + len(correct) + 1 > 64:
	continue
	out.append((f"{inp} => ", correct))
	return out


	def naive_fix(phrase):
	"""The best context-free script: drop adjacent doubled words + normalise every
	homophone to the most common member of its family. Fixes doubled words and
	could-of perfectly; cannot resolve context-dependent homophones."""
	toks = phrase.split()
	dedup = []
	for t in toks:
	if not dedup or dedup[-1] != t:
	dedup.append(t)
	out, i = [], 0
	while i < len(dedup):
	two = " ".join(dedup[i:i + 2])
	if two in ("could of", "could have"):
	out += ["could", "have"]; i += 2
	elif dedup[i] in _CANON:
	out.append(_CANON[dedup[i]]); i += 1
	else:
	out.append(dedup[i]); i += 1
	return " ".join(out)


	# Hand-written natural phrases NOT drawn from the frames — the honest generalisation
	# test. (input, gold-correction). Mix of every confusion + identity (leave-alone) cases.
	OOD_CASES = [
	("their going to win", "they're going to win"),
	("your the best", "you're the best"),
	("its raining again", "it's raining again"),
	("the the cat sat", "the cat sat"),
	("i could of helped", "i could have helped"),
	("this is bigger then that", "this is bigger than that"),
	("it is to late", "it is too late"),
	("they're house is big", "their house is big"),
	("we went too the park", "we went to the park"),
	("your going to love it", "you're going to love it"),
	("its a great day", "it's a great day"),
	("there going home", "they're going home"),
	("i like there car", "i like their car"),
	("i want too help", "i want to help"),
	("me to", "me too"),
	("she is faster then me", "she is faster than me"),
	("you could of won", "you could have won"),
	("its been a long day", "it's been a long day"),
	("their not ready", "they're not ready"),
	("put it over their", "put it over there"),
	("she is happy today", "she is happy today"),
	("the dog lost its bone", "the dog lost its bone"),
	("your dog is cute", "your dog is cute"),
	("they're going to win", "they're going to win"),
	("it is too late", "it is too late"),
	]