Initial upload

360e354 verified about 1 year ago

14.4 kB

	import random
	import string

	common_misspelled_words = {
	"absence": ["absense", "absentse", "abcense", "absance"],
	"acceptable": ["acceptible"],
	"accidentally": ["accidently", "ccidentaly"],
	"accommodate": ["accomodate", "acommodate"],
	"achieve": ["acheive"],
	"acknowledge": ["acknowlege", "aknowledge"],
	"acquaintance": ["acquaintence", "aquaintance"],
	"acquire": ["aquire", "adquire"],
	"acquit": ["aquit"],
	"acreage": ["acrage", "acerage"],
	"address": ["adress"],
	"adultery": ["adultary"],
	"advisable": ["adviseable", "advizable"],
	"affect": ["effect"],
	"aggression": ["agression"],
	"aggressive": ["agressive"],
	"allegiance": ["allegaince", "allegience", "alegiance"],
	"almost": ["allmost"],
	#"a lot": ["alot", "allot"] # Not captured since "a lot" is two tokens.
	"amateur": ["amatuer", "amature"],
	"annually": ["anually", "annualy"],
	"apparent": ["apparant", "aparent", "apparrent", "aparrent"],
	"arctic": ["artic"],
	"argument": ["arguement"],
	"atheist": ["athiest", "athist"],
	"awful": ["awfull", "aweful"],
	"because": ["becuase", "becasue"],
	"beautiful": ["beatiful"],
	"becoming": ["becomeing"],
	"beginning": ["begining"],
	"believe": ["beleive"],
	"bellwether": ["bellweather"],
	"benefit": ["benifit"],
	"buoy": ["bouy"],
	"buoyant": ["bouyant"],
	"business": ["buisness"],
	"calendar": ["calender"],
	"camouflage": ["camoflage", "camoflague"],
	"capitol": ["capital"],
	"Caribbean": ["Carribean"], # More names?
	"category": ["catagory"],
	"caught": ["cauhgt", "caugt"],
	"cemetery": ["cemetary", "cematery"],
	"changeable": ["changable"],
	"chief": ["cheif"],
	"colleague": ["collaegue", "collegue"],
	"column": ["colum"],
	"coming": ["comming"],
	"committed": ["commited", "comitted"],
	"comparison": ["comparsion"],
	"concede": ["conceed"],
	"congratulate": ["congradulate"],
	"conscientious": ["consciencious"],
	"conscious": ["concious", "consious"],
	"consensus": ["concensus"],
	"controversy": ["contraversy"],
	"coolly": ["cooly"],
	"daiquiri": ["dacquiri", "daquiri"],
	"deceive": ["decieve"],
	"definite": ["definate", "definit"],
	"definitely": ["definitly", "definately", "definatly", "defiantly"],
	"desperate": ["desparate"],
	"difference": ["diffrence"],
	"dilemma": ["dilema"],
	"disappoint": ["dissapoint"],
	"disastrous": ["disasterous"],
	"drunkenness": ["drunkeness"],
	"dumbbell": ["dumbell"],
	"embarrass": ["embarass"],
	"equipment": ["equiptment"],
	"exceed": ["excede"],
	"exhilarate": ["exilerate"],
	"existence": ["existance"],
	"experience": ["experiance"],
	"extreme": ["extreem"],
	"fascinating": ["facinating"],
	"fiery": ["firey"],
	"fluorescent": ["flourescent"],
	"foreign": ["foriegn"],
	"forty": ["fourty"],
	"friend": ["freind"],
	"fulfil": ["fullfil", "fulfill"],
	"gauge": ["guage"],
	"grateful": ["gratefull", "greatful"],
	"great": ["grate", "grat"],
	"guarantee": ["garantee", "garentee", "garanty"],
	"guidance": ["guidence"],
	"harass": ["harrass"],
	"height": ["heighth", "heigth"],
	"hierarchy": ["heirarchy"],
	# "hors d'oeuvres": ["hors derves", "ordeurves"] # Not captured since "hors d'oeuvres" is two tokens.
	"humorous": ["humerous"],
	"hygiene": ["hygene", "hygine", "hiygeine", "higeine", "hygeine"],
	"hypocrite": ["hipocrit"],
	"ignorance": ["ignorence"],
	"imitate": ["immitate"],
	"immediately": ["imediately"],
	"indict": ["indite"],
	"independent": ["independant"],
	"indispensable": ["indispensible"],
	"inoculate": ["innoculate"],
	"intelligence": ["inteligence", "intelligance"],
	"jewelry": ["jewellery", "jewelery"],
	"judgment": ["judgement"],
	"kernel": ["kernal"],
	"leisure": ["liesure"],
	"liaison": ["liason"],
	"library": ["libary", "liberry"],
	"license": ["lisence", "licence"],
	"lightning": ["lightening"],
	"lose": ["loose"],
	"maintenance": ["maintainance", "maintnance"],
	"marshmallow": ["marshmellow"],
	"medieval": ["medeval", "medevil", "mideval"],
	"memento": ["momento"],
	"millennium": ["millenium", "milennium"],
	"miniature": ["miniture"],
	"minuscule": ["miniscule"],
	"mischievous": ["mischievious", "mischevous", "mischevious"],
	"misspell": ["mispell", "misspel"],
	"necessary": ["neccessary", "necessery"],
	"niece": ["neice"],
	"neighbour": ["nieghbor"],
	"noticeable": ["noticable"],
	"occasion": ["occassion"],
	"occasionally": ["occasionaly", "occassionally"],
	"occurrence": ["occurrance", "occurence"],
	"occurred": ["occured"],
	"omission": ["ommision", "omision"],
	"original": ["orignal"],
	"outrageous": ["outragous"],
	"parliament": ["parliment"],
	"pastime": ["passtime", "pasttime"],
	"pedagogue": ["pedagoge"],
	"perceive": ["percieve"],
	"perseverance": ["perseverence"],
	"personnel": ["personell", "personel"],
	"plagiarize": ["plagerize"],
	"playwright": ["playright", "playwrite"],
	"possession": ["posession", "possesion"],
	"potatoes": ["potatos"],
	"precede": ["preceed"],
	"presence": ["presance"],
	"principle": ["principal"],
	"privilege": ["privelege", "priviledge"],
	"professor": ["professer"],
	"protester": ["protestor"],
	"promise": ["promiss"],
	"pronunciation": ["pronounciatio"],
	"proof": ["prufe"],
	"prophecy": ["prophesy"],
	"publicly": ["publically"],
	"quarantine": ["quarentine"],
	"queue": ["que"],
	"questionnaire": ["questionaire", "questionnair"],
	"readable": ["readible"],
	"really": ["realy"],
	"receive": ["recieve"],
	"receipt": ["reciept"],
	"recommend": ["recomend", "reccommend"],
	"referred": ["refered"],
	"reference": ["referance", "refrence"],
	"relevant": ["relevent", "revelant"],
	"religious": ["religous", "religius"],
	"repetition": ["repitition"],
	"restaurant": ["restarant", "restaraunt"],
	"rhyme": ["rime"],
	"rhythm": ["rythm", "rythem"],
	"secretary": ["secratary", "secretery"],
	"seize": ["sieze"],
	"separate": ["seperate"],
	"sergeant": ["sargent"],
	"similar": ["similer"],
	"skilful": ["skilfull", "skillful"],
	"speech": ["speach", "speeche"],
	"successful": ["succesful", "successfull", "sucessful"],
	"supersede": ["supercede"],
	"surprise": ["suprise", "surprize"],
	"than": ["then"],
	"their": ["there", "they're"],
	"tomatoes": ["tomatos"],
	"tomorrow": ["tommorow", "tommorrow"],
	"Tucson": ["Tuscon"],
	"twelfth": ["twelth"],
	"tyranny": ["tyrany"],
	"underrate": ["underate"],
	"until": ["untill"],
	"upholstery": ["upholstry"],
	"usable": ["useable", "usible"],
	"vacuum": ["vaccuum", "vaccum", "vacume"],
	"vehicle": ["vehical"],
	"vicious": ["visious"],
	"what": ["wat"],
	"weather": ["wether", "whether"],
	"weird": ["wierd"],
	"welfare": ["wellfare", "welfair"],
	"whether": ["wether"],
	"wilful": ["wilfull", "willful"],
	"withhold": ["withold"],
	"writing": ["writting", "writeing"],
	"you're": ["your"],
	"your": ["you're"],
	}


	def apostrophe_error(word: str) -> str:
	"""
	Simulate common errors with apostrophes.

	If the word contains an apostrophe:
	- randomly remove it,
	- shift it one position left (if possible),
	- shift it one position right (if possible), or
	- duplicate it.

	If the word does not contain an apostrophe but ends with 's',
	sometimes insert an apostrophe to mimic a mistaken possessive.
	"""
	if "'" in word:
	# Identify all apostrophe positions
	indices = [i for i, ch in enumerate(word) if ch == "'"]
	idx = random.choice(indices)
	error_choice = random.choice(['remove', 'shift_left', 'shift_right', 'duplicate'])
	if error_choice == 'remove':
	return word[:idx] + word[idx + 1:]
	elif error_choice == 'shift_left':
	if idx > 0:
	# Remove the apostrophe and insert it one position left.
	return word[:idx - 1] + word[idx] + word[idx - 1] + word[idx + 1:]
	else:
	return word[:idx] + word[idx + 1:]
	elif error_choice == 'shift_right':
	if idx < len(word) - 1:
	# Remove the apostrophe and insert it one position right.
	return word[:idx] + word[idx + 1] + word[idx] + word[idx + 2:]
	else:
	return word[:idx] + word[idx + 1:]
	elif error_choice == 'duplicate':
	return word[:idx + 1] + "'" + word[idx + 1:]
	else:
	# For words without an apostrophe: if the word ends with 's', sometimes insert one.
	if word.endswith("s") and random.random() < 0.5:
	# Insert an apostrophe before the last letter.
	return word[:-1] + "'" + word[-1]
	return word


	def delete_random_letter(word: str) -> str:
	"""Simulate an omission error by deleting a random letter."""
	if len(word) < 2:
	return word
	idx = random.randint(0, len(word) - 1)
	return word[:idx] + word[idx + 1:]


	def duplicate_random_letter(word: str) -> str:
	"""Simulate an extra keypress by duplicating a letter at a random index."""
	if not word:
	return word
	idx = random.randint(0, len(word) - 1)
	return word[:idx + 1] + word[idx] + word[idx + 1:]


	def insert_random_letter(word: str) -> str:
	"""Simulate an insertion error by adding a random letter at a random position."""
	idx = random.randint(0, len(word))
	letter = random.choice(string.ascii_lowercase)
	return word[:idx] + letter + word[idx:]


	def replace_with_adjacent_key(word: str) -> str:
	"""
	Simulate a typing error by replacing a letter with one of its QWERTY neighbors.
	Only letters with defined neighbors are considered.
	"""
	# Define neighboring keys for a QWERTY keyboard (for lowercase letters)
	qwerty_neighbors = {
	'q': ['w', 'a'],
	'w': ['q', 'e', 's'],
	'e': ['w', 'r', 'd'],
	'r': ['e', 't', 'f'],
	't': ['r', 'y', 'g'],
	'y': ['t', 'u', 'h'],
	'u': ['y', 'i', 'j'],
	'i': ['u', 'o', 'k'],
	'o': ['i', 'p', 'l'],
	'p': ['o'],
	'a': ['q', 's', 'z'],
	's': ['a', 'd', 'w', 'x'],
	'd': ['s', 'f', 'e', 'c'],
	'f': ['d', 'g', 'r', 'v'],
	'g': ['f', 'h', 't', 'b'],
	'h': ['g', 'j', 'y', 'n'],
	'j': ['h', 'k', 'u', 'm'],
	'k': ['j', 'l', 'i'],
	'l': ['k', 'o'],
	'z': ['a', 'x'],
	'x': ['z', 'c', 's'],
	'c': ['x', 'v', 'd'],
	'v': ['c', 'b', 'f'],
	'b': ['v', 'n', 'g'],
	'n': ['b', 'm', 'h'],
	'm': ['n', 'j']
	}
	# Find indices of characters that are letters with neighbors
	valid_indices = [i for i, ch in enumerate(word) if ch.lower() in qwerty_neighbors]
	if not valid_indices:
	return word
	idx = random.choice(valid_indices)
	orig_char = word[idx]
	lower_char = orig_char.lower()
	replacement = random.choice(qwerty_neighbors[lower_char])
	# Preserve original case
	if orig_char.isupper():
	replacement = replacement.upper()
	return word[:idx] + replacement + word[idx + 1:]


	def swap_adjacent_letters(word: str) -> str:
	"""Simulate a transposition error by swapping two adjacent letters."""
	if len(word) < 2:
	return word
	idx = random.randint(0, len(word) - 2)
	word_list = list(word)
	word_list[idx], word_list[idx + 1] = word_list[idx + 1], word_list[idx]
	return ''.join(word_list)


	def switch_ie_ei(word: str) -> str:
	"""
	Switch occurrences of 'ie' with 'ei' (or vice versa) to simulate
	a common vowel pair error.
	"""
	if 'ie' in word:
	# Find all occurrences of 'ie'
	indices = []
	start = 0
	while True:
	idx = word.find('ie', start)
	if idx == -1:
	break
	indices.append(idx)
	start = idx + 1
	if indices:
	idx = random.choice(indices)
	return word[:idx] + 'ei' + word[idx + 2:]
	elif 'ei' in word:
	indices = []
	start = 0
	while True:
	idx = word.find('ei', start)
	if idx == -1:
	break
	indices.append(idx)
	start = idx + 1
	if indices:
	idx = random.choice(indices)
	return word[:idx] + 'ie' + word[idx + 2:]
	return word


	def generate_typo(word: str) -> str:
	"""
	Given an input word, return a version of it with a common typo.
	This function randomly selects one (or sometimes two) of the following error types:
	- adjacent letter transposition
	- deletion of a letter
	- duplication of a letter
	- insertion of a random letter
	- replacement with a neighboring key (QWERTY)
	- switching 'ie' and 'ei' sequences
	While this method is by no means exhaustive, it reflects many of the typical errors documented.
	"""
	if not word:
	return word

	if word in common_misspelled_words:
	if random.random() < 0.5: # 50% chance of selecting a common misspelling.
	return random.choice(common_misspelled_words[word])

	# List of available transformation functions
	transformations = [
	apostrophe_error,
	delete_random_letter,
	duplicate_random_letter,
	insert_random_letter,
	replace_with_adjacent_key,
	swap_adjacent_letters,
	switch_ie_ei
	]

	# Randomly choose one transformation
	transformation = random.choice(transformations)
	result = transformation(word)

	# Occasionally chain a second transformation (10% chance) for added variability
	if random.random() < 0.1:
	second_transformation = random.choice(transformations)
	result = second_transformation(result)

	return result


	# Example usage:
	if __name__ == '__main__':
	test_words = [
	"accommodate", "definitely", "receive", "mischievous", "calendar",
	"equipment", "pronunciation", "consensus", "friend", "beautiful",
	"doesn't", "books"
	]
	for test_word in test_words:
	typo = generate_typo(test_word)
	print(f"Original: {test_word:15s} -> Typo: {typo}")