veryfansome's picture
Initial upload
360e354 verified
import random
import string
common_misspelled_words = {
"absence": ["absense", "absentse", "abcense", "absance"],
"acceptable": ["acceptible"],
"accidentally": ["accidently", "ccidentaly"],
"accommodate": ["accomodate", "acommodate"],
"achieve": ["acheive"],
"acknowledge": ["acknowlege", "aknowledge"],
"acquaintance": ["acquaintence", "aquaintance"],
"acquire": ["aquire", "adquire"],
"acquit": ["aquit"],
"acreage": ["acrage", "acerage"],
"address": ["adress"],
"adultery": ["adultary"],
"advisable": ["adviseable", "advizable"],
"affect": ["effect"],
"aggression": ["agression"],
"aggressive": ["agressive"],
"allegiance": ["allegaince", "allegience", "alegiance"],
"almost": ["allmost"],
#"a lot": ["alot", "allot"] # Not captured since "a lot" is two tokens.
"amateur": ["amatuer", "amature"],
"annually": ["anually", "annualy"],
"apparent": ["apparant", "aparent", "apparrent", "aparrent"],
"arctic": ["artic"],
"argument": ["arguement"],
"atheist": ["athiest", "athist"],
"awful": ["awfull", "aweful"],
"because": ["becuase", "becasue"],
"beautiful": ["beatiful"],
"becoming": ["becomeing"],
"beginning": ["begining"],
"believe": ["beleive"],
"bellwether": ["bellweather"],
"benefit": ["benifit"],
"buoy": ["bouy"],
"buoyant": ["bouyant"],
"business": ["buisness"],
"calendar": ["calender"],
"camouflage": ["camoflage", "camoflague"],
"capitol": ["capital"],
"Caribbean": ["Carribean"], # More names?
"category": ["catagory"],
"caught": ["cauhgt", "caugt"],
"cemetery": ["cemetary", "cematery"],
"changeable": ["changable"],
"chief": ["cheif"],
"colleague": ["collaegue", "collegue"],
"column": ["colum"],
"coming": ["comming"],
"committed": ["commited", "comitted"],
"comparison": ["comparsion"],
"concede": ["conceed"],
"congratulate": ["congradulate"],
"conscientious": ["consciencious"],
"conscious": ["concious", "consious"],
"consensus": ["concensus"],
"controversy": ["contraversy"],
"coolly": ["cooly"],
"daiquiri": ["dacquiri", "daquiri"],
"deceive": ["decieve"],
"definite": ["definate", "definit"],
"definitely": ["definitly", "definately", "definatly", "defiantly"],
"desperate": ["desparate"],
"difference": ["diffrence"],
"dilemma": ["dilema"],
"disappoint": ["dissapoint"],
"disastrous": ["disasterous"],
"drunkenness": ["drunkeness"],
"dumbbell": ["dumbell"],
"embarrass": ["embarass"],
"equipment": ["equiptment"],
"exceed": ["excede"],
"exhilarate": ["exilerate"],
"existence": ["existance"],
"experience": ["experiance"],
"extreme": ["extreem"],
"fascinating": ["facinating"],
"fiery": ["firey"],
"fluorescent": ["flourescent"],
"foreign": ["foriegn"],
"forty": ["fourty"],
"friend": ["freind"],
"fulfil": ["fullfil", "fulfill"],
"gauge": ["guage"],
"grateful": ["gratefull", "greatful"],
"great": ["grate", "grat"],
"guarantee": ["garantee", "garentee", "garanty"],
"guidance": ["guidence"],
"harass": ["harrass"],
"height": ["heighth", "heigth"],
"hierarchy": ["heirarchy"],
# "hors d'oeuvres": ["hors derves", "ordeurves"] # Not captured since "hors d'oeuvres" is two tokens.
"humorous": ["humerous"],
"hygiene": ["hygene", "hygine", "hiygeine", "higeine", "hygeine"],
"hypocrite": ["hipocrit"],
"ignorance": ["ignorence"],
"imitate": ["immitate"],
"immediately": ["imediately"],
"indict": ["indite"],
"independent": ["independant"],
"indispensable": ["indispensible"],
"inoculate": ["innoculate"],
"intelligence": ["inteligence", "intelligance"],
"jewelry": ["jewellery", "jewelery"],
"judgment": ["judgement"],
"kernel": ["kernal"],
"leisure": ["liesure"],
"liaison": ["liason"],
"library": ["libary", "liberry"],
"license": ["lisence", "licence"],
"lightning": ["lightening"],
"lose": ["loose"],
"maintenance": ["maintainance", "maintnance"],
"marshmallow": ["marshmellow"],
"medieval": ["medeval", "medevil", "mideval"],
"memento": ["momento"],
"millennium": ["millenium", "milennium"],
"miniature": ["miniture"],
"minuscule": ["miniscule"],
"mischievous": ["mischievious", "mischevous", "mischevious"],
"misspell": ["mispell", "misspel"],
"necessary": ["neccessary", "necessery"],
"niece": ["neice"],
"neighbour": ["nieghbor"],
"noticeable": ["noticable"],
"occasion": ["occassion"],
"occasionally": ["occasionaly", "occassionally"],
"occurrence": ["occurrance", "occurence"],
"occurred": ["occured"],
"omission": ["ommision", "omision"],
"original": ["orignal"],
"outrageous": ["outragous"],
"parliament": ["parliment"],
"pastime": ["passtime", "pasttime"],
"pedagogue": ["pedagoge"],
"perceive": ["percieve"],
"perseverance": ["perseverence"],
"personnel": ["personell", "personel"],
"plagiarize": ["plagerize"],
"playwright": ["playright", "playwrite"],
"possession": ["posession", "possesion"],
"potatoes": ["potatos"],
"precede": ["preceed"],
"presence": ["presance"],
"principle": ["principal"],
"privilege": ["privelege", "priviledge"],
"professor": ["professer"],
"protester": ["protestor"],
"promise": ["promiss"],
"pronunciation": ["pronounciatio"],
"proof": ["prufe"],
"prophecy": ["prophesy"],
"publicly": ["publically"],
"quarantine": ["quarentine"],
"queue": ["que"],
"questionnaire": ["questionaire", "questionnair"],
"readable": ["readible"],
"really": ["realy"],
"receive": ["recieve"],
"receipt": ["reciept"],
"recommend": ["recomend", "reccommend"],
"referred": ["refered"],
"reference": ["referance", "refrence"],
"relevant": ["relevent", "revelant"],
"religious": ["religous", "religius"],
"repetition": ["repitition"],
"restaurant": ["restarant", "restaraunt"],
"rhyme": ["rime"],
"rhythm": ["rythm", "rythem"],
"secretary": ["secratary", "secretery"],
"seize": ["sieze"],
"separate": ["seperate"],
"sergeant": ["sargent"],
"similar": ["similer"],
"skilful": ["skilfull", "skillful"],
"speech": ["speach", "speeche"],
"successful": ["succesful", "successfull", "sucessful"],
"supersede": ["supercede"],
"surprise": ["suprise", "surprize"],
"than": ["then"],
"their": ["there", "they're"],
"tomatoes": ["tomatos"],
"tomorrow": ["tommorow", "tommorrow"],
"Tucson": ["Tuscon"],
"twelfth": ["twelth"],
"tyranny": ["tyrany"],
"underrate": ["underate"],
"until": ["untill"],
"upholstery": ["upholstry"],
"usable": ["useable", "usible"],
"vacuum": ["vaccuum", "vaccum", "vacume"],
"vehicle": ["vehical"],
"vicious": ["visious"],
"what": ["wat"],
"weather": ["wether", "whether"],
"weird": ["wierd"],
"welfare": ["wellfare", "welfair"],
"whether": ["wether"],
"wilful": ["wilfull", "willful"],
"withhold": ["withold"],
"writing": ["writting", "writeing"],
"you're": ["your"],
"your": ["you're"],
}
def apostrophe_error(word: str) -> str:
"""
Simulate common errors with apostrophes.
If the word contains an apostrophe:
- randomly remove it,
- shift it one position left (if possible),
- shift it one position right (if possible), or
- duplicate it.
If the word does not contain an apostrophe but ends with 's',
sometimes insert an apostrophe to mimic a mistaken possessive.
"""
if "'" in word:
# Identify all apostrophe positions
indices = [i for i, ch in enumerate(word) if ch == "'"]
idx = random.choice(indices)
error_choice = random.choice(['remove', 'shift_left', 'shift_right', 'duplicate'])
if error_choice == 'remove':
return word[:idx] + word[idx + 1:]
elif error_choice == 'shift_left':
if idx > 0:
# Remove the apostrophe and insert it one position left.
return word[:idx - 1] + word[idx] + word[idx - 1] + word[idx + 1:]
else:
return word[:idx] + word[idx + 1:]
elif error_choice == 'shift_right':
if idx < len(word) - 1:
# Remove the apostrophe and insert it one position right.
return word[:idx] + word[idx + 1] + word[idx] + word[idx + 2:]
else:
return word[:idx] + word[idx + 1:]
elif error_choice == 'duplicate':
return word[:idx + 1] + "'" + word[idx + 1:]
else:
# For words without an apostrophe: if the word ends with 's', sometimes insert one.
if word.endswith("s") and random.random() < 0.5:
# Insert an apostrophe before the last letter.
return word[:-1] + "'" + word[-1]
return word
def delete_random_letter(word: str) -> str:
"""Simulate an omission error by deleting a random letter."""
if len(word) < 2:
return word
idx = random.randint(0, len(word) - 1)
return word[:idx] + word[idx + 1:]
def duplicate_random_letter(word: str) -> str:
"""Simulate an extra keypress by duplicating a letter at a random index."""
if not word:
return word
idx = random.randint(0, len(word) - 1)
return word[:idx + 1] + word[idx] + word[idx + 1:]
def insert_random_letter(word: str) -> str:
"""Simulate an insertion error by adding a random letter at a random position."""
idx = random.randint(0, len(word))
letter = random.choice(string.ascii_lowercase)
return word[:idx] + letter + word[idx:]
def replace_with_adjacent_key(word: str) -> str:
"""
Simulate a typing error by replacing a letter with one of its QWERTY neighbors.
Only letters with defined neighbors are considered.
"""
# Define neighboring keys for a QWERTY keyboard (for lowercase letters)
qwerty_neighbors = {
'q': ['w', 'a'],
'w': ['q', 'e', 's'],
'e': ['w', 'r', 'd'],
'r': ['e', 't', 'f'],
't': ['r', 'y', 'g'],
'y': ['t', 'u', 'h'],
'u': ['y', 'i', 'j'],
'i': ['u', 'o', 'k'],
'o': ['i', 'p', 'l'],
'p': ['o'],
'a': ['q', 's', 'z'],
's': ['a', 'd', 'w', 'x'],
'd': ['s', 'f', 'e', 'c'],
'f': ['d', 'g', 'r', 'v'],
'g': ['f', 'h', 't', 'b'],
'h': ['g', 'j', 'y', 'n'],
'j': ['h', 'k', 'u', 'm'],
'k': ['j', 'l', 'i'],
'l': ['k', 'o'],
'z': ['a', 'x'],
'x': ['z', 'c', 's'],
'c': ['x', 'v', 'd'],
'v': ['c', 'b', 'f'],
'b': ['v', 'n', 'g'],
'n': ['b', 'm', 'h'],
'm': ['n', 'j']
}
# Find indices of characters that are letters with neighbors
valid_indices = [i for i, ch in enumerate(word) if ch.lower() in qwerty_neighbors]
if not valid_indices:
return word
idx = random.choice(valid_indices)
orig_char = word[idx]
lower_char = orig_char.lower()
replacement = random.choice(qwerty_neighbors[lower_char])
# Preserve original case
if orig_char.isupper():
replacement = replacement.upper()
return word[:idx] + replacement + word[idx + 1:]
def swap_adjacent_letters(word: str) -> str:
"""Simulate a transposition error by swapping two adjacent letters."""
if len(word) < 2:
return word
idx = random.randint(0, len(word) - 2)
word_list = list(word)
word_list[idx], word_list[idx + 1] = word_list[idx + 1], word_list[idx]
return ''.join(word_list)
def switch_ie_ei(word: str) -> str:
"""
Switch occurrences of 'ie' with 'ei' (or vice versa) to simulate
a common vowel pair error.
"""
if 'ie' in word:
# Find all occurrences of 'ie'
indices = []
start = 0
while True:
idx = word.find('ie', start)
if idx == -1:
break
indices.append(idx)
start = idx + 1
if indices:
idx = random.choice(indices)
return word[:idx] + 'ei' + word[idx + 2:]
elif 'ei' in word:
indices = []
start = 0
while True:
idx = word.find('ei', start)
if idx == -1:
break
indices.append(idx)
start = idx + 1
if indices:
idx = random.choice(indices)
return word[:idx] + 'ie' + word[idx + 2:]
return word
def generate_typo(word: str) -> str:
"""
Given an input word, return a version of it with a common typo.
This function randomly selects one (or sometimes two) of the following error types:
- adjacent letter transposition
- deletion of a letter
- duplication of a letter
- insertion of a random letter
- replacement with a neighboring key (QWERTY)
- switching 'ie' and 'ei' sequences
While this method is by no means exhaustive, it reflects many of the typical errors documented.
"""
if not word:
return word
if word in common_misspelled_words:
if random.random() < 0.5: # 50% chance of selecting a common misspelling.
return random.choice(common_misspelled_words[word])
# List of available transformation functions
transformations = [
apostrophe_error,
delete_random_letter,
duplicate_random_letter,
insert_random_letter,
replace_with_adjacent_key,
swap_adjacent_letters,
switch_ie_ei
]
# Randomly choose one transformation
transformation = random.choice(transformations)
result = transformation(word)
# Occasionally chain a second transformation (10% chance) for added variability
if random.random() < 0.1:
second_transformation = random.choice(transformations)
result = second_transformation(result)
return result
# Example usage:
if __name__ == '__main__':
test_words = [
"accommodate", "definitely", "receive", "mischievous", "calendar",
"equipment", "pronunciation", "consensus", "friend", "beautiful",
"doesn't", "books"
]
for test_word in test_words:
typo = generate_typo(test_word)
print(f"Original: {test_word:15s} -> Typo: {typo}")