import random import string common_misspelled_words = { "absence": ["absense", "absentse", "abcense", "absance"], "acceptable": ["acceptible"], "accidentally": ["accidently", "ccidentaly"], "accommodate": ["accomodate", "acommodate"], "achieve": ["acheive"], "acknowledge": ["acknowlege", "aknowledge"], "acquaintance": ["acquaintence", "aquaintance"], "acquire": ["aquire", "adquire"], "acquit": ["aquit"], "acreage": ["acrage", "acerage"], "address": ["adress"], "adultery": ["adultary"], "advisable": ["adviseable", "advizable"], "affect": ["effect"], "aggression": ["agression"], "aggressive": ["agressive"], "allegiance": ["allegaince", "allegience", "alegiance"], "almost": ["allmost"], #"a lot": ["alot", "allot"] # Not captured since "a lot" is two tokens. "amateur": ["amatuer", "amature"], "annually": ["anually", "annualy"], "apparent": ["apparant", "aparent", "apparrent", "aparrent"], "arctic": ["artic"], "argument": ["arguement"], "atheist": ["athiest", "athist"], "awful": ["awfull", "aweful"], "because": ["becuase", "becasue"], "beautiful": ["beatiful"], "becoming": ["becomeing"], "beginning": ["begining"], "believe": ["beleive"], "bellwether": ["bellweather"], "benefit": ["benifit"], "buoy": ["bouy"], "buoyant": ["bouyant"], "business": ["buisness"], "calendar": ["calender"], "camouflage": ["camoflage", "camoflague"], "capitol": ["capital"], "Caribbean": ["Carribean"], # More names? "category": ["catagory"], "caught": ["cauhgt", "caugt"], "cemetery": ["cemetary", "cematery"], "changeable": ["changable"], "chief": ["cheif"], "colleague": ["collaegue", "collegue"], "column": ["colum"], "coming": ["comming"], "committed": ["commited", "comitted"], "comparison": ["comparsion"], "concede": ["conceed"], "congratulate": ["congradulate"], "conscientious": ["consciencious"], "conscious": ["concious", "consious"], "consensus": ["concensus"], "controversy": ["contraversy"], "coolly": ["cooly"], "daiquiri": ["dacquiri", "daquiri"], "deceive": ["decieve"], "definite": ["definate", "definit"], "definitely": ["definitly", "definately", "definatly", "defiantly"], "desperate": ["desparate"], "difference": ["diffrence"], "dilemma": ["dilema"], "disappoint": ["dissapoint"], "disastrous": ["disasterous"], "drunkenness": ["drunkeness"], "dumbbell": ["dumbell"], "embarrass": ["embarass"], "equipment": ["equiptment"], "exceed": ["excede"], "exhilarate": ["exilerate"], "existence": ["existance"], "experience": ["experiance"], "extreme": ["extreem"], "fascinating": ["facinating"], "fiery": ["firey"], "fluorescent": ["flourescent"], "foreign": ["foriegn"], "forty": ["fourty"], "friend": ["freind"], "fulfil": ["fullfil", "fulfill"], "gauge": ["guage"], "grateful": ["gratefull", "greatful"], "great": ["grate", "grat"], "guarantee": ["garantee", "garentee", "garanty"], "guidance": ["guidence"], "harass": ["harrass"], "height": ["heighth", "heigth"], "hierarchy": ["heirarchy"], # "hors d'oeuvres": ["hors derves", "ordeurves"] # Not captured since "hors d'oeuvres" is two tokens. "humorous": ["humerous"], "hygiene": ["hygene", "hygine", "hiygeine", "higeine", "hygeine"], "hypocrite": ["hipocrit"], "ignorance": ["ignorence"], "imitate": ["immitate"], "immediately": ["imediately"], "indict": ["indite"], "independent": ["independant"], "indispensable": ["indispensible"], "inoculate": ["innoculate"], "intelligence": ["inteligence", "intelligance"], "jewelry": ["jewellery", "jewelery"], "judgment": ["judgement"], "kernel": ["kernal"], "leisure": ["liesure"], "liaison": ["liason"], "library": ["libary", "liberry"], "license": ["lisence", "licence"], "lightning": ["lightening"], "lose": ["loose"], "maintenance": ["maintainance", "maintnance"], "marshmallow": ["marshmellow"], "medieval": ["medeval", "medevil", "mideval"], "memento": ["momento"], "millennium": ["millenium", "milennium"], "miniature": ["miniture"], "minuscule": ["miniscule"], "mischievous": ["mischievious", "mischevous", "mischevious"], "misspell": ["mispell", "misspel"], "necessary": ["neccessary", "necessery"], "niece": ["neice"], "neighbour": ["nieghbor"], "noticeable": ["noticable"], "occasion": ["occassion"], "occasionally": ["occasionaly", "occassionally"], "occurrence": ["occurrance", "occurence"], "occurred": ["occured"], "omission": ["ommision", "omision"], "original": ["orignal"], "outrageous": ["outragous"], "parliament": ["parliment"], "pastime": ["passtime", "pasttime"], "pedagogue": ["pedagoge"], "perceive": ["percieve"], "perseverance": ["perseverence"], "personnel": ["personell", "personel"], "plagiarize": ["plagerize"], "playwright": ["playright", "playwrite"], "possession": ["posession", "possesion"], "potatoes": ["potatos"], "precede": ["preceed"], "presence": ["presance"], "principle": ["principal"], "privilege": ["privelege", "priviledge"], "professor": ["professer"], "protester": ["protestor"], "promise": ["promiss"], "pronunciation": ["pronounciatio"], "proof": ["prufe"], "prophecy": ["prophesy"], "publicly": ["publically"], "quarantine": ["quarentine"], "queue": ["que"], "questionnaire": ["questionaire", "questionnair"], "readable": ["readible"], "really": ["realy"], "receive": ["recieve"], "receipt": ["reciept"], "recommend": ["recomend", "reccommend"], "referred": ["refered"], "reference": ["referance", "refrence"], "relevant": ["relevent", "revelant"], "religious": ["religous", "religius"], "repetition": ["repitition"], "restaurant": ["restarant", "restaraunt"], "rhyme": ["rime"], "rhythm": ["rythm", "rythem"], "secretary": ["secratary", "secretery"], "seize": ["sieze"], "separate": ["seperate"], "sergeant": ["sargent"], "similar": ["similer"], "skilful": ["skilfull", "skillful"], "speech": ["speach", "speeche"], "successful": ["succesful", "successfull", "sucessful"], "supersede": ["supercede"], "surprise": ["suprise", "surprize"], "than": ["then"], "their": ["there", "they're"], "tomatoes": ["tomatos"], "tomorrow": ["tommorow", "tommorrow"], "Tucson": ["Tuscon"], "twelfth": ["twelth"], "tyranny": ["tyrany"], "underrate": ["underate"], "until": ["untill"], "upholstery": ["upholstry"], "usable": ["useable", "usible"], "vacuum": ["vaccuum", "vaccum", "vacume"], "vehicle": ["vehical"], "vicious": ["visious"], "what": ["wat"], "weather": ["wether", "whether"], "weird": ["wierd"], "welfare": ["wellfare", "welfair"], "whether": ["wether"], "wilful": ["wilfull", "willful"], "withhold": ["withold"], "writing": ["writting", "writeing"], "you're": ["your"], "your": ["you're"], } def apostrophe_error(word: str) -> str: """ Simulate common errors with apostrophes. If the word contains an apostrophe: - randomly remove it, - shift it one position left (if possible), - shift it one position right (if possible), or - duplicate it. If the word does not contain an apostrophe but ends with 's', sometimes insert an apostrophe to mimic a mistaken possessive. """ if "'" in word: # Identify all apostrophe positions indices = [i for i, ch in enumerate(word) if ch == "'"] idx = random.choice(indices) error_choice = random.choice(['remove', 'shift_left', 'shift_right', 'duplicate']) if error_choice == 'remove': return word[:idx] + word[idx + 1:] elif error_choice == 'shift_left': if idx > 0: # Remove the apostrophe and insert it one position left. return word[:idx - 1] + word[idx] + word[idx - 1] + word[idx + 1:] else: return word[:idx] + word[idx + 1:] elif error_choice == 'shift_right': if idx < len(word) - 1: # Remove the apostrophe and insert it one position right. return word[:idx] + word[idx + 1] + word[idx] + word[idx + 2:] else: return word[:idx] + word[idx + 1:] elif error_choice == 'duplicate': return word[:idx + 1] + "'" + word[idx + 1:] else: # For words without an apostrophe: if the word ends with 's', sometimes insert one. if word.endswith("s") and random.random() < 0.5: # Insert an apostrophe before the last letter. return word[:-1] + "'" + word[-1] return word def delete_random_letter(word: str) -> str: """Simulate an omission error by deleting a random letter.""" if len(word) < 2: return word idx = random.randint(0, len(word) - 1) return word[:idx] + word[idx + 1:] def duplicate_random_letter(word: str) -> str: """Simulate an extra keypress by duplicating a letter at a random index.""" if not word: return word idx = random.randint(0, len(word) - 1) return word[:idx + 1] + word[idx] + word[idx + 1:] def insert_random_letter(word: str) -> str: """Simulate an insertion error by adding a random letter at a random position.""" idx = random.randint(0, len(word)) letter = random.choice(string.ascii_lowercase) return word[:idx] + letter + word[idx:] def replace_with_adjacent_key(word: str) -> str: """ Simulate a typing error by replacing a letter with one of its QWERTY neighbors. Only letters with defined neighbors are considered. """ # Define neighboring keys for a QWERTY keyboard (for lowercase letters) qwerty_neighbors = { 'q': ['w', 'a'], 'w': ['q', 'e', 's'], 'e': ['w', 'r', 'd'], 'r': ['e', 't', 'f'], 't': ['r', 'y', 'g'], 'y': ['t', 'u', 'h'], 'u': ['y', 'i', 'j'], 'i': ['u', 'o', 'k'], 'o': ['i', 'p', 'l'], 'p': ['o'], 'a': ['q', 's', 'z'], 's': ['a', 'd', 'w', 'x'], 'd': ['s', 'f', 'e', 'c'], 'f': ['d', 'g', 'r', 'v'], 'g': ['f', 'h', 't', 'b'], 'h': ['g', 'j', 'y', 'n'], 'j': ['h', 'k', 'u', 'm'], 'k': ['j', 'l', 'i'], 'l': ['k', 'o'], 'z': ['a', 'x'], 'x': ['z', 'c', 's'], 'c': ['x', 'v', 'd'], 'v': ['c', 'b', 'f'], 'b': ['v', 'n', 'g'], 'n': ['b', 'm', 'h'], 'm': ['n', 'j'] } # Find indices of characters that are letters with neighbors valid_indices = [i for i, ch in enumerate(word) if ch.lower() in qwerty_neighbors] if not valid_indices: return word idx = random.choice(valid_indices) orig_char = word[idx] lower_char = orig_char.lower() replacement = random.choice(qwerty_neighbors[lower_char]) # Preserve original case if orig_char.isupper(): replacement = replacement.upper() return word[:idx] + replacement + word[idx + 1:] def swap_adjacent_letters(word: str) -> str: """Simulate a transposition error by swapping two adjacent letters.""" if len(word) < 2: return word idx = random.randint(0, len(word) - 2) word_list = list(word) word_list[idx], word_list[idx + 1] = word_list[idx + 1], word_list[idx] return ''.join(word_list) def switch_ie_ei(word: str) -> str: """ Switch occurrences of 'ie' with 'ei' (or vice versa) to simulate a common vowel pair error. """ if 'ie' in word: # Find all occurrences of 'ie' indices = [] start = 0 while True: idx = word.find('ie', start) if idx == -1: break indices.append(idx) start = idx + 1 if indices: idx = random.choice(indices) return word[:idx] + 'ei' + word[idx + 2:] elif 'ei' in word: indices = [] start = 0 while True: idx = word.find('ei', start) if idx == -1: break indices.append(idx) start = idx + 1 if indices: idx = random.choice(indices) return word[:idx] + 'ie' + word[idx + 2:] return word def generate_typo(word: str) -> str: """ Given an input word, return a version of it with a common typo. This function randomly selects one (or sometimes two) of the following error types: - adjacent letter transposition - deletion of a letter - duplication of a letter - insertion of a random letter - replacement with a neighboring key (QWERTY) - switching 'ie' and 'ei' sequences While this method is by no means exhaustive, it reflects many of the typical errors documented. """ if not word: return word if word in common_misspelled_words: if random.random() < 0.5: # 50% chance of selecting a common misspelling. return random.choice(common_misspelled_words[word]) # List of available transformation functions transformations = [ apostrophe_error, delete_random_letter, duplicate_random_letter, insert_random_letter, replace_with_adjacent_key, swap_adjacent_letters, switch_ie_ei ] # Randomly choose one transformation transformation = random.choice(transformations) result = transformation(word) # Occasionally chain a second transformation (10% chance) for added variability if random.random() < 0.1: second_transformation = random.choice(transformations) result = second_transformation(result) return result # Example usage: if __name__ == '__main__': test_words = [ "accommodate", "definitely", "receive", "mischievous", "calendar", "equipment", "pronunciation", "consensus", "friend", "beautiful", "doesn't", "books" ] for test_word in test_words: typo = generate_typo(test_word) print(f"Original: {test_word:15s} -> Typo: {typo}")