| import random |
| import string |
|
|
| common_misspelled_words = { |
| "absence": ["absense", "absentse", "abcense", "absance"], |
| "acceptable": ["acceptible"], |
| "accidentally": ["accidently", "ccidentaly"], |
| "accommodate": ["accomodate", "acommodate"], |
| "achieve": ["acheive"], |
| "acknowledge": ["acknowlege", "aknowledge"], |
| "acquaintance": ["acquaintence", "aquaintance"], |
| "acquire": ["aquire", "adquire"], |
| "acquit": ["aquit"], |
| "acreage": ["acrage", "acerage"], |
| "address": ["adress"], |
| "adultery": ["adultary"], |
| "advisable": ["adviseable", "advizable"], |
| "affect": ["effect"], |
| "aggression": ["agression"], |
| "aggressive": ["agressive"], |
| "allegiance": ["allegaince", "allegience", "alegiance"], |
| "almost": ["allmost"], |
| |
| "amateur": ["amatuer", "amature"], |
| "annually": ["anually", "annualy"], |
| "apparent": ["apparant", "aparent", "apparrent", "aparrent"], |
| "arctic": ["artic"], |
| "argument": ["arguement"], |
| "atheist": ["athiest", "athist"], |
| "awful": ["awfull", "aweful"], |
| "because": ["becuase", "becasue"], |
| "beautiful": ["beatiful"], |
| "becoming": ["becomeing"], |
| "beginning": ["begining"], |
| "believe": ["beleive"], |
| "bellwether": ["bellweather"], |
| "benefit": ["benifit"], |
| "buoy": ["bouy"], |
| "buoyant": ["bouyant"], |
| "business": ["buisness"], |
| "calendar": ["calender"], |
| "camouflage": ["camoflage", "camoflague"], |
| "capitol": ["capital"], |
| "Caribbean": ["Carribean"], |
| "category": ["catagory"], |
| "caught": ["cauhgt", "caugt"], |
| "cemetery": ["cemetary", "cematery"], |
| "changeable": ["changable"], |
| "chief": ["cheif"], |
| "colleague": ["collaegue", "collegue"], |
| "column": ["colum"], |
| "coming": ["comming"], |
| "committed": ["commited", "comitted"], |
| "comparison": ["comparsion"], |
| "concede": ["conceed"], |
| "congratulate": ["congradulate"], |
| "conscientious": ["consciencious"], |
| "conscious": ["concious", "consious"], |
| "consensus": ["concensus"], |
| "controversy": ["contraversy"], |
| "coolly": ["cooly"], |
| "daiquiri": ["dacquiri", "daquiri"], |
| "deceive": ["decieve"], |
| "definite": ["definate", "definit"], |
| "definitely": ["definitly", "definately", "definatly", "defiantly"], |
| "desperate": ["desparate"], |
| "difference": ["diffrence"], |
| "dilemma": ["dilema"], |
| "disappoint": ["dissapoint"], |
| "disastrous": ["disasterous"], |
| "drunkenness": ["drunkeness"], |
| "dumbbell": ["dumbell"], |
| "embarrass": ["embarass"], |
| "equipment": ["equiptment"], |
| "exceed": ["excede"], |
| "exhilarate": ["exilerate"], |
| "existence": ["existance"], |
| "experience": ["experiance"], |
| "extreme": ["extreem"], |
| "fascinating": ["facinating"], |
| "fiery": ["firey"], |
| "fluorescent": ["flourescent"], |
| "foreign": ["foriegn"], |
| "forty": ["fourty"], |
| "friend": ["freind"], |
| "fulfil": ["fullfil", "fulfill"], |
| "gauge": ["guage"], |
| "grateful": ["gratefull", "greatful"], |
| "great": ["grate", "grat"], |
| "guarantee": ["garantee", "garentee", "garanty"], |
| "guidance": ["guidence"], |
| "harass": ["harrass"], |
| "height": ["heighth", "heigth"], |
| "hierarchy": ["heirarchy"], |
| |
| "humorous": ["humerous"], |
| "hygiene": ["hygene", "hygine", "hiygeine", "higeine", "hygeine"], |
| "hypocrite": ["hipocrit"], |
| "ignorance": ["ignorence"], |
| "imitate": ["immitate"], |
| "immediately": ["imediately"], |
| "indict": ["indite"], |
| "independent": ["independant"], |
| "indispensable": ["indispensible"], |
| "inoculate": ["innoculate"], |
| "intelligence": ["inteligence", "intelligance"], |
| "jewelry": ["jewellery", "jewelery"], |
| "judgment": ["judgement"], |
| "kernel": ["kernal"], |
| "leisure": ["liesure"], |
| "liaison": ["liason"], |
| "library": ["libary", "liberry"], |
| "license": ["lisence", "licence"], |
| "lightning": ["lightening"], |
| "lose": ["loose"], |
| "maintenance": ["maintainance", "maintnance"], |
| "marshmallow": ["marshmellow"], |
| "medieval": ["medeval", "medevil", "mideval"], |
| "memento": ["momento"], |
| "millennium": ["millenium", "milennium"], |
| "miniature": ["miniture"], |
| "minuscule": ["miniscule"], |
| "mischievous": ["mischievious", "mischevous", "mischevious"], |
| "misspell": ["mispell", "misspel"], |
| "necessary": ["neccessary", "necessery"], |
| "niece": ["neice"], |
| "neighbour": ["nieghbor"], |
| "noticeable": ["noticable"], |
| "occasion": ["occassion"], |
| "occasionally": ["occasionaly", "occassionally"], |
| "occurrence": ["occurrance", "occurence"], |
| "occurred": ["occured"], |
| "omission": ["ommision", "omision"], |
| "original": ["orignal"], |
| "outrageous": ["outragous"], |
| "parliament": ["parliment"], |
| "pastime": ["passtime", "pasttime"], |
| "pedagogue": ["pedagoge"], |
| "perceive": ["percieve"], |
| "perseverance": ["perseverence"], |
| "personnel": ["personell", "personel"], |
| "plagiarize": ["plagerize"], |
| "playwright": ["playright", "playwrite"], |
| "possession": ["posession", "possesion"], |
| "potatoes": ["potatos"], |
| "precede": ["preceed"], |
| "presence": ["presance"], |
| "principle": ["principal"], |
| "privilege": ["privelege", "priviledge"], |
| "professor": ["professer"], |
| "protester": ["protestor"], |
| "promise": ["promiss"], |
| "pronunciation": ["pronounciatio"], |
| "proof": ["prufe"], |
| "prophecy": ["prophesy"], |
| "publicly": ["publically"], |
| "quarantine": ["quarentine"], |
| "queue": ["que"], |
| "questionnaire": ["questionaire", "questionnair"], |
| "readable": ["readible"], |
| "really": ["realy"], |
| "receive": ["recieve"], |
| "receipt": ["reciept"], |
| "recommend": ["recomend", "reccommend"], |
| "referred": ["refered"], |
| "reference": ["referance", "refrence"], |
| "relevant": ["relevent", "revelant"], |
| "religious": ["religous", "religius"], |
| "repetition": ["repitition"], |
| "restaurant": ["restarant", "restaraunt"], |
| "rhyme": ["rime"], |
| "rhythm": ["rythm", "rythem"], |
| "secretary": ["secratary", "secretery"], |
| "seize": ["sieze"], |
| "separate": ["seperate"], |
| "sergeant": ["sargent"], |
| "similar": ["similer"], |
| "skilful": ["skilfull", "skillful"], |
| "speech": ["speach", "speeche"], |
| "successful": ["succesful", "successfull", "sucessful"], |
| "supersede": ["supercede"], |
| "surprise": ["suprise", "surprize"], |
| "than": ["then"], |
| "their": ["there", "they're"], |
| "tomatoes": ["tomatos"], |
| "tomorrow": ["tommorow", "tommorrow"], |
| "Tucson": ["Tuscon"], |
| "twelfth": ["twelth"], |
| "tyranny": ["tyrany"], |
| "underrate": ["underate"], |
| "until": ["untill"], |
| "upholstery": ["upholstry"], |
| "usable": ["useable", "usible"], |
| "vacuum": ["vaccuum", "vaccum", "vacume"], |
| "vehicle": ["vehical"], |
| "vicious": ["visious"], |
| "what": ["wat"], |
| "weather": ["wether", "whether"], |
| "weird": ["wierd"], |
| "welfare": ["wellfare", "welfair"], |
| "whether": ["wether"], |
| "wilful": ["wilfull", "willful"], |
| "withhold": ["withold"], |
| "writing": ["writting", "writeing"], |
| "you're": ["your"], |
| "your": ["you're"], |
| } |
|
|
|
|
| def apostrophe_error(word: str) -> str: |
| """ |
| Simulate common errors with apostrophes. |
| |
| If the word contains an apostrophe: |
| - randomly remove it, |
| - shift it one position left (if possible), |
| - shift it one position right (if possible), or |
| - duplicate it. |
| |
| If the word does not contain an apostrophe but ends with 's', |
| sometimes insert an apostrophe to mimic a mistaken possessive. |
| """ |
| if "'" in word: |
| |
| indices = [i for i, ch in enumerate(word) if ch == "'"] |
| idx = random.choice(indices) |
| error_choice = random.choice(['remove', 'shift_left', 'shift_right', 'duplicate']) |
| if error_choice == 'remove': |
| return word[:idx] + word[idx + 1:] |
| elif error_choice == 'shift_left': |
| if idx > 0: |
| |
| return word[:idx - 1] + word[idx] + word[idx - 1] + word[idx + 1:] |
| else: |
| return word[:idx] + word[idx + 1:] |
| elif error_choice == 'shift_right': |
| if idx < len(word) - 1: |
| |
| return word[:idx] + word[idx + 1] + word[idx] + word[idx + 2:] |
| else: |
| return word[:idx] + word[idx + 1:] |
| elif error_choice == 'duplicate': |
| return word[:idx + 1] + "'" + word[idx + 1:] |
| else: |
| |
| if word.endswith("s") and random.random() < 0.5: |
| |
| return word[:-1] + "'" + word[-1] |
| return word |
|
|
|
|
| def delete_random_letter(word: str) -> str: |
| """Simulate an omission error by deleting a random letter.""" |
| if len(word) < 2: |
| return word |
| idx = random.randint(0, len(word) - 1) |
| return word[:idx] + word[idx + 1:] |
|
|
|
|
| def duplicate_random_letter(word: str) -> str: |
| """Simulate an extra keypress by duplicating a letter at a random index.""" |
| if not word: |
| return word |
| idx = random.randint(0, len(word) - 1) |
| return word[:idx + 1] + word[idx] + word[idx + 1:] |
|
|
|
|
| def insert_random_letter(word: str) -> str: |
| """Simulate an insertion error by adding a random letter at a random position.""" |
| idx = random.randint(0, len(word)) |
| letter = random.choice(string.ascii_lowercase) |
| return word[:idx] + letter + word[idx:] |
|
|
|
|
| def replace_with_adjacent_key(word: str) -> str: |
| """ |
| Simulate a typing error by replacing a letter with one of its QWERTY neighbors. |
| Only letters with defined neighbors are considered. |
| """ |
| |
| qwerty_neighbors = { |
| 'q': ['w', 'a'], |
| 'w': ['q', 'e', 's'], |
| 'e': ['w', 'r', 'd'], |
| 'r': ['e', 't', 'f'], |
| 't': ['r', 'y', 'g'], |
| 'y': ['t', 'u', 'h'], |
| 'u': ['y', 'i', 'j'], |
| 'i': ['u', 'o', 'k'], |
| 'o': ['i', 'p', 'l'], |
| 'p': ['o'], |
| 'a': ['q', 's', 'z'], |
| 's': ['a', 'd', 'w', 'x'], |
| 'd': ['s', 'f', 'e', 'c'], |
| 'f': ['d', 'g', 'r', 'v'], |
| 'g': ['f', 'h', 't', 'b'], |
| 'h': ['g', 'j', 'y', 'n'], |
| 'j': ['h', 'k', 'u', 'm'], |
| 'k': ['j', 'l', 'i'], |
| 'l': ['k', 'o'], |
| 'z': ['a', 'x'], |
| 'x': ['z', 'c', 's'], |
| 'c': ['x', 'v', 'd'], |
| 'v': ['c', 'b', 'f'], |
| 'b': ['v', 'n', 'g'], |
| 'n': ['b', 'm', 'h'], |
| 'm': ['n', 'j'] |
| } |
| |
| valid_indices = [i for i, ch in enumerate(word) if ch.lower() in qwerty_neighbors] |
| if not valid_indices: |
| return word |
| idx = random.choice(valid_indices) |
| orig_char = word[idx] |
| lower_char = orig_char.lower() |
| replacement = random.choice(qwerty_neighbors[lower_char]) |
| |
| if orig_char.isupper(): |
| replacement = replacement.upper() |
| return word[:idx] + replacement + word[idx + 1:] |
|
|
|
|
| def swap_adjacent_letters(word: str) -> str: |
| """Simulate a transposition error by swapping two adjacent letters.""" |
| if len(word) < 2: |
| return word |
| idx = random.randint(0, len(word) - 2) |
| word_list = list(word) |
| word_list[idx], word_list[idx + 1] = word_list[idx + 1], word_list[idx] |
| return ''.join(word_list) |
|
|
|
|
| def switch_ie_ei(word: str) -> str: |
| """ |
| Switch occurrences of 'ie' with 'ei' (or vice versa) to simulate |
| a common vowel pair error. |
| """ |
| if 'ie' in word: |
| |
| indices = [] |
| start = 0 |
| while True: |
| idx = word.find('ie', start) |
| if idx == -1: |
| break |
| indices.append(idx) |
| start = idx + 1 |
| if indices: |
| idx = random.choice(indices) |
| return word[:idx] + 'ei' + word[idx + 2:] |
| elif 'ei' in word: |
| indices = [] |
| start = 0 |
| while True: |
| idx = word.find('ei', start) |
| if idx == -1: |
| break |
| indices.append(idx) |
| start = idx + 1 |
| if indices: |
| idx = random.choice(indices) |
| return word[:idx] + 'ie' + word[idx + 2:] |
| return word |
|
|
|
|
| def generate_typo(word: str) -> str: |
| """ |
| Given an input word, return a version of it with a common typo. |
| This function randomly selects one (or sometimes two) of the following error types: |
| - adjacent letter transposition |
| - deletion of a letter |
| - duplication of a letter |
| - insertion of a random letter |
| - replacement with a neighboring key (QWERTY) |
| - switching 'ie' and 'ei' sequences |
| While this method is by no means exhaustive, it reflects many of the typical errors documented. |
| """ |
| if not word: |
| return word |
|
|
| if word in common_misspelled_words: |
| if random.random() < 0.5: |
| return random.choice(common_misspelled_words[word]) |
|
|
| |
| transformations = [ |
| apostrophe_error, |
| delete_random_letter, |
| duplicate_random_letter, |
| insert_random_letter, |
| replace_with_adjacent_key, |
| swap_adjacent_letters, |
| switch_ie_ei |
| ] |
|
|
| |
| transformation = random.choice(transformations) |
| result = transformation(word) |
|
|
| |
| if random.random() < 0.1: |
| second_transformation = random.choice(transformations) |
| result = second_transformation(result) |
|
|
| return result |
|
|
|
|
| |
| if __name__ == '__main__': |
| test_words = [ |
| "accommodate", "definitely", "receive", "mischievous", "calendar", |
| "equipment", "pronunciation", "consensus", "friend", "beautiful", |
| "doesn't", "books" |
| ] |
| for test_word in test_words: |
| typo = generate_typo(test_word) |
| print(f"Original: {test_word:15s} -> Typo: {typo}") |
|
|