| | import random |
| | import string |
| |
|
| | common_misspelled_words = { |
| | "absence": ["absense", "absentse", "abcense", "absance"], |
| | "acceptable": ["acceptible"], |
| | "accidentally": ["accidently", "ccidentaly"], |
| | "accommodate": ["accomodate", "acommodate"], |
| | "achieve": ["acheive"], |
| | "acknowledge": ["acknowlege", "aknowledge"], |
| | "acquaintance": ["acquaintence", "aquaintance"], |
| | "acquire": ["aquire", "adquire"], |
| | "acquit": ["aquit"], |
| | "acreage": ["acrage", "acerage"], |
| | "address": ["adress"], |
| | "adultery": ["adultary"], |
| | "advisable": ["adviseable", "advizable"], |
| | "affect": ["effect"], |
| | "aggression": ["agression"], |
| | "aggressive": ["agressive"], |
| | "allegiance": ["allegaince", "allegience", "alegiance"], |
| | "almost": ["allmost"], |
| | |
| | "amateur": ["amatuer", "amature"], |
| | "annually": ["anually", "annualy"], |
| | "apparent": ["apparant", "aparent", "apparrent", "aparrent"], |
| | "arctic": ["artic"], |
| | "argument": ["arguement"], |
| | "atheist": ["athiest", "athist"], |
| | "awful": ["awfull", "aweful"], |
| | "because": ["becuase", "becasue"], |
| | "beautiful": ["beatiful"], |
| | "becoming": ["becomeing"], |
| | "beginning": ["begining"], |
| | "believe": ["beleive"], |
| | "bellwether": ["bellweather"], |
| | "benefit": ["benifit"], |
| | "buoy": ["bouy"], |
| | "buoyant": ["bouyant"], |
| | "business": ["buisness"], |
| | "calendar": ["calender"], |
| | "camouflage": ["camoflage", "camoflague"], |
| | "capitol": ["capital"], |
| | "Caribbean": ["Carribean"], |
| | "category": ["catagory"], |
| | "caught": ["cauhgt", "caugt"], |
| | "cemetery": ["cemetary", "cematery"], |
| | "changeable": ["changable"], |
| | "chief": ["cheif"], |
| | "colleague": ["collaegue", "collegue"], |
| | "column": ["colum"], |
| | "coming": ["comming"], |
| | "committed": ["commited", "comitted"], |
| | "comparison": ["comparsion"], |
| | "concede": ["conceed"], |
| | "congratulate": ["congradulate"], |
| | "conscientious": ["consciencious"], |
| | "conscious": ["concious", "consious"], |
| | "consensus": ["concensus"], |
| | "controversy": ["contraversy"], |
| | "coolly": ["cooly"], |
| | "daiquiri": ["dacquiri", "daquiri"], |
| | "deceive": ["decieve"], |
| | "definite": ["definate", "definit"], |
| | "definitely": ["definitly", "definately", "definatly", "defiantly"], |
| | "desperate": ["desparate"], |
| | "difference": ["diffrence"], |
| | "dilemma": ["dilema"], |
| | "disappoint": ["dissapoint"], |
| | "disastrous": ["disasterous"], |
| | "drunkenness": ["drunkeness"], |
| | "dumbbell": ["dumbell"], |
| | "embarrass": ["embarass"], |
| | "equipment": ["equiptment"], |
| | "exceed": ["excede"], |
| | "exhilarate": ["exilerate"], |
| | "existence": ["existance"], |
| | "experience": ["experiance"], |
| | "extreme": ["extreem"], |
| | "fascinating": ["facinating"], |
| | "fiery": ["firey"], |
| | "fluorescent": ["flourescent"], |
| | "foreign": ["foriegn"], |
| | "forty": ["fourty"], |
| | "friend": ["freind"], |
| | "fulfil": ["fullfil", "fulfill"], |
| | "gauge": ["guage"], |
| | "grateful": ["gratefull", "greatful"], |
| | "great": ["grate", "grat"], |
| | "guarantee": ["garantee", "garentee", "garanty"], |
| | "guidance": ["guidence"], |
| | "harass": ["harrass"], |
| | "height": ["heighth", "heigth"], |
| | "hierarchy": ["heirarchy"], |
| | |
| | "humorous": ["humerous"], |
| | "hygiene": ["hygene", "hygine", "hiygeine", "higeine", "hygeine"], |
| | "hypocrite": ["hipocrit"], |
| | "ignorance": ["ignorence"], |
| | "imitate": ["immitate"], |
| | "immediately": ["imediately"], |
| | "indict": ["indite"], |
| | "independent": ["independant"], |
| | "indispensable": ["indispensible"], |
| | "inoculate": ["innoculate"], |
| | "intelligence": ["inteligence", "intelligance"], |
| | "jewelry": ["jewellery", "jewelery"], |
| | "judgment": ["judgement"], |
| | "kernel": ["kernal"], |
| | "leisure": ["liesure"], |
| | "liaison": ["liason"], |
| | "library": ["libary", "liberry"], |
| | "license": ["lisence", "licence"], |
| | "lightning": ["lightening"], |
| | "lose": ["loose"], |
| | "maintenance": ["maintainance", "maintnance"], |
| | "marshmallow": ["marshmellow"], |
| | "medieval": ["medeval", "medevil", "mideval"], |
| | "memento": ["momento"], |
| | "millennium": ["millenium", "milennium"], |
| | "miniature": ["miniture"], |
| | "minuscule": ["miniscule"], |
| | "mischievous": ["mischievious", "mischevous", "mischevious"], |
| | "misspell": ["mispell", "misspel"], |
| | "necessary": ["neccessary", "necessery"], |
| | "niece": ["neice"], |
| | "neighbour": ["nieghbor"], |
| | "noticeable": ["noticable"], |
| | "occasion": ["occassion"], |
| | "occasionally": ["occasionaly", "occassionally"], |
| | "occurrence": ["occurrance", "occurence"], |
| | "occurred": ["occured"], |
| | "omission": ["ommision", "omision"], |
| | "original": ["orignal"], |
| | "outrageous": ["outragous"], |
| | "parliament": ["parliment"], |
| | "pastime": ["passtime", "pasttime"], |
| | "pedagogue": ["pedagoge"], |
| | "perceive": ["percieve"], |
| | "perseverance": ["perseverence"], |
| | "personnel": ["personell", "personel"], |
| | "plagiarize": ["plagerize"], |
| | "playwright": ["playright", "playwrite"], |
| | "possession": ["posession", "possesion"], |
| | "potatoes": ["potatos"], |
| | "precede": ["preceed"], |
| | "presence": ["presance"], |
| | "principle": ["principal"], |
| | "privilege": ["privelege", "priviledge"], |
| | "professor": ["professer"], |
| | "protester": ["protestor"], |
| | "promise": ["promiss"], |
| | "pronunciation": ["pronounciatio"], |
| | "proof": ["prufe"], |
| | "prophecy": ["prophesy"], |
| | "publicly": ["publically"], |
| | "quarantine": ["quarentine"], |
| | "queue": ["que"], |
| | "questionnaire": ["questionaire", "questionnair"], |
| | "readable": ["readible"], |
| | "really": ["realy"], |
| | "receive": ["recieve"], |
| | "receipt": ["reciept"], |
| | "recommend": ["recomend", "reccommend"], |
| | "referred": ["refered"], |
| | "reference": ["referance", "refrence"], |
| | "relevant": ["relevent", "revelant"], |
| | "religious": ["religous", "religius"], |
| | "repetition": ["repitition"], |
| | "restaurant": ["restarant", "restaraunt"], |
| | "rhyme": ["rime"], |
| | "rhythm": ["rythm", "rythem"], |
| | "secretary": ["secratary", "secretery"], |
| | "seize": ["sieze"], |
| | "separate": ["seperate"], |
| | "sergeant": ["sargent"], |
| | "similar": ["similer"], |
| | "skilful": ["skilfull", "skillful"], |
| | "speech": ["speach", "speeche"], |
| | "successful": ["succesful", "successfull", "sucessful"], |
| | "supersede": ["supercede"], |
| | "surprise": ["suprise", "surprize"], |
| | "than": ["then"], |
| | "their": ["there", "they're"], |
| | "tomatoes": ["tomatos"], |
| | "tomorrow": ["tommorow", "tommorrow"], |
| | "Tucson": ["Tuscon"], |
| | "twelfth": ["twelth"], |
| | "tyranny": ["tyrany"], |
| | "underrate": ["underate"], |
| | "until": ["untill"], |
| | "upholstery": ["upholstry"], |
| | "usable": ["useable", "usible"], |
| | "vacuum": ["vaccuum", "vaccum", "vacume"], |
| | "vehicle": ["vehical"], |
| | "vicious": ["visious"], |
| | "what": ["wat"], |
| | "weather": ["wether", "whether"], |
| | "weird": ["wierd"], |
| | "welfare": ["wellfare", "welfair"], |
| | "whether": ["wether"], |
| | "wilful": ["wilfull", "willful"], |
| | "withhold": ["withold"], |
| | "writing": ["writting", "writeing"], |
| | "you're": ["your"], |
| | "your": ["you're"], |
| | } |
| |
|
| |
|
| | def apostrophe_error(word: str) -> str: |
| | """ |
| | Simulate common errors with apostrophes. |
| | |
| | If the word contains an apostrophe: |
| | - randomly remove it, |
| | - shift it one position left (if possible), |
| | - shift it one position right (if possible), or |
| | - duplicate it. |
| | |
| | If the word does not contain an apostrophe but ends with 's', |
| | sometimes insert an apostrophe to mimic a mistaken possessive. |
| | """ |
| | if "'" in word: |
| | |
| | indices = [i for i, ch in enumerate(word) if ch == "'"] |
| | idx = random.choice(indices) |
| | error_choice = random.choice(['remove', 'shift_left', 'shift_right', 'duplicate']) |
| | if error_choice == 'remove': |
| | return word[:idx] + word[idx + 1:] |
| | elif error_choice == 'shift_left': |
| | if idx > 0: |
| | |
| | return word[:idx - 1] + word[idx] + word[idx - 1] + word[idx + 1:] |
| | else: |
| | return word[:idx] + word[idx + 1:] |
| | elif error_choice == 'shift_right': |
| | if idx < len(word) - 1: |
| | |
| | return word[:idx] + word[idx + 1] + word[idx] + word[idx + 2:] |
| | else: |
| | return word[:idx] + word[idx + 1:] |
| | elif error_choice == 'duplicate': |
| | return word[:idx + 1] + "'" + word[idx + 1:] |
| | else: |
| | |
| | if word.endswith("s") and random.random() < 0.5: |
| | |
| | return word[:-1] + "'" + word[-1] |
| | return word |
| |
|
| |
|
| | def delete_random_letter(word: str) -> str: |
| | """Simulate an omission error by deleting a random letter.""" |
| | if len(word) < 2: |
| | return word |
| | idx = random.randint(0, len(word) - 1) |
| | return word[:idx] + word[idx + 1:] |
| |
|
| |
|
| | def duplicate_random_letter(word: str) -> str: |
| | """Simulate an extra keypress by duplicating a letter at a random index.""" |
| | if not word: |
| | return word |
| | idx = random.randint(0, len(word) - 1) |
| | return word[:idx + 1] + word[idx] + word[idx + 1:] |
| |
|
| |
|
| | def insert_random_letter(word: str) -> str: |
| | """Simulate an insertion error by adding a random letter at a random position.""" |
| | idx = random.randint(0, len(word)) |
| | letter = random.choice(string.ascii_lowercase) |
| | return word[:idx] + letter + word[idx:] |
| |
|
| |
|
| | def replace_with_adjacent_key(word: str) -> str: |
| | """ |
| | Simulate a typing error by replacing a letter with one of its QWERTY neighbors. |
| | Only letters with defined neighbors are considered. |
| | """ |
| | |
| | qwerty_neighbors = { |
| | 'q': ['w', 'a'], |
| | 'w': ['q', 'e', 's'], |
| | 'e': ['w', 'r', 'd'], |
| | 'r': ['e', 't', 'f'], |
| | 't': ['r', 'y', 'g'], |
| | 'y': ['t', 'u', 'h'], |
| | 'u': ['y', 'i', 'j'], |
| | 'i': ['u', 'o', 'k'], |
| | 'o': ['i', 'p', 'l'], |
| | 'p': ['o'], |
| | 'a': ['q', 's', 'z'], |
| | 's': ['a', 'd', 'w', 'x'], |
| | 'd': ['s', 'f', 'e', 'c'], |
| | 'f': ['d', 'g', 'r', 'v'], |
| | 'g': ['f', 'h', 't', 'b'], |
| | 'h': ['g', 'j', 'y', 'n'], |
| | 'j': ['h', 'k', 'u', 'm'], |
| | 'k': ['j', 'l', 'i'], |
| | 'l': ['k', 'o'], |
| | 'z': ['a', 'x'], |
| | 'x': ['z', 'c', 's'], |
| | 'c': ['x', 'v', 'd'], |
| | 'v': ['c', 'b', 'f'], |
| | 'b': ['v', 'n', 'g'], |
| | 'n': ['b', 'm', 'h'], |
| | 'm': ['n', 'j'] |
| | } |
| | |
| | valid_indices = [i for i, ch in enumerate(word) if ch.lower() in qwerty_neighbors] |
| | if not valid_indices: |
| | return word |
| | idx = random.choice(valid_indices) |
| | orig_char = word[idx] |
| | lower_char = orig_char.lower() |
| | replacement = random.choice(qwerty_neighbors[lower_char]) |
| | |
| | if orig_char.isupper(): |
| | replacement = replacement.upper() |
| | return word[:idx] + replacement + word[idx + 1:] |
| |
|
| |
|
| | def swap_adjacent_letters(word: str) -> str: |
| | """Simulate a transposition error by swapping two adjacent letters.""" |
| | if len(word) < 2: |
| | return word |
| | idx = random.randint(0, len(word) - 2) |
| | word_list = list(word) |
| | word_list[idx], word_list[idx + 1] = word_list[idx + 1], word_list[idx] |
| | return ''.join(word_list) |
| |
|
| |
|
| | def switch_ie_ei(word: str) -> str: |
| | """ |
| | Switch occurrences of 'ie' with 'ei' (or vice versa) to simulate |
| | a common vowel pair error. |
| | """ |
| | if 'ie' in word: |
| | |
| | indices = [] |
| | start = 0 |
| | while True: |
| | idx = word.find('ie', start) |
| | if idx == -1: |
| | break |
| | indices.append(idx) |
| | start = idx + 1 |
| | if indices: |
| | idx = random.choice(indices) |
| | return word[:idx] + 'ei' + word[idx + 2:] |
| | elif 'ei' in word: |
| | indices = [] |
| | start = 0 |
| | while True: |
| | idx = word.find('ei', start) |
| | if idx == -1: |
| | break |
| | indices.append(idx) |
| | start = idx + 1 |
| | if indices: |
| | idx = random.choice(indices) |
| | return word[:idx] + 'ie' + word[idx + 2:] |
| | return word |
| |
|
| |
|
| | def generate_typo(word: str) -> str: |
| | """ |
| | Given an input word, return a version of it with a common typo. |
| | This function randomly selects one (or sometimes two) of the following error types: |
| | - adjacent letter transposition |
| | - deletion of a letter |
| | - duplication of a letter |
| | - insertion of a random letter |
| | - replacement with a neighboring key (QWERTY) |
| | - switching 'ie' and 'ei' sequences |
| | While this method is by no means exhaustive, it reflects many of the typical errors documented. |
| | """ |
| | if not word: |
| | return word |
| |
|
| | if word in common_misspelled_words: |
| | if random.random() < 0.5: |
| | return random.choice(common_misspelled_words[word]) |
| |
|
| | |
| | transformations = [ |
| | apostrophe_error, |
| | delete_random_letter, |
| | duplicate_random_letter, |
| | insert_random_letter, |
| | replace_with_adjacent_key, |
| | swap_adjacent_letters, |
| | switch_ie_ei |
| | ] |
| |
|
| | |
| | transformation = random.choice(transformations) |
| | result = transformation(word) |
| |
|
| | |
| | if random.random() < 0.1: |
| | second_transformation = random.choice(transformations) |
| | result = second_transformation(result) |
| |
|
| | return result |
| |
|
| |
|
| | |
| | if __name__ == '__main__': |
| | test_words = [ |
| | "accommodate", "definitely", "receive", "mischievous", "calendar", |
| | "equipment", "pronunciation", "consensus", "friend", "beautiful", |
| | "doesn't", "books" |
| | ] |
| | for test_word in test_words: |
| | typo = generate_typo(test_word) |
| | print(f"Original: {test_word:15s} -> Typo: {typo}") |
| |
|