|
|
import random |
|
|
import string |
|
|
|
|
|
common_misspelled_words = { |
|
|
"absence": ["absense", "absentse", "abcense", "absance"], |
|
|
"acceptable": ["acceptible"], |
|
|
"accidentally": ["accidently", "ccidentaly"], |
|
|
"accommodate": ["accomodate", "acommodate"], |
|
|
"achieve": ["acheive"], |
|
|
"acknowledge": ["acknowlege", "aknowledge"], |
|
|
"acquaintance": ["acquaintence", "aquaintance"], |
|
|
"acquire": ["aquire", "adquire"], |
|
|
"acquit": ["aquit"], |
|
|
"acreage": ["acrage", "acerage"], |
|
|
"address": ["adress"], |
|
|
"adultery": ["adultary"], |
|
|
"advisable": ["adviseable", "advizable"], |
|
|
"affect": ["effect"], |
|
|
"aggression": ["agression"], |
|
|
"aggressive": ["agressive"], |
|
|
"allegiance": ["allegaince", "allegience", "alegiance"], |
|
|
"almost": ["allmost"], |
|
|
|
|
|
"amateur": ["amatuer", "amature"], |
|
|
"annually": ["anually", "annualy"], |
|
|
"apparent": ["apparant", "aparent", "apparrent", "aparrent"], |
|
|
"arctic": ["artic"], |
|
|
"argument": ["arguement"], |
|
|
"atheist": ["athiest", "athist"], |
|
|
"awful": ["awfull", "aweful"], |
|
|
"because": ["becuase", "becasue"], |
|
|
"beautiful": ["beatiful"], |
|
|
"becoming": ["becomeing"], |
|
|
"beginning": ["begining"], |
|
|
"believe": ["beleive"], |
|
|
"bellwether": ["bellweather"], |
|
|
"benefit": ["benifit"], |
|
|
"buoy": ["bouy"], |
|
|
"buoyant": ["bouyant"], |
|
|
"business": ["buisness"], |
|
|
"calendar": ["calender"], |
|
|
"camouflage": ["camoflage", "camoflague"], |
|
|
"capitol": ["capital"], |
|
|
"Caribbean": ["Carribean"], |
|
|
"category": ["catagory"], |
|
|
"caught": ["cauhgt", "caugt"], |
|
|
"cemetery": ["cemetary", "cematery"], |
|
|
"changeable": ["changable"], |
|
|
"chief": ["cheif"], |
|
|
"colleague": ["collaegue", "collegue"], |
|
|
"column": ["colum"], |
|
|
"coming": ["comming"], |
|
|
"committed": ["commited", "comitted"], |
|
|
"comparison": ["comparsion"], |
|
|
"concede": ["conceed"], |
|
|
"congratulate": ["congradulate"], |
|
|
"conscientious": ["consciencious"], |
|
|
"conscious": ["concious", "consious"], |
|
|
"consensus": ["concensus"], |
|
|
"controversy": ["contraversy"], |
|
|
"coolly": ["cooly"], |
|
|
"daiquiri": ["dacquiri", "daquiri"], |
|
|
"deceive": ["decieve"], |
|
|
"definite": ["definate", "definit"], |
|
|
"definitely": ["definitly", "definately", "definatly", "defiantly"], |
|
|
"desperate": ["desparate"], |
|
|
"difference": ["diffrence"], |
|
|
"dilemma": ["dilema"], |
|
|
"disappoint": ["dissapoint"], |
|
|
"disastrous": ["disasterous"], |
|
|
"drunkenness": ["drunkeness"], |
|
|
"dumbbell": ["dumbell"], |
|
|
"embarrass": ["embarass"], |
|
|
"equipment": ["equiptment"], |
|
|
"exceed": ["excede"], |
|
|
"exhilarate": ["exilerate"], |
|
|
"existence": ["existance"], |
|
|
"experience": ["experiance"], |
|
|
"extreme": ["extreem"], |
|
|
"fascinating": ["facinating"], |
|
|
"fiery": ["firey"], |
|
|
"fluorescent": ["flourescent"], |
|
|
"foreign": ["foriegn"], |
|
|
"forty": ["fourty"], |
|
|
"friend": ["freind"], |
|
|
"fulfil": ["fullfil", "fulfill"], |
|
|
"gauge": ["guage"], |
|
|
"grateful": ["gratefull", "greatful"], |
|
|
"great": ["grate", "grat"], |
|
|
"guarantee": ["garantee", "garentee", "garanty"], |
|
|
"guidance": ["guidence"], |
|
|
"harass": ["harrass"], |
|
|
"height": ["heighth", "heigth"], |
|
|
"hierarchy": ["heirarchy"], |
|
|
|
|
|
"humorous": ["humerous"], |
|
|
"hygiene": ["hygene", "hygine", "hiygeine", "higeine", "hygeine"], |
|
|
"hypocrite": ["hipocrit"], |
|
|
"ignorance": ["ignorence"], |
|
|
"imitate": ["immitate"], |
|
|
"immediately": ["imediately"], |
|
|
"indict": ["indite"], |
|
|
"independent": ["independant"], |
|
|
"indispensable": ["indispensible"], |
|
|
"inoculate": ["innoculate"], |
|
|
"intelligence": ["inteligence", "intelligance"], |
|
|
"jewelry": ["jewellery", "jewelery"], |
|
|
"judgment": ["judgement"], |
|
|
"kernel": ["kernal"], |
|
|
"leisure": ["liesure"], |
|
|
"liaison": ["liason"], |
|
|
"library": ["libary", "liberry"], |
|
|
"license": ["lisence", "licence"], |
|
|
"lightning": ["lightening"], |
|
|
"lose": ["loose"], |
|
|
"maintenance": ["maintainance", "maintnance"], |
|
|
"marshmallow": ["marshmellow"], |
|
|
"medieval": ["medeval", "medevil", "mideval"], |
|
|
"memento": ["momento"], |
|
|
"millennium": ["millenium", "milennium"], |
|
|
"miniature": ["miniture"], |
|
|
"minuscule": ["miniscule"], |
|
|
"mischievous": ["mischievious", "mischevous", "mischevious"], |
|
|
"misspell": ["mispell", "misspel"], |
|
|
"necessary": ["neccessary", "necessery"], |
|
|
"niece": ["neice"], |
|
|
"neighbour": ["nieghbor"], |
|
|
"noticeable": ["noticable"], |
|
|
"occasion": ["occassion"], |
|
|
"occasionally": ["occasionaly", "occassionally"], |
|
|
"occurrence": ["occurrance", "occurence"], |
|
|
"occurred": ["occured"], |
|
|
"omission": ["ommision", "omision"], |
|
|
"original": ["orignal"], |
|
|
"outrageous": ["outragous"], |
|
|
"parliament": ["parliment"], |
|
|
"pastime": ["passtime", "pasttime"], |
|
|
"pedagogue": ["pedagoge"], |
|
|
"perceive": ["percieve"], |
|
|
"perseverance": ["perseverence"], |
|
|
"personnel": ["personell", "personel"], |
|
|
"plagiarize": ["plagerize"], |
|
|
"playwright": ["playright", "playwrite"], |
|
|
"possession": ["posession", "possesion"], |
|
|
"potatoes": ["potatos"], |
|
|
"precede": ["preceed"], |
|
|
"presence": ["presance"], |
|
|
"principle": ["principal"], |
|
|
"privilege": ["privelege", "priviledge"], |
|
|
"professor": ["professer"], |
|
|
"protester": ["protestor"], |
|
|
"promise": ["promiss"], |
|
|
"pronunciation": ["pronounciatio"], |
|
|
"proof": ["prufe"], |
|
|
"prophecy": ["prophesy"], |
|
|
"publicly": ["publically"], |
|
|
"quarantine": ["quarentine"], |
|
|
"queue": ["que"], |
|
|
"questionnaire": ["questionaire", "questionnair"], |
|
|
"readable": ["readible"], |
|
|
"really": ["realy"], |
|
|
"receive": ["recieve"], |
|
|
"receipt": ["reciept"], |
|
|
"recommend": ["recomend", "reccommend"], |
|
|
"referred": ["refered"], |
|
|
"reference": ["referance", "refrence"], |
|
|
"relevant": ["relevent", "revelant"], |
|
|
"religious": ["religous", "religius"], |
|
|
"repetition": ["repitition"], |
|
|
"restaurant": ["restarant", "restaraunt"], |
|
|
"rhyme": ["rime"], |
|
|
"rhythm": ["rythm", "rythem"], |
|
|
"secretary": ["secratary", "secretery"], |
|
|
"seize": ["sieze"], |
|
|
"separate": ["seperate"], |
|
|
"sergeant": ["sargent"], |
|
|
"similar": ["similer"], |
|
|
"skilful": ["skilfull", "skillful"], |
|
|
"speech": ["speach", "speeche"], |
|
|
"successful": ["succesful", "successfull", "sucessful"], |
|
|
"supersede": ["supercede"], |
|
|
"surprise": ["suprise", "surprize"], |
|
|
"than": ["then"], |
|
|
"their": ["there", "they're"], |
|
|
"tomatoes": ["tomatos"], |
|
|
"tomorrow": ["tommorow", "tommorrow"], |
|
|
"Tucson": ["Tuscon"], |
|
|
"twelfth": ["twelth"], |
|
|
"tyranny": ["tyrany"], |
|
|
"underrate": ["underate"], |
|
|
"until": ["untill"], |
|
|
"upholstery": ["upholstry"], |
|
|
"usable": ["useable", "usible"], |
|
|
"vacuum": ["vaccuum", "vaccum", "vacume"], |
|
|
"vehicle": ["vehical"], |
|
|
"vicious": ["visious"], |
|
|
"what": ["wat"], |
|
|
"weather": ["wether", "whether"], |
|
|
"weird": ["wierd"], |
|
|
"welfare": ["wellfare", "welfair"], |
|
|
"whether": ["wether"], |
|
|
"wilful": ["wilfull", "willful"], |
|
|
"withhold": ["withold"], |
|
|
"writing": ["writting", "writeing"], |
|
|
"you're": ["your"], |
|
|
"your": ["you're"], |
|
|
} |
|
|
|
|
|
|
|
|
def apostrophe_error(word: str) -> str: |
|
|
""" |
|
|
Simulate common errors with apostrophes. |
|
|
|
|
|
If the word contains an apostrophe: |
|
|
- randomly remove it, |
|
|
- shift it one position left (if possible), |
|
|
- shift it one position right (if possible), or |
|
|
- duplicate it. |
|
|
|
|
|
If the word does not contain an apostrophe but ends with 's', |
|
|
sometimes insert an apostrophe to mimic a mistaken possessive. |
|
|
""" |
|
|
if "'" in word: |
|
|
|
|
|
indices = [i for i, ch in enumerate(word) if ch == "'"] |
|
|
idx = random.choice(indices) |
|
|
error_choice = random.choice(['remove', 'shift_left', 'shift_right', 'duplicate']) |
|
|
if error_choice == 'remove': |
|
|
return word[:idx] + word[idx + 1:] |
|
|
elif error_choice == 'shift_left': |
|
|
if idx > 0: |
|
|
|
|
|
return word[:idx - 1] + word[idx] + word[idx - 1] + word[idx + 1:] |
|
|
else: |
|
|
return word[:idx] + word[idx + 1:] |
|
|
elif error_choice == 'shift_right': |
|
|
if idx < len(word) - 1: |
|
|
|
|
|
return word[:idx] + word[idx + 1] + word[idx] + word[idx + 2:] |
|
|
else: |
|
|
return word[:idx] + word[idx + 1:] |
|
|
elif error_choice == 'duplicate': |
|
|
return word[:idx + 1] + "'" + word[idx + 1:] |
|
|
else: |
|
|
|
|
|
if word.endswith("s") and random.random() < 0.5: |
|
|
|
|
|
return word[:-1] + "'" + word[-1] |
|
|
return word |
|
|
|
|
|
|
|
|
def delete_random_letter(word: str) -> str: |
|
|
"""Simulate an omission error by deleting a random letter.""" |
|
|
if len(word) < 2: |
|
|
return word |
|
|
idx = random.randint(0, len(word) - 1) |
|
|
return word[:idx] + word[idx + 1:] |
|
|
|
|
|
|
|
|
def duplicate_random_letter(word: str) -> str: |
|
|
"""Simulate an extra keypress by duplicating a letter at a random index.""" |
|
|
if not word: |
|
|
return word |
|
|
idx = random.randint(0, len(word) - 1) |
|
|
return word[:idx + 1] + word[idx] + word[idx + 1:] |
|
|
|
|
|
|
|
|
def insert_random_letter(word: str) -> str: |
|
|
"""Simulate an insertion error by adding a random letter at a random position.""" |
|
|
idx = random.randint(0, len(word)) |
|
|
letter = random.choice(string.ascii_lowercase) |
|
|
return word[:idx] + letter + word[idx:] |
|
|
|
|
|
|
|
|
def replace_with_adjacent_key(word: str) -> str: |
|
|
""" |
|
|
Simulate a typing error by replacing a letter with one of its QWERTY neighbors. |
|
|
Only letters with defined neighbors are considered. |
|
|
""" |
|
|
|
|
|
qwerty_neighbors = { |
|
|
'q': ['w', 'a'], |
|
|
'w': ['q', 'e', 's'], |
|
|
'e': ['w', 'r', 'd'], |
|
|
'r': ['e', 't', 'f'], |
|
|
't': ['r', 'y', 'g'], |
|
|
'y': ['t', 'u', 'h'], |
|
|
'u': ['y', 'i', 'j'], |
|
|
'i': ['u', 'o', 'k'], |
|
|
'o': ['i', 'p', 'l'], |
|
|
'p': ['o'], |
|
|
'a': ['q', 's', 'z'], |
|
|
's': ['a', 'd', 'w', 'x'], |
|
|
'd': ['s', 'f', 'e', 'c'], |
|
|
'f': ['d', 'g', 'r', 'v'], |
|
|
'g': ['f', 'h', 't', 'b'], |
|
|
'h': ['g', 'j', 'y', 'n'], |
|
|
'j': ['h', 'k', 'u', 'm'], |
|
|
'k': ['j', 'l', 'i'], |
|
|
'l': ['k', 'o'], |
|
|
'z': ['a', 'x'], |
|
|
'x': ['z', 'c', 's'], |
|
|
'c': ['x', 'v', 'd'], |
|
|
'v': ['c', 'b', 'f'], |
|
|
'b': ['v', 'n', 'g'], |
|
|
'n': ['b', 'm', 'h'], |
|
|
'm': ['n', 'j'] |
|
|
} |
|
|
|
|
|
valid_indices = [i for i, ch in enumerate(word) if ch.lower() in qwerty_neighbors] |
|
|
if not valid_indices: |
|
|
return word |
|
|
idx = random.choice(valid_indices) |
|
|
orig_char = word[idx] |
|
|
lower_char = orig_char.lower() |
|
|
replacement = random.choice(qwerty_neighbors[lower_char]) |
|
|
|
|
|
if orig_char.isupper(): |
|
|
replacement = replacement.upper() |
|
|
return word[:idx] + replacement + word[idx + 1:] |
|
|
|
|
|
|
|
|
def swap_adjacent_letters(word: str) -> str: |
|
|
"""Simulate a transposition error by swapping two adjacent letters.""" |
|
|
if len(word) < 2: |
|
|
return word |
|
|
idx = random.randint(0, len(word) - 2) |
|
|
word_list = list(word) |
|
|
word_list[idx], word_list[idx + 1] = word_list[idx + 1], word_list[idx] |
|
|
return ''.join(word_list) |
|
|
|
|
|
|
|
|
def switch_ie_ei(word: str) -> str: |
|
|
""" |
|
|
Switch occurrences of 'ie' with 'ei' (or vice versa) to simulate |
|
|
a common vowel pair error. |
|
|
""" |
|
|
if 'ie' in word: |
|
|
|
|
|
indices = [] |
|
|
start = 0 |
|
|
while True: |
|
|
idx = word.find('ie', start) |
|
|
if idx == -1: |
|
|
break |
|
|
indices.append(idx) |
|
|
start = idx + 1 |
|
|
if indices: |
|
|
idx = random.choice(indices) |
|
|
return word[:idx] + 'ei' + word[idx + 2:] |
|
|
elif 'ei' in word: |
|
|
indices = [] |
|
|
start = 0 |
|
|
while True: |
|
|
idx = word.find('ei', start) |
|
|
if idx == -1: |
|
|
break |
|
|
indices.append(idx) |
|
|
start = idx + 1 |
|
|
if indices: |
|
|
idx = random.choice(indices) |
|
|
return word[:idx] + 'ie' + word[idx + 2:] |
|
|
return word |
|
|
|
|
|
|
|
|
def generate_typo(word: str) -> str: |
|
|
""" |
|
|
Given an input word, return a version of it with a common typo. |
|
|
This function randomly selects one (or sometimes two) of the following error types: |
|
|
- adjacent letter transposition |
|
|
- deletion of a letter |
|
|
- duplication of a letter |
|
|
- insertion of a random letter |
|
|
- replacement with a neighboring key (QWERTY) |
|
|
- switching 'ie' and 'ei' sequences |
|
|
While this method is by no means exhaustive, it reflects many of the typical errors documented. |
|
|
""" |
|
|
if not word: |
|
|
return word |
|
|
|
|
|
if word in common_misspelled_words: |
|
|
if random.random() < 0.5: |
|
|
return random.choice(common_misspelled_words[word]) |
|
|
|
|
|
|
|
|
transformations = [ |
|
|
apostrophe_error, |
|
|
delete_random_letter, |
|
|
duplicate_random_letter, |
|
|
insert_random_letter, |
|
|
replace_with_adjacent_key, |
|
|
swap_adjacent_letters, |
|
|
switch_ie_ei |
|
|
] |
|
|
|
|
|
|
|
|
transformation = random.choice(transformations) |
|
|
result = transformation(word) |
|
|
|
|
|
|
|
|
if random.random() < 0.1: |
|
|
second_transformation = random.choice(transformations) |
|
|
result = second_transformation(result) |
|
|
|
|
|
return result |
|
|
|
|
|
|
|
|
|
|
|
if __name__ == '__main__': |
|
|
test_words = [ |
|
|
"accommodate", "definitely", "receive", "mischievous", "calendar", |
|
|
"equipment", "pronunciation", "consensus", "friend", "beautiful", |
|
|
"doesn't", "books" |
|
|
] |
|
|
for test_word in test_words: |
|
|
typo = generate_typo(test_word) |
|
|
print(f"Original: {test_word:15s} -> Typo: {typo}") |
|
|
|