File size: 14,418 Bytes

360e354

import random
import string

common_misspelled_words = {
    "absence": ["absense", "absentse", "abcense", "absance"],
    "acceptable": ["acceptible"],
    "accidentally": ["accidently", "ccidentaly"],
    "accommodate": ["accomodate", "acommodate"],
    "achieve": ["acheive"],
    "acknowledge": ["acknowlege", "aknowledge"],
    "acquaintance": ["acquaintence", "aquaintance"],
    "acquire": ["aquire", "adquire"],
    "acquit": ["aquit"],
    "acreage": ["acrage", "acerage"],
    "address": ["adress"],
    "adultery": ["adultary"],
    "advisable": ["adviseable", "advizable"],
    "affect": ["effect"],
    "aggression": ["agression"],
    "aggressive": ["agressive"],
    "allegiance": ["allegaince", "allegience", "alegiance"],
    "almost": ["allmost"],
    #"a lot": ["alot", "allot"]  # Not captured since "a lot" is two tokens.
    "amateur": ["amatuer", "amature"],
    "annually": ["anually", "annualy"],
    "apparent": ["apparant", "aparent", "apparrent", "aparrent"],
    "arctic": ["artic"],
    "argument": ["arguement"],
    "atheist": ["athiest", "athist"],
    "awful": ["awfull", "aweful"],
    "because": ["becuase", "becasue"],
    "beautiful": ["beatiful"],
    "becoming": ["becomeing"],
    "beginning": ["begining"],
    "believe": ["beleive"],
    "bellwether": ["bellweather"],
    "benefit": ["benifit"],
    "buoy": ["bouy"],
    "buoyant": ["bouyant"],
    "business": ["buisness"],
    "calendar": ["calender"],
    "camouflage": ["camoflage", "camoflague"],
    "capitol": ["capital"],
    "Caribbean": ["Carribean"],  # More names?
    "category": ["catagory"],
    "caught": ["cauhgt", "caugt"],
    "cemetery": ["cemetary", "cematery"],
    "changeable": ["changable"],
    "chief": ["cheif"],
    "colleague": ["collaegue", "collegue"],
    "column": ["colum"],
    "coming": ["comming"],
    "committed": ["commited", "comitted"],
    "comparison": ["comparsion"],
    "concede": ["conceed"],
    "congratulate": ["congradulate"],
    "conscientious": ["consciencious"],
    "conscious": ["concious", "consious"],
    "consensus": ["concensus"],
    "controversy": ["contraversy"],
    "coolly": ["cooly"],
    "daiquiri": ["dacquiri", "daquiri"],
    "deceive": ["decieve"],
    "definite": ["definate", "definit"],
    "definitely": ["definitly", "definately", "definatly", "defiantly"],
    "desperate": ["desparate"],
    "difference": ["diffrence"],
    "dilemma": ["dilema"],
    "disappoint": ["dissapoint"],
    "disastrous": ["disasterous"],
    "drunkenness": ["drunkeness"],
    "dumbbell": ["dumbell"],
    "embarrass": ["embarass"],
    "equipment": ["equiptment"],
    "exceed": ["excede"],
    "exhilarate": ["exilerate"],
    "existence": ["existance"],
    "experience": ["experiance"],
    "extreme": ["extreem"],
    "fascinating": ["facinating"],
    "fiery": ["firey"],
    "fluorescent": ["flourescent"],
    "foreign": ["foriegn"],
    "forty": ["fourty"],
    "friend": ["freind"],
    "fulfil": ["fullfil", "fulfill"],
    "gauge": ["guage"],
    "grateful": ["gratefull", "greatful"],
    "great": ["grate", "grat"],
    "guarantee": ["garantee", "garentee", "garanty"],
    "guidance": ["guidence"],
    "harass": ["harrass"],
    "height": ["heighth", "heigth"],
    "hierarchy": ["heirarchy"],
    # "hors d'oeuvres": ["hors derves", "ordeurves"]  # Not captured since "hors d'oeuvres" is two tokens.
    "humorous": ["humerous"],
    "hygiene": ["hygene", "hygine", "hiygeine", "higeine", "hygeine"],
    "hypocrite": ["hipocrit"],
    "ignorance": ["ignorence"],
    "imitate": ["immitate"],
    "immediately": ["imediately"],
    "indict": ["indite"],
    "independent": ["independant"],
    "indispensable": ["indispensible"],
    "inoculate": ["innoculate"],
    "intelligence": ["inteligence", "intelligance"],
    "jewelry": ["jewellery", "jewelery"],
    "judgment": ["judgement"],
    "kernel": ["kernal"],
    "leisure": ["liesure"],
    "liaison": ["liason"],
    "library": ["libary", "liberry"],
    "license": ["lisence", "licence"],
    "lightning": ["lightening"],
    "lose": ["loose"],
    "maintenance": ["maintainance", "maintnance"],
    "marshmallow": ["marshmellow"],
    "medieval": ["medeval", "medevil", "mideval"],
    "memento": ["momento"],
    "millennium": ["millenium", "milennium"],
    "miniature": ["miniture"],
    "minuscule": ["miniscule"],
    "mischievous": ["mischievious", "mischevous", "mischevious"],
    "misspell": ["mispell", "misspel"],
    "necessary": ["neccessary", "necessery"],
    "niece": ["neice"],
    "neighbour": ["nieghbor"],
    "noticeable": ["noticable"],
    "occasion": ["occassion"],
    "occasionally": ["occasionaly", "occassionally"],
    "occurrence": ["occurrance", "occurence"],
    "occurred": ["occured"],
    "omission": ["ommision", "omision"],
    "original": ["orignal"],
    "outrageous": ["outragous"],
    "parliament": ["parliment"],
    "pastime": ["passtime", "pasttime"],
    "pedagogue": ["pedagoge"],
    "perceive": ["percieve"],
    "perseverance": ["perseverence"],
    "personnel": ["personell", "personel"],
    "plagiarize": ["plagerize"],
    "playwright": ["playright", "playwrite"],
    "possession": ["posession", "possesion"],
    "potatoes": ["potatos"],
    "precede": ["preceed"],
    "presence": ["presance"],
    "principle": ["principal"],
    "privilege": ["privelege", "priviledge"],
    "professor": ["professer"],
    "protester": ["protestor"],
    "promise": ["promiss"],
    "pronunciation": ["pronounciatio"],
    "proof": ["prufe"],
    "prophecy": ["prophesy"],
    "publicly": ["publically"],
    "quarantine": ["quarentine"],
    "queue": ["que"],
    "questionnaire": ["questionaire", "questionnair"],
    "readable": ["readible"],
    "really": ["realy"],
    "receive": ["recieve"],
    "receipt": ["reciept"],
    "recommend": ["recomend", "reccommend"],
    "referred": ["refered"],
    "reference": ["referance", "refrence"],
    "relevant": ["relevent", "revelant"],
    "religious": ["religous", "religius"],
    "repetition": ["repitition"],
    "restaurant": ["restarant", "restaraunt"],
    "rhyme": ["rime"],
    "rhythm": ["rythm", "rythem"],
    "secretary": ["secratary", "secretery"],
    "seize": ["sieze"],
    "separate": ["seperate"],
    "sergeant": ["sargent"],
    "similar": ["similer"],
    "skilful": ["skilfull", "skillful"],
    "speech": ["speach", "speeche"],
    "successful": ["succesful", "successfull", "sucessful"],
    "supersede": ["supercede"],
    "surprise": ["suprise", "surprize"],
    "than": ["then"],
    "their": ["there", "they're"],
    "tomatoes": ["tomatos"],
    "tomorrow": ["tommorow", "tommorrow"],
    "Tucson": ["Tuscon"],
    "twelfth": ["twelth"],
    "tyranny": ["tyrany"],
    "underrate": ["underate"],
    "until": ["untill"],
    "upholstery": ["upholstry"],
    "usable": ["useable", "usible"],
    "vacuum": ["vaccuum", "vaccum", "vacume"],
    "vehicle": ["vehical"],
    "vicious": ["visious"],
    "what": ["wat"],
    "weather": ["wether", "whether"],
    "weird": ["wierd"],
    "welfare": ["wellfare", "welfair"],
    "whether": ["wether"],
    "wilful": ["wilfull", "willful"],
    "withhold": ["withold"],
    "writing": ["writting", "writeing"],
    "you're": ["your"],
    "your": ["you're"],
}


def apostrophe_error(word: str) -> str:
    """
    Simulate common errors with apostrophes.

    If the word contains an apostrophe:
      - randomly remove it,
      - shift it one position left (if possible),
      - shift it one position right (if possible), or
      - duplicate it.

    If the word does not contain an apostrophe but ends with 's',
    sometimes insert an apostrophe to mimic a mistaken possessive.
    """
    if "'" in word:
        # Identify all apostrophe positions
        indices = [i for i, ch in enumerate(word) if ch == "'"]
        idx = random.choice(indices)
        error_choice = random.choice(['remove', 'shift_left', 'shift_right', 'duplicate'])
        if error_choice == 'remove':
            return word[:idx] + word[idx + 1:]
        elif error_choice == 'shift_left':
            if idx > 0:
                # Remove the apostrophe and insert it one position left.
                return word[:idx - 1] + word[idx] + word[idx - 1] + word[idx + 1:]
            else:
                return word[:idx] + word[idx + 1:]
        elif error_choice == 'shift_right':
            if idx < len(word) - 1:
                # Remove the apostrophe and insert it one position right.
                return word[:idx] + word[idx + 1] + word[idx] + word[idx + 2:]
            else:
                return word[:idx] + word[idx + 1:]
        elif error_choice == 'duplicate':
            return word[:idx + 1] + "'" + word[idx + 1:]
    else:
        # For words without an apostrophe: if the word ends with 's', sometimes insert one.
        if word.endswith("s") and random.random() < 0.5:
            # Insert an apostrophe before the last letter.
            return word[:-1] + "'" + word[-1]
    return word


def delete_random_letter(word: str) -> str:
    """Simulate an omission error by deleting a random letter."""
    if len(word) < 2:
        return word
    idx = random.randint(0, len(word) - 1)
    return word[:idx] + word[idx + 1:]


def duplicate_random_letter(word: str) -> str:
    """Simulate an extra keypress by duplicating a letter at a random index."""
    if not word:
        return word
    idx = random.randint(0, len(word) - 1)
    return word[:idx + 1] + word[idx] + word[idx + 1:]


def insert_random_letter(word: str) -> str:
    """Simulate an insertion error by adding a random letter at a random position."""
    idx = random.randint(0, len(word))
    letter = random.choice(string.ascii_lowercase)
    return word[:idx] + letter + word[idx:]


def replace_with_adjacent_key(word: str) -> str:
    """
    Simulate a typing error by replacing a letter with one of its QWERTY neighbors.
    Only letters with defined neighbors are considered.
    """
    # Define neighboring keys for a QWERTY keyboard (for lowercase letters)
    qwerty_neighbors = {
        'q': ['w', 'a'],
        'w': ['q', 'e', 's'],
        'e': ['w', 'r', 'd'],
        'r': ['e', 't', 'f'],
        't': ['r', 'y', 'g'],
        'y': ['t', 'u', 'h'],
        'u': ['y', 'i', 'j'],
        'i': ['u', 'o', 'k'],
        'o': ['i', 'p', 'l'],
        'p': ['o'],
        'a': ['q', 's', 'z'],
        's': ['a', 'd', 'w', 'x'],
        'd': ['s', 'f', 'e', 'c'],
        'f': ['d', 'g', 'r', 'v'],
        'g': ['f', 'h', 't', 'b'],
        'h': ['g', 'j', 'y', 'n'],
        'j': ['h', 'k', 'u', 'm'],
        'k': ['j', 'l', 'i'],
        'l': ['k', 'o'],
        'z': ['a', 'x'],
        'x': ['z', 'c', 's'],
        'c': ['x', 'v', 'd'],
        'v': ['c', 'b', 'f'],
        'b': ['v', 'n', 'g'],
        'n': ['b', 'm', 'h'],
        'm': ['n', 'j']
    }
    # Find indices of characters that are letters with neighbors
    valid_indices = [i for i, ch in enumerate(word) if ch.lower() in qwerty_neighbors]
    if not valid_indices:
        return word
    idx = random.choice(valid_indices)
    orig_char = word[idx]
    lower_char = orig_char.lower()
    replacement = random.choice(qwerty_neighbors[lower_char])
    # Preserve original case
    if orig_char.isupper():
        replacement = replacement.upper()
    return word[:idx] + replacement + word[idx + 1:]


def swap_adjacent_letters(word: str) -> str:
    """Simulate a transposition error by swapping two adjacent letters."""
    if len(word) < 2:
        return word
    idx = random.randint(0, len(word) - 2)
    word_list = list(word)
    word_list[idx], word_list[idx + 1] = word_list[idx + 1], word_list[idx]
    return ''.join(word_list)


def switch_ie_ei(word: str) -> str:
    """
    Switch occurrences of 'ie' with 'ei' (or vice versa) to simulate
    a common vowel pair error.
    """
    if 'ie' in word:
        # Find all occurrences of 'ie'
        indices = []
        start = 0
        while True:
            idx = word.find('ie', start)
            if idx == -1:
                break
            indices.append(idx)
            start = idx + 1
        if indices:
            idx = random.choice(indices)
            return word[:idx] + 'ei' + word[idx + 2:]
    elif 'ei' in word:
        indices = []
        start = 0
        while True:
            idx = word.find('ei', start)
            if idx == -1:
                break
            indices.append(idx)
            start = idx + 1
        if indices:
            idx = random.choice(indices)
            return word[:idx] + 'ie' + word[idx + 2:]
    return word


def generate_typo(word: str) -> str:
    """
    Given an input word, return a version of it with a common typo.
    This function randomly selects one (or sometimes two) of the following error types:
      - adjacent letter transposition
      - deletion of a letter
      - duplication of a letter
      - insertion of a random letter
      - replacement with a neighboring key (QWERTY)
      - switching 'ie' and 'ei' sequences
    While this method is by no means exhaustive, it reflects many of the typical errors documented.
    """
    if not word:
        return word

    if word in common_misspelled_words:
        if random.random() < 0.5:  # 50% chance of selecting a common misspelling.
            return random.choice(common_misspelled_words[word])

    # List of available transformation functions
    transformations = [
        apostrophe_error,
        delete_random_letter,
        duplicate_random_letter,
        insert_random_letter,
        replace_with_adjacent_key,
        swap_adjacent_letters,
        switch_ie_ei
    ]

    # Randomly choose one transformation
    transformation = random.choice(transformations)
    result = transformation(word)

    # Occasionally chain a second transformation (10% chance) for added variability
    if random.random() < 0.1:
        second_transformation = random.choice(transformations)
        result = second_transformation(result)

    return result


# Example usage:
if __name__ == '__main__':
    test_words = [
        "accommodate", "definitely", "receive", "mischievous", "calendar",
        "equipment", "pronunciation", "consensus", "friend", "beautiful",
        "doesn't", "books"
    ]
    for test_word in test_words:
        typo = generate_typo(test_word)
        print(f"Original: {test_word:15s} -> Typo: {typo}")