File size: 3,562 Bytes
40a04d4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
#!/usr/bin/env python3
"""
corrupter.py - light English typo/misspelling injection for to simulate
"human-typed" text.

This module provides `corrupt(text, ...)`, which returns the original text with
a small amount of realistic noise (common misspellings and keyboard typos). See
function docstring for arguments and explanations.
"""
import re
import argparse
import random

import nlpaug.augmenter.word as naw
import nlpaug.augmenter.char as nac
from nlpaug.flow import Sequential


def parse_args():
    parser = argparse.ArgumentParser(
        description="Interactive text corrupter.")
    parser.add_argument(
        "--misspelling-prob",
        type=float,
        default=0.04,
        help="Fraction of words to misspell (roughly)"
    )
    parser.add_argument(
        "--typo-prob",
        type=float,
        default=0.01,
        help="Fraction of words to finger fart"
    )
    parser.add_argument(
        "--min-len",
        type=int,
        default=3,
        help="Minimum length word to possibly corrupt"
    )
    parser.add_argument(
        "--verbose",
        action="store_true",
        help="If True, print words and corruptions (if any)."
    )
    return parser.parse_args()


def corrupt(
    text: str,
    misspell_prob: float = 0.04,
    typo_prob: float = 0.01,
    min_len: int = 3,
    misspell_aug_p: float = 0.02,
    typo_aug_p: float = 0.001,
    verbose: bool = False
) -> str:
    """
    Modestly corrupt the text passed to get a version with a small number of
    misspellings and typos.

    misspell_prob: Probability of each word (of sufficient length) being
        misspelled.
    typo_prob: Probability of each word (of sufficient length) having a typo.
    min_len: The minimum length word that will be considered for corruption.
    misspell_aug_p: Passed through directly to the nlpaug.augmenter. From their
        docs, it seems to mean "for words chosen for misspelling, what
        percentage of the word is misspelled?"
    typo_aug_p: Passed through directly to the nlpaug.augmenter. From their
        docs, it seems to mean "for words chosen for typos, what percentage of
        the word will have typos?"
    verbose: if True, print words as encountered, plus their corruptions (if
        any).
    """

    TOKEN_RE = re.compile(
        r"[A-Za-z0-9]+(?:'[A-Za-z0-9]+)?|\s+|[^\w\s]",
        re.UNICODE
    )
    WORD_RE = re.compile(r"[A-Za-z]+(?:'[A-Za-z0-9]+)?", re.UNICODE)

    misspell_aug = naw.SpellingAug(aug_p=misspell_aug_p)
    typo_aug = nac.KeyboardAug(aug_word_p=typo_aug_p)

    tokens = TOKEN_RE.findall(text)
    for i in range(len(tokens)):
        if verbose: print(f"Considering {tokens[i]}...")
        if WORD_RE.fullmatch(tokens[i]):
            if len(tokens[i]) >= min_len and random.random() < misspell_prob:
                tokens[i] = misspell_aug.augment(tokens[i])[0]
                if verbose: print(f"  ...misspelled to {tokens[i]}")
            if len(tokens[i]) >= min_len and random.random() < typo_prob:
                tokens[i] = typo_aug.augment(tokens[i])[0]
                if verbose: print(f"  ...corrupted to {tokens[i]}")

    return "".join(tokens)


if __name__ == "__main__":

    random.seed(123)

    args = parse_args()

    s = input("Enter text: ")
    while s != "done":
        corrupted = corrupt(
            s,
            args.misspelling_prob,
            args.typo_prob,
            args.min_len,
            verbose=args.verbose,
        )
        print(f"Corrupted version: {corrupted}")
        s = input("Enter text: ")