File size: 4,147 Bytes
0b4a16b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
"""

Generate realistic typo-based misspellings from words.txt β†’ misspellings.txt



Colab version

Place words.txt in /content/ before running

"""

import os
import time

# Optional: mount Google Drive if your file is there
# from google.colab import drive
# drive.mount('/content/drive')
# words_path = '/content/drive/MyDrive/words.txt'

words_path = '/content/words.txt'
output_path = '/content/misspellings.txt'

KEYBOARD_NEIGHBORS = {
    'q': 'wa', 'w': 'qeas', 'e': 'wrds', 'r': 'etfs', 't': 'rygs',
    'y': 'tuhs', 'u': 'yijs', 'i': 'uoks', 'o': 'ipls', 'p': 'o',
    'a': 'qwsz', 's': 'awedxz', 'd': 'serfcx', 'f': 'drtgvc',
    'g': 'ftyhbv', 'h': 'gyujnb', 'j': 'huikmn', 'k': 'jiolm',
    'l': 'kop', 'z': 'asx', 'x': 'zsdc', 'c': 'xdfv', 'v': 'cfgb',
    'b': 'vghn', 'n': 'bhjm', 'm': 'njk',
}

def generate_adjacent_swaps(word):
    typos = []
    for i in range(len(word) - 1):
        chars = list(word)
        chars[i], chars[i + 1] = chars[i + 1], chars[i]
        typo = ''.join(chars)
        if typo != word:
            typos.append(typo)
    return typos

def generate_deletions(word):
    typos = []
    for i in range(len(word)):
        typo = word[:i] + word[i + 1:]
        if len(typo) >= 2:
            typos.append(typo)
    return typos

def generate_duplications(word):
    typos = []
    for i in range(len(word)):
        typo = word[:i] + word[i] + word[i:]
        if typo != word:
            typos.append(typo)
    return typos

def generate_nearby_key_subs(word):
    typos = []
    lower = word.lower()
    for i in range(len(word)):
        ch = lower[i]
        if ch in KEYBOARD_NEIGHBORS:
            for neighbor in KEYBOARD_NEIGHBORS[ch]:
                typo = lower[:i] + neighbor + lower[i + 1:]
                if typo != lower:
                    typos.append(typo)
    return typos

def generate_all_typos(word):
    typos = set()
    typos.update(generate_adjacent_swaps(word))
    typos.update(generate_deletions(word))
    typos.update(generate_duplications(word))
    typos.update(generate_nearby_key_subs(word))
    typos.discard(word)
    typos.discard(word.lower())
    return typos

def is_pure_alpha(word):
    return word.isalpha()

# ── Check file ──────────────────────────────────────────────
if not os.path.exists(words_path):
    raise FileNotFoundError(f"{words_path} not found. Upload it to /content/ first.")

print(f"Reading words from: {words_path}")

with open(words_path, 'r', encoding='utf-8', errors='replace') as f:
    raw_words = [line.strip() for line in f if line.strip()]

print(f"Total raw entries: {len(raw_words):,}")

words = [w for w in raw_words if is_pure_alpha(w) and len(w) >= 3]
print(f"Filtered to {len(words):,} alphabetical words (len >= 3)")

start = time.time()
total_typos = 0
batch_size = 10_000

print(f"Generating typos β†’ {output_path}")

with open(output_path, 'w', encoding='utf-8', newline='\n') as out:
    out.write("# Auto-generated misspellings database\n")
    out.write("# Format: misspelling=correction\n\n")

    for idx, word in enumerate(words):
        correction = word
        typos = generate_all_typos(word.lower())

        for typo in sorted(typos):
            out.write(f"{typo}={correction}\n")
            total_typos += 1

        if (idx + 1) % batch_size == 0:
            elapsed = time.time() - start
            pct = (idx + 1) / len(words) * 100
            rate = (idx + 1) / elapsed if elapsed > 0 else 0
            print(f"[{pct:5.1f}%] {idx + 1:,}/{len(words):,} words | "
                  f"{total_typos:,} typos | {rate:.0f} words/sec")

elapsed = time.time() - start
file_size_mb = os.path.getsize(output_path) / (1024 * 1024)

print("\n" + "=" * 60)
print(f"Done in {elapsed:.1f}s")
print(f"Words processed : {len(words):,}")
print(f"Typos generated : {total_typos:,}")
print(f"Output file     : {output_path}")
print(f"File size       : {file_size_mb:.1f} MB")
print("=" * 60)