File size: 4,147 Bytes
0b4a16b | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 | """
Generate realistic typo-based misspellings from words.txt β misspellings.txt
Colab version
Place words.txt in /content/ before running
"""
import os
import time
# Optional: mount Google Drive if your file is there
# from google.colab import drive
# drive.mount('/content/drive')
# words_path = '/content/drive/MyDrive/words.txt'
words_path = '/content/words.txt'
output_path = '/content/misspellings.txt'
KEYBOARD_NEIGHBORS = {
'q': 'wa', 'w': 'qeas', 'e': 'wrds', 'r': 'etfs', 't': 'rygs',
'y': 'tuhs', 'u': 'yijs', 'i': 'uoks', 'o': 'ipls', 'p': 'o',
'a': 'qwsz', 's': 'awedxz', 'd': 'serfcx', 'f': 'drtgvc',
'g': 'ftyhbv', 'h': 'gyujnb', 'j': 'huikmn', 'k': 'jiolm',
'l': 'kop', 'z': 'asx', 'x': 'zsdc', 'c': 'xdfv', 'v': 'cfgb',
'b': 'vghn', 'n': 'bhjm', 'm': 'njk',
}
def generate_adjacent_swaps(word):
typos = []
for i in range(len(word) - 1):
chars = list(word)
chars[i], chars[i + 1] = chars[i + 1], chars[i]
typo = ''.join(chars)
if typo != word:
typos.append(typo)
return typos
def generate_deletions(word):
typos = []
for i in range(len(word)):
typo = word[:i] + word[i + 1:]
if len(typo) >= 2:
typos.append(typo)
return typos
def generate_duplications(word):
typos = []
for i in range(len(word)):
typo = word[:i] + word[i] + word[i:]
if typo != word:
typos.append(typo)
return typos
def generate_nearby_key_subs(word):
typos = []
lower = word.lower()
for i in range(len(word)):
ch = lower[i]
if ch in KEYBOARD_NEIGHBORS:
for neighbor in KEYBOARD_NEIGHBORS[ch]:
typo = lower[:i] + neighbor + lower[i + 1:]
if typo != lower:
typos.append(typo)
return typos
def generate_all_typos(word):
typos = set()
typos.update(generate_adjacent_swaps(word))
typos.update(generate_deletions(word))
typos.update(generate_duplications(word))
typos.update(generate_nearby_key_subs(word))
typos.discard(word)
typos.discard(word.lower())
return typos
def is_pure_alpha(word):
return word.isalpha()
# ββ Check file ββββββββββββββββββββββββββββββββββββββββββββββ
if not os.path.exists(words_path):
raise FileNotFoundError(f"{words_path} not found. Upload it to /content/ first.")
print(f"Reading words from: {words_path}")
with open(words_path, 'r', encoding='utf-8', errors='replace') as f:
raw_words = [line.strip() for line in f if line.strip()]
print(f"Total raw entries: {len(raw_words):,}")
words = [w for w in raw_words if is_pure_alpha(w) and len(w) >= 3]
print(f"Filtered to {len(words):,} alphabetical words (len >= 3)")
start = time.time()
total_typos = 0
batch_size = 10_000
print(f"Generating typos β {output_path}")
with open(output_path, 'w', encoding='utf-8', newline='\n') as out:
out.write("# Auto-generated misspellings database\n")
out.write("# Format: misspelling=correction\n\n")
for idx, word in enumerate(words):
correction = word
typos = generate_all_typos(word.lower())
for typo in sorted(typos):
out.write(f"{typo}={correction}\n")
total_typos += 1
if (idx + 1) % batch_size == 0:
elapsed = time.time() - start
pct = (idx + 1) / len(words) * 100
rate = (idx + 1) / elapsed if elapsed > 0 else 0
print(f"[{pct:5.1f}%] {idx + 1:,}/{len(words):,} words | "
f"{total_typos:,} typos | {rate:.0f} words/sec")
elapsed = time.time() - start
file_size_mb = os.path.getsize(output_path) / (1024 * 1024)
print("\n" + "=" * 60)
print(f"Done in {elapsed:.1f}s")
print(f"Words processed : {len(words):,}")
print(f"Typos generated : {total_typos:,}")
print(f"Output file : {output_path}")
print(f"File size : {file_size_mb:.1f} MB")
print("=" * 60) |