misspelling-generator / generate_typos_local.py
algorembrant's picture
Upload 4 files
0b4a16b verified
"""
Generate realistic typo-based misspellings from words.txt β†’ misspellings.txt
Typo strategies:
1. Adjacent letter swaps ("hello" β†’ "hlelo", "helol")
2. Single character deletion ("hello" β†’ "hllo", "helo")
3. Single character duplication ("hello" β†’ "hhello", "heello")
4. Nearby keyboard key sub ("hello" β†’ "gello", "jello")
Output format: misspelling=correction (one per line)
"""
import sys
import os
import time
# QWERTY keyboard proximity map
KEYBOARD_NEIGHBORS = {
'q': 'wa', 'w': 'qeas', 'e': 'wrds', 'r': 'etfs', 't': 'rygs',
'y': 'tuhs', 'u': 'yijs', 'i': 'uoks', 'o': 'ipls', 'p': 'o',
'a': 'qwsz', 's': 'awedxz', 'd': 'serfcx', 'f': 'drtgvc',
'g': 'ftyhbv', 'h': 'gyujnb', 'j': 'huikmn', 'k': 'jiolm',
'l': 'kop', 'z': 'asx', 'x': 'zsdc', 'c': 'xdfv', 'v': 'cfgb',
'b': 'vghn', 'n': 'bhjm', 'm': 'njk',
}
def generate_adjacent_swaps(word):
"""Swap each pair of adjacent characters."""
typos = []
for i in range(len(word) - 1):
chars = list(word)
chars[i], chars[i + 1] = chars[i + 1], chars[i]
typo = ''.join(chars)
if typo != word:
typos.append(typo)
return typos
def generate_deletions(word):
"""Delete one character at a time."""
typos = []
for i in range(len(word)):
typo = word[:i] + word[i + 1:]
if len(typo) >= 2: # keep at least 2 chars
typos.append(typo)
return typos
def generate_duplications(word):
"""Duplicate one character at a time."""
typos = []
for i in range(len(word)):
typo = word[:i] + word[i] + word[i:]
if typo != word:
typos.append(typo)
return typos
def generate_nearby_key_subs(word):
"""Replace one character with a nearby keyboard key."""
typos = []
lower = word.lower()
for i in range(len(word)):
ch = lower[i]
if ch in KEYBOARD_NEIGHBORS:
for neighbor in KEYBOARD_NEIGHBORS[ch]:
typo = lower[:i] + neighbor + lower[i + 1:]
if typo != lower:
typos.append(typo)
return typos
def generate_all_typos(word):
"""Generate all realistic typo variants for a word."""
typos = set()
typos.update(generate_adjacent_swaps(word))
typos.update(generate_deletions(word))
typos.update(generate_duplications(word))
typos.update(generate_nearby_key_subs(word))
typos.discard(word) # never map a word to itself
typos.discard(word.lower())
return typos
def is_pure_alpha(word):
"""Only process words that are purely alphabetical (a-z)."""
return word.isalpha()
def main():
base_dir = os.path.dirname(os.path.abspath(__file__))
words_path = os.path.join(base_dir, 'data', 'words.txt')
output_path = os.path.join(base_dir, 'data', 'misspellings.txt')
if not os.path.exists(words_path):
print(f"ERROR: {words_path} not found.")
sys.exit(1)
# ── Read words ──────────────────────────────────────────────
print(f"Reading words from: {words_path}")
with open(words_path, 'r', encoding='utf-8', errors='replace') as f:
raw_words = [line.strip() for line in f if line.strip()]
print(f"Total raw entries: {len(raw_words):,}")
# Filter to pure-alpha words with length >= 3
words = [w for w in raw_words if is_pure_alpha(w) and len(w) >= 3]
print(f"Filtered to {len(words):,} alphabetical words (len >= 3)")
# ── Generate typos ──────────────────────────────────────────
start = time.time()
total_typos = 0
batch_size = 10_000
print(f"Generating typos β†’ {output_path}")
print("This may take a few minutes for 466k words...")
with open(output_path, 'w', encoding='utf-8', newline='\n') as out:
out.write("# Auto-generated misspellings database\n")
out.write("# Format: misspelling=correction\n")
out.write("# Generated by generate_typos.py\n")
out.write("#\n")
out.write("# Strategies: adjacent swaps, deletions, duplications, keyboard proximity\n")
out.write("\n")
for idx, word in enumerate(words):
correction = word # original is the correct form
typos = generate_all_typos(word.lower())
for typo in sorted(typos):
out.write(f"{typo}={correction}\n")
total_typos += 1
# Progress reporting
if (idx + 1) % batch_size == 0:
elapsed = time.time() - start
pct = (idx + 1) / len(words) * 100
rate = (idx + 1) / elapsed if elapsed > 0 else 0
print(f" [{pct:5.1f}%] {idx + 1:>7,} / {len(words):,} words |"
f" {total_typos:>10,} typos | {rate:.0f} words/sec")
elapsed = time.time() - start
file_size_mb = os.path.getsize(output_path) / (1024 * 1024)
print()
print("=" * 60)
print(f" Done in {elapsed:.1f}s")
print(f" Words processed : {len(words):,}")
print(f" Typos generated : {total_typos:,}")
print(f" Output file : {output_path}")
print(f" File size : {file_size_mb:.1f} MB")
print("=" * 60)
if __name__ == '__main__':
main()