""" Generate realistic typo-based misspellings from words.txt → misspellings.txt Typo strategies: 1. Adjacent letter swaps ("hello" → "hlelo", "helol") 2. Single character deletion ("hello" → "hllo", "helo") 3. Single character duplication ("hello" → "hhello", "heello") 4. Nearby keyboard key sub ("hello" → "gello", "jello") Output format: misspelling=correction (one per line) """ import sys import os import time # QWERTY keyboard proximity map KEYBOARD_NEIGHBORS = { 'q': 'wa', 'w': 'qeas', 'e': 'wrds', 'r': 'etfs', 't': 'rygs', 'y': 'tuhs', 'u': 'yijs', 'i': 'uoks', 'o': 'ipls', 'p': 'o', 'a': 'qwsz', 's': 'awedxz', 'd': 'serfcx', 'f': 'drtgvc', 'g': 'ftyhbv', 'h': 'gyujnb', 'j': 'huikmn', 'k': 'jiolm', 'l': 'kop', 'z': 'asx', 'x': 'zsdc', 'c': 'xdfv', 'v': 'cfgb', 'b': 'vghn', 'n': 'bhjm', 'm': 'njk', } def generate_adjacent_swaps(word): """Swap each pair of adjacent characters.""" typos = [] for i in range(len(word) - 1): chars = list(word) chars[i], chars[i + 1] = chars[i + 1], chars[i] typo = ''.join(chars) if typo != word: typos.append(typo) return typos def generate_deletions(word): """Delete one character at a time.""" typos = [] for i in range(len(word)): typo = word[:i] + word[i + 1:] if len(typo) >= 2: # keep at least 2 chars typos.append(typo) return typos def generate_duplications(word): """Duplicate one character at a time.""" typos = [] for i in range(len(word)): typo = word[:i] + word[i] + word[i:] if typo != word: typos.append(typo) return typos def generate_nearby_key_subs(word): """Replace one character with a nearby keyboard key.""" typos = [] lower = word.lower() for i in range(len(word)): ch = lower[i] if ch in KEYBOARD_NEIGHBORS: for neighbor in KEYBOARD_NEIGHBORS[ch]: typo = lower[:i] + neighbor + lower[i + 1:] if typo != lower: typos.append(typo) return typos def generate_all_typos(word): """Generate all realistic typo variants for a word.""" typos = set() typos.update(generate_adjacent_swaps(word)) typos.update(generate_deletions(word)) typos.update(generate_duplications(word)) typos.update(generate_nearby_key_subs(word)) typos.discard(word) # never map a word to itself typos.discard(word.lower()) return typos def is_pure_alpha(word): """Only process words that are purely alphabetical (a-z).""" return word.isalpha() def main(): base_dir = os.path.dirname(os.path.abspath(__file__)) words_path = os.path.join(base_dir, 'data', 'words.txt') output_path = os.path.join(base_dir, 'data', 'misspellings.txt') if not os.path.exists(words_path): print(f"ERROR: {words_path} not found.") sys.exit(1) # ── Read words ────────────────────────────────────────────── print(f"Reading words from: {words_path}") with open(words_path, 'r', encoding='utf-8', errors='replace') as f: raw_words = [line.strip() for line in f if line.strip()] print(f"Total raw entries: {len(raw_words):,}") # Filter to pure-alpha words with length >= 3 words = [w for w in raw_words if is_pure_alpha(w) and len(w) >= 3] print(f"Filtered to {len(words):,} alphabetical words (len >= 3)") # ── Generate typos ────────────────────────────────────────── start = time.time() total_typos = 0 batch_size = 10_000 print(f"Generating typos → {output_path}") print("This may take a few minutes for 466k words...") with open(output_path, 'w', encoding='utf-8', newline='\n') as out: out.write("# Auto-generated misspellings database\n") out.write("# Format: misspelling=correction\n") out.write("# Generated by generate_typos.py\n") out.write("#\n") out.write("# Strategies: adjacent swaps, deletions, duplications, keyboard proximity\n") out.write("\n") for idx, word in enumerate(words): correction = word # original is the correct form typos = generate_all_typos(word.lower()) for typo in sorted(typos): out.write(f"{typo}={correction}\n") total_typos += 1 # Progress reporting if (idx + 1) % batch_size == 0: elapsed = time.time() - start pct = (idx + 1) / len(words) * 100 rate = (idx + 1) / elapsed if elapsed > 0 else 0 print(f" [{pct:5.1f}%] {idx + 1:>7,} / {len(words):,} words |" f" {total_typos:>10,} typos | {rate:.0f} words/sec") elapsed = time.time() - start file_size_mb = os.path.getsize(output_path) / (1024 * 1024) print() print("=" * 60) print(f" Done in {elapsed:.1f}s") print(f" Words processed : {len(words):,}") print(f" Typos generated : {total_typos:,}") print(f" Output file : {output_path}") print(f" File size : {file_size_mb:.1f} MB") print("=" * 60) if __name__ == '__main__': main()