| | """
|
| | Generate realistic typo-based misspellings from words.txt β misspellings.txt
|
| |
|
| | Typo strategies:
|
| | 1. Adjacent letter swaps ("hello" β "hlelo", "helol")
|
| | 2. Single character deletion ("hello" β "hllo", "helo")
|
| | 3. Single character duplication ("hello" β "hhello", "heello")
|
| | 4. Nearby keyboard key sub ("hello" β "gello", "jello")
|
| |
|
| | Output format: misspelling=correction (one per line)
|
| | """
|
| |
|
| | import sys
|
| | import os
|
| | import time
|
| |
|
| |
|
| | KEYBOARD_NEIGHBORS = {
|
| | 'q': 'wa', 'w': 'qeas', 'e': 'wrds', 'r': 'etfs', 't': 'rygs',
|
| | 'y': 'tuhs', 'u': 'yijs', 'i': 'uoks', 'o': 'ipls', 'p': 'o',
|
| | 'a': 'qwsz', 's': 'awedxz', 'd': 'serfcx', 'f': 'drtgvc',
|
| | 'g': 'ftyhbv', 'h': 'gyujnb', 'j': 'huikmn', 'k': 'jiolm',
|
| | 'l': 'kop', 'z': 'asx', 'x': 'zsdc', 'c': 'xdfv', 'v': 'cfgb',
|
| | 'b': 'vghn', 'n': 'bhjm', 'm': 'njk',
|
| | }
|
| |
|
| |
|
| | def generate_adjacent_swaps(word):
|
| | """Swap each pair of adjacent characters."""
|
| | typos = []
|
| | for i in range(len(word) - 1):
|
| | chars = list(word)
|
| | chars[i], chars[i + 1] = chars[i + 1], chars[i]
|
| | typo = ''.join(chars)
|
| | if typo != word:
|
| | typos.append(typo)
|
| | return typos
|
| |
|
| |
|
| | def generate_deletions(word):
|
| | """Delete one character at a time."""
|
| | typos = []
|
| | for i in range(len(word)):
|
| | typo = word[:i] + word[i + 1:]
|
| | if len(typo) >= 2:
|
| | typos.append(typo)
|
| | return typos
|
| |
|
| |
|
| | def generate_duplications(word):
|
| | """Duplicate one character at a time."""
|
| | typos = []
|
| | for i in range(len(word)):
|
| | typo = word[:i] + word[i] + word[i:]
|
| | if typo != word:
|
| | typos.append(typo)
|
| | return typos
|
| |
|
| |
|
| | def generate_nearby_key_subs(word):
|
| | """Replace one character with a nearby keyboard key."""
|
| | typos = []
|
| | lower = word.lower()
|
| | for i in range(len(word)):
|
| | ch = lower[i]
|
| | if ch in KEYBOARD_NEIGHBORS:
|
| | for neighbor in KEYBOARD_NEIGHBORS[ch]:
|
| | typo = lower[:i] + neighbor + lower[i + 1:]
|
| | if typo != lower:
|
| | typos.append(typo)
|
| | return typos
|
| |
|
| |
|
| | def generate_all_typos(word):
|
| | """Generate all realistic typo variants for a word."""
|
| | typos = set()
|
| | typos.update(generate_adjacent_swaps(word))
|
| | typos.update(generate_deletions(word))
|
| | typos.update(generate_duplications(word))
|
| | typos.update(generate_nearby_key_subs(word))
|
| | typos.discard(word)
|
| | typos.discard(word.lower())
|
| | return typos
|
| |
|
| |
|
| | def is_pure_alpha(word):
|
| | """Only process words that are purely alphabetical (a-z)."""
|
| | return word.isalpha()
|
| |
|
| |
|
| | def main():
|
| | base_dir = os.path.dirname(os.path.abspath(__file__))
|
| | words_path = os.path.join(base_dir, 'data', 'words.txt')
|
| | output_path = os.path.join(base_dir, 'data', 'misspellings.txt')
|
| |
|
| | if not os.path.exists(words_path):
|
| | print(f"ERROR: {words_path} not found.")
|
| | sys.exit(1)
|
| |
|
| |
|
| | print(f"Reading words from: {words_path}")
|
| | with open(words_path, 'r', encoding='utf-8', errors='replace') as f:
|
| | raw_words = [line.strip() for line in f if line.strip()]
|
| |
|
| | print(f"Total raw entries: {len(raw_words):,}")
|
| |
|
| |
|
| | words = [w for w in raw_words if is_pure_alpha(w) and len(w) >= 3]
|
| | print(f"Filtered to {len(words):,} alphabetical words (len >= 3)")
|
| |
|
| |
|
| | start = time.time()
|
| | total_typos = 0
|
| | batch_size = 10_000
|
| |
|
| | print(f"Generating typos β {output_path}")
|
| | print("This may take a few minutes for 466k words...")
|
| |
|
| | with open(output_path, 'w', encoding='utf-8', newline='\n') as out:
|
| | out.write("# Auto-generated misspellings database\n")
|
| | out.write("# Format: misspelling=correction\n")
|
| | out.write("# Generated by generate_typos.py\n")
|
| | out.write("#\n")
|
| | out.write("# Strategies: adjacent swaps, deletions, duplications, keyboard proximity\n")
|
| | out.write("\n")
|
| |
|
| | for idx, word in enumerate(words):
|
| | correction = word
|
| | typos = generate_all_typos(word.lower())
|
| |
|
| | for typo in sorted(typos):
|
| | out.write(f"{typo}={correction}\n")
|
| | total_typos += 1
|
| |
|
| |
|
| | if (idx + 1) % batch_size == 0:
|
| | elapsed = time.time() - start
|
| | pct = (idx + 1) / len(words) * 100
|
| | rate = (idx + 1) / elapsed if elapsed > 0 else 0
|
| | print(f" [{pct:5.1f}%] {idx + 1:>7,} / {len(words):,} words |"
|
| | f" {total_typos:>10,} typos | {rate:.0f} words/sec")
|
| |
|
| | elapsed = time.time() - start
|
| | file_size_mb = os.path.getsize(output_path) / (1024 * 1024)
|
| |
|
| | print()
|
| | print("=" * 60)
|
| | print(f" Done in {elapsed:.1f}s")
|
| | print(f" Words processed : {len(words):,}")
|
| | print(f" Typos generated : {total_typos:,}")
|
| | print(f" Output file : {output_path}")
|
| | print(f" File size : {file_size_mb:.1f} MB")
|
| | print("=" * 60)
|
| |
|
| |
|
| | if __name__ == '__main__':
|
| | main()
|
| |
|