""" ============================================================================= FULL PERMUTATION MISSPELLINGS GENERATOR (Google Colab Edition) ============================================================================= Purpose: Generate ALL possible letter permutations of each word from words.txt and write them as misspelling=correction pairs. WARNING — READ BEFORE RUNNING This is computationally EXTREME. A single 10-letter word has 3,628,800 permutations. A 12-letter word has 479,001,600. For 466k words, the full output could be PETABYTES. You WILL need to limit word length. ============================================================================= HOW TO USE ON GOOGLE COLAB ============================================================================= 1. Open Google Colab → https://colab.research.google.com 2. Create a new notebook (Python 3) 3. Upload your words.txt: ───────────────────────────────────── # CELL 1: Upload words.txt from google.colab import files uploaded = files.upload() # click "Choose Files" → select words.txt ───────────────────────────────────── 4. Copy-paste this ENTIRE script into a new cell and run it. 5. Download the result: ───────────────────────────────────── # CELL 3: Download the output files.download('misspellings_permutations.txt') ───────────────────────────────────── ============================================================================= OR: Use Google Drive for large files ============================================================================= # Mount Google Drive (you get 15 GB free) from google.colab import drive drive.mount('/content/drive') # Then set OUTPUT_PATH below to: OUTPUT_PATH = '/content/drive/MyDrive/misspellings_permutations.txt' ============================================================================= CONFIGURATION — Adjust these before running! ============================================================================= """ import os import sys import time import math from itertools import permutations # ── CONFIGURATION ─────────────────────────────────────────────────────────── WORDS_PATH = 'words.txt' # path to your words.txt OUTPUT_PATH = 'misspellings_permutations.txt' # output file path MIN_WORD_LEN = 3 # skip words shorter than this MAX_WORD_LEN = 7 # CRITICAL: max word length to permute # 7 → max 5,040 perms/word (manageable) # 8 → max 40,320 perms/word (large) # 9 → max 362,880 perms/word (very large) # 10 → max 3,628,800 perms/word (EXTREME) # Increase at your own risk! ONLY_ALPHA = True # only process pure-alphabetical words BATCH_LOG = 5000 # print progress every N words # ── ESTIMATION TABLE ──────────────────────────────────────────────────────── # Here's roughly how big the output gets at each MAX_WORD_LEN setting, # assuming ~200k qualifying words at each length bracket: # # MAX_WORD_LEN │ Perms per word (worst) │ Rough output size # ─────────────┼────────────────────────┼────────────────── # 5 │ 120 │ ~200 MB # 6 │ 720 │ ~1-2 GB # 7 │ 5,040 │ ~5-15 GB # 8 │ 40,320 │ ~50-150 GB # 9 │ 362,880 │ ~500 GB - 1 TB # 10 │ 3,628,800 │ ~5-50 TB ← won't fit anywhere # # Google Colab free tier gives you: # • ~78 GB disk on the VM (temporary, lost on disconnect) # • 15 GB Google Drive (persistent) # • Colab Pro: 225 GB disk, longer runtimes # # RECOMMENDATION: Start with MAX_WORD_LEN = 6 or 7, see the size, # then increase if you have space. # ──────────────────────────────────────────────────────────────────────────── def estimate_output(words): """Estimate total permutations and file size before generating.""" total_perms = 0 for w in words: n = len(w) # Account for duplicate letters: n! / (c1! * c2! * ...) freq = {} for ch in w.lower(): freq[ch] = freq.get(ch, 0) + 1 unique_perms = math.factorial(n) for count in freq.values(): unique_perms //= math.factorial(count) total_perms += unique_perms - 1 # subtract the original word # Estimate ~15 bytes per line (avg) → "typo=word\n" avg_bytes_per_line = 15 est_bytes = total_perms * avg_bytes_per_line est_gb = est_bytes / (1024 ** 3) return total_perms, est_gb def generate_unique_permutations(word): """ Generate all unique permutations of a word's letters, excluding the original word itself. Uses set() to deduplicate (handles repeated letters efficiently). """ lower = word.lower() perms = set(''.join(p) for p in permutations(lower)) perms.discard(lower) # remove the correctly-spelled word return perms def is_pure_alpha(word): return word.isalpha() def main(): if not os.path.exists(WORDS_PATH): print(f"ERROR: '{WORDS_PATH}' not found!") print("Make sure you uploaded words.txt or set WORDS_PATH correctly.") sys.exit(1) # ── Read words ────────────────────────────────────────────── print(f"Reading words from: {WORDS_PATH}") with open(WORDS_PATH, 'r', encoding='utf-8', errors='replace') as f: raw_words = [line.strip() for line in f if line.strip()] print(f"Total raw entries: {len(raw_words):,}") # Filter words = [] for w in raw_words: if ONLY_ALPHA and not is_pure_alpha(w): continue if len(w) < MIN_WORD_LEN or len(w) > MAX_WORD_LEN: continue words.append(w) print(f"Filtered to {len(words):,} words (alpha-only, len {MIN_WORD_LEN}-{MAX_WORD_LEN})") if len(words) == 0: print("No words matched the filter. Adjust MIN/MAX_WORD_LEN.") sys.exit(1) # ── Estimate ──────────────────────────────────────────────── print("\nEstimating output size (this may take a moment)...") total_perms, est_gb = estimate_output(words) print(f" Estimated permutations : {total_perms:,}") print(f" Estimated file size : {est_gb:.2f} GB") # Safety check if est_gb > 70: print(f"\n WARNING: Estimated output ({est_gb:.1f} GB) exceeds Colab disk (~78 GB).") print(" Reduce MAX_WORD_LEN or the script will crash when disk fills up.") print(" Aborting. Set MAX_WORD_LEN lower and re-run.") sys.exit(1) print(f"\nProceeding with generation → {OUTPUT_PATH}") print("=" * 60) # ── Generate ──────────────────────────────────────────────── start = time.time() total_written = 0 with open(OUTPUT_PATH, 'w', encoding='utf-8') as out: out.write("# Auto-generated FULL PERMUTATION misspellings\n") out.write(f"# Config: word length {MIN_WORD_LEN}-{MAX_WORD_LEN}\n") out.write("# Format: misspelling=correction\n\n") for idx, word in enumerate(words): perms = generate_unique_permutations(word) for typo in sorted(perms): out.write(f"{typo}={word}\n") total_written += 1 # Progress if (idx + 1) % BATCH_LOG == 0: elapsed = time.time() - start pct = (idx + 1) / len(words) * 100 rate = (idx + 1) / elapsed if elapsed > 0 else 0 cur_size = os.path.getsize(OUTPUT_PATH) / (1024 ** 3) print(f" [{pct:5.1f}%] {idx+1:>7,}/{len(words):,} words |" f" {total_written:>12,} lines | {cur_size:.2f} GB |" f" {rate:.0f} words/sec") elapsed = time.time() - start final_size = os.path.getsize(OUTPUT_PATH) / (1024 ** 3) print() print("=" * 60) print(f" DONE in {elapsed:.1f}s ({elapsed/60:.1f} min)") print(f" Words processed : {len(words):,}") print(f" Lines written : {total_written:,}") print(f" Output file : {OUTPUT_PATH}") print(f" File size : {final_size:.2f} GB") print("=" * 60) if __name__ == '__main__': main()