| | """
|
| | =============================================================================
|
| | FULL PERMUTATION MISSPELLINGS GENERATOR (Google Colab Edition)
|
| | =============================================================================
|
| |
|
| | Purpose:
|
| | Generate ALL possible letter permutations of each word from words.txt
|
| | and write them as misspelling=correction pairs.
|
| |
|
| | WARNING β READ BEFORE RUNNING
|
| | This is computationally EXTREME. A single 10-letter word has 3,628,800
|
| | permutations. A 12-letter word has 479,001,600. For 466k words, the full
|
| | output could be PETABYTES. You WILL need to limit word length.
|
| |
|
| | =============================================================================
|
| | HOW TO USE ON GOOGLE COLAB
|
| | =============================================================================
|
| |
|
| | 1. Open Google Colab β https://colab.research.google.com
|
| | 2. Create a new notebook (Python 3)
|
| |
|
| | 3. Upload your words.txt:
|
| | βββββββββββββββββββββββββββββββββββββ
|
| | # CELL 1: Upload words.txt
|
| | from google.colab import files
|
| | uploaded = files.upload() # click "Choose Files" β select words.txt
|
| | βββββββββββββββββββββββββββββββββββββ
|
| |
|
| | 4. Copy-paste this ENTIRE script into a new cell and run it.
|
| |
|
| | 5. Download the result:
|
| | βββββββββββββββββββββββββββββββββββββ
|
| | # CELL 3: Download the output
|
| | files.download('misspellings_permutations.txt')
|
| | βββββββββββββββββββββββββββββββββββββ
|
| |
|
| | =============================================================================
|
| | OR: Use Google Drive for large files
|
| | =============================================================================
|
| |
|
| | # Mount Google Drive (you get 15 GB free)
|
| | from google.colab import drive
|
| | drive.mount('/content/drive')
|
| |
|
| | # Then set OUTPUT_PATH below to:
|
| | OUTPUT_PATH = '/content/drive/MyDrive/misspellings_permutations.txt'
|
| |
|
| | =============================================================================
|
| | CONFIGURATION β Adjust these before running!
|
| | =============================================================================
|
| | """
|
| |
|
| | import os
|
| | import sys
|
| | import time
|
| | import math
|
| | from itertools import permutations
|
| |
|
| |
|
| |
|
| | WORDS_PATH = 'words.txt'
|
| | OUTPUT_PATH = 'misspellings_permutations.txt'
|
| |
|
| | MIN_WORD_LEN = 3
|
| | MAX_WORD_LEN = 7
|
| |
|
| |
|
| |
|
| |
|
| |
|
| |
|
| | ONLY_ALPHA = True
|
| | BATCH_LOG = 5000
|
| |
|
| |
|
| |
|
| |
|
| |
|
| |
|
| |
|
| |
|
| |
|
| |
|
| |
|
| |
|
| |
|
| |
|
| |
|
| |
|
| |
|
| |
|
| |
|
| |
|
| |
|
| |
|
| |
|
| |
|
| | def estimate_output(words):
|
| | """Estimate total permutations and file size before generating."""
|
| | total_perms = 0
|
| | for w in words:
|
| | n = len(w)
|
| |
|
| | freq = {}
|
| | for ch in w.lower():
|
| | freq[ch] = freq.get(ch, 0) + 1
|
| | unique_perms = math.factorial(n)
|
| | for count in freq.values():
|
| | unique_perms //= math.factorial(count)
|
| | total_perms += unique_perms - 1
|
| |
|
| |
|
| | avg_bytes_per_line = 15
|
| | est_bytes = total_perms * avg_bytes_per_line
|
| | est_gb = est_bytes / (1024 ** 3)
|
| |
|
| | return total_perms, est_gb
|
| |
|
| |
|
| | def generate_unique_permutations(word):
|
| | """
|
| | Generate all unique permutations of a word's letters,
|
| | excluding the original word itself.
|
| |
|
| | Uses set() to deduplicate (handles repeated letters efficiently).
|
| | """
|
| | lower = word.lower()
|
| | perms = set(''.join(p) for p in permutations(lower))
|
| | perms.discard(lower)
|
| | return perms
|
| |
|
| |
|
| | def is_pure_alpha(word):
|
| | return word.isalpha()
|
| |
|
| |
|
| | def main():
|
| | if not os.path.exists(WORDS_PATH):
|
| | print(f"ERROR: '{WORDS_PATH}' not found!")
|
| | print("Make sure you uploaded words.txt or set WORDS_PATH correctly.")
|
| | sys.exit(1)
|
| |
|
| |
|
| | print(f"Reading words from: {WORDS_PATH}")
|
| | with open(WORDS_PATH, 'r', encoding='utf-8', errors='replace') as f:
|
| | raw_words = [line.strip() for line in f if line.strip()]
|
| |
|
| | print(f"Total raw entries: {len(raw_words):,}")
|
| |
|
| |
|
| | words = []
|
| | for w in raw_words:
|
| | if ONLY_ALPHA and not is_pure_alpha(w):
|
| | continue
|
| | if len(w) < MIN_WORD_LEN or len(w) > MAX_WORD_LEN:
|
| | continue
|
| | words.append(w)
|
| |
|
| | print(f"Filtered to {len(words):,} words (alpha-only, len {MIN_WORD_LEN}-{MAX_WORD_LEN})")
|
| |
|
| | if len(words) == 0:
|
| | print("No words matched the filter. Adjust MIN/MAX_WORD_LEN.")
|
| | sys.exit(1)
|
| |
|
| |
|
| | print("\nEstimating output size (this may take a moment)...")
|
| | total_perms, est_gb = estimate_output(words)
|
| | print(f" Estimated permutations : {total_perms:,}")
|
| | print(f" Estimated file size : {est_gb:.2f} GB")
|
| |
|
| |
|
| | if est_gb > 70:
|
| | print(f"\n WARNING: Estimated output ({est_gb:.1f} GB) exceeds Colab disk (~78 GB).")
|
| | print(" Reduce MAX_WORD_LEN or the script will crash when disk fills up.")
|
| | print(" Aborting. Set MAX_WORD_LEN lower and re-run.")
|
| | sys.exit(1)
|
| |
|
| | print(f"\nProceeding with generation β {OUTPUT_PATH}")
|
| | print("=" * 60)
|
| |
|
| |
|
| | start = time.time()
|
| | total_written = 0
|
| |
|
| | with open(OUTPUT_PATH, 'w', encoding='utf-8') as out:
|
| | out.write("# Auto-generated FULL PERMUTATION misspellings\n")
|
| | out.write(f"# Config: word length {MIN_WORD_LEN}-{MAX_WORD_LEN}\n")
|
| | out.write("# Format: misspelling=correction\n\n")
|
| |
|
| | for idx, word in enumerate(words):
|
| | perms = generate_unique_permutations(word)
|
| |
|
| | for typo in sorted(perms):
|
| | out.write(f"{typo}={word}\n")
|
| | total_written += 1
|
| |
|
| |
|
| | if (idx + 1) % BATCH_LOG == 0:
|
| | elapsed = time.time() - start
|
| | pct = (idx + 1) / len(words) * 100
|
| | rate = (idx + 1) / elapsed if elapsed > 0 else 0
|
| | cur_size = os.path.getsize(OUTPUT_PATH) / (1024 ** 3)
|
| | print(f" [{pct:5.1f}%] {idx+1:>7,}/{len(words):,} words |"
|
| | f" {total_written:>12,} lines | {cur_size:.2f} GB |"
|
| | f" {rate:.0f} words/sec")
|
| |
|
| | elapsed = time.time() - start
|
| | final_size = os.path.getsize(OUTPUT_PATH) / (1024 ** 3)
|
| |
|
| | print()
|
| | print("=" * 60)
|
| | print(f" DONE in {elapsed:.1f}s ({elapsed/60:.1f} min)")
|
| | print(f" Words processed : {len(words):,}")
|
| | print(f" Lines written : {total_written:,}")
|
| | print(f" Output file : {OUTPUT_PATH}")
|
| | print(f" File size : {final_size:.2f} GB")
|
| | print("=" * 60)
|
| |
|
| |
|
| | if __name__ == '__main__':
|
| | main()
|
| |
|