File size: 9,618 Bytes

2b97944

"""

=============================================================================

  FULL PERMUTATION MISSPELLINGS GENERATOR  (Google Colab Edition)

=============================================================================



Purpose:

  Generate ALL possible letter permutations of each word from words.txt

  and write them as misspelling=correction pairs.



  WARNING — READ BEFORE RUNNING  

  This is computationally EXTREME. A single 10-letter word has 3,628,800

  permutations. A 12-letter word has 479,001,600. For 466k words, the full

  output could be PETABYTES. You WILL need to limit word length.



=============================================================================

  HOW TO USE ON GOOGLE COLAB

=============================================================================



1. Open Google Colab  →  https://colab.research.google.com

2. Create a new notebook (Python 3)



3. Upload your words.txt:

   ─────────────────────────────────────

   # CELL 1: Upload words.txt

   from google.colab import files

   uploaded = files.upload()     # click "Choose Files" → select words.txt

   ─────────────────────────────────────



4. Copy-paste this ENTIRE script into a new cell and run it.



5. Download the result:

   ─────────────────────────────────────

   # CELL 3: Download the output

   files.download('misspellings_permutations.txt')

   ─────────────────────────────────────



=============================================================================

  OR: Use Google Drive for large files

=============================================================================



   # Mount Google Drive (you get 15 GB free)

   from google.colab import drive

   drive.mount('/content/drive')



   # Then set OUTPUT_PATH below to:

   OUTPUT_PATH = '/content/drive/MyDrive/misspellings_permutations.txt'



=============================================================================

  CONFIGURATION — Adjust these before running!

=============================================================================

"""

import os
import sys
import time
import math
from itertools import permutations

# ── CONFIGURATION ───────────────────────────────────────────────────────────

WORDS_PATH   = 'words.txt'                          # path to your words.txt
OUTPUT_PATH  = 'misspellings_permutations.txt'       # output file path

MIN_WORD_LEN = 3     # skip words shorter than this
MAX_WORD_LEN = 7     #  CRITICAL: max word length to permute
                      # 7  → max 5,040 perms/word   (manageable)
                      # 8  → max 40,320 perms/word  (large)
                      # 9  → max 362,880 perms/word (very large)
                      # 10 → max 3,628,800 perms/word (EXTREME)
                      # Increase at your own risk!

ONLY_ALPHA   = True   # only process pure-alphabetical words
BATCH_LOG    = 5000   # print progress every N words

# ── ESTIMATION TABLE ────────────────────────────────────────────────────────
# Here's roughly how big the output gets at each MAX_WORD_LEN setting,
# assuming ~200k qualifying words at each length bracket:
#
# MAX_WORD_LEN │ Perms per word (worst) │ Rough output size
# ─────────────┼────────────────────────┼──────────────────
#      5       │          120           │   ~200 MB
#      6       │          720           │   ~1-2 GB
#      7       │        5,040           │   ~5-15 GB
#      8       │       40,320           │   ~50-150 GB
#      9       │      362,880           │   ~500 GB - 1 TB
#     10       │    3,628,800           │   ~5-50 TB  ← won't fit anywhere
#
# Google Colab free tier gives you:
#   • ~78 GB disk on the VM (temporary, lost on disconnect)
#   • 15 GB Google Drive (persistent)
#   • Colab Pro: 225 GB disk, longer runtimes
#
# RECOMMENDATION: Start with MAX_WORD_LEN = 6 or 7, see the size,
# then increase if you have space.
# ────────────────────────────────────────────────────────────────────────────


def estimate_output(words):
    """Estimate total permutations and file size before generating."""
    total_perms = 0
    for w in words:
        n = len(w)
        # Account for duplicate letters: n! / (c1! * c2! * ...)
        freq = {}
        for ch in w.lower():
            freq[ch] = freq.get(ch, 0) + 1
        unique_perms = math.factorial(n)
        for count in freq.values():
            unique_perms //= math.factorial(count)
        total_perms += unique_perms - 1  # subtract the original word

    # Estimate ~15 bytes per line (avg)  →  "typo=word\n"
    avg_bytes_per_line = 15
    est_bytes = total_perms * avg_bytes_per_line
    est_gb = est_bytes / (1024 ** 3)

    return total_perms, est_gb


def generate_unique_permutations(word):
    """

    Generate all unique permutations of a word's letters,

    excluding the original word itself.



    Uses set() to deduplicate (handles repeated letters efficiently).

    """
    lower = word.lower()
    perms = set(''.join(p) for p in permutations(lower))
    perms.discard(lower)  # remove the correctly-spelled word
    return perms


def is_pure_alpha(word):
    return word.isalpha()


def main():
    if not os.path.exists(WORDS_PATH):
        print(f"ERROR: '{WORDS_PATH}' not found!")
        print("Make sure you uploaded words.txt or set WORDS_PATH correctly.")
        sys.exit(1)

    # ── Read words ──────────────────────────────────────────────
    print(f"Reading words from: {WORDS_PATH}")
    with open(WORDS_PATH, 'r', encoding='utf-8', errors='replace') as f:
        raw_words = [line.strip() for line in f if line.strip()]

    print(f"Total raw entries: {len(raw_words):,}")

    # Filter
    words = []
    for w in raw_words:
        if ONLY_ALPHA and not is_pure_alpha(w):
            continue
        if len(w) < MIN_WORD_LEN or len(w) > MAX_WORD_LEN:
            continue
        words.append(w)

    print(f"Filtered to {len(words):,} words (alpha-only, len {MIN_WORD_LEN}-{MAX_WORD_LEN})")

    if len(words) == 0:
        print("No words matched the filter. Adjust MIN/MAX_WORD_LEN.")
        sys.exit(1)

    # ── Estimate ────────────────────────────────────────────────
    print("\nEstimating output size (this may take a moment)...")
    total_perms, est_gb = estimate_output(words)
    print(f"  Estimated permutations : {total_perms:,}")
    print(f"  Estimated file size    : {est_gb:.2f} GB")

    # Safety check
    if est_gb > 70:
        print(f"\n  WARNING: Estimated output ({est_gb:.1f} GB) exceeds Colab disk (~78 GB).")
        print("  Reduce MAX_WORD_LEN or the script will crash when disk fills up.")
        print("  Aborting. Set MAX_WORD_LEN lower and re-run.")
        sys.exit(1)

    print(f"\nProceeding with generation → {OUTPUT_PATH}")
    print("=" * 60)

    # ── Generate ────────────────────────────────────────────────
    start = time.time()
    total_written = 0

    with open(OUTPUT_PATH, 'w', encoding='utf-8') as out:
        out.write("# Auto-generated FULL PERMUTATION misspellings\n")
        out.write(f"# Config: word length {MIN_WORD_LEN}-{MAX_WORD_LEN}\n")
        out.write("# Format: misspelling=correction\n\n")

        for idx, word in enumerate(words):
            perms = generate_unique_permutations(word)

            for typo in sorted(perms):
                out.write(f"{typo}={word}\n")
                total_written += 1

            # Progress
            if (idx + 1) % BATCH_LOG == 0:
                elapsed = time.time() - start
                pct = (idx + 1) / len(words) * 100
                rate = (idx + 1) / elapsed if elapsed > 0 else 0
                cur_size = os.path.getsize(OUTPUT_PATH) / (1024 ** 3)
                print(f"  [{pct:5.1f}%]  {idx+1:>7,}/{len(words):,} words  |"
                      f"  {total_written:>12,} lines  |  {cur_size:.2f} GB  |"
                      f"  {rate:.0f} words/sec")

    elapsed = time.time() - start
    final_size = os.path.getsize(OUTPUT_PATH) / (1024 ** 3)

    print()
    print("=" * 60)
    print(f"  DONE in {elapsed:.1f}s ({elapsed/60:.1f} min)")
    print(f"  Words processed  : {len(words):,}")
    print(f"  Lines written    : {total_written:,}")
    print(f"  Output file      : {OUTPUT_PATH}")
    print(f"  File size        : {final_size:.2f} GB")
    print("=" * 60)


if __name__ == '__main__':
    main()