misspelling-generator / generate_permutations_colab.py
algorembrant's picture
Upload 5 files
2b97944 verified
"""
=============================================================================
FULL PERMUTATION MISSPELLINGS GENERATOR (Google Colab Edition)
=============================================================================
Purpose:
Generate ALL possible letter permutations of each word from words.txt
and write them as misspelling=correction pairs.
WARNING β€” READ BEFORE RUNNING
This is computationally EXTREME. A single 10-letter word has 3,628,800
permutations. A 12-letter word has 479,001,600. For 466k words, the full
output could be PETABYTES. You WILL need to limit word length.
=============================================================================
HOW TO USE ON GOOGLE COLAB
=============================================================================
1. Open Google Colab β†’ https://colab.research.google.com
2. Create a new notebook (Python 3)
3. Upload your words.txt:
─────────────────────────────────────
# CELL 1: Upload words.txt
from google.colab import files
uploaded = files.upload() # click "Choose Files" β†’ select words.txt
─────────────────────────────────────
4. Copy-paste this ENTIRE script into a new cell and run it.
5. Download the result:
─────────────────────────────────────
# CELL 3: Download the output
files.download('misspellings_permutations.txt')
─────────────────────────────────────
=============================================================================
OR: Use Google Drive for large files
=============================================================================
# Mount Google Drive (you get 15 GB free)
from google.colab import drive
drive.mount('/content/drive')
# Then set OUTPUT_PATH below to:
OUTPUT_PATH = '/content/drive/MyDrive/misspellings_permutations.txt'
=============================================================================
CONFIGURATION β€” Adjust these before running!
=============================================================================
"""
import os
import sys
import time
import math
from itertools import permutations
# ── CONFIGURATION ───────────────────────────────────────────────────────────
WORDS_PATH = 'words.txt' # path to your words.txt
OUTPUT_PATH = 'misspellings_permutations.txt' # output file path
MIN_WORD_LEN = 3 # skip words shorter than this
MAX_WORD_LEN = 7 # CRITICAL: max word length to permute
# 7 β†’ max 5,040 perms/word (manageable)
# 8 β†’ max 40,320 perms/word (large)
# 9 β†’ max 362,880 perms/word (very large)
# 10 β†’ max 3,628,800 perms/word (EXTREME)
# Increase at your own risk!
ONLY_ALPHA = True # only process pure-alphabetical words
BATCH_LOG = 5000 # print progress every N words
# ── ESTIMATION TABLE ────────────────────────────────────────────────────────
# Here's roughly how big the output gets at each MAX_WORD_LEN setting,
# assuming ~200k qualifying words at each length bracket:
#
# MAX_WORD_LEN β”‚ Perms per word (worst) β”‚ Rough output size
# ─────────────┼────────────────────────┼──────────────────
# 5 β”‚ 120 β”‚ ~200 MB
# 6 β”‚ 720 β”‚ ~1-2 GB
# 7 β”‚ 5,040 β”‚ ~5-15 GB
# 8 β”‚ 40,320 β”‚ ~50-150 GB
# 9 β”‚ 362,880 β”‚ ~500 GB - 1 TB
# 10 β”‚ 3,628,800 β”‚ ~5-50 TB ← won't fit anywhere
#
# Google Colab free tier gives you:
# β€’ ~78 GB disk on the VM (temporary, lost on disconnect)
# β€’ 15 GB Google Drive (persistent)
# β€’ Colab Pro: 225 GB disk, longer runtimes
#
# RECOMMENDATION: Start with MAX_WORD_LEN = 6 or 7, see the size,
# then increase if you have space.
# ────────────────────────────────────────────────────────────────────────────
def estimate_output(words):
"""Estimate total permutations and file size before generating."""
total_perms = 0
for w in words:
n = len(w)
# Account for duplicate letters: n! / (c1! * c2! * ...)
freq = {}
for ch in w.lower():
freq[ch] = freq.get(ch, 0) + 1
unique_perms = math.factorial(n)
for count in freq.values():
unique_perms //= math.factorial(count)
total_perms += unique_perms - 1 # subtract the original word
# Estimate ~15 bytes per line (avg) β†’ "typo=word\n"
avg_bytes_per_line = 15
est_bytes = total_perms * avg_bytes_per_line
est_gb = est_bytes / (1024 ** 3)
return total_perms, est_gb
def generate_unique_permutations(word):
"""
Generate all unique permutations of a word's letters,
excluding the original word itself.
Uses set() to deduplicate (handles repeated letters efficiently).
"""
lower = word.lower()
perms = set(''.join(p) for p in permutations(lower))
perms.discard(lower) # remove the correctly-spelled word
return perms
def is_pure_alpha(word):
return word.isalpha()
def main():
if not os.path.exists(WORDS_PATH):
print(f"ERROR: '{WORDS_PATH}' not found!")
print("Make sure you uploaded words.txt or set WORDS_PATH correctly.")
sys.exit(1)
# ── Read words ──────────────────────────────────────────────
print(f"Reading words from: {WORDS_PATH}")
with open(WORDS_PATH, 'r', encoding='utf-8', errors='replace') as f:
raw_words = [line.strip() for line in f if line.strip()]
print(f"Total raw entries: {len(raw_words):,}")
# Filter
words = []
for w in raw_words:
if ONLY_ALPHA and not is_pure_alpha(w):
continue
if len(w) < MIN_WORD_LEN or len(w) > MAX_WORD_LEN:
continue
words.append(w)
print(f"Filtered to {len(words):,} words (alpha-only, len {MIN_WORD_LEN}-{MAX_WORD_LEN})")
if len(words) == 0:
print("No words matched the filter. Adjust MIN/MAX_WORD_LEN.")
sys.exit(1)
# ── Estimate ────────────────────────────────────────────────
print("\nEstimating output size (this may take a moment)...")
total_perms, est_gb = estimate_output(words)
print(f" Estimated permutations : {total_perms:,}")
print(f" Estimated file size : {est_gb:.2f} GB")
# Safety check
if est_gb > 70:
print(f"\n WARNING: Estimated output ({est_gb:.1f} GB) exceeds Colab disk (~78 GB).")
print(" Reduce MAX_WORD_LEN or the script will crash when disk fills up.")
print(" Aborting. Set MAX_WORD_LEN lower and re-run.")
sys.exit(1)
print(f"\nProceeding with generation β†’ {OUTPUT_PATH}")
print("=" * 60)
# ── Generate ────────────────────────────────────────────────
start = time.time()
total_written = 0
with open(OUTPUT_PATH, 'w', encoding='utf-8') as out:
out.write("# Auto-generated FULL PERMUTATION misspellings\n")
out.write(f"# Config: word length {MIN_WORD_LEN}-{MAX_WORD_LEN}\n")
out.write("# Format: misspelling=correction\n\n")
for idx, word in enumerate(words):
perms = generate_unique_permutations(word)
for typo in sorted(perms):
out.write(f"{typo}={word}\n")
total_written += 1
# Progress
if (idx + 1) % BATCH_LOG == 0:
elapsed = time.time() - start
pct = (idx + 1) / len(words) * 100
rate = (idx + 1) / elapsed if elapsed > 0 else 0
cur_size = os.path.getsize(OUTPUT_PATH) / (1024 ** 3)
print(f" [{pct:5.1f}%] {idx+1:>7,}/{len(words):,} words |"
f" {total_written:>12,} lines | {cur_size:.2f} GB |"
f" {rate:.0f} words/sec")
elapsed = time.time() - start
final_size = os.path.getsize(OUTPUT_PATH) / (1024 ** 3)
print()
print("=" * 60)
print(f" DONE in {elapsed:.1f}s ({elapsed/60:.1f} min)")
print(f" Words processed : {len(words):,}")
print(f" Lines written : {total_written:,}")
print(f" Output file : {OUTPUT_PATH}")
print(f" File size : {final_size:.2f} GB")
print("=" * 60)
if __name__ == '__main__':
main()