File size: 9,618 Bytes
2b97944 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 | """
=============================================================================
FULL PERMUTATION MISSPELLINGS GENERATOR (Google Colab Edition)
=============================================================================
Purpose:
Generate ALL possible letter permutations of each word from words.txt
and write them as misspelling=correction pairs.
WARNING β READ BEFORE RUNNING
This is computationally EXTREME. A single 10-letter word has 3,628,800
permutations. A 12-letter word has 479,001,600. For 466k words, the full
output could be PETABYTES. You WILL need to limit word length.
=============================================================================
HOW TO USE ON GOOGLE COLAB
=============================================================================
1. Open Google Colab β https://colab.research.google.com
2. Create a new notebook (Python 3)
3. Upload your words.txt:
βββββββββββββββββββββββββββββββββββββ
# CELL 1: Upload words.txt
from google.colab import files
uploaded = files.upload() # click "Choose Files" β select words.txt
βββββββββββββββββββββββββββββββββββββ
4. Copy-paste this ENTIRE script into a new cell and run it.
5. Download the result:
βββββββββββββββββββββββββββββββββββββ
# CELL 3: Download the output
files.download('misspellings_permutations.txt')
βββββββββββββββββββββββββββββββββββββ
=============================================================================
OR: Use Google Drive for large files
=============================================================================
# Mount Google Drive (you get 15 GB free)
from google.colab import drive
drive.mount('/content/drive')
# Then set OUTPUT_PATH below to:
OUTPUT_PATH = '/content/drive/MyDrive/misspellings_permutations.txt'
=============================================================================
CONFIGURATION β Adjust these before running!
=============================================================================
"""
import os
import sys
import time
import math
from itertools import permutations
# ββ CONFIGURATION βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
WORDS_PATH = 'words.txt' # path to your words.txt
OUTPUT_PATH = 'misspellings_permutations.txt' # output file path
MIN_WORD_LEN = 3 # skip words shorter than this
MAX_WORD_LEN = 7 # CRITICAL: max word length to permute
# 7 β max 5,040 perms/word (manageable)
# 8 β max 40,320 perms/word (large)
# 9 β max 362,880 perms/word (very large)
# 10 β max 3,628,800 perms/word (EXTREME)
# Increase at your own risk!
ONLY_ALPHA = True # only process pure-alphabetical words
BATCH_LOG = 5000 # print progress every N words
# ββ ESTIMATION TABLE ββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
# Here's roughly how big the output gets at each MAX_WORD_LEN setting,
# assuming ~200k qualifying words at each length bracket:
#
# MAX_WORD_LEN β Perms per word (worst) β Rough output size
# ββββββββββββββΌβββββββββββββββββββββββββΌββββββββββββββββββ
# 5 β 120 β ~200 MB
# 6 β 720 β ~1-2 GB
# 7 β 5,040 β ~5-15 GB
# 8 β 40,320 β ~50-150 GB
# 9 β 362,880 β ~500 GB - 1 TB
# 10 β 3,628,800 β ~5-50 TB β won't fit anywhere
#
# Google Colab free tier gives you:
# β’ ~78 GB disk on the VM (temporary, lost on disconnect)
# β’ 15 GB Google Drive (persistent)
# β’ Colab Pro: 225 GB disk, longer runtimes
#
# RECOMMENDATION: Start with MAX_WORD_LEN = 6 or 7, see the size,
# then increase if you have space.
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
def estimate_output(words):
"""Estimate total permutations and file size before generating."""
total_perms = 0
for w in words:
n = len(w)
# Account for duplicate letters: n! / (c1! * c2! * ...)
freq = {}
for ch in w.lower():
freq[ch] = freq.get(ch, 0) + 1
unique_perms = math.factorial(n)
for count in freq.values():
unique_perms //= math.factorial(count)
total_perms += unique_perms - 1 # subtract the original word
# Estimate ~15 bytes per line (avg) β "typo=word\n"
avg_bytes_per_line = 15
est_bytes = total_perms * avg_bytes_per_line
est_gb = est_bytes / (1024 ** 3)
return total_perms, est_gb
def generate_unique_permutations(word):
"""
Generate all unique permutations of a word's letters,
excluding the original word itself.
Uses set() to deduplicate (handles repeated letters efficiently).
"""
lower = word.lower()
perms = set(''.join(p) for p in permutations(lower))
perms.discard(lower) # remove the correctly-spelled word
return perms
def is_pure_alpha(word):
return word.isalpha()
def main():
if not os.path.exists(WORDS_PATH):
print(f"ERROR: '{WORDS_PATH}' not found!")
print("Make sure you uploaded words.txt or set WORDS_PATH correctly.")
sys.exit(1)
# ββ Read words ββββββββββββββββββββββββββββββββββββββββββββββ
print(f"Reading words from: {WORDS_PATH}")
with open(WORDS_PATH, 'r', encoding='utf-8', errors='replace') as f:
raw_words = [line.strip() for line in f if line.strip()]
print(f"Total raw entries: {len(raw_words):,}")
# Filter
words = []
for w in raw_words:
if ONLY_ALPHA and not is_pure_alpha(w):
continue
if len(w) < MIN_WORD_LEN or len(w) > MAX_WORD_LEN:
continue
words.append(w)
print(f"Filtered to {len(words):,} words (alpha-only, len {MIN_WORD_LEN}-{MAX_WORD_LEN})")
if len(words) == 0:
print("No words matched the filter. Adjust MIN/MAX_WORD_LEN.")
sys.exit(1)
# ββ Estimate ββββββββββββββββββββββββββββββββββββββββββββββββ
print("\nEstimating output size (this may take a moment)...")
total_perms, est_gb = estimate_output(words)
print(f" Estimated permutations : {total_perms:,}")
print(f" Estimated file size : {est_gb:.2f} GB")
# Safety check
if est_gb > 70:
print(f"\n WARNING: Estimated output ({est_gb:.1f} GB) exceeds Colab disk (~78 GB).")
print(" Reduce MAX_WORD_LEN or the script will crash when disk fills up.")
print(" Aborting. Set MAX_WORD_LEN lower and re-run.")
sys.exit(1)
print(f"\nProceeding with generation β {OUTPUT_PATH}")
print("=" * 60)
# ββ Generate ββββββββββββββββββββββββββββββββββββββββββββββββ
start = time.time()
total_written = 0
with open(OUTPUT_PATH, 'w', encoding='utf-8') as out:
out.write("# Auto-generated FULL PERMUTATION misspellings\n")
out.write(f"# Config: word length {MIN_WORD_LEN}-{MAX_WORD_LEN}\n")
out.write("# Format: misspelling=correction\n\n")
for idx, word in enumerate(words):
perms = generate_unique_permutations(word)
for typo in sorted(perms):
out.write(f"{typo}={word}\n")
total_written += 1
# Progress
if (idx + 1) % BATCH_LOG == 0:
elapsed = time.time() - start
pct = (idx + 1) / len(words) * 100
rate = (idx + 1) / elapsed if elapsed > 0 else 0
cur_size = os.path.getsize(OUTPUT_PATH) / (1024 ** 3)
print(f" [{pct:5.1f}%] {idx+1:>7,}/{len(words):,} words |"
f" {total_written:>12,} lines | {cur_size:.2f} GB |"
f" {rate:.0f} words/sec")
elapsed = time.time() - start
final_size = os.path.getsize(OUTPUT_PATH) / (1024 ** 3)
print()
print("=" * 60)
print(f" DONE in {elapsed:.1f}s ({elapsed/60:.1f} min)")
print(f" Words processed : {len(words):,}")
print(f" Lines written : {total_written:,}")
print(f" Output file : {OUTPUT_PATH}")
print(f" File size : {final_size:.2f} GB")
print("=" * 60)
if __name__ == '__main__':
main()
|