misspelling-generator / generate_permutations_colab.py

Upload 5 files

2b97944 verified 3 days ago

9.62 kB

	"""
	=============================================================================
	FULL PERMUTATION MISSPELLINGS GENERATOR (Google Colab Edition)
	=============================================================================

	Purpose:
	Generate ALL possible letter permutations of each word from words.txt
	and write them as misspelling=correction pairs.

	WARNING — READ BEFORE RUNNING
	This is computationally EXTREME. A single 10-letter word has 3,628,800
	permutations. A 12-letter word has 479,001,600. For 466k words, the full
	output could be PETABYTES. You WILL need to limit word length.

	=============================================================================
	HOW TO USE ON GOOGLE COLAB
	=============================================================================

	1. Open Google Colab → https://colab.research.google.com
	2. Create a new notebook (Python 3)

	3. Upload your words.txt:
	─────────────────────────────────────
	# CELL 1: Upload words.txt
	from google.colab import files
	uploaded = files.upload() # click "Choose Files" → select words.txt
	─────────────────────────────────────

	4. Copy-paste this ENTIRE script into a new cell and run it.

	5. Download the result:
	─────────────────────────────────────
	# CELL 3: Download the output
	files.download('misspellings_permutations.txt')
	─────────────────────────────────────

	=============================================================================
	OR: Use Google Drive for large files
	=============================================================================

	# Mount Google Drive (you get 15 GB free)
	from google.colab import drive
	drive.mount('/content/drive')

	# Then set OUTPUT_PATH below to:
	OUTPUT_PATH = '/content/drive/MyDrive/misspellings_permutations.txt'

	=============================================================================
	CONFIGURATION — Adjust these before running!
	=============================================================================
	"""

	import os
	import sys
	import time
	import math
	from itertools import permutations

	# ── CONFIGURATION ───────────────────────────────────────────────────────────

	WORDS_PATH = 'words.txt' # path to your words.txt
	OUTPUT_PATH = 'misspellings_permutations.txt' # output file path

	MIN_WORD_LEN = 3 # skip words shorter than this
	MAX_WORD_LEN = 7 # CRITICAL: max word length to permute
	# 7 → max 5,040 perms/word (manageable)
	# 8 → max 40,320 perms/word (large)
	# 9 → max 362,880 perms/word (very large)
	# 10 → max 3,628,800 perms/word (EXTREME)
	# Increase at your own risk!

	ONLY_ALPHA = True # only process pure-alphabetical words
	BATCH_LOG = 5000 # print progress every N words

	# ── ESTIMATION TABLE ────────────────────────────────────────────────────────
	# Here's roughly how big the output gets at each MAX_WORD_LEN setting,
	# assuming ~200k qualifying words at each length bracket:
	#
	# MAX_WORD_LEN │ Perms per word (worst) │ Rough output size
	# ─────────────┼────────────────────────┼──────────────────
	# 5 │ 120 │ ~200 MB
	# 6 │ 720 │ ~1-2 GB
	# 7 │ 5,040 │ ~5-15 GB
	# 8 │ 40,320 │ ~50-150 GB
	# 9 │ 362,880 │ ~500 GB - 1 TB
	# 10 │ 3,628,800 │ ~5-50 TB ← won't fit anywhere
	#
	# Google Colab free tier gives you:
	# • ~78 GB disk on the VM (temporary, lost on disconnect)
	# • 15 GB Google Drive (persistent)
	# • Colab Pro: 225 GB disk, longer runtimes
	#
	# RECOMMENDATION: Start with MAX_WORD_LEN = 6 or 7, see the size,
	# then increase if you have space.
	# ────────────────────────────────────────────────────────────────────────────


	def estimate_output(words):
	"""Estimate total permutations and file size before generating."""
	total_perms = 0
	for w in words:
	n = len(w)
	# Account for duplicate letters: n! / (c1! * c2! * ...)
	freq = {}
	for ch in w.lower():
	freq[ch] = freq.get(ch, 0) + 1
	unique_perms = math.factorial(n)
	for count in freq.values():
	unique_perms //= math.factorial(count)
	total_perms += unique_perms - 1 # subtract the original word

	# Estimate ~15 bytes per line (avg) → "typo=word\n"
	avg_bytes_per_line = 15
	est_bytes = total_perms * avg_bytes_per_line
	est_gb = est_bytes / (1024 ** 3)

	return total_perms, est_gb


	def generate_unique_permutations(word):
	"""
	Generate all unique permutations of a word's letters,
	excluding the original word itself.

	Uses set() to deduplicate (handles repeated letters efficiently).
	"""
	lower = word.lower()
	perms = set(''.join(p) for p in permutations(lower))
	perms.discard(lower) # remove the correctly-spelled word
	return perms


	def is_pure_alpha(word):
	return word.isalpha()


	def main():
	if not os.path.exists(WORDS_PATH):
	print(f"ERROR: '{WORDS_PATH}' not found!")
	print("Make sure you uploaded words.txt or set WORDS_PATH correctly.")
	sys.exit(1)

	# ── Read words ──────────────────────────────────────────────
	print(f"Reading words from: {WORDS_PATH}")
	with open(WORDS_PATH, 'r', encoding='utf-8', errors='replace') as f:
	raw_words = [line.strip() for line in f if line.strip()]

	print(f"Total raw entries: {len(raw_words):,}")

	# Filter
	words = []
	for w in raw_words:
	if ONLY_ALPHA and not is_pure_alpha(w):
	continue
	if len(w) < MIN_WORD_LEN or len(w) > MAX_WORD_LEN:
	continue
	words.append(w)

	print(f"Filtered to {len(words):,} words (alpha-only, len {MIN_WORD_LEN}-{MAX_WORD_LEN})")

	if len(words) == 0:
	print("No words matched the filter. Adjust MIN/MAX_WORD_LEN.")
	sys.exit(1)

	# ── Estimate ────────────────────────────────────────────────
	print("\nEstimating output size (this may take a moment)...")
	total_perms, est_gb = estimate_output(words)
	print(f" Estimated permutations : {total_perms:,}")
	print(f" Estimated file size : {est_gb:.2f} GB")

	# Safety check
	if est_gb > 70:
	print(f"\n WARNING: Estimated output ({est_gb:.1f} GB) exceeds Colab disk (~78 GB).")
	print(" Reduce MAX_WORD_LEN or the script will crash when disk fills up.")
	print(" Aborting. Set MAX_WORD_LEN lower and re-run.")
	sys.exit(1)

	print(f"\nProceeding with generation → {OUTPUT_PATH}")
	print("=" * 60)

	# ── Generate ────────────────────────────────────────────────
	start = time.time()
	total_written = 0

	with open(OUTPUT_PATH, 'w', encoding='utf-8') as out:
	out.write("# Auto-generated FULL PERMUTATION misspellings\n")
	out.write(f"# Config: word length {MIN_WORD_LEN}-{MAX_WORD_LEN}\n")
	out.write("# Format: misspelling=correction\n\n")

	for idx, word in enumerate(words):
	perms = generate_unique_permutations(word)

	for typo in sorted(perms):
	out.write(f"{typo}={word}\n")
	total_written += 1

	# Progress
	if (idx + 1) % BATCH_LOG == 0:
	elapsed = time.time() - start
	pct = (idx + 1) / len(words) * 100
	rate = (idx + 1) / elapsed if elapsed > 0 else 0
	cur_size = os.path.getsize(OUTPUT_PATH) / (1024 ** 3)
	print(f" [{pct:5.1f}%] {idx+1:>7,}/{len(words):,} words \|"
	f" {total_written:>12,} lines \| {cur_size:.2f} GB \|"
	f" {rate:.0f} words/sec")

	elapsed = time.time() - start
	final_size = os.path.getsize(OUTPUT_PATH) / (1024 ** 3)

	print()
	print("=" * 60)
	print(f" DONE in {elapsed:.1f}s ({elapsed/60:.1f} min)")
	print(f" Words processed : {len(words):,}")
	print(f" Lines written : {total_written:,}")
	print(f" Output file : {OUTPUT_PATH}")
	print(f" File size : {final_size:.2f} GB")
	print("=" * 60)


	if __name__ == '__main__':
	main()