{ "nbformat": 4, "nbformat_minor": 0, "metadata": { "colab": { "provenance": [] }, "kernelspec": { "name": "python3", "display_name": "Python 3" }, "language_info": { "name": "python" } }, "cells": [ { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 73 }, "id": "FaSiqVnTItLq", "outputId": "1ef8a78c-7421-41eb-8cf4-db8426edeed9" }, "outputs": [ { "output_type": "display_data", "data": { "text/plain": [ "" ], "text/html": [ "\n", " \n", " \n", " Upload widget is only available when the cell has been executed in the\n", " current browser session. Please rerun this cell to enable.\n", " \n", " " ] }, "metadata": {} }, { "output_type": "stream", "name": "stdout", "text": [ "Saving words.txt to words (1).txt\n" ] } ], "source": [ "# Cell 1\n", "from google.colab import files\n", "uploaded = files.upload() # select words.txt from your PC\n" ] }, { "cell_type": "code", "source": [ "\"\"\"\n", "=============================================================================\n", " FULL PERMUTATION MISSPELLINGS GENERATOR (Google Colab Edition)\n", "=============================================================================\n", "\n", "Purpose:\n", " Generate ALL possible letter permutations of each word from words.txt\n", " and write them as misspelling=correction pairs.\n", "\n", "⚠️ WARNING — READ BEFORE RUNNING ⚠️\n", " This is computationally EXTREME. A single 10-letter word has 3,628,800\n", " permutations. A 12-letter word has 479,001,600. For 466k words, the full\n", " output could be PETABYTES. You WILL need to limit word length.\n", "\n", "=============================================================================\n", " HOW TO USE ON GOOGLE COLAB\n", "=============================================================================\n", "\n", "1. Open Google Colab → https://colab.research.google.com\n", "2. Create a new notebook (Python 3)\n", "\n", "3. Upload your words.txt:\n", " ─────────────────────────────────────\n", " # CELL 1: Upload words.txt\n", " from google.colab import files\n", " uploaded = files.upload() # click \"Choose Files\" → select words.txt\n", " ─────────────────────────────────────\n", "\n", "4. Copy-paste this ENTIRE script into a new cell and run it.\n", "\n", "5. Download the result:\n", " ─────────────────────────────────────\n", " # CELL 3: Download the output\n", " files.download('misspellings_permutations.txt')\n", " ─────────────────────────────────────\n", "\n", "=============================================================================\n", " OR: Use Google Drive for large files\n", "=============================================================================\n", "\n", " # Mount Google Drive (you get 15 GB free)\n", " from google.colab import drive\n", " drive.mount('/content/drive')\n", "\n", " # Then set OUTPUT_PATH below to:\n", " OUTPUT_PATH = '/content/drive/MyDrive/misspellings_permutations.txt'\n", "\n", "=============================================================================\n", " CONFIGURATION — Adjust these before running!\n", "=============================================================================\n", "\"\"\"\n", "\n", "import os\n", "import sys\n", "import time\n", "import math\n", "from itertools import permutations\n", "\n", "# ── CONFIGURATION ───────────────────────────────────────────────────────────\n", "\n", "WORDS_PATH = 'words.txt' # path to your words.txt\n", "OUTPUT_PATH = 'misspellings_permutations.txt' # output file path\n", "\n", "MIN_WORD_LEN = 3 # skip words shorter than this\n", "MAX_WORD_LEN = 7 # ⚠️ CRITICAL: max word length to permute\n", " # 7 → max 5,040 perms/word (manageable)\n", " # 8 → max 40,320 perms/word (large)\n", " # 9 → max 362,880 perms/word (very large)\n", " # 10 → max 3,628,800 perms/word (EXTREME)\n", " # Increase at your own risk!\n", "\n", "ONLY_ALPHA = True # only process pure-alphabetical words\n", "BATCH_LOG = 5000 # print progress every N words\n", "\n", "# ── ESTIMATION TABLE ────────────────────────────────────────────────────────\n", "# Here's roughly how big the output gets at each MAX_WORD_LEN setting,\n", "# assuming ~200k qualifying words at each length bracket:\n", "#\n", "# MAX_WORD_LEN │ Perms per word (worst) │ Rough output size\n", "# ─────────────┼────────────────────────┼──────────────────\n", "# 5 │ 120 │ ~200 MB\n", "# 6 │ 720 │ ~1-2 GB\n", "# 7 │ 5,040 │ ~5-15 GB\n", "# 8 │ 40,320 │ ~50-150 GB\n", "# 9 │ 362,880 │ ~500 GB - 1 TB\n", "# 10 │ 3,628,800 │ ~5-50 TB ← won't fit anywhere\n", "#\n", "# Google Colab free tier gives you:\n", "# • ~78 GB disk on the VM (temporary, lost on disconnect)\n", "# • 15 GB Google Drive (persistent)\n", "# • Colab Pro: 225 GB disk, longer runtimes\n", "#\n", "# RECOMMENDATION: Start with MAX_WORD_LEN = 6 or 7, see the size,\n", "# then increase if you have space.\n", "# ────────────────────────────────────────────────────────────────────────────\n", "\n", "\n", "def estimate_output(words):\n", " \"\"\"Estimate total permutations and file size before generating.\"\"\"\n", " total_perms = 0\n", " for w in words:\n", " n = len(w)\n", " # Account for duplicate letters: n! / (c1! * c2! * ...)\n", " freq = {}\n", " for ch in w.lower():\n", " freq[ch] = freq.get(ch, 0) + 1\n", " unique_perms = math.factorial(n)\n", " for count in freq.values():\n", " unique_perms //= math.factorial(count)\n", " total_perms += unique_perms - 1 # subtract the original word\n", "\n", " # Estimate ~15 bytes per line (avg) → \"typo=word\\n\"\n", " avg_bytes_per_line = 15\n", " est_bytes = total_perms * avg_bytes_per_line\n", " est_gb = est_bytes / (1024 ** 3)\n", "\n", " return total_perms, est_gb\n", "\n", "\n", "def generate_unique_permutations(word):\n", " \"\"\"\n", " Generate all unique permutations of a word's letters,\n", " excluding the original word itself.\n", "\n", " Uses set() to deduplicate (handles repeated letters efficiently).\n", " \"\"\"\n", " lower = word.lower()\n", " perms = set(''.join(p) for p in permutations(lower))\n", " perms.discard(lower) # remove the correctly-spelled word\n", " return perms\n", "\n", "\n", "def is_pure_alpha(word):\n", " return word.isalpha()\n", "\n", "\n", "def main():\n", " if not os.path.exists(WORDS_PATH):\n", " print(f\"ERROR: '{WORDS_PATH}' not found!\")\n", " print(\"Make sure you uploaded words.txt or set WORDS_PATH correctly.\")\n", " sys.exit(1)\n", "\n", " # ── Read words ──────────────────────────────────────────────\n", " print(f\"Reading words from: {WORDS_PATH}\")\n", " with open(WORDS_PATH, 'r', encoding='utf-8', errors='replace') as f:\n", " raw_words = [line.strip() for line in f if line.strip()]\n", "\n", " print(f\"Total raw entries: {len(raw_words):,}\")\n", "\n", " # Filter\n", " words = []\n", " for w in raw_words:\n", " if ONLY_ALPHA and not is_pure_alpha(w):\n", " continue\n", " if len(w) < MIN_WORD_LEN or len(w) > MAX_WORD_LEN:\n", " continue\n", " words.append(w)\n", "\n", " print(f\"Filtered to {len(words):,} words (alpha-only, len {MIN_WORD_LEN}-{MAX_WORD_LEN})\")\n", "\n", " if len(words) == 0:\n", " print(\"No words matched the filter. Adjust MIN/MAX_WORD_LEN.\")\n", " sys.exit(1)\n", "\n", " # ── Estimate ────────────────────────────────────────────────\n", " print(\"\\nEstimating output size (this may take a moment)...\")\n", " total_perms, est_gb = estimate_output(words)\n", " print(f\" Estimated permutations : {total_perms:,}\")\n", " print(f\" Estimated file size : {est_gb:.2f} GB\")\n", "\n", " # Safety check\n", " if est_gb > 70:\n", " print(f\"\\n⚠️ WARNING: Estimated output ({est_gb:.1f} GB) exceeds Colab disk (~78 GB).\")\n", " print(\" Reduce MAX_WORD_LEN or the script will crash when disk fills up.\")\n", " print(\" Aborting. Set MAX_WORD_LEN lower and re-run.\")\n", " sys.exit(1)\n", "\n", " print(f\"\\nProceeding with generation → {OUTPUT_PATH}\")\n", " print(\"=\" * 60)\n", "\n", " # ── Generate ────────────────────────────────────────────────\n", " start = time.time()\n", " total_written = 0\n", "\n", " with open(OUTPUT_PATH, 'w', encoding='utf-8') as out:\n", " out.write(\"# Auto-generated FULL PERMUTATION misspellings\\n\")\n", " out.write(f\"# Config: word length {MIN_WORD_LEN}-{MAX_WORD_LEN}\\n\")\n", " out.write(\"# Format: misspelling=correction\\n\\n\")\n", "\n", " for idx, word in enumerate(words):\n", " perms = generate_unique_permutations(word)\n", "\n", " for typo in sorted(perms):\n", " out.write(f\"{typo}={word}\\n\")\n", " total_written += 1\n", "\n", " # Progress\n", " if (idx + 1) % BATCH_LOG == 0:\n", " elapsed = time.time() - start\n", " pct = (idx + 1) / len(words) * 100\n", " rate = (idx + 1) / elapsed if elapsed > 0 else 0\n", " cur_size = os.path.getsize(OUTPUT_PATH) / (1024 ** 3)\n", " print(f\" [{pct:5.1f}%] {idx+1:>7,}/{len(words):,} words |\"\n", " f\" {total_written:>12,} lines | {cur_size:.2f} GB |\"\n", " f\" {rate:.0f} words/sec\")\n", "\n", " elapsed = time.time() - start\n", " final_size = os.path.getsize(OUTPUT_PATH) / (1024 ** 3)\n", "\n", " print()\n", " print(\"=\" * 60)\n", " print(f\" ✅ DONE in {elapsed:.1f}s ({elapsed/60:.1f} min)\")\n", " print(f\" Words processed : {len(words):,}\")\n", " print(f\" Lines written : {total_written:,}\")\n", " print(f\" Output file : {OUTPUT_PATH}\")\n", " print(f\" File size : {final_size:.2f} GB\")\n", " print(\"=\" * 60)\n", "\n", "\n", "if __name__ == '__main__':\n", " main()\n" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "Et0QfIxpJz_5", "outputId": "e7e72965-f709-45c0-ae56-abf76b89d714" }, "execution_count": null, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Reading words from: words.txt\n", "Total raw entries: 466,550\n", "Filtered to 125,414 words (alpha-only, len 3-7)\n", "\n", "Estimating output size (this may take a moment)...\n", " Estimated permutations : 173,110,626\n", " Estimated file size : 2.42 GB\n", "\n", "Proceeding with generation → misspellings_permutations.txt\n", "============================================================\n", " [ 4.0%] 5,000/125,414 words | 5,810,553 lines | 0.08 GB | 898 words/sec\n", " [ 8.0%] 10,000/125,414 words | 11,972,245 lines | 0.18 GB | 781 words/sec\n", " [ 12.0%] 15,000/125,414 words | 19,094,747 lines | 0.28 GB | 775 words/sec\n", " [ 15.9%] 20,000/125,414 words | 26,800,249 lines | 0.39 GB | 721 words/sec\n", " [ 19.9%] 25,000/125,414 words | 35,047,153 lines | 0.51 GB | 690 words/sec\n", " [ 23.9%] 30,000/125,414 words | 42,273,166 lines | 0.62 GB | 695 words/sec\n", " [ 27.9%] 35,000/125,414 words | 48,702,338 lines | 0.71 GB | 692 words/sec\n", " [ 31.9%] 40,000/125,414 words | 55,295,151 lines | 0.81 GB | 703 words/sec\n", " [ 35.9%] 45,000/125,414 words | 62,710,327 lines | 0.92 GB | 690 words/sec\n", " [ 39.9%] 50,000/125,414 words | 69,722,485 lines | 1.02 GB | 690 words/sec\n", " [ 43.9%] 55,000/125,414 words | 76,146,526 lines | 1.12 GB | 674 words/sec\n", " [ 47.8%] 60,000/125,414 words | 81,994,038 lines | 1.20 GB | 686 words/sec\n", " [ 51.8%] 65,000/125,414 words | 88,058,594 lines | 1.29 GB | 683 words/sec\n", " [ 55.8%] 70,000/125,414 words | 94,651,291 lines | 1.39 GB | 688 words/sec\n", " [ 59.8%] 75,000/125,414 words | 101,636,647 lines | 1.49 GB | 679 words/sec\n", " [ 63.8%] 80,000/125,414 words | 107,086,424 lines | 1.57 GB | 691 words/sec\n", " [ 67.8%] 85,000/125,414 words | 114,898,717 lines | 1.68 GB | 678 words/sec\n", " [ 71.8%] 90,000/125,414 words | 123,278,791 lines | 1.80 GB | 675 words/sec\n", " [ 75.7%] 95,000/125,414 words | 129,821,900 lines | 1.90 GB | 669 words/sec\n", " [ 79.7%] 100,000/125,414 words | 136,429,269 lines | 2.00 GB | 673 words/sec\n", " [ 83.7%] 105,000/125,414 words | 143,342,171 lines | 2.10 GB | 667 words/sec\n", " [ 87.7%] 110,000/125,414 words | 150,701,210 lines | 2.21 GB | 666 words/sec\n", " [ 91.7%] 115,000/125,414 words | 157,479,616 lines | 2.31 GB | 665 words/sec\n", " [ 95.7%] 120,000/125,414 words | 165,619,673 lines | 2.43 GB | 662 words/sec\n", " [ 99.7%] 125,000/125,414 words | 172,558,768 lines | 2.53 GB | 661 words/sec\n", "\n", "============================================================\n", " ✅ DONE in 189.5s (3.2 min)\n", " Words processed : 125,414\n", " Lines written : 173,110,626\n", " Output file : misspellings_permutations.txt\n", " File size : 2.53 GB\n", "============================================================\n" ] } ] }, { "cell_type": "code", "source": [ "# If saved to VM disk:\n", "files.download('misspellings_permutations.txt')\n", "\n", "# If saved to Google Drive: just access it from drive.google.com\n", "\n" ], "metadata": { "id": "y9jWxvv8LWoH", "outputId": "d8d754d3-234e-4020-bcc7-a19f3fc5fb26", "colab": { "base_uri": "https://localhost:8080/", "height": 34 } }, "execution_count": null, "outputs": [ { "output_type": "display_data", "data": { "text/plain": [ "" ], "application/javascript": [ "\n", " async function download(id, filename, size) {\n", " if (!google.colab.kernel.accessAllowed) {\n", " return;\n", " }\n", " const div = document.createElement('div');\n", " const label = document.createElement('label');\n", " label.textContent = `Downloading \"${filename}\": `;\n", " div.appendChild(label);\n", " const progress = document.createElement('progress');\n", " progress.max = size;\n", " div.appendChild(progress);\n", " document.body.appendChild(div);\n", "\n", " const buffers = [];\n", " let downloaded = 0;\n", "\n", " const channel = await google.colab.kernel.comms.open(id);\n", " // Send a message to notify the kernel that we're ready.\n", " channel.send({})\n", "\n", " for await (const message of channel.messages) {\n", " // Send a message to notify the kernel that we're ready.\n", " channel.send({})\n", " if (message.buffers) {\n", " for (const buffer of message.buffers) {\n", " buffers.push(buffer);\n", " downloaded += buffer.byteLength;\n", " progress.value = downloaded;\n", " }\n", " }\n", " }\n", " const blob = new Blob(buffers, {type: 'application/binary'});\n", " const a = document.createElement('a');\n", " a.href = window.URL.createObjectURL(blob);\n", " a.download = filename;\n", " div.appendChild(a);\n", " a.click();\n", " div.remove();\n", " }\n", " " ] }, "metadata": {} }, { "output_type": "display_data", "data": { "text/plain": [ "" ], "application/javascript": [ "download(\"download_10941777-78c6-4833-b8e6-093feee02e11\", \"misspellings_permutations.txt\", 2721877361)" ] }, "metadata": {} } ] } ] }