Upload 4 files

Browse files

Files changed (4) hide show

generate_typos_colab.py +127 -0
generate_typos_local.py +157 -0
google_collab_173MSW.ipynb +608 -0
google_collab_263MSW.ipynb +523 -0

generate_typos_colab.py ADDED Viewed

	@@ -0,0 +1,127 @@

+"""
+Generate realistic typo-based misspellings from words.txt → misspellings.txt
+Colab version
+Place words.txt in /content/ before running
+"""
+import os
+import time
+# Optional: mount Google Drive if your file is there
+# from google.colab import drive
+# drive.mount('/content/drive')
+# words_path = '/content/drive/MyDrive/words.txt'
+words_path = '/content/words.txt'
+output_path = '/content/misspellings.txt'
+KEYBOARD_NEIGHBORS = {
+    'q': 'wa', 'w': 'qeas', 'e': 'wrds', 'r': 'etfs', 't': 'rygs',
+    'y': 'tuhs', 'u': 'yijs', 'i': 'uoks', 'o': 'ipls', 'p': 'o',
+    'a': 'qwsz', 's': 'awedxz', 'd': 'serfcx', 'f': 'drtgvc',
+    'g': 'ftyhbv', 'h': 'gyujnb', 'j': 'huikmn', 'k': 'jiolm',
+    'l': 'kop', 'z': 'asx', 'x': 'zsdc', 'c': 'xdfv', 'v': 'cfgb',
+    'b': 'vghn', 'n': 'bhjm', 'm': 'njk',
+}
+def generate_adjacent_swaps(word):
+    typos = []
+    for i in range(len(word) - 1):
+        chars = list(word)
+        chars[i], chars[i + 1] = chars[i + 1], chars[i]
+        typo = ''.join(chars)
+        if typo != word:
+            typos.append(typo)
+    return typos
+def generate_deletions(word):
+    typos = []
+    for i in range(len(word)):
+        typo = word[:i] + word[i + 1:]
+        if len(typo) >= 2:
+            typos.append(typo)
+    return typos
+def generate_duplications(word):
+    typos = []
+    for i in range(len(word)):
+        typo = word[:i] + word[i] + word[i:]
+        if typo != word:
+            typos.append(typo)
+    return typos
+def generate_nearby_key_subs(word):
+    typos = []
+    lower = word.lower()
+    for i in range(len(word)):
+        ch = lower[i]
+        if ch in KEYBOARD_NEIGHBORS:
+            for neighbor in KEYBOARD_NEIGHBORS[ch]:
+                typo = lower[:i] + neighbor + lower[i + 1:]
+                if typo != lower:
+                    typos.append(typo)
+    return typos
+def generate_all_typos(word):
+    typos = set()
+    typos.update(generate_adjacent_swaps(word))
+    typos.update(generate_deletions(word))
+    typos.update(generate_duplications(word))
+    typos.update(generate_nearby_key_subs(word))
+    typos.discard(word)
+    typos.discard(word.lower())
+    return typos
+def is_pure_alpha(word):
+    return word.isalpha()
+# ── Check file ──────────────────────────────────────────────
+if not os.path.exists(words_path):
+    raise FileNotFoundError(f"{words_path} not found. Upload it to /content/ first.")
+print(f"Reading words from: {words_path}")
+with open(words_path, 'r', encoding='utf-8', errors='replace') as f:
+    raw_words = [line.strip() for line in f if line.strip()]
+print(f"Total raw entries: {len(raw_words):,}")
+words = [w for w in raw_words if is_pure_alpha(w) and len(w) >= 3]
+print(f"Filtered to {len(words):,} alphabetical words (len >= 3)")
+start = time.time()
+total_typos = 0
+batch_size = 10_000
+print(f"Generating typos → {output_path}")
+with open(output_path, 'w', encoding='utf-8', newline='\n') as out:
+    out.write("# Auto-generated misspellings database\n")
+    out.write("# Format: misspelling=correction\n\n")
+    for idx, word in enumerate(words):
+        correction = word
+        typos = generate_all_typos(word.lower())
+        for typo in sorted(typos):
+            out.write(f"{typo}={correction}\n")
+            total_typos += 1
+        if (idx + 1) % batch_size == 0:
+            elapsed = time.time() - start
+            pct = (idx + 1) / len(words) * 100
+            rate = (idx + 1) / elapsed if elapsed > 0 else 0
+            print(f"[{pct:5.1f}%] {idx + 1:,}/{len(words):,} words | "
+                  f"{total_typos:,} typos | {rate:.0f} words/sec")
+elapsed = time.time() - start
+file_size_mb = os.path.getsize(output_path) / (1024 * 1024)
+print("\n" + "=" * 60)
+print(f"Done in {elapsed:.1f}s")
+print(f"Words processed : {len(words):,}")
+print(f"Typos generated : {total_typos:,}")
+print(f"Output file     : {output_path}")
+print(f"File size       : {file_size_mb:.1f} MB")
+print("=" * 60)

generate_typos_local.py ADDED Viewed

	@@ -0,0 +1,157 @@

+"""
+Generate realistic typo-based misspellings from words.txt → misspellings.txt
+Typo strategies:
+  1. Adjacent letter swaps      ("hello" → "hlelo", "helol")
+  2. Single character deletion   ("hello" → "hllo", "helo")
+  3. Single character duplication ("hello" → "hhello", "heello")
+  4. Nearby keyboard key sub     ("hello" → "gello", "jello")
+Output format: misspelling=correction (one per line)
+"""
+import sys
+import os
+import time
+# QWERTY keyboard proximity map
+KEYBOARD_NEIGHBORS = {
+    'q': 'wa', 'w': 'qeas', 'e': 'wrds', 'r': 'etfs', 't': 'rygs',
+    'y': 'tuhs', 'u': 'yijs', 'i': 'uoks', 'o': 'ipls', 'p': 'o',
+    'a': 'qwsz', 's': 'awedxz', 'd': 'serfcx', 'f': 'drtgvc',
+    'g': 'ftyhbv', 'h': 'gyujnb', 'j': 'huikmn', 'k': 'jiolm',
+    'l': 'kop', 'z': 'asx', 'x': 'zsdc', 'c': 'xdfv', 'v': 'cfgb',
+    'b': 'vghn', 'n': 'bhjm', 'm': 'njk',
+}
+def generate_adjacent_swaps(word):
+    """Swap each pair of adjacent characters."""
+    typos = []
+    for i in range(len(word) - 1):
+        chars = list(word)
+        chars[i], chars[i + 1] = chars[i + 1], chars[i]
+        typo = ''.join(chars)
+        if typo != word:
+            typos.append(typo)
+    return typos
+def generate_deletions(word):
+    """Delete one character at a time."""
+    typos = []
+    for i in range(len(word)):
+        typo = word[:i] + word[i + 1:]
+        if len(typo) >= 2:  # keep at least 2 chars
+            typos.append(typo)
+    return typos
+def generate_duplications(word):
+    """Duplicate one character at a time."""
+    typos = []
+    for i in range(len(word)):
+        typo = word[:i] + word[i] + word[i:]
+        if typo != word:
+            typos.append(typo)
+    return typos
+def generate_nearby_key_subs(word):
+    """Replace one character with a nearby keyboard key."""
+    typos = []
+    lower = word.lower()
+    for i in range(len(word)):
+        ch = lower[i]
+        if ch in KEYBOARD_NEIGHBORS:
+            for neighbor in KEYBOARD_NEIGHBORS[ch]:
+                typo = lower[:i] + neighbor + lower[i + 1:]
+                if typo != lower:
+                    typos.append(typo)
+    return typos
+def generate_all_typos(word):
+    """Generate all realistic typo variants for a word."""
+    typos = set()
+    typos.update(generate_adjacent_swaps(word))
+    typos.update(generate_deletions(word))
+    typos.update(generate_duplications(word))
+    typos.update(generate_nearby_key_subs(word))
+    typos.discard(word)   # never map a word to itself
+    typos.discard(word.lower())
+    return typos
+def is_pure_alpha(word):
+    """Only process words that are purely alphabetical (a-z)."""
+    return word.isalpha()
+def main():
+    base_dir = os.path.dirname(os.path.abspath(__file__))
+    words_path = os.path.join(base_dir, 'data', 'words.txt')
+    output_path = os.path.join(base_dir, 'data', 'misspellings.txt')
+    if not os.path.exists(words_path):
+        print(f"ERROR: {words_path} not found.")
+        sys.exit(1)
+    # ── Read words ──────────────────────────────────────────────
+    print(f"Reading words from: {words_path}")
+    with open(words_path, 'r', encoding='utf-8', errors='replace') as f:
+        raw_words = [line.strip() for line in f if line.strip()]
+    print(f"Total raw entries: {len(raw_words):,}")
+    # Filter to pure-alpha words with length >= 3
+    words = [w for w in raw_words if is_pure_alpha(w) and len(w) >= 3]
+    print(f"Filtered to {len(words):,} alphabetical words (len >= 3)")
+    # ── Generate typos ──────────────────────────────────────────
+    start = time.time()
+    total_typos = 0
+    batch_size = 10_000
+    print(f"Generating typos → {output_path}")
+    print("This may take a few minutes for 466k words...")
+    with open(output_path, 'w', encoding='utf-8', newline='\n') as out:
+        out.write("# Auto-generated misspellings database\n")
+        out.write("# Format: misspelling=correction\n")
+        out.write("# Generated by generate_typos.py\n")
+        out.write("#\n")
+        out.write("# Strategies: adjacent swaps, deletions, duplications, keyboard proximity\n")
+        out.write("\n")
+        for idx, word in enumerate(words):
+            correction = word  # original is the correct form
+            typos = generate_all_typos(word.lower())
+            for typo in sorted(typos):
+                out.write(f"{typo}={correction}\n")
+                total_typos += 1
+            # Progress reporting
+            if (idx + 1) % batch_size == 0:
+                elapsed = time.time() - start
+                pct = (idx + 1) / len(words) * 100
+                rate = (idx + 1) / elapsed if elapsed > 0 else 0
+                print(f"  [{pct:5.1f}%] {idx + 1:>7,} / {len(words):,} words  |"
+                      f"  {total_typos:>10,} typos  |  {rate:.0f} words/sec")
+    elapsed = time.time() - start
+    file_size_mb = os.path.getsize(output_path) / (1024 * 1024)
+    print()
+    print("=" * 60)
+    print(f"  Done in {elapsed:.1f}s")
+    print(f"  Words processed : {len(words):,}")
+    print(f"  Typos generated : {total_typos:,}")
+    print(f"  Output file     : {output_path}")
+    print(f"  File size       : {file_size_mb:.1f} MB")
+    print("=" * 60)
+if __name__ == '__main__':
+    main()

google_collab_173MSW.ipynb ADDED Viewed

	@@ -0,0 +1,608 @@

+{
+  "nbformat": 4,
+  "nbformat_minor": 0,
+  "metadata": {
+    "colab": {
+      "provenance": []
+    },
+    "kernelspec": {
+      "name": "python3",
+      "display_name": "Python 3"
+    },
+    "language_info": {
+      "name": "python"
+    }
+  },
+  "cells": [
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/",
+          "height": 73
+        },
+        "id": "FaSiqVnTItLq",
+        "outputId": "1ef8a78c-7421-41eb-8cf4-db8426edeed9"
+      },
+      "outputs": [
+        {
+          "output_type": "display_data",
+          "data": {
+            "text/plain": [
+              "<IPython.core.display.HTML object>"
+            ],
+            "text/html": [
+              "\n",
+              "     <input type=\"file\" id=\"files-a9b68a68-38ec-4a0a-8037-171b8cfec796\" name=\"files[]\" multiple disabled\n",
+              "        style=\"border:none\" />\n",
+              "     <output id=\"result-a9b68a68-38ec-4a0a-8037-171b8cfec796\">\n",
+              "      Upload widget is only available when the cell has been executed in the\n",
+              "      current browser session. Please rerun this cell to enable.\n",
+              "      </output>\n",
+              "      <script>// Copyright 2017 Google LLC\n",
+              "//\n",
+              "// Licensed under the Apache License, Version 2.0 (the \"License\");\n",
+              "// you may not use this file except in compliance with the License.\n",
+              "// You may obtain a copy of the License at\n",
+              "//\n",
+              "//      http://www.apache.org/licenses/LICENSE-2.0\n",
+              "//\n",
+              "// Unless required by applicable law or agreed to in writing, software\n",
+              "// distributed under the License is distributed on an \"AS IS\" BASIS,\n",
+              "// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n",
+              "// See the License for the specific language governing permissions and\n",
+              "// limitations under the License.\n",
+              "\n",
+              "/**\n",
+              " * @fileoverview Helpers for google.colab Python module.\n",
+              " */\n",
+              "(function(scope) {\n",
+              "function span(text, styleAttributes = {}) {\n",
+              "  const element = document.createElement('span');\n",
+              "  element.textContent = text;\n",
+              "  for (const key of Object.keys(styleAttributes)) {\n",
+              "    element.style[key] = styleAttributes[key];\n",
+              "  }\n",
+              "  return element;\n",
+              "}\n",
+              "\n",
+              "// Max number of bytes which will be uploaded at a time.\n",
+              "const MAX_PAYLOAD_SIZE = 100 * 1024;\n",
+              "\n",
+              "function _uploadFiles(inputId, outputId) {\n",
+              "  const steps = uploadFilesStep(inputId, outputId);\n",
+              "  const outputElement = document.getElementById(outputId);\n",
+              "  // Cache steps on the outputElement to make it available for the next call\n",
+              "  // to uploadFilesContinue from Python.\n",
+              "  outputElement.steps = steps;\n",
+              "\n",
+              "  return _uploadFilesContinue(outputId);\n",
+              "}\n",
+              "\n",
+              "// This is roughly an async generator (not supported in the browser yet),\n",
+              "// where there are multiple asynchronous steps and the Python side is going\n",
+              "// to poll for completion of each step.\n",
+              "// This uses a Promise to block the python side on completion of each step,\n",
+              "// then passes the result of the previous step as the input to the next step.\n",
+              "function _uploadFilesContinue(outputId) {\n",
+              "  const outputElement = document.getElementById(outputId);\n",
+              "  const steps = outputElement.steps;\n",
+              "\n",
+              "  const next = steps.next(outputElement.lastPromiseValue);\n",
+              "  return Promise.resolve(next.value.promise).then((value) => {\n",
+              "    // Cache the last promise value to make it available to the next\n",
+              "    // step of the generator.\n",
+              "    outputElement.lastPromiseValue = value;\n",
+              "    return next.value.response;\n",
+              "  });\n",
+              "}\n",
+              "\n",
+              "/**\n",
+              " * Generator function which is called between each async step of the upload\n",
+              " * process.\n",
+              " * @param {string} inputId Element ID of the input file picker element.\n",
+              " * @param {string} outputId Element ID of the output display.\n",
+              " * @return {!Iterable<!Object>} Iterable of next steps.\n",
+              " */\n",
+              "function* uploadFilesStep(inputId, outputId) {\n",
+              "  const inputElement = document.getElementById(inputId);\n",
+              "  inputElement.disabled = false;\n",
+              "\n",
+              "  const outputElement = document.getElementById(outputId);\n",
+              "  outputElement.innerHTML = '';\n",
+              "\n",
+              "  const pickedPromise = new Promise((resolve) => {\n",
+              "    inputElement.addEventListener('change', (e) => {\n",
+              "      resolve(e.target.files);\n",
+              "    });\n",
+              "  });\n",
+              "\n",
+              "  const cancel = document.createElement('button');\n",
+              "  inputElement.parentElement.appendChild(cancel);\n",
+              "  cancel.textContent = 'Cancel upload';\n",
+              "  const cancelPromise = new Promise((resolve) => {\n",
+              "    cancel.onclick = () => {\n",
+              "      resolve(null);\n",
+              "    };\n",
+              "  });\n",
+              "\n",
+              "  // Wait for the user to pick the files.\n",
+              "  const files = yield {\n",
+              "    promise: Promise.race([pickedPromise, cancelPromise]),\n",
+              "    response: {\n",
+              "      action: 'starting',\n",
+              "    }\n",
+              "  };\n",
+              "\n",
+              "  cancel.remove();\n",
+              "\n",
+              "  // Disable the input element since further picks are not allowed.\n",
+              "  inputElement.disabled = true;\n",
+              "\n",
+              "  if (!files) {\n",
+              "    return {\n",
+              "      response: {\n",
+              "        action: 'complete',\n",
+              "      }\n",
+              "    };\n",
+              "  }\n",
+              "\n",
+              "  for (const file of files) {\n",
+              "    const li = document.createElement('li');\n",
+              "    li.append(span(file.name, {fontWeight: 'bold'}));\n",
+              "    li.append(span(\n",
+              "        `(${file.type || 'n/a'}) - ${file.size} bytes, ` +\n",
+              "        `last modified: ${\n",
+              "            file.lastModifiedDate ? file.lastModifiedDate.toLocaleDateString() :\n",
+              "                                    'n/a'} - `));\n",
+              "    const percent = span('0% done');\n",
+              "    li.appendChild(percent);\n",
+              "\n",
+              "    outputElement.appendChild(li);\n",
+              "\n",
+              "    const fileDataPromise = new Promise((resolve) => {\n",
+              "      const reader = new FileReader();\n",
+              "      reader.onload = (e) => {\n",
+              "        resolve(e.target.result);\n",
+              "      };\n",
+              "      reader.readAsArrayBuffer(file);\n",
+              "    });\n",
+              "    // Wait for the data to be ready.\n",
+              "    let fileData = yield {\n",
+              "      promise: fileDataPromise,\n",
+              "      response: {\n",
+              "        action: 'continue',\n",
+              "      }\n",
+              "    };\n",
+              "\n",
+              "    // Use a chunked sending to avoid message size limits. See b/62115660.\n",
+              "    let position = 0;\n",
+              "    do {\n",
+              "      const length = Math.min(fileData.byteLength - position, MAX_PAYLOAD_SIZE);\n",
+              "      const chunk = new Uint8Array(fileData, position, length);\n",
+              "      position += length;\n",
+              "\n",
+              "      const base64 = btoa(String.fromCharCode.apply(null, chunk));\n",
+              "      yield {\n",
+              "        response: {\n",
+              "          action: 'append',\n",
+              "          file: file.name,\n",
+              "          data: base64,\n",
+              "        },\n",
+              "      };\n",
+              "\n",
+              "      let percentDone = fileData.byteLength === 0 ?\n",
+              "          100 :\n",
+              "          Math.round((position / fileData.byteLength) * 100);\n",
+              "      percent.textContent = `${percentDone}% done`;\n",
+              "\n",
+              "    } while (position < fileData.byteLength);\n",
+              "  }\n",
+              "\n",
+              "  // All done.\n",
+              "  yield {\n",
+              "    response: {\n",
+              "      action: 'complete',\n",
+              "    }\n",
+              "  };\n",
+              "}\n",
+              "\n",
+              "scope.google = scope.google || {};\n",
+              "scope.google.colab = scope.google.colab || {};\n",
+              "scope.google.colab._files = {\n",
+              "  _uploadFiles,\n",
+              "  _uploadFilesContinue,\n",
+              "};\n",
+              "})(self);\n",
+              "</script> "
+            ]
+          },
+          "metadata": {}
+        },
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "Saving words.txt to words (1).txt\n"
+          ]
+        }
+      ],
+      "source": [
+        "# Cell 1\n",
+        "from google.colab import files\n",
+        "uploaded = files.upload()   # select words.txt from your PC\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "\"\"\"\n",
+        "=============================================================================\n",
+        "  FULL PERMUTATION MISSPELLINGS GENERATOR  (Google Colab Edition)\n",
+        "=============================================================================\n",
+        "\n",
+        "Purpose:\n",
+        "  Generate ALL possible letter permutations of each word from words.txt\n",
+        "  and write them as misspelling=correction pairs.\n",
+        "\n",
+        "⚠️  WARNING — READ BEFORE RUNNING  ⚠️\n",
+        "  This is computationally EXTREME. A single 10-letter word has 3,628,800\n",
+        "  permutations. A 12-letter word has 479,001,600. For 466k words, the full\n",
+        "  output could be PETABYTES. You WILL need to limit word length.\n",
+        "\n",
+        "=============================================================================\n",
+        "  HOW TO USE ON GOOGLE COLAB\n",
+        "=============================================================================\n",
+        "\n",
+        "1. Open Google Colab  →  https://colab.research.google.com\n",
+        "2. Create a new notebook (Python 3)\n",
+        "\n",
+        "3. Upload your words.txt:\n",
+        "   ─────────────────────────────────────\n",
+        "   # CELL 1: Upload words.txt\n",
+        "   from google.colab import files\n",
+        "   uploaded = files.upload()     # click \"Choose Files\" → select words.txt\n",
+        "   ─────────────────────────────────────\n",
+        "\n",
+        "4. Copy-paste this ENTIRE script into a new cell and run it.\n",
+        "\n",
+        "5. Download the result:\n",
+        "   ─────────────────────────────────────\n",
+        "   # CELL 3: Download the output\n",
+        "   files.download('misspellings_permutations.txt')\n",
+        "   ─────────────────────────────────────\n",
+        "\n",
+        "=============================================================================\n",
+        "  OR: Use Google Drive for large files\n",
+        "=============================================================================\n",
+        "\n",
+        "   # Mount Google Drive (you get 15 GB free)\n",
+        "   from google.colab import drive\n",
+        "   drive.mount('/content/drive')\n",
+        "\n",
+        "   # Then set OUTPUT_PATH below to:\n",
+        "   OUTPUT_PATH = '/content/drive/MyDrive/misspellings_permutations.txt'\n",
+        "\n",
+        "=============================================================================\n",
+        "  CONFIGURATION — Adjust these before running!\n",
+        "=============================================================================\n",
+        "\"\"\"\n",
+        "\n",
+        "import os\n",
+        "import sys\n",
+        "import time\n",
+        "import math\n",
+        "from itertools import permutations\n",
+        "\n",
+        "# ── CONFIGURATION ───────────────────────────────────────────────────────────\n",
+        "\n",
+        "WORDS_PATH   = 'words.txt'                          # path to your words.txt\n",
+        "OUTPUT_PATH  = 'misspellings_permutations.txt'       # output file path\n",
+        "\n",
+        "MIN_WORD_LEN = 3     # skip words shorter than this\n",
+        "MAX_WORD_LEN = 7     # ⚠️ CRITICAL: max word length to permute\n",
+        "                      # 7  → max 5,040 perms/word   (manageable)\n",
+        "                      # 8  → max 40,320 perms/word  (large)\n",
+        "                      # 9  → max 362,880 perms/word (very large)\n",
+        "                      # 10 → max 3,628,800 perms/word (EXTREME)\n",
+        "                      # Increase at your own risk!\n",
+        "\n",
+        "ONLY_ALPHA   = True   # only process pure-alphabetical words\n",
+        "BATCH_LOG    = 5000   # print progress every N words\n",
+        "\n",
+        "# ── ESTIMATION TABLE ────────────────────────────────────────────────────────\n",
+        "# Here's roughly how big the output gets at each MAX_WORD_LEN setting,\n",
+        "# assuming ~200k qualifying words at each length bracket:\n",
+        "#\n",
+        "# MAX_WORD_LEN │ Perms per word (worst) │ Rough output size\n",
+        "# ─────────────┼────────────────────────┼──────────────────\n",
+        "#      5       │          120           │   ~200 MB\n",
+        "#      6       │          720           │   ~1-2 GB\n",
+        "#      7       │        5,040           │   ~5-15 GB\n",
+        "#      8       │       40,320           │   ~50-150 GB\n",
+        "#      9       │      362,880           │   ~500 GB - 1 TB\n",
+        "#     10       │    3,628,800           │   ~5-50 TB  ← won't fit anywhere\n",
+        "#\n",
+        "# Google Colab free tier gives you:\n",
+        "#   • ~78 GB disk on the VM (temporary, lost on disconnect)\n",
+        "#   • 15 GB Google Drive (persistent)\n",
+        "#   • Colab Pro: 225 GB disk, longer runtimes\n",
+        "#\n",
+        "# RECOMMENDATION: Start with MAX_WORD_LEN = 6 or 7, see the size,\n",
+        "# then increase if you have space.\n",
+        "# ────────────────────────────────────────────────────────────────────────────\n",
+        "\n",
+        "\n",
+        "def estimate_output(words):\n",
+        "    \"\"\"Estimate total permutations and file size before generating.\"\"\"\n",
+        "    total_perms = 0\n",
+        "    for w in words:\n",
+        "        n = len(w)\n",
+        "        # Account for duplicate letters: n! / (c1! * c2! * ...)\n",
+        "        freq = {}\n",
+        "        for ch in w.lower():\n",
+        "            freq[ch] = freq.get(ch, 0) + 1\n",
+        "        unique_perms = math.factorial(n)\n",
+        "        for count in freq.values():\n",
+        "            unique_perms //= math.factorial(count)\n",
+        "        total_perms += unique_perms - 1  # subtract the original word\n",
+        "\n",
+        "    # Estimate ~15 bytes per line (avg)  →  \"typo=word\\n\"\n",
+        "    avg_bytes_per_line = 15\n",
+        "    est_bytes = total_perms * avg_bytes_per_line\n",
+        "    est_gb = est_bytes / (1024 ** 3)\n",
+        "\n",
+        "    return total_perms, est_gb\n",
+        "\n",
+        "\n",
+        "def generate_unique_permutations(word):\n",
+        "    \"\"\"\n",
+        "    Generate all unique permutations of a word's letters,\n",
+        "    excluding the original word itself.\n",
+        "\n",
+        "    Uses set() to deduplicate (handles repeated letters efficiently).\n",
+        "    \"\"\"\n",
+        "    lower = word.lower()\n",
+        "    perms = set(''.join(p) for p in permutations(lower))\n",
+        "    perms.discard(lower)  # remove the correctly-spelled word\n",
+        "    return perms\n",
+        "\n",
+        "\n",
+        "def is_pure_alpha(word):\n",
+        "    return word.isalpha()\n",
+        "\n",
+        "\n",
+        "def main():\n",
+        "    if not os.path.exists(WORDS_PATH):\n",
+        "        print(f\"ERROR: '{WORDS_PATH}' not found!\")\n",
+        "        print(\"Make sure you uploaded words.txt or set WORDS_PATH correctly.\")\n",
+        "        sys.exit(1)\n",
+        "\n",
+        "    # ── Read words ──────────────────────────────────────────────\n",
+        "    print(f\"Reading words from: {WORDS_PATH}\")\n",
+        "    with open(WORDS_PATH, 'r', encoding='utf-8', errors='replace') as f:\n",
+        "        raw_words = [line.strip() for line in f if line.strip()]\n",
+        "\n",
+        "    print(f\"Total raw entries: {len(raw_words):,}\")\n",
+        "\n",
+        "    # Filter\n",
+        "    words = []\n",
+        "    for w in raw_words:\n",
+        "        if ONLY_ALPHA and not is_pure_alpha(w):\n",
+        "            continue\n",
+        "        if len(w) < MIN_WORD_LEN or len(w) > MAX_WORD_LEN:\n",
+        "            continue\n",
+        "        words.append(w)\n",
+        "\n",
+        "    print(f\"Filtered to {len(words):,} words (alpha-only, len {MIN_WORD_LEN}-{MAX_WORD_LEN})\")\n",
+        "\n",
+        "    if len(words) == 0:\n",
+        "        print(\"No words matched the filter. Adjust MIN/MAX_WORD_LEN.\")\n",
+        "        sys.exit(1)\n",
+        "\n",
+        "    # ── Estimate ────────────────────────────────────────────────\n",
+        "    print(\"\\nEstimating output size (this may take a moment)...\")\n",
+        "    total_perms, est_gb = estimate_output(words)\n",
+        "    print(f\"  Estimated permutations : {total_perms:,}\")\n",
+        "    print(f\"  Estimated file size    : {est_gb:.2f} GB\")\n",
+        "\n",
+        "    # Safety check\n",
+        "    if est_gb > 70:\n",
+        "        print(f\"\\n⚠️  WARNING: Estimated output ({est_gb:.1f} GB) exceeds Colab disk (~78 GB).\")\n",
+        "        print(\"  Reduce MAX_WORD_LEN or the script will crash when disk fills up.\")\n",
+        "        print(\"  Aborting. Set MAX_WORD_LEN lower and re-run.\")\n",
+        "        sys.exit(1)\n",
+        "\n",
+        "    print(f\"\\nProceeding with generation → {OUTPUT_PATH}\")\n",
+        "    print(\"=\" * 60)\n",
+        "\n",
+        "    # ── Generate ────────────────────────────────────────────────\n",
+        "    start = time.time()\n",
+        "    total_written = 0\n",
+        "\n",
+        "    with open(OUTPUT_PATH, 'w', encoding='utf-8') as out:\n",
+        "        out.write(\"# Auto-generated FULL PERMUTATION misspellings\\n\")\n",
+        "        out.write(f\"# Config: word length {MIN_WORD_LEN}-{MAX_WORD_LEN}\\n\")\n",
+        "        out.write(\"# Format: misspelling=correction\\n\\n\")\n",
+        "\n",
+        "        for idx, word in enumerate(words):\n",
+        "            perms = generate_unique_permutations(word)\n",
+        "\n",
+        "            for typo in sorted(perms):\n",
+        "                out.write(f\"{typo}={word}\\n\")\n",
+        "                total_written += 1\n",
+        "\n",
+        "            # Progress\n",
+        "            if (idx + 1) % BATCH_LOG == 0:\n",
+        "                elapsed = time.time() - start\n",
+        "                pct = (idx + 1) / len(words) * 100\n",
+        "                rate = (idx + 1) / elapsed if elapsed > 0 else 0\n",
+        "                cur_size = os.path.getsize(OUTPUT_PATH) / (1024 ** 3)\n",
+        "                print(f\"  [{pct:5.1f}%]  {idx+1:>7,}/{len(words):,} words  |\"\n",
+        "                      f\"  {total_written:>12,} lines  |  {cur_size:.2f} GB  |\"\n",
+        "                      f\"  {rate:.0f} words/sec\")\n",
+        "\n",
+        "    elapsed = time.time() - start\n",
+        "    final_size = os.path.getsize(OUTPUT_PATH) / (1024 ** 3)\n",
+        "\n",
+        "    print()\n",
+        "    print(\"=\" * 60)\n",
+        "    print(f\"  ✅  DONE in {elapsed:.1f}s ({elapsed/60:.1f} min)\")\n",
+        "    print(f\"  Words processed  : {len(words):,}\")\n",
+        "    print(f\"  Lines written    : {total_written:,}\")\n",
+        "    print(f\"  Output file      : {OUTPUT_PATH}\")\n",
+        "    print(f\"  File size        : {final_size:.2f} GB\")\n",
+        "    print(\"=\" * 60)\n",
+        "\n",
+        "\n",
+        "if __name__ == '__main__':\n",
+        "    main()\n"
+      ],
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "Et0QfIxpJz_5",
+        "outputId": "e7e72965-f709-45c0-ae56-abf76b89d714"
+      },
+      "execution_count": null,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "Reading words from: words.txt\n",
+            "Total raw entries: 466,550\n",
+            "Filtered to 125,414 words (alpha-only, len 3-7)\n",
+            "\n",
+            "Estimating output size (this may take a moment)...\n",
+            "  Estimated permutations : 173,110,626\n",
+            "  Estimated file size    : 2.42 GB\n",
+            "\n",
+            "Proceeding with generation → misspellings_permutations.txt\n",
+            "============================================================\n",
+            "  [  4.0%]    5,000/125,414 words  |     5,810,553 lines  |  0.08 GB  |  898 words/sec\n",
+            "  [  8.0%]   10,000/125,414 words  |    11,972,245 lines  |  0.18 GB  |  781 words/sec\n",
+            "  [ 12.0%]   15,000/125,414 words  |    19,094,747 lines  |  0.28 GB  |  775 words/sec\n",
+            "  [ 15.9%]   20,000/125,414 words  |    26,800,249 lines  |  0.39 GB  |  721 words/sec\n",
+            "  [ 19.9%]   25,000/125,414 words  |    35,047,153 lines  |  0.51 GB  |  690 words/sec\n",
+            "  [ 23.9%]   30,000/125,414 words  |    42,273,166 lines  |  0.62 GB  |  695 words/sec\n",
+            "  [ 27.9%]   35,000/125,414 words  |    48,702,338 lines  |  0.71 GB  |  692 words/sec\n",
+            "  [ 31.9%]   40,000/125,414 words  |    55,295,151 lines  |  0.81 GB  |  703 words/sec\n",
+            "  [ 35.9%]   45,000/125,414 words  |    62,710,327 lines  |  0.92 GB  |  690 words/sec\n",
+            "  [ 39.9%]   50,000/125,414 words  |    69,722,485 lines  |  1.02 GB  |  690 words/sec\n",
+            "  [ 43.9%]   55,000/125,414 words  |    76,146,526 lines  |  1.12 GB  |  674 words/sec\n",
+            "  [ 47.8%]   60,000/125,414 words  |    81,994,038 lines  |  1.20 GB  |  686 words/sec\n",
+            "  [ 51.8%]   65,000/125,414 words  |    88,058,594 lines  |  1.29 GB  |  683 words/sec\n",
+            "  [ 55.8%]   70,000/125,414 words  |    94,651,291 lines  |  1.39 GB  |  688 words/sec\n",
+            "  [ 59.8%]   75,000/125,414 words  |   101,636,647 lines  |  1.49 GB  |  679 words/sec\n",
+            "  [ 63.8%]   80,000/125,414 words  |   107,086,424 lines  |  1.57 GB  |  691 words/sec\n",
+            "  [ 67.8%]   85,000/125,414 words  |   114,898,717 lines  |  1.68 GB  |  678 words/sec\n",
+            "  [ 71.8%]   90,000/125,414 words  |   123,278,791 lines  |  1.80 GB  |  675 words/sec\n",
+            "  [ 75.7%]   95,000/125,414 words  |   129,821,900 lines  |  1.90 GB  |  669 words/sec\n",
+            "  [ 79.7%]  100,000/125,414 words  |   136,429,269 lines  |  2.00 GB  |  673 words/sec\n",
+            "  [ 83.7%]  105,000/125,414 words  |   143,342,171 lines  |  2.10 GB  |  667 words/sec\n",
+            "  [ 87.7%]  110,000/125,414 words  |   150,701,210 lines  |  2.21 GB  |  666 words/sec\n",
+            "  [ 91.7%]  115,000/125,414 words  |   157,479,616 lines  |  2.31 GB  |  665 words/sec\n",
+            "  [ 95.7%]  120,000/125,414 words  |   165,619,673 lines  |  2.43 GB  |  662 words/sec\n",
+            "  [ 99.7%]  125,000/125,414 words  |   172,558,768 lines  |  2.53 GB  |  661 words/sec\n",
+            "\n",
+            "============================================================\n",
+            "  ✅  DONE in 189.5s (3.2 min)\n",
+            "  Words processed  : 125,414\n",
+            "  Lines written    : 173,110,626\n",
+            "  Output file      : misspellings_permutations.txt\n",
+            "  File size        : 2.53 GB\n",
+            "============================================================\n"
+          ]
+        }
+      ]
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# If saved to VM disk:\n",
+        "files.download('misspellings_permutations.txt')\n",
+        "\n",
+        "# If saved to Google Drive: just access it from drive.google.com\n",
+        "\n"
+      ],
+      "metadata": {
+        "id": "y9jWxvv8LWoH",
+        "outputId": "d8d754d3-234e-4020-bcc7-a19f3fc5fb26",
+        "colab": {
+          "base_uri": "https://localhost:8080/",
+          "height": 34
+        }
+      },
+      "execution_count": null,
+      "outputs": [
+        {
+          "output_type": "display_data",
+          "data": {
+            "text/plain": [
+              "<IPython.core.display.Javascript object>"
+            ],
+            "application/javascript": [
+              "\n",
+              "    async function download(id, filename, size) {\n",
+              "      if (!google.colab.kernel.accessAllowed) {\n",
+              "        return;\n",
+              "      }\n",
+              "      const div = document.createElement('div');\n",
+              "      const label = document.createElement('label');\n",
+              "      label.textContent = `Downloading \"${filename}\": `;\n",
+              "      div.appendChild(label);\n",
+              "      const progress = document.createElement('progress');\n",
+              "      progress.max = size;\n",
+              "      div.appendChild(progress);\n",
+              "      document.body.appendChild(div);\n",
+              "\n",
+              "      const buffers = [];\n",
+              "      let downloaded = 0;\n",
+              "\n",
+              "      const channel = await google.colab.kernel.comms.open(id);\n",
+              "      // Send a message to notify the kernel that we're ready.\n",
+              "      channel.send({})\n",
+              "\n",
+              "      for await (const message of channel.messages) {\n",
+              "        // Send a message to notify the kernel that we're ready.\n",
+              "        channel.send({})\n",
+              "        if (message.buffers) {\n",
+              "          for (const buffer of message.buffers) {\n",
+              "            buffers.push(buffer);\n",
+              "            downloaded += buffer.byteLength;\n",
+              "            progress.value = downloaded;\n",
+              "          }\n",
+              "        }\n",
+              "      }\n",
+              "      const blob = new Blob(buffers, {type: 'application/binary'});\n",
+              "      const a = document.createElement('a');\n",
+              "      a.href = window.URL.createObjectURL(blob);\n",
+              "      a.download = filename;\n",
+              "      div.appendChild(a);\n",
+              "      a.click();\n",
+              "      div.remove();\n",
+              "    }\n",
+              "  "
+            ]
+          },
+          "metadata": {}
+        },
+        {
+          "output_type": "display_data",
+          "data": {
+            "text/plain": [
+              "<IPython.core.display.Javascript object>"
+            ],
+            "application/javascript": [
+              "download(\"download_10941777-78c6-4833-b8e6-093feee02e11\", \"misspellings_permutations.txt\", 2721877361)"
+            ]
+          },
+          "metadata": {}
+        }
+      ]
+    }
+  ]
+}

google_collab_263MSW.ipynb ADDED Viewed

	@@ -0,0 +1,523 @@

+{
+  "nbformat": 4,
+  "nbformat_minor": 0,
+  "metadata": {
+    "colab": {
+      "provenance": []
+    },
+    "kernelspec": {
+      "name": "python3",
+      "display_name": "Python 3"
+    },
+    "language_info": {
+      "name": "python"
+    }
+  },
+  "cells": [
+    {
+      "cell_type": "code",
+      "execution_count": 1,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/",
+          "height": 73
+        },
+        "id": "NKDQAIA9bkTI",
+        "outputId": "b21b6fd4-cbe0-46f2-ae31-639ac21e04c4"
+      },
+      "outputs": [
+        {
+          "output_type": "display_data",
+          "data": {
+            "text/plain": [
+              "<IPython.core.display.HTML object>"
+            ],
+            "text/html": [
+              "\n",
+              "     <input type=\"file\" id=\"files-27caebec-daaf-4dc1-9317-a13c04ecdb3b\" name=\"files[]\" multiple disabled\n",
+              "        style=\"border:none\" />\n",
+              "     <output id=\"result-27caebec-daaf-4dc1-9317-a13c04ecdb3b\">\n",
+              "      Upload widget is only available when the cell has been executed in the\n",
+              "      current browser session. Please rerun this cell to enable.\n",
+              "      </output>\n",
+              "      <script>// Copyright 2017 Google LLC\n",
+              "//\n",
+              "// Licensed under the Apache License, Version 2.0 (the \"License\");\n",
+              "// you may not use this file except in compliance with the License.\n",
+              "// You may obtain a copy of the License at\n",
+              "//\n",
+              "//      http://www.apache.org/licenses/LICENSE-2.0\n",
+              "//\n",
+              "// Unless required by applicable law or agreed to in writing, software\n",
+              "// distributed under the License is distributed on an \"AS IS\" BASIS,\n",
+              "// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n",
+              "// See the License for the specific language governing permissions and\n",
+              "// limitations under the License.\n",
+              "\n",
+              "/**\n",
+              " * @fileoverview Helpers for google.colab Python module.\n",
+              " */\n",
+              "(function(scope) {\n",
+              "function span(text, styleAttributes = {}) {\n",
+              "  const element = document.createElement('span');\n",
+              "  element.textContent = text;\n",
+              "  for (const key of Object.keys(styleAttributes)) {\n",
+              "    element.style[key] = styleAttributes[key];\n",
+              "  }\n",
+              "  return element;\n",
+              "}\n",
+              "\n",
+              "// Max number of bytes which will be uploaded at a time.\n",
+              "const MAX_PAYLOAD_SIZE = 100 * 1024;\n",
+              "\n",
+              "function _uploadFiles(inputId, outputId) {\n",
+              "  const steps = uploadFilesStep(inputId, outputId);\n",
+              "  const outputElement = document.getElementById(outputId);\n",
+              "  // Cache steps on the outputElement to make it available for the next call\n",
+              "  // to uploadFilesContinue from Python.\n",
+              "  outputElement.steps = steps;\n",
+              "\n",
+              "  return _uploadFilesContinue(outputId);\n",
+              "}\n",
+              "\n",
+              "// This is roughly an async generator (not supported in the browser yet),\n",
+              "// where there are multiple asynchronous steps and the Python side is going\n",
+              "// to poll for completion of each step.\n",
+              "// This uses a Promise to block the python side on completion of each step,\n",
+              "// then passes the result of the previous step as the input to the next step.\n",
+              "function _uploadFilesContinue(outputId) {\n",
+              "  const outputElement = document.getElementById(outputId);\n",
+              "  const steps = outputElement.steps;\n",
+              "\n",
+              "  const next = steps.next(outputElement.lastPromiseValue);\n",
+              "  return Promise.resolve(next.value.promise).then((value) => {\n",
+              "    // Cache the last promise value to make it available to the next\n",
+              "    // step of the generator.\n",
+              "    outputElement.lastPromiseValue = value;\n",
+              "    return next.value.response;\n",
+              "  });\n",
+              "}\n",
+              "\n",
+              "/**\n",
+              " * Generator function which is called between each async step of the upload\n",
+              " * process.\n",
+              " * @param {string} inputId Element ID of the input file picker element.\n",
+              " * @param {string} outputId Element ID of the output display.\n",
+              " * @return {!Iterable<!Object>} Iterable of next steps.\n",
+              " */\n",
+              "function* uploadFilesStep(inputId, outputId) {\n",
+              "  const inputElement = document.getElementById(inputId);\n",
+              "  inputElement.disabled = false;\n",
+              "\n",
+              "  const outputElement = document.getElementById(outputId);\n",
+              "  outputElement.innerHTML = '';\n",
+              "\n",
+              "  const pickedPromise = new Promise((resolve) => {\n",
+              "    inputElement.addEventListener('change', (e) => {\n",
+              "      resolve(e.target.files);\n",
+              "    });\n",
+              "  });\n",
+              "\n",
+              "  const cancel = document.createElement('button');\n",
+              "  inputElement.parentElement.appendChild(cancel);\n",
+              "  cancel.textContent = 'Cancel upload';\n",
+              "  const cancelPromise = new Promise((resolve) => {\n",
+              "    cancel.onclick = () => {\n",
+              "      resolve(null);\n",
+              "    };\n",
+              "  });\n",
+              "\n",
+              "  // Wait for the user to pick the files.\n",
+              "  const files = yield {\n",
+              "    promise: Promise.race([pickedPromise, cancelPromise]),\n",
+              "    response: {\n",
+              "      action: 'starting',\n",
+              "    }\n",
+              "  };\n",
+              "\n",
+              "  cancel.remove();\n",
+              "\n",
+              "  // Disable the input element since further picks are not allowed.\n",
+              "  inputElement.disabled = true;\n",
+              "\n",
+              "  if (!files) {\n",
+              "    return {\n",
+              "      response: {\n",
+              "        action: 'complete',\n",
+              "      }\n",
+              "    };\n",
+              "  }\n",
+              "\n",
+              "  for (const file of files) {\n",
+              "    const li = document.createElement('li');\n",
+              "    li.append(span(file.name, {fontWeight: 'bold'}));\n",
+              "    li.append(span(\n",
+              "        `(${file.type || 'n/a'}) - ${file.size} bytes, ` +\n",
+              "        `last modified: ${\n",
+              "            file.lastModifiedDate ? file.lastModifiedDate.toLocaleDateString() :\n",
+              "                                    'n/a'} - `));\n",
+              "    const percent = span('0% done');\n",
+              "    li.appendChild(percent);\n",
+              "\n",
+              "    outputElement.appendChild(li);\n",
+              "\n",
+              "    const fileDataPromise = new Promise((resolve) => {\n",
+              "      const reader = new FileReader();\n",
+              "      reader.onload = (e) => {\n",
+              "        resolve(e.target.result);\n",
+              "      };\n",
+              "      reader.readAsArrayBuffer(file);\n",
+              "    });\n",
+              "    // Wait for the data to be ready.\n",
+              "    let fileData = yield {\n",
+              "      promise: fileDataPromise,\n",
+              "      response: {\n",
+              "        action: 'continue',\n",
+              "      }\n",
+              "    };\n",
+              "\n",
+              "    // Use a chunked sending to avoid message size limits. See b/62115660.\n",
+              "    let position = 0;\n",
+              "    do {\n",
+              "      const length = Math.min(fileData.byteLength - position, MAX_PAYLOAD_SIZE);\n",
+              "      const chunk = new Uint8Array(fileData, position, length);\n",
+              "      position += length;\n",
+              "\n",
+              "      const base64 = btoa(String.fromCharCode.apply(null, chunk));\n",
+              "      yield {\n",
+              "        response: {\n",
+              "          action: 'append',\n",
+              "          file: file.name,\n",
+              "          data: base64,\n",
+              "        },\n",
+              "      };\n",
+              "\n",
+              "      let percentDone = fileData.byteLength === 0 ?\n",
+              "          100 :\n",
+              "          Math.round((position / fileData.byteLength) * 100);\n",
+              "      percent.textContent = `${percentDone}% done`;\n",
+              "\n",
+              "    } while (position < fileData.byteLength);\n",
+              "  }\n",
+              "\n",
+              "  // All done.\n",
+              "  yield {\n",
+              "    response: {\n",
+              "      action: 'complete',\n",
+              "    }\n",
+              "  };\n",
+              "}\n",
+              "\n",
+              "scope.google = scope.google || {};\n",
+              "scope.google.colab = scope.google.colab || {};\n",
+              "scope.google.colab._files = {\n",
+              "  _uploadFiles,\n",
+              "  _uploadFilesContinue,\n",
+              "};\n",
+              "})(self);\n",
+              "</script> "
+            ]
+          },
+          "metadata": {}
+        },
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "Saving words.txt to words.txt\n"
+          ]
+        }
+      ],
+      "source": [
+        "# Cell 1\n",
+        "from google.colab import files\n",
+        "uploaded = files.upload()   # select words.txt from your PC\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "\"\"\"\n",
+        "Generate realistic typo-based misspellings from words.txt → misspellings.txt\n",
+        "\n",
+        "Colab version\n",
+        "Place words.txt in /content/ before running\n",
+        "\"\"\"\n",
+        "\n",
+        "import os\n",
+        "import time\n",
+        "\n",
+        "# Optional: mount Google Drive if your file is there\n",
+        "# from google.colab import drive\n",
+        "# drive.mount('/content/drive')\n",
+        "# words_path = '/content/drive/MyDrive/words.txt'\n",
+        "\n",
+        "words_path = '/content/words.txt'\n",
+        "output_path = '/content/misspellings.txt'\n",
+        "\n",
+        "KEYBOARD_NEIGHBORS = {\n",
+        "    'q': 'wa', 'w': 'qeas', 'e': 'wrds', 'r': 'etfs', 't': 'rygs',\n",
+        "    'y': 'tuhs', 'u': 'yijs', 'i': 'uoks', 'o': 'ipls', 'p': 'o',\n",
+        "    'a': 'qwsz', 's': 'awedxz', 'd': 'serfcx', 'f': 'drtgvc',\n",
+        "    'g': 'ftyhbv', 'h': 'gyujnb', 'j': 'huikmn', 'k': 'jiolm',\n",
+        "    'l': 'kop', 'z': 'asx', 'x': 'zsdc', 'c': 'xdfv', 'v': 'cfgb',\n",
+        "    'b': 'vghn', 'n': 'bhjm', 'm': 'njk',\n",
+        "}\n",
+        "\n",
+        "def generate_adjacent_swaps(word):\n",
+        "    typos = []\n",
+        "    for i in range(len(word) - 1):\n",
+        "        chars = list(word)\n",
+        "        chars[i], chars[i + 1] = chars[i + 1], chars[i]\n",
+        "        typo = ''.join(chars)\n",
+        "        if typo != word:\n",
+        "            typos.append(typo)\n",
+        "    return typos\n",
+        "\n",
+        "def generate_deletions(word):\n",
+        "    typos = []\n",
+        "    for i in range(len(word)):\n",
+        "        typo = word[:i] + word[i + 1:]\n",
+        "        if len(typo) >= 2:\n",
+        "            typos.append(typo)\n",
+        "    return typos\n",
+        "\n",
+        "def generate_duplications(word):\n",
+        "    typos = []\n",
+        "    for i in range(len(word)):\n",
+        "        typo = word[:i] + word[i] + word[i:]\n",
+        "        if typo != word:\n",
+        "            typos.append(typo)\n",
+        "    return typos\n",
+        "\n",
+        "def generate_nearby_key_subs(word):\n",
+        "    typos = []\n",
+        "    lower = word.lower()\n",
+        "    for i in range(len(word)):\n",
+        "        ch = lower[i]\n",
+        "        if ch in KEYBOARD_NEIGHBORS:\n",
+        "            for neighbor in KEYBOARD_NEIGHBORS[ch]:\n",
+        "                typo = lower[:i] + neighbor + lower[i + 1:]\n",
+        "                if typo != lower:\n",
+        "                    typos.append(typo)\n",
+        "    return typos\n",
+        "\n",
+        "def generate_all_typos(word):\n",
+        "    typos = set()\n",
+        "    typos.update(generate_adjacent_swaps(word))\n",
+        "    typos.update(generate_deletions(word))\n",
+        "    typos.update(generate_duplications(word))\n",
+        "    typos.update(generate_nearby_key_subs(word))\n",
+        "    typos.discard(word)\n",
+        "    typos.discard(word.lower())\n",
+        "    return typos\n",
+        "\n",
+        "def is_pure_alpha(word):\n",
+        "    return word.isalpha()\n",
+        "\n",
+        "# ── Check file ──────────────────────────────────────────────\n",
+        "if not os.path.exists(words_path):\n",
+        "    raise FileNotFoundError(f\"{words_path} not found. Upload it to /content/ first.\")\n",
+        "\n",
+        "print(f\"Reading words from: {words_path}\")\n",
+        "\n",
+        "with open(words_path, 'r', encoding='utf-8', errors='replace') as f:\n",
+        "    raw_words = [line.strip() for line in f if line.strip()]\n",
+        "\n",
+        "print(f\"Total raw entries: {len(raw_words):,}\")\n",
+        "\n",
+        "words = [w for w in raw_words if is_pure_alpha(w) and len(w) >= 3]\n",
+        "print(f\"Filtered to {len(words):,} alphabetical words (len >= 3)\")\n",
+        "\n",
+        "start = time.time()\n",
+        "total_typos = 0\n",
+        "batch_size = 10_000\n",
+        "\n",
+        "print(f\"Generating typos → {output_path}\")\n",
+        "\n",
+        "with open(output_path, 'w', encoding='utf-8', newline='\\n') as out:\n",
+        "    out.write(\"# Auto-generated misspellings database\\n\")\n",
+        "    out.write(\"# Format: misspelling=correction\\n\\n\")\n",
+        "\n",
+        "    for idx, word in enumerate(words):\n",
+        "        correction = word\n",
+        "        typos = generate_all_typos(word.lower())\n",
+        "\n",
+        "        for typo in sorted(typos):\n",
+        "            out.write(f\"{typo}={correction}\\n\")\n",
+        "            total_typos += 1\n",
+        "\n",
+        "        if (idx + 1) % batch_size == 0:\n",
+        "            elapsed = time.time() - start\n",
+        "            pct = (idx + 1) / len(words) * 100\n",
+        "            rate = (idx + 1) / elapsed if elapsed > 0 else 0\n",
+        "            print(f\"[{pct:5.1f}%] {idx + 1:,}/{len(words):,} words | \"\n",
+        "                  f\"{total_typos:,} typos | {rate:.0f} words/sec\")\n",
+        "\n",
+        "elapsed = time.time() - start\n",
+        "file_size_mb = os.path.getsize(output_path) / (1024 * 1024)\n",
+        "\n",
+        "print(\"\\n\" + \"=\" * 60)\n",
+        "print(f\"Done in {elapsed:.1f}s\")\n",
+        "print(f\"Words processed : {len(words):,}\")\n",
+        "print(f\"Typos generated : {total_typos:,}\")\n",
+        "print(f\"Output file     : {output_path}\")\n",
+        "print(f\"File size       : {file_size_mb:.1f} MB\")\n",
+        "print(\"=\" * 60)"
+      ],
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "8wpfrH2Rev6c",
+        "outputId": "c5b782a4-01e2-46e9-cf19-628f0315eb03"
+      },
+      "execution_count": 3,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "Reading words from: /content/words.txt\n",
+            "Total raw entries: 466,550\n",
+            "Filtered to 415,701 alphabetical words (len >= 3)\n",
+            "Generating typos → /content/misspellings.txt\n",
+            "[  2.4%] 10,000/415,701 words | 606,939 typos | 25472 words/sec\n",
+            "[  4.8%] 20,000/415,701 words | 1,280,904 typos | 24508 words/sec\n",
+            "[  7.2%] 30,000/415,701 words | 1,896,445 typos | 24634 words/sec\n",
+            "[  9.6%] 40,000/415,701 words | 2,472,636 typos | 25175 words/sec\n",
+            "[ 12.0%] 50,000/415,701 words | 3,046,929 typos | 25615 words/sec\n",
+            "[ 14.4%] 60,000/415,701 words | 3,658,494 typos | 25610 words/sec\n",
+            "[ 16.8%] 70,000/415,701 words | 4,310,538 typos | 25453 words/sec\n",
+            "[ 19.2%] 80,000/415,701 words | 4,990,356 typos | 25166 words/sec\n",
+            "[ 21.7%] 90,000/415,701 words | 5,607,705 typos | 25045 words/sec\n",
+            "[ 24.1%] 100,000/415,701 words | 6,313,297 typos | 24478 words/sec\n",
+            "[ 26.5%] 110,000/415,701 words | 6,924,705 typos | 24476 words/sec\n",
+            "[ 28.9%] 120,000/415,701 words | 7,551,152 typos | 24435 words/sec\n",
+            "[ 31.3%] 130,000/415,701 words | 8,173,721 typos | 24412 words/sec\n",
+            "[ 33.7%] 140,000/415,701 words | 8,784,574 typos | 24411 words/sec\n",
+            "[ 36.1%] 150,000/415,701 words | 9,371,986 typos | 24565 words/sec\n",
+            "[ 38.5%] 160,000/415,701 words | 10,066,265 typos | 24395 words/sec\n",
+            "[ 40.9%] 170,000/415,701 words | 10,683,848 typos | 24422 words/sec\n",
+            "[ 43.3%] 180,000/415,701 words | 11,419,079 typos | 24226 words/sec\n",
+            "[ 45.7%] 190,000/415,701 words | 11,935,360 typos | 24456 words/sec\n",
+            "[ 48.1%] 200,000/415,701 words | 12,506,920 typos | 24350 words/sec\n",
+            "[ 50.5%] 210,000/415,701 words | 13,082,705 typos | 23918 words/sec\n",
+            "[ 52.9%] 220,000/415,701 words | 13,740,979 typos | 23111 words/sec\n",
+            "[ 55.3%] 230,000/415,701 words | 14,339,517 typos | 23098 words/sec\n",
+            "[ 57.7%] 240,000/415,701 words | 15,158,921 typos | 22855 words/sec\n",
+            "[ 60.1%] 250,000/415,701 words | 15,771,208 typos | 22941 words/sec\n",
+            "[ 62.5%] 260,000/415,701 words | 16,479,864 typos | 22901 words/sec\n",
+            "[ 65.0%] 270,000/415,701 words | 17,144,444 typos | 22915 words/sec\n",
+            "[ 67.4%] 280,000/415,701 words | 17,764,197 typos | 23001 words/sec\n",
+            "[ 69.8%] 290,000/415,701 words | 18,511,700 typos | 22932 words/sec\n",
+            "[ 72.2%] 300,000/415,701 words | 19,126,791 typos | 22983 words/sec\n",
+            "[ 74.6%] 310,000/415,701 words | 19,770,597 typos | 22941 words/sec\n",
+            "[ 77.0%] 320,000/415,701 words | 20,369,517 typos | 23014 words/sec\n",
+            "[ 79.4%] 330,000/415,701 words | 21,019,600 typos | 23035 words/sec\n",
+            "[ 81.8%] 340,000/415,701 words | 21,631,279 typos | 23071 words/sec\n",
+            "[ 84.2%] 350,000/415,701 words | 22,312,850 typos | 23047 words/sec\n",
+            "[ 86.6%] 360,000/415,701 words | 22,968,756 typos | 23043 words/sec\n",
+            "[ 89.0%] 370,000/415,701 words | 23,596,078 typos | 23056 words/sec\n",
+            "[ 91.4%] 380,000/415,701 words | 24,266,024 typos | 23043 words/sec\n",
+            "[ 93.8%] 390,000/415,701 words | 25,041,545 typos | 22925 words/sec\n",
+            "[ 96.2%] 400,000/415,701 words | 25,744,156 typos | 22899 words/sec\n",
+            "[ 98.6%] 410,000/415,701 words | 26,322,505 typos | 22958 words/sec\n",
+            "\n",
+            "============================================================\n",
+            "Done in 18.1s\n",
+            "Words processed : 415,701\n",
+            "Typos generated : 26,636,990\n",
+            "Output file     : /content/misspellings.txt\n",
+            "File size       : 566.3 MB\n",
+            "============================================================\n"
+          ]
+        }
+      ]
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# If saved to VM disk:\n",
+        "files.download('misspellings.txt')\n",
+        "\n",
+        "# If saved to Google Drive: just access it from drive.google.com"
+      ],
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/",
+          "height": 17
+        },
+        "id": "HVq_gU0qfG9u",
+        "outputId": "dc770f0b-76d2-4ad7-93ba-bef0e9da45e3"
+      },
+      "execution_count": 4,
+      "outputs": [
+        {
+          "output_type": "display_data",
+          "data": {
+            "text/plain": [
+              "<IPython.core.display.Javascript object>"
+            ],
+            "application/javascript": [
+              "\n",
+              "    async function download(id, filename, size) {\n",
+              "      if (!google.colab.kernel.accessAllowed) {\n",
+              "        return;\n",
+              "      }\n",
+              "      const div = document.createElement('div');\n",
+              "      const label = document.createElement('label');\n",
+              "      label.textContent = `Downloading \"${filename}\": `;\n",
+              "      div.appendChild(label);\n",
+              "      const progress = document.createElement('progress');\n",
+              "      progress.max = size;\n",
+              "      div.appendChild(progress);\n",
+              "      document.body.appendChild(div);\n",
+              "\n",
+              "      const buffers = [];\n",
+              "      let downloaded = 0;\n",
+              "\n",
+              "      const channel = await google.colab.kernel.comms.open(id);\n",
+              "      // Send a message to notify the kernel that we're ready.\n",
+              "      channel.send({})\n",
+              "\n",
+              "      for await (const message of channel.messages) {\n",
+              "        // Send a message to notify the kernel that we're ready.\n",
+              "        channel.send({})\n",
+              "        if (message.buffers) {\n",
+              "          for (const buffer of message.buffers) {\n",
+              "            buffers.push(buffer);\n",
+              "            downloaded += buffer.byteLength;\n",
+              "            progress.value = downloaded;\n",
+              "          }\n",
+              "        }\n",
+              "      }\n",
+              "      const blob = new Blob(buffers, {type: 'application/binary'});\n",
+              "      const a = document.createElement('a');\n",
+              "      a.href = window.URL.createObjectURL(blob);\n",
+              "      a.download = filename;\n",
+              "      div.appendChild(a);\n",
+              "      a.click();\n",
+              "      div.remove();\n",
+              "    }\n",
+              "  "
+            ]
+          },
+          "metadata": {}
+        },
+        {
+          "output_type": "display_data",
+          "data": {
+            "text/plain": [
+              "<IPython.core.display.Javascript object>"
+            ],
+            "application/javascript": [
+              "download(\"download_ef5c634e-3ae3-4a85-a7b4-8f9422b11298\", \"misspellings.txt\", 593809553)"
+            ]
+          },
+          "metadata": {}
+        }
+      ]
+    }
+  ]
+}