{ "nbformat": 4, "nbformat_minor": 0, "metadata": { "colab": { "provenance": [] }, "kernelspec": { "name": "python3", "display_name": "Python 3" }, "language_info": { "name": "python" } }, "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 73 }, "id": "NKDQAIA9bkTI", "outputId": "b21b6fd4-cbe0-46f2-ae31-639ac21e04c4" }, "outputs": [ { "output_type": "display_data", "data": { "text/plain": [ "" ], "text/html": [ "\n", " \n", " \n", " Upload widget is only available when the cell has been executed in the\n", " current browser session. Please rerun this cell to enable.\n", " \n", " " ] }, "metadata": {} }, { "output_type": "stream", "name": "stdout", "text": [ "Saving words.txt to words.txt\n" ] } ], "source": [ "# Cell 1\n", "from google.colab import files\n", "uploaded = files.upload() # select words.txt from your PC\n" ] }, { "cell_type": "code", "source": [ "\"\"\"\n", "Generate realistic typo-based misspellings from words.txt → misspellings.txt\n", "\n", "Colab version\n", "Place words.txt in /content/ before running\n", "\"\"\"\n", "\n", "import os\n", "import time\n", "\n", "# Optional: mount Google Drive if your file is there\n", "# from google.colab import drive\n", "# drive.mount('/content/drive')\n", "# words_path = '/content/drive/MyDrive/words.txt'\n", "\n", "words_path = '/content/words.txt'\n", "output_path = '/content/misspellings.txt'\n", "\n", "KEYBOARD_NEIGHBORS = {\n", " 'q': 'wa', 'w': 'qeas', 'e': 'wrds', 'r': 'etfs', 't': 'rygs',\n", " 'y': 'tuhs', 'u': 'yijs', 'i': 'uoks', 'o': 'ipls', 'p': 'o',\n", " 'a': 'qwsz', 's': 'awedxz', 'd': 'serfcx', 'f': 'drtgvc',\n", " 'g': 'ftyhbv', 'h': 'gyujnb', 'j': 'huikmn', 'k': 'jiolm',\n", " 'l': 'kop', 'z': 'asx', 'x': 'zsdc', 'c': 'xdfv', 'v': 'cfgb',\n", " 'b': 'vghn', 'n': 'bhjm', 'm': 'njk',\n", "}\n", "\n", "def generate_adjacent_swaps(word):\n", " typos = []\n", " for i in range(len(word) - 1):\n", " chars = list(word)\n", " chars[i], chars[i + 1] = chars[i + 1], chars[i]\n", " typo = ''.join(chars)\n", " if typo != word:\n", " typos.append(typo)\n", " return typos\n", "\n", "def generate_deletions(word):\n", " typos = []\n", " for i in range(len(word)):\n", " typo = word[:i] + word[i + 1:]\n", " if len(typo) >= 2:\n", " typos.append(typo)\n", " return typos\n", "\n", "def generate_duplications(word):\n", " typos = []\n", " for i in range(len(word)):\n", " typo = word[:i] + word[i] + word[i:]\n", " if typo != word:\n", " typos.append(typo)\n", " return typos\n", "\n", "def generate_nearby_key_subs(word):\n", " typos = []\n", " lower = word.lower()\n", " for i in range(len(word)):\n", " ch = lower[i]\n", " if ch in KEYBOARD_NEIGHBORS:\n", " for neighbor in KEYBOARD_NEIGHBORS[ch]:\n", " typo = lower[:i] + neighbor + lower[i + 1:]\n", " if typo != lower:\n", " typos.append(typo)\n", " return typos\n", "\n", "def generate_all_typos(word):\n", " typos = set()\n", " typos.update(generate_adjacent_swaps(word))\n", " typos.update(generate_deletions(word))\n", " typos.update(generate_duplications(word))\n", " typos.update(generate_nearby_key_subs(word))\n", " typos.discard(word)\n", " typos.discard(word.lower())\n", " return typos\n", "\n", "def is_pure_alpha(word):\n", " return word.isalpha()\n", "\n", "# ── Check file ──────────────────────────────────────────────\n", "if not os.path.exists(words_path):\n", " raise FileNotFoundError(f\"{words_path} not found. Upload it to /content/ first.\")\n", "\n", "print(f\"Reading words from: {words_path}\")\n", "\n", "with open(words_path, 'r', encoding='utf-8', errors='replace') as f:\n", " raw_words = [line.strip() for line in f if line.strip()]\n", "\n", "print(f\"Total raw entries: {len(raw_words):,}\")\n", "\n", "words = [w for w in raw_words if is_pure_alpha(w) and len(w) >= 3]\n", "print(f\"Filtered to {len(words):,} alphabetical words (len >= 3)\")\n", "\n", "start = time.time()\n", "total_typos = 0\n", "batch_size = 10_000\n", "\n", "print(f\"Generating typos → {output_path}\")\n", "\n", "with open(output_path, 'w', encoding='utf-8', newline='\\n') as out:\n", " out.write(\"# Auto-generated misspellings database\\n\")\n", " out.write(\"# Format: misspelling=correction\\n\\n\")\n", "\n", " for idx, word in enumerate(words):\n", " correction = word\n", " typos = generate_all_typos(word.lower())\n", "\n", " for typo in sorted(typos):\n", " out.write(f\"{typo}={correction}\\n\")\n", " total_typos += 1\n", "\n", " if (idx + 1) % batch_size == 0:\n", " elapsed = time.time() - start\n", " pct = (idx + 1) / len(words) * 100\n", " rate = (idx + 1) / elapsed if elapsed > 0 else 0\n", " print(f\"[{pct:5.1f}%] {idx + 1:,}/{len(words):,} words | \"\n", " f\"{total_typos:,} typos | {rate:.0f} words/sec\")\n", "\n", "elapsed = time.time() - start\n", "file_size_mb = os.path.getsize(output_path) / (1024 * 1024)\n", "\n", "print(\"\\n\" + \"=\" * 60)\n", "print(f\"Done in {elapsed:.1f}s\")\n", "print(f\"Words processed : {len(words):,}\")\n", "print(f\"Typos generated : {total_typos:,}\")\n", "print(f\"Output file : {output_path}\")\n", "print(f\"File size : {file_size_mb:.1f} MB\")\n", "print(\"=\" * 60)" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "8wpfrH2Rev6c", "outputId": "c5b782a4-01e2-46e9-cf19-628f0315eb03" }, "execution_count": 3, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Reading words from: /content/words.txt\n", "Total raw entries: 466,550\n", "Filtered to 415,701 alphabetical words (len >= 3)\n", "Generating typos → /content/misspellings.txt\n", "[ 2.4%] 10,000/415,701 words | 606,939 typos | 25472 words/sec\n", "[ 4.8%] 20,000/415,701 words | 1,280,904 typos | 24508 words/sec\n", "[ 7.2%] 30,000/415,701 words | 1,896,445 typos | 24634 words/sec\n", "[ 9.6%] 40,000/415,701 words | 2,472,636 typos | 25175 words/sec\n", "[ 12.0%] 50,000/415,701 words | 3,046,929 typos | 25615 words/sec\n", "[ 14.4%] 60,000/415,701 words | 3,658,494 typos | 25610 words/sec\n", "[ 16.8%] 70,000/415,701 words | 4,310,538 typos | 25453 words/sec\n", "[ 19.2%] 80,000/415,701 words | 4,990,356 typos | 25166 words/sec\n", "[ 21.7%] 90,000/415,701 words | 5,607,705 typos | 25045 words/sec\n", "[ 24.1%] 100,000/415,701 words | 6,313,297 typos | 24478 words/sec\n", "[ 26.5%] 110,000/415,701 words | 6,924,705 typos | 24476 words/sec\n", "[ 28.9%] 120,000/415,701 words | 7,551,152 typos | 24435 words/sec\n", "[ 31.3%] 130,000/415,701 words | 8,173,721 typos | 24412 words/sec\n", "[ 33.7%] 140,000/415,701 words | 8,784,574 typos | 24411 words/sec\n", "[ 36.1%] 150,000/415,701 words | 9,371,986 typos | 24565 words/sec\n", "[ 38.5%] 160,000/415,701 words | 10,066,265 typos | 24395 words/sec\n", "[ 40.9%] 170,000/415,701 words | 10,683,848 typos | 24422 words/sec\n", "[ 43.3%] 180,000/415,701 words | 11,419,079 typos | 24226 words/sec\n", "[ 45.7%] 190,000/415,701 words | 11,935,360 typos | 24456 words/sec\n", "[ 48.1%] 200,000/415,701 words | 12,506,920 typos | 24350 words/sec\n", "[ 50.5%] 210,000/415,701 words | 13,082,705 typos | 23918 words/sec\n", "[ 52.9%] 220,000/415,701 words | 13,740,979 typos | 23111 words/sec\n", "[ 55.3%] 230,000/415,701 words | 14,339,517 typos | 23098 words/sec\n", "[ 57.7%] 240,000/415,701 words | 15,158,921 typos | 22855 words/sec\n", "[ 60.1%] 250,000/415,701 words | 15,771,208 typos | 22941 words/sec\n", "[ 62.5%] 260,000/415,701 words | 16,479,864 typos | 22901 words/sec\n", "[ 65.0%] 270,000/415,701 words | 17,144,444 typos | 22915 words/sec\n", "[ 67.4%] 280,000/415,701 words | 17,764,197 typos | 23001 words/sec\n", "[ 69.8%] 290,000/415,701 words | 18,511,700 typos | 22932 words/sec\n", "[ 72.2%] 300,000/415,701 words | 19,126,791 typos | 22983 words/sec\n", "[ 74.6%] 310,000/415,701 words | 19,770,597 typos | 22941 words/sec\n", "[ 77.0%] 320,000/415,701 words | 20,369,517 typos | 23014 words/sec\n", "[ 79.4%] 330,000/415,701 words | 21,019,600 typos | 23035 words/sec\n", "[ 81.8%] 340,000/415,701 words | 21,631,279 typos | 23071 words/sec\n", "[ 84.2%] 350,000/415,701 words | 22,312,850 typos | 23047 words/sec\n", "[ 86.6%] 360,000/415,701 words | 22,968,756 typos | 23043 words/sec\n", "[ 89.0%] 370,000/415,701 words | 23,596,078 typos | 23056 words/sec\n", "[ 91.4%] 380,000/415,701 words | 24,266,024 typos | 23043 words/sec\n", "[ 93.8%] 390,000/415,701 words | 25,041,545 typos | 22925 words/sec\n", "[ 96.2%] 400,000/415,701 words | 25,744,156 typos | 22899 words/sec\n", "[ 98.6%] 410,000/415,701 words | 26,322,505 typos | 22958 words/sec\n", "\n", "============================================================\n", "Done in 18.1s\n", "Words processed : 415,701\n", "Typos generated : 26,636,990\n", "Output file : /content/misspellings.txt\n", "File size : 566.3 MB\n", "============================================================\n" ] } ] }, { "cell_type": "code", "source": [ "# If saved to VM disk:\n", "files.download('misspellings.txt')\n", "\n", "# If saved to Google Drive: just access it from drive.google.com" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 17 }, "id": "HVq_gU0qfG9u", "outputId": "dc770f0b-76d2-4ad7-93ba-bef0e9da45e3" }, "execution_count": 4, "outputs": [ { "output_type": "display_data", "data": { "text/plain": [ "" ], "application/javascript": [ "\n", " async function download(id, filename, size) {\n", " if (!google.colab.kernel.accessAllowed) {\n", " return;\n", " }\n", " const div = document.createElement('div');\n", " const label = document.createElement('label');\n", " label.textContent = `Downloading \"${filename}\": `;\n", " div.appendChild(label);\n", " const progress = document.createElement('progress');\n", " progress.max = size;\n", " div.appendChild(progress);\n", " document.body.appendChild(div);\n", "\n", " const buffers = [];\n", " let downloaded = 0;\n", "\n", " const channel = await google.colab.kernel.comms.open(id);\n", " // Send a message to notify the kernel that we're ready.\n", " channel.send({})\n", "\n", " for await (const message of channel.messages) {\n", " // Send a message to notify the kernel that we're ready.\n", " channel.send({})\n", " if (message.buffers) {\n", " for (const buffer of message.buffers) {\n", " buffers.push(buffer);\n", " downloaded += buffer.byteLength;\n", " progress.value = downloaded;\n", " }\n", " }\n", " }\n", " const blob = new Blob(buffers, {type: 'application/binary'});\n", " const a = document.createElement('a');\n", " a.href = window.URL.createObjectURL(blob);\n", " a.download = filename;\n", " div.appendChild(a);\n", " a.click();\n", " div.remove();\n", " }\n", " " ] }, "metadata": {} }, { "output_type": "display_data", "data": { "text/plain": [ "" ], "application/javascript": [ "download(\"download_ef5c634e-3ae3-4a85-a7b4-8f9422b11298\", \"misspellings.txt\", 593809553)" ] }, "metadata": {} } ] } ] }