algorembrant commited on
Commit
0b4a16b
·
verified ·
1 Parent(s): 2f36fdf

Upload 4 files

Browse files
generate_typos_colab.py ADDED
@@ -0,0 +1,127 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Generate realistic typo-based misspellings from words.txt → misspellings.txt
3
+
4
+ Colab version
5
+ Place words.txt in /content/ before running
6
+ """
7
+
8
+ import os
9
+ import time
10
+
11
+ # Optional: mount Google Drive if your file is there
12
+ # from google.colab import drive
13
+ # drive.mount('/content/drive')
14
+ # words_path = '/content/drive/MyDrive/words.txt'
15
+
16
+ words_path = '/content/words.txt'
17
+ output_path = '/content/misspellings.txt'
18
+
19
+ KEYBOARD_NEIGHBORS = {
20
+ 'q': 'wa', 'w': 'qeas', 'e': 'wrds', 'r': 'etfs', 't': 'rygs',
21
+ 'y': 'tuhs', 'u': 'yijs', 'i': 'uoks', 'o': 'ipls', 'p': 'o',
22
+ 'a': 'qwsz', 's': 'awedxz', 'd': 'serfcx', 'f': 'drtgvc',
23
+ 'g': 'ftyhbv', 'h': 'gyujnb', 'j': 'huikmn', 'k': 'jiolm',
24
+ 'l': 'kop', 'z': 'asx', 'x': 'zsdc', 'c': 'xdfv', 'v': 'cfgb',
25
+ 'b': 'vghn', 'n': 'bhjm', 'm': 'njk',
26
+ }
27
+
28
+ def generate_adjacent_swaps(word):
29
+ typos = []
30
+ for i in range(len(word) - 1):
31
+ chars = list(word)
32
+ chars[i], chars[i + 1] = chars[i + 1], chars[i]
33
+ typo = ''.join(chars)
34
+ if typo != word:
35
+ typos.append(typo)
36
+ return typos
37
+
38
+ def generate_deletions(word):
39
+ typos = []
40
+ for i in range(len(word)):
41
+ typo = word[:i] + word[i + 1:]
42
+ if len(typo) >= 2:
43
+ typos.append(typo)
44
+ return typos
45
+
46
+ def generate_duplications(word):
47
+ typos = []
48
+ for i in range(len(word)):
49
+ typo = word[:i] + word[i] + word[i:]
50
+ if typo != word:
51
+ typos.append(typo)
52
+ return typos
53
+
54
+ def generate_nearby_key_subs(word):
55
+ typos = []
56
+ lower = word.lower()
57
+ for i in range(len(word)):
58
+ ch = lower[i]
59
+ if ch in KEYBOARD_NEIGHBORS:
60
+ for neighbor in KEYBOARD_NEIGHBORS[ch]:
61
+ typo = lower[:i] + neighbor + lower[i + 1:]
62
+ if typo != lower:
63
+ typos.append(typo)
64
+ return typos
65
+
66
+ def generate_all_typos(word):
67
+ typos = set()
68
+ typos.update(generate_adjacent_swaps(word))
69
+ typos.update(generate_deletions(word))
70
+ typos.update(generate_duplications(word))
71
+ typos.update(generate_nearby_key_subs(word))
72
+ typos.discard(word)
73
+ typos.discard(word.lower())
74
+ return typos
75
+
76
+ def is_pure_alpha(word):
77
+ return word.isalpha()
78
+
79
+ # ── Check file ──────────────────────────────────────────────
80
+ if not os.path.exists(words_path):
81
+ raise FileNotFoundError(f"{words_path} not found. Upload it to /content/ first.")
82
+
83
+ print(f"Reading words from: {words_path}")
84
+
85
+ with open(words_path, 'r', encoding='utf-8', errors='replace') as f:
86
+ raw_words = [line.strip() for line in f if line.strip()]
87
+
88
+ print(f"Total raw entries: {len(raw_words):,}")
89
+
90
+ words = [w for w in raw_words if is_pure_alpha(w) and len(w) >= 3]
91
+ print(f"Filtered to {len(words):,} alphabetical words (len >= 3)")
92
+
93
+ start = time.time()
94
+ total_typos = 0
95
+ batch_size = 10_000
96
+
97
+ print(f"Generating typos → {output_path}")
98
+
99
+ with open(output_path, 'w', encoding='utf-8', newline='\n') as out:
100
+ out.write("# Auto-generated misspellings database\n")
101
+ out.write("# Format: misspelling=correction\n\n")
102
+
103
+ for idx, word in enumerate(words):
104
+ correction = word
105
+ typos = generate_all_typos(word.lower())
106
+
107
+ for typo in sorted(typos):
108
+ out.write(f"{typo}={correction}\n")
109
+ total_typos += 1
110
+
111
+ if (idx + 1) % batch_size == 0:
112
+ elapsed = time.time() - start
113
+ pct = (idx + 1) / len(words) * 100
114
+ rate = (idx + 1) / elapsed if elapsed > 0 else 0
115
+ print(f"[{pct:5.1f}%] {idx + 1:,}/{len(words):,} words | "
116
+ f"{total_typos:,} typos | {rate:.0f} words/sec")
117
+
118
+ elapsed = time.time() - start
119
+ file_size_mb = os.path.getsize(output_path) / (1024 * 1024)
120
+
121
+ print("\n" + "=" * 60)
122
+ print(f"Done in {elapsed:.1f}s")
123
+ print(f"Words processed : {len(words):,}")
124
+ print(f"Typos generated : {total_typos:,}")
125
+ print(f"Output file : {output_path}")
126
+ print(f"File size : {file_size_mb:.1f} MB")
127
+ print("=" * 60)
generate_typos_local.py ADDED
@@ -0,0 +1,157 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Generate realistic typo-based misspellings from words.txt → misspellings.txt
3
+
4
+ Typo strategies:
5
+ 1. Adjacent letter swaps ("hello" → "hlelo", "helol")
6
+ 2. Single character deletion ("hello" → "hllo", "helo")
7
+ 3. Single character duplication ("hello" → "hhello", "heello")
8
+ 4. Nearby keyboard key sub ("hello" → "gello", "jello")
9
+
10
+ Output format: misspelling=correction (one per line)
11
+ """
12
+
13
+ import sys
14
+ import os
15
+ import time
16
+
17
+ # QWERTY keyboard proximity map
18
+ KEYBOARD_NEIGHBORS = {
19
+ 'q': 'wa', 'w': 'qeas', 'e': 'wrds', 'r': 'etfs', 't': 'rygs',
20
+ 'y': 'tuhs', 'u': 'yijs', 'i': 'uoks', 'o': 'ipls', 'p': 'o',
21
+ 'a': 'qwsz', 's': 'awedxz', 'd': 'serfcx', 'f': 'drtgvc',
22
+ 'g': 'ftyhbv', 'h': 'gyujnb', 'j': 'huikmn', 'k': 'jiolm',
23
+ 'l': 'kop', 'z': 'asx', 'x': 'zsdc', 'c': 'xdfv', 'v': 'cfgb',
24
+ 'b': 'vghn', 'n': 'bhjm', 'm': 'njk',
25
+ }
26
+
27
+
28
+ def generate_adjacent_swaps(word):
29
+ """Swap each pair of adjacent characters."""
30
+ typos = []
31
+ for i in range(len(word) - 1):
32
+ chars = list(word)
33
+ chars[i], chars[i + 1] = chars[i + 1], chars[i]
34
+ typo = ''.join(chars)
35
+ if typo != word:
36
+ typos.append(typo)
37
+ return typos
38
+
39
+
40
+ def generate_deletions(word):
41
+ """Delete one character at a time."""
42
+ typos = []
43
+ for i in range(len(word)):
44
+ typo = word[:i] + word[i + 1:]
45
+ if len(typo) >= 2: # keep at least 2 chars
46
+ typos.append(typo)
47
+ return typos
48
+
49
+
50
+ def generate_duplications(word):
51
+ """Duplicate one character at a time."""
52
+ typos = []
53
+ for i in range(len(word)):
54
+ typo = word[:i] + word[i] + word[i:]
55
+ if typo != word:
56
+ typos.append(typo)
57
+ return typos
58
+
59
+
60
+ def generate_nearby_key_subs(word):
61
+ """Replace one character with a nearby keyboard key."""
62
+ typos = []
63
+ lower = word.lower()
64
+ for i in range(len(word)):
65
+ ch = lower[i]
66
+ if ch in KEYBOARD_NEIGHBORS:
67
+ for neighbor in KEYBOARD_NEIGHBORS[ch]:
68
+ typo = lower[:i] + neighbor + lower[i + 1:]
69
+ if typo != lower:
70
+ typos.append(typo)
71
+ return typos
72
+
73
+
74
+ def generate_all_typos(word):
75
+ """Generate all realistic typo variants for a word."""
76
+ typos = set()
77
+ typos.update(generate_adjacent_swaps(word))
78
+ typos.update(generate_deletions(word))
79
+ typos.update(generate_duplications(word))
80
+ typos.update(generate_nearby_key_subs(word))
81
+ typos.discard(word) # never map a word to itself
82
+ typos.discard(word.lower())
83
+ return typos
84
+
85
+
86
+ def is_pure_alpha(word):
87
+ """Only process words that are purely alphabetical (a-z)."""
88
+ return word.isalpha()
89
+
90
+
91
+ def main():
92
+ base_dir = os.path.dirname(os.path.abspath(__file__))
93
+ words_path = os.path.join(base_dir, 'data', 'words.txt')
94
+ output_path = os.path.join(base_dir, 'data', 'misspellings.txt')
95
+
96
+ if not os.path.exists(words_path):
97
+ print(f"ERROR: {words_path} not found.")
98
+ sys.exit(1)
99
+
100
+ # ── Read words ──────────────────────────────────────────────
101
+ print(f"Reading words from: {words_path}")
102
+ with open(words_path, 'r', encoding='utf-8', errors='replace') as f:
103
+ raw_words = [line.strip() for line in f if line.strip()]
104
+
105
+ print(f"Total raw entries: {len(raw_words):,}")
106
+
107
+ # Filter to pure-alpha words with length >= 3
108
+ words = [w for w in raw_words if is_pure_alpha(w) and len(w) >= 3]
109
+ print(f"Filtered to {len(words):,} alphabetical words (len >= 3)")
110
+
111
+ # ── Generate typos ──────────────────────────────────────────
112
+ start = time.time()
113
+ total_typos = 0
114
+ batch_size = 10_000
115
+
116
+ print(f"Generating typos → {output_path}")
117
+ print("This may take a few minutes for 466k words...")
118
+
119
+ with open(output_path, 'w', encoding='utf-8', newline='\n') as out:
120
+ out.write("# Auto-generated misspellings database\n")
121
+ out.write("# Format: misspelling=correction\n")
122
+ out.write("# Generated by generate_typos.py\n")
123
+ out.write("#\n")
124
+ out.write("# Strategies: adjacent swaps, deletions, duplications, keyboard proximity\n")
125
+ out.write("\n")
126
+
127
+ for idx, word in enumerate(words):
128
+ correction = word # original is the correct form
129
+ typos = generate_all_typos(word.lower())
130
+
131
+ for typo in sorted(typos):
132
+ out.write(f"{typo}={correction}\n")
133
+ total_typos += 1
134
+
135
+ # Progress reporting
136
+ if (idx + 1) % batch_size == 0:
137
+ elapsed = time.time() - start
138
+ pct = (idx + 1) / len(words) * 100
139
+ rate = (idx + 1) / elapsed if elapsed > 0 else 0
140
+ print(f" [{pct:5.1f}%] {idx + 1:>7,} / {len(words):,} words |"
141
+ f" {total_typos:>10,} typos | {rate:.0f} words/sec")
142
+
143
+ elapsed = time.time() - start
144
+ file_size_mb = os.path.getsize(output_path) / (1024 * 1024)
145
+
146
+ print()
147
+ print("=" * 60)
148
+ print(f" Done in {elapsed:.1f}s")
149
+ print(f" Words processed : {len(words):,}")
150
+ print(f" Typos generated : {total_typos:,}")
151
+ print(f" Output file : {output_path}")
152
+ print(f" File size : {file_size_mb:.1f} MB")
153
+ print("=" * 60)
154
+
155
+
156
+ if __name__ == '__main__':
157
+ main()
google_collab_173MSW.ipynb ADDED
@@ -0,0 +1,608 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "nbformat": 4,
3
+ "nbformat_minor": 0,
4
+ "metadata": {
5
+ "colab": {
6
+ "provenance": []
7
+ },
8
+ "kernelspec": {
9
+ "name": "python3",
10
+ "display_name": "Python 3"
11
+ },
12
+ "language_info": {
13
+ "name": "python"
14
+ }
15
+ },
16
+ "cells": [
17
+ {
18
+ "cell_type": "code",
19
+ "execution_count": null,
20
+ "metadata": {
21
+ "colab": {
22
+ "base_uri": "https://localhost:8080/",
23
+ "height": 73
24
+ },
25
+ "id": "FaSiqVnTItLq",
26
+ "outputId": "1ef8a78c-7421-41eb-8cf4-db8426edeed9"
27
+ },
28
+ "outputs": [
29
+ {
30
+ "output_type": "display_data",
31
+ "data": {
32
+ "text/plain": [
33
+ "<IPython.core.display.HTML object>"
34
+ ],
35
+ "text/html": [
36
+ "\n",
37
+ " <input type=\"file\" id=\"files-a9b68a68-38ec-4a0a-8037-171b8cfec796\" name=\"files[]\" multiple disabled\n",
38
+ " style=\"border:none\" />\n",
39
+ " <output id=\"result-a9b68a68-38ec-4a0a-8037-171b8cfec796\">\n",
40
+ " Upload widget is only available when the cell has been executed in the\n",
41
+ " current browser session. Please rerun this cell to enable.\n",
42
+ " </output>\n",
43
+ " <script>// Copyright 2017 Google LLC\n",
44
+ "//\n",
45
+ "// Licensed under the Apache License, Version 2.0 (the \"License\");\n",
46
+ "// you may not use this file except in compliance with the License.\n",
47
+ "// You may obtain a copy of the License at\n",
48
+ "//\n",
49
+ "// http://www.apache.org/licenses/LICENSE-2.0\n",
50
+ "//\n",
51
+ "// Unless required by applicable law or agreed to in writing, software\n",
52
+ "// distributed under the License is distributed on an \"AS IS\" BASIS,\n",
53
+ "// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n",
54
+ "// See the License for the specific language governing permissions and\n",
55
+ "// limitations under the License.\n",
56
+ "\n",
57
+ "/**\n",
58
+ " * @fileoverview Helpers for google.colab Python module.\n",
59
+ " */\n",
60
+ "(function(scope) {\n",
61
+ "function span(text, styleAttributes = {}) {\n",
62
+ " const element = document.createElement('span');\n",
63
+ " element.textContent = text;\n",
64
+ " for (const key of Object.keys(styleAttributes)) {\n",
65
+ " element.style[key] = styleAttributes[key];\n",
66
+ " }\n",
67
+ " return element;\n",
68
+ "}\n",
69
+ "\n",
70
+ "// Max number of bytes which will be uploaded at a time.\n",
71
+ "const MAX_PAYLOAD_SIZE = 100 * 1024;\n",
72
+ "\n",
73
+ "function _uploadFiles(inputId, outputId) {\n",
74
+ " const steps = uploadFilesStep(inputId, outputId);\n",
75
+ " const outputElement = document.getElementById(outputId);\n",
76
+ " // Cache steps on the outputElement to make it available for the next call\n",
77
+ " // to uploadFilesContinue from Python.\n",
78
+ " outputElement.steps = steps;\n",
79
+ "\n",
80
+ " return _uploadFilesContinue(outputId);\n",
81
+ "}\n",
82
+ "\n",
83
+ "// This is roughly an async generator (not supported in the browser yet),\n",
84
+ "// where there are multiple asynchronous steps and the Python side is going\n",
85
+ "// to poll for completion of each step.\n",
86
+ "// This uses a Promise to block the python side on completion of each step,\n",
87
+ "// then passes the result of the previous step as the input to the next step.\n",
88
+ "function _uploadFilesContinue(outputId) {\n",
89
+ " const outputElement = document.getElementById(outputId);\n",
90
+ " const steps = outputElement.steps;\n",
91
+ "\n",
92
+ " const next = steps.next(outputElement.lastPromiseValue);\n",
93
+ " return Promise.resolve(next.value.promise).then((value) => {\n",
94
+ " // Cache the last promise value to make it available to the next\n",
95
+ " // step of the generator.\n",
96
+ " outputElement.lastPromiseValue = value;\n",
97
+ " return next.value.response;\n",
98
+ " });\n",
99
+ "}\n",
100
+ "\n",
101
+ "/**\n",
102
+ " * Generator function which is called between each async step of the upload\n",
103
+ " * process.\n",
104
+ " * @param {string} inputId Element ID of the input file picker element.\n",
105
+ " * @param {string} outputId Element ID of the output display.\n",
106
+ " * @return {!Iterable<!Object>} Iterable of next steps.\n",
107
+ " */\n",
108
+ "function* uploadFilesStep(inputId, outputId) {\n",
109
+ " const inputElement = document.getElementById(inputId);\n",
110
+ " inputElement.disabled = false;\n",
111
+ "\n",
112
+ " const outputElement = document.getElementById(outputId);\n",
113
+ " outputElement.innerHTML = '';\n",
114
+ "\n",
115
+ " const pickedPromise = new Promise((resolve) => {\n",
116
+ " inputElement.addEventListener('change', (e) => {\n",
117
+ " resolve(e.target.files);\n",
118
+ " });\n",
119
+ " });\n",
120
+ "\n",
121
+ " const cancel = document.createElement('button');\n",
122
+ " inputElement.parentElement.appendChild(cancel);\n",
123
+ " cancel.textContent = 'Cancel upload';\n",
124
+ " const cancelPromise = new Promise((resolve) => {\n",
125
+ " cancel.onclick = () => {\n",
126
+ " resolve(null);\n",
127
+ " };\n",
128
+ " });\n",
129
+ "\n",
130
+ " // Wait for the user to pick the files.\n",
131
+ " const files = yield {\n",
132
+ " promise: Promise.race([pickedPromise, cancelPromise]),\n",
133
+ " response: {\n",
134
+ " action: 'starting',\n",
135
+ " }\n",
136
+ " };\n",
137
+ "\n",
138
+ " cancel.remove();\n",
139
+ "\n",
140
+ " // Disable the input element since further picks are not allowed.\n",
141
+ " inputElement.disabled = true;\n",
142
+ "\n",
143
+ " if (!files) {\n",
144
+ " return {\n",
145
+ " response: {\n",
146
+ " action: 'complete',\n",
147
+ " }\n",
148
+ " };\n",
149
+ " }\n",
150
+ "\n",
151
+ " for (const file of files) {\n",
152
+ " const li = document.createElement('li');\n",
153
+ " li.append(span(file.name, {fontWeight: 'bold'}));\n",
154
+ " li.append(span(\n",
155
+ " `(${file.type || 'n/a'}) - ${file.size} bytes, ` +\n",
156
+ " `last modified: ${\n",
157
+ " file.lastModifiedDate ? file.lastModifiedDate.toLocaleDateString() :\n",
158
+ " 'n/a'} - `));\n",
159
+ " const percent = span('0% done');\n",
160
+ " li.appendChild(percent);\n",
161
+ "\n",
162
+ " outputElement.appendChild(li);\n",
163
+ "\n",
164
+ " const fileDataPromise = new Promise((resolve) => {\n",
165
+ " const reader = new FileReader();\n",
166
+ " reader.onload = (e) => {\n",
167
+ " resolve(e.target.result);\n",
168
+ " };\n",
169
+ " reader.readAsArrayBuffer(file);\n",
170
+ " });\n",
171
+ " // Wait for the data to be ready.\n",
172
+ " let fileData = yield {\n",
173
+ " promise: fileDataPromise,\n",
174
+ " response: {\n",
175
+ " action: 'continue',\n",
176
+ " }\n",
177
+ " };\n",
178
+ "\n",
179
+ " // Use a chunked sending to avoid message size limits. See b/62115660.\n",
180
+ " let position = 0;\n",
181
+ " do {\n",
182
+ " const length = Math.min(fileData.byteLength - position, MAX_PAYLOAD_SIZE);\n",
183
+ " const chunk = new Uint8Array(fileData, position, length);\n",
184
+ " position += length;\n",
185
+ "\n",
186
+ " const base64 = btoa(String.fromCharCode.apply(null, chunk));\n",
187
+ " yield {\n",
188
+ " response: {\n",
189
+ " action: 'append',\n",
190
+ " file: file.name,\n",
191
+ " data: base64,\n",
192
+ " },\n",
193
+ " };\n",
194
+ "\n",
195
+ " let percentDone = fileData.byteLength === 0 ?\n",
196
+ " 100 :\n",
197
+ " Math.round((position / fileData.byteLength) * 100);\n",
198
+ " percent.textContent = `${percentDone}% done`;\n",
199
+ "\n",
200
+ " } while (position < fileData.byteLength);\n",
201
+ " }\n",
202
+ "\n",
203
+ " // All done.\n",
204
+ " yield {\n",
205
+ " response: {\n",
206
+ " action: 'complete',\n",
207
+ " }\n",
208
+ " };\n",
209
+ "}\n",
210
+ "\n",
211
+ "scope.google = scope.google || {};\n",
212
+ "scope.google.colab = scope.google.colab || {};\n",
213
+ "scope.google.colab._files = {\n",
214
+ " _uploadFiles,\n",
215
+ " _uploadFilesContinue,\n",
216
+ "};\n",
217
+ "})(self);\n",
218
+ "</script> "
219
+ ]
220
+ },
221
+ "metadata": {}
222
+ },
223
+ {
224
+ "output_type": "stream",
225
+ "name": "stdout",
226
+ "text": [
227
+ "Saving words.txt to words (1).txt\n"
228
+ ]
229
+ }
230
+ ],
231
+ "source": [
232
+ "# Cell 1\n",
233
+ "from google.colab import files\n",
234
+ "uploaded = files.upload() # select words.txt from your PC\n"
235
+ ]
236
+ },
237
+ {
238
+ "cell_type": "code",
239
+ "source": [
240
+ "\"\"\"\n",
241
+ "=============================================================================\n",
242
+ " FULL PERMUTATION MISSPELLINGS GENERATOR (Google Colab Edition)\n",
243
+ "=============================================================================\n",
244
+ "\n",
245
+ "Purpose:\n",
246
+ " Generate ALL possible letter permutations of each word from words.txt\n",
247
+ " and write them as misspelling=correction pairs.\n",
248
+ "\n",
249
+ "⚠️ WARNING — READ BEFORE RUNNING ⚠️\n",
250
+ " This is computationally EXTREME. A single 10-letter word has 3,628,800\n",
251
+ " permutations. A 12-letter word has 479,001,600. For 466k words, the full\n",
252
+ " output could be PETABYTES. You WILL need to limit word length.\n",
253
+ "\n",
254
+ "=============================================================================\n",
255
+ " HOW TO USE ON GOOGLE COLAB\n",
256
+ "=============================================================================\n",
257
+ "\n",
258
+ "1. Open Google Colab → https://colab.research.google.com\n",
259
+ "2. Create a new notebook (Python 3)\n",
260
+ "\n",
261
+ "3. Upload your words.txt:\n",
262
+ " ─────────────────────────────────────\n",
263
+ " # CELL 1: Upload words.txt\n",
264
+ " from google.colab import files\n",
265
+ " uploaded = files.upload() # click \"Choose Files\" → select words.txt\n",
266
+ " ─────────────────────────────────────\n",
267
+ "\n",
268
+ "4. Copy-paste this ENTIRE script into a new cell and run it.\n",
269
+ "\n",
270
+ "5. Download the result:\n",
271
+ " ─────────────────────────────────────\n",
272
+ " # CELL 3: Download the output\n",
273
+ " files.download('misspellings_permutations.txt')\n",
274
+ " ─────────────────────────────────────\n",
275
+ "\n",
276
+ "=============================================================================\n",
277
+ " OR: Use Google Drive for large files\n",
278
+ "=============================================================================\n",
279
+ "\n",
280
+ " # Mount Google Drive (you get 15 GB free)\n",
281
+ " from google.colab import drive\n",
282
+ " drive.mount('/content/drive')\n",
283
+ "\n",
284
+ " # Then set OUTPUT_PATH below to:\n",
285
+ " OUTPUT_PATH = '/content/drive/MyDrive/misspellings_permutations.txt'\n",
286
+ "\n",
287
+ "=============================================================================\n",
288
+ " CONFIGURATION — Adjust these before running!\n",
289
+ "=============================================================================\n",
290
+ "\"\"\"\n",
291
+ "\n",
292
+ "import os\n",
293
+ "import sys\n",
294
+ "import time\n",
295
+ "import math\n",
296
+ "from itertools import permutations\n",
297
+ "\n",
298
+ "# ── CONFIGURATION ───────────────────────────────────────────────────────────\n",
299
+ "\n",
300
+ "WORDS_PATH = 'words.txt' # path to your words.txt\n",
301
+ "OUTPUT_PATH = 'misspellings_permutations.txt' # output file path\n",
302
+ "\n",
303
+ "MIN_WORD_LEN = 3 # skip words shorter than this\n",
304
+ "MAX_WORD_LEN = 7 # ⚠️ CRITICAL: max word length to permute\n",
305
+ " # 7 → max 5,040 perms/word (manageable)\n",
306
+ " # 8 → max 40,320 perms/word (large)\n",
307
+ " # 9 → max 362,880 perms/word (very large)\n",
308
+ " # 10 → max 3,628,800 perms/word (EXTREME)\n",
309
+ " # Increase at your own risk!\n",
310
+ "\n",
311
+ "ONLY_ALPHA = True # only process pure-alphabetical words\n",
312
+ "BATCH_LOG = 5000 # print progress every N words\n",
313
+ "\n",
314
+ "# ── ESTIMATION TABLE ────────────────────────────────────────────────────────\n",
315
+ "# Here's roughly how big the output gets at each MAX_WORD_LEN setting,\n",
316
+ "# assuming ~200k qualifying words at each length bracket:\n",
317
+ "#\n",
318
+ "# MAX_WORD_LEN │ Perms per word (worst) │ Rough output size\n",
319
+ "# ─────────────┼────────────────────────┼──────────────────\n",
320
+ "# 5 │ 120 │ ~200 MB\n",
321
+ "# 6 │ 720 │ ~1-2 GB\n",
322
+ "# 7 │ 5,040 │ ~5-15 GB\n",
323
+ "# 8 │ 40,320 │ ~50-150 GB\n",
324
+ "# 9 │ 362,880 │ ~500 GB - 1 TB\n",
325
+ "# 10 │ 3,628,800 │ ~5-50 TB ← won't fit anywhere\n",
326
+ "#\n",
327
+ "# Google Colab free tier gives you:\n",
328
+ "# • ~78 GB disk on the VM (temporary, lost on disconnect)\n",
329
+ "# • 15 GB Google Drive (persistent)\n",
330
+ "# • Colab Pro: 225 GB disk, longer runtimes\n",
331
+ "#\n",
332
+ "# RECOMMENDATION: Start with MAX_WORD_LEN = 6 or 7, see the size,\n",
333
+ "# then increase if you have space.\n",
334
+ "# ────────────────────────────────────────────────────────────────────────────\n",
335
+ "\n",
336
+ "\n",
337
+ "def estimate_output(words):\n",
338
+ " \"\"\"Estimate total permutations and file size before generating.\"\"\"\n",
339
+ " total_perms = 0\n",
340
+ " for w in words:\n",
341
+ " n = len(w)\n",
342
+ " # Account for duplicate letters: n! / (c1! * c2! * ...)\n",
343
+ " freq = {}\n",
344
+ " for ch in w.lower():\n",
345
+ " freq[ch] = freq.get(ch, 0) + 1\n",
346
+ " unique_perms = math.factorial(n)\n",
347
+ " for count in freq.values():\n",
348
+ " unique_perms //= math.factorial(count)\n",
349
+ " total_perms += unique_perms - 1 # subtract the original word\n",
350
+ "\n",
351
+ " # Estimate ~15 bytes per line (avg) → \"typo=word\\n\"\n",
352
+ " avg_bytes_per_line = 15\n",
353
+ " est_bytes = total_perms * avg_bytes_per_line\n",
354
+ " est_gb = est_bytes / (1024 ** 3)\n",
355
+ "\n",
356
+ " return total_perms, est_gb\n",
357
+ "\n",
358
+ "\n",
359
+ "def generate_unique_permutations(word):\n",
360
+ " \"\"\"\n",
361
+ " Generate all unique permutations of a word's letters,\n",
362
+ " excluding the original word itself.\n",
363
+ "\n",
364
+ " Uses set() to deduplicate (handles repeated letters efficiently).\n",
365
+ " \"\"\"\n",
366
+ " lower = word.lower()\n",
367
+ " perms = set(''.join(p) for p in permutations(lower))\n",
368
+ " perms.discard(lower) # remove the correctly-spelled word\n",
369
+ " return perms\n",
370
+ "\n",
371
+ "\n",
372
+ "def is_pure_alpha(word):\n",
373
+ " return word.isalpha()\n",
374
+ "\n",
375
+ "\n",
376
+ "def main():\n",
377
+ " if not os.path.exists(WORDS_PATH):\n",
378
+ " print(f\"ERROR: '{WORDS_PATH}' not found!\")\n",
379
+ " print(\"Make sure you uploaded words.txt or set WORDS_PATH correctly.\")\n",
380
+ " sys.exit(1)\n",
381
+ "\n",
382
+ " # ── Read words ──────────────────────────────────────────────\n",
383
+ " print(f\"Reading words from: {WORDS_PATH}\")\n",
384
+ " with open(WORDS_PATH, 'r', encoding='utf-8', errors='replace') as f:\n",
385
+ " raw_words = [line.strip() for line in f if line.strip()]\n",
386
+ "\n",
387
+ " print(f\"Total raw entries: {len(raw_words):,}\")\n",
388
+ "\n",
389
+ " # Filter\n",
390
+ " words = []\n",
391
+ " for w in raw_words:\n",
392
+ " if ONLY_ALPHA and not is_pure_alpha(w):\n",
393
+ " continue\n",
394
+ " if len(w) < MIN_WORD_LEN or len(w) > MAX_WORD_LEN:\n",
395
+ " continue\n",
396
+ " words.append(w)\n",
397
+ "\n",
398
+ " print(f\"Filtered to {len(words):,} words (alpha-only, len {MIN_WORD_LEN}-{MAX_WORD_LEN})\")\n",
399
+ "\n",
400
+ " if len(words) == 0:\n",
401
+ " print(\"No words matched the filter. Adjust MIN/MAX_WORD_LEN.\")\n",
402
+ " sys.exit(1)\n",
403
+ "\n",
404
+ " # ── Estimate ────────────────────────────────────────────────\n",
405
+ " print(\"\\nEstimating output size (this may take a moment)...\")\n",
406
+ " total_perms, est_gb = estimate_output(words)\n",
407
+ " print(f\" Estimated permutations : {total_perms:,}\")\n",
408
+ " print(f\" Estimated file size : {est_gb:.2f} GB\")\n",
409
+ "\n",
410
+ " # Safety check\n",
411
+ " if est_gb > 70:\n",
412
+ " print(f\"\\n⚠️ WARNING: Estimated output ({est_gb:.1f} GB) exceeds Colab disk (~78 GB).\")\n",
413
+ " print(\" Reduce MAX_WORD_LEN or the script will crash when disk fills up.\")\n",
414
+ " print(\" Aborting. Set MAX_WORD_LEN lower and re-run.\")\n",
415
+ " sys.exit(1)\n",
416
+ "\n",
417
+ " print(f\"\\nProceeding with generation → {OUTPUT_PATH}\")\n",
418
+ " print(\"=\" * 60)\n",
419
+ "\n",
420
+ " # ── Generate ────────────────────────────────────────────────\n",
421
+ " start = time.time()\n",
422
+ " total_written = 0\n",
423
+ "\n",
424
+ " with open(OUTPUT_PATH, 'w', encoding='utf-8') as out:\n",
425
+ " out.write(\"# Auto-generated FULL PERMUTATION misspellings\\n\")\n",
426
+ " out.write(f\"# Config: word length {MIN_WORD_LEN}-{MAX_WORD_LEN}\\n\")\n",
427
+ " out.write(\"# Format: misspelling=correction\\n\\n\")\n",
428
+ "\n",
429
+ " for idx, word in enumerate(words):\n",
430
+ " perms = generate_unique_permutations(word)\n",
431
+ "\n",
432
+ " for typo in sorted(perms):\n",
433
+ " out.write(f\"{typo}={word}\\n\")\n",
434
+ " total_written += 1\n",
435
+ "\n",
436
+ " # Progress\n",
437
+ " if (idx + 1) % BATCH_LOG == 0:\n",
438
+ " elapsed = time.time() - start\n",
439
+ " pct = (idx + 1) / len(words) * 100\n",
440
+ " rate = (idx + 1) / elapsed if elapsed > 0 else 0\n",
441
+ " cur_size = os.path.getsize(OUTPUT_PATH) / (1024 ** 3)\n",
442
+ " print(f\" [{pct:5.1f}%] {idx+1:>7,}/{len(words):,} words |\"\n",
443
+ " f\" {total_written:>12,} lines | {cur_size:.2f} GB |\"\n",
444
+ " f\" {rate:.0f} words/sec\")\n",
445
+ "\n",
446
+ " elapsed = time.time() - start\n",
447
+ " final_size = os.path.getsize(OUTPUT_PATH) / (1024 ** 3)\n",
448
+ "\n",
449
+ " print()\n",
450
+ " print(\"=\" * 60)\n",
451
+ " print(f\" ✅ DONE in {elapsed:.1f}s ({elapsed/60:.1f} min)\")\n",
452
+ " print(f\" Words processed : {len(words):,}\")\n",
453
+ " print(f\" Lines written : {total_written:,}\")\n",
454
+ " print(f\" Output file : {OUTPUT_PATH}\")\n",
455
+ " print(f\" File size : {final_size:.2f} GB\")\n",
456
+ " print(\"=\" * 60)\n",
457
+ "\n",
458
+ "\n",
459
+ "if __name__ == '__main__':\n",
460
+ " main()\n"
461
+ ],
462
+ "metadata": {
463
+ "colab": {
464
+ "base_uri": "https://localhost:8080/"
465
+ },
466
+ "id": "Et0QfIxpJz_5",
467
+ "outputId": "e7e72965-f709-45c0-ae56-abf76b89d714"
468
+ },
469
+ "execution_count": null,
470
+ "outputs": [
471
+ {
472
+ "output_type": "stream",
473
+ "name": "stdout",
474
+ "text": [
475
+ "Reading words from: words.txt\n",
476
+ "Total raw entries: 466,550\n",
477
+ "Filtered to 125,414 words (alpha-only, len 3-7)\n",
478
+ "\n",
479
+ "Estimating output size (this may take a moment)...\n",
480
+ " Estimated permutations : 173,110,626\n",
481
+ " Estimated file size : 2.42 GB\n",
482
+ "\n",
483
+ "Proceeding with generation → misspellings_permutations.txt\n",
484
+ "============================================================\n",
485
+ " [ 4.0%] 5,000/125,414 words | 5,810,553 lines | 0.08 GB | 898 words/sec\n",
486
+ " [ 8.0%] 10,000/125,414 words | 11,972,245 lines | 0.18 GB | 781 words/sec\n",
487
+ " [ 12.0%] 15,000/125,414 words | 19,094,747 lines | 0.28 GB | 775 words/sec\n",
488
+ " [ 15.9%] 20,000/125,414 words | 26,800,249 lines | 0.39 GB | 721 words/sec\n",
489
+ " [ 19.9%] 25,000/125,414 words | 35,047,153 lines | 0.51 GB | 690 words/sec\n",
490
+ " [ 23.9%] 30,000/125,414 words | 42,273,166 lines | 0.62 GB | 695 words/sec\n",
491
+ " [ 27.9%] 35,000/125,414 words | 48,702,338 lines | 0.71 GB | 692 words/sec\n",
492
+ " [ 31.9%] 40,000/125,414 words | 55,295,151 lines | 0.81 GB | 703 words/sec\n",
493
+ " [ 35.9%] 45,000/125,414 words | 62,710,327 lines | 0.92 GB | 690 words/sec\n",
494
+ " [ 39.9%] 50,000/125,414 words | 69,722,485 lines | 1.02 GB | 690 words/sec\n",
495
+ " [ 43.9%] 55,000/125,414 words | 76,146,526 lines | 1.12 GB | 674 words/sec\n",
496
+ " [ 47.8%] 60,000/125,414 words | 81,994,038 lines | 1.20 GB | 686 words/sec\n",
497
+ " [ 51.8%] 65,000/125,414 words | 88,058,594 lines | 1.29 GB | 683 words/sec\n",
498
+ " [ 55.8%] 70,000/125,414 words | 94,651,291 lines | 1.39 GB | 688 words/sec\n",
499
+ " [ 59.8%] 75,000/125,414 words | 101,636,647 lines | 1.49 GB | 679 words/sec\n",
500
+ " [ 63.8%] 80,000/125,414 words | 107,086,424 lines | 1.57 GB | 691 words/sec\n",
501
+ " [ 67.8%] 85,000/125,414 words | 114,898,717 lines | 1.68 GB | 678 words/sec\n",
502
+ " [ 71.8%] 90,000/125,414 words | 123,278,791 lines | 1.80 GB | 675 words/sec\n",
503
+ " [ 75.7%] 95,000/125,414 words | 129,821,900 lines | 1.90 GB | 669 words/sec\n",
504
+ " [ 79.7%] 100,000/125,414 words | 136,429,269 lines | 2.00 GB | 673 words/sec\n",
505
+ " [ 83.7%] 105,000/125,414 words | 143,342,171 lines | 2.10 GB | 667 words/sec\n",
506
+ " [ 87.7%] 110,000/125,414 words | 150,701,210 lines | 2.21 GB | 666 words/sec\n",
507
+ " [ 91.7%] 115,000/125,414 words | 157,479,616 lines | 2.31 GB | 665 words/sec\n",
508
+ " [ 95.7%] 120,000/125,414 words | 165,619,673 lines | 2.43 GB | 662 words/sec\n",
509
+ " [ 99.7%] 125,000/125,414 words | 172,558,768 lines | 2.53 GB | 661 words/sec\n",
510
+ "\n",
511
+ "============================================================\n",
512
+ " ✅ DONE in 189.5s (3.2 min)\n",
513
+ " Words processed : 125,414\n",
514
+ " Lines written : 173,110,626\n",
515
+ " Output file : misspellings_permutations.txt\n",
516
+ " File size : 2.53 GB\n",
517
+ "============================================================\n"
518
+ ]
519
+ }
520
+ ]
521
+ },
522
+ {
523
+ "cell_type": "code",
524
+ "source": [
525
+ "# If saved to VM disk:\n",
526
+ "files.download('misspellings_permutations.txt')\n",
527
+ "\n",
528
+ "# If saved to Google Drive: just access it from drive.google.com\n",
529
+ "\n"
530
+ ],
531
+ "metadata": {
532
+ "id": "y9jWxvv8LWoH",
533
+ "outputId": "d8d754d3-234e-4020-bcc7-a19f3fc5fb26",
534
+ "colab": {
535
+ "base_uri": "https://localhost:8080/",
536
+ "height": 34
537
+ }
538
+ },
539
+ "execution_count": null,
540
+ "outputs": [
541
+ {
542
+ "output_type": "display_data",
543
+ "data": {
544
+ "text/plain": [
545
+ "<IPython.core.display.Javascript object>"
546
+ ],
547
+ "application/javascript": [
548
+ "\n",
549
+ " async function download(id, filename, size) {\n",
550
+ " if (!google.colab.kernel.accessAllowed) {\n",
551
+ " return;\n",
552
+ " }\n",
553
+ " const div = document.createElement('div');\n",
554
+ " const label = document.createElement('label');\n",
555
+ " label.textContent = `Downloading \"${filename}\": `;\n",
556
+ " div.appendChild(label);\n",
557
+ " const progress = document.createElement('progress');\n",
558
+ " progress.max = size;\n",
559
+ " div.appendChild(progress);\n",
560
+ " document.body.appendChild(div);\n",
561
+ "\n",
562
+ " const buffers = [];\n",
563
+ " let downloaded = 0;\n",
564
+ "\n",
565
+ " const channel = await google.colab.kernel.comms.open(id);\n",
566
+ " // Send a message to notify the kernel that we're ready.\n",
567
+ " channel.send({})\n",
568
+ "\n",
569
+ " for await (const message of channel.messages) {\n",
570
+ " // Send a message to notify the kernel that we're ready.\n",
571
+ " channel.send({})\n",
572
+ " if (message.buffers) {\n",
573
+ " for (const buffer of message.buffers) {\n",
574
+ " buffers.push(buffer);\n",
575
+ " downloaded += buffer.byteLength;\n",
576
+ " progress.value = downloaded;\n",
577
+ " }\n",
578
+ " }\n",
579
+ " }\n",
580
+ " const blob = new Blob(buffers, {type: 'application/binary'});\n",
581
+ " const a = document.createElement('a');\n",
582
+ " a.href = window.URL.createObjectURL(blob);\n",
583
+ " a.download = filename;\n",
584
+ " div.appendChild(a);\n",
585
+ " a.click();\n",
586
+ " div.remove();\n",
587
+ " }\n",
588
+ " "
589
+ ]
590
+ },
591
+ "metadata": {}
592
+ },
593
+ {
594
+ "output_type": "display_data",
595
+ "data": {
596
+ "text/plain": [
597
+ "<IPython.core.display.Javascript object>"
598
+ ],
599
+ "application/javascript": [
600
+ "download(\"download_10941777-78c6-4833-b8e6-093feee02e11\", \"misspellings_permutations.txt\", 2721877361)"
601
+ ]
602
+ },
603
+ "metadata": {}
604
+ }
605
+ ]
606
+ }
607
+ ]
608
+ }
google_collab_263MSW.ipynb ADDED
@@ -0,0 +1,523 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "nbformat": 4,
3
+ "nbformat_minor": 0,
4
+ "metadata": {
5
+ "colab": {
6
+ "provenance": []
7
+ },
8
+ "kernelspec": {
9
+ "name": "python3",
10
+ "display_name": "Python 3"
11
+ },
12
+ "language_info": {
13
+ "name": "python"
14
+ }
15
+ },
16
+ "cells": [
17
+ {
18
+ "cell_type": "code",
19
+ "execution_count": 1,
20
+ "metadata": {
21
+ "colab": {
22
+ "base_uri": "https://localhost:8080/",
23
+ "height": 73
24
+ },
25
+ "id": "NKDQAIA9bkTI",
26
+ "outputId": "b21b6fd4-cbe0-46f2-ae31-639ac21e04c4"
27
+ },
28
+ "outputs": [
29
+ {
30
+ "output_type": "display_data",
31
+ "data": {
32
+ "text/plain": [
33
+ "<IPython.core.display.HTML object>"
34
+ ],
35
+ "text/html": [
36
+ "\n",
37
+ " <input type=\"file\" id=\"files-27caebec-daaf-4dc1-9317-a13c04ecdb3b\" name=\"files[]\" multiple disabled\n",
38
+ " style=\"border:none\" />\n",
39
+ " <output id=\"result-27caebec-daaf-4dc1-9317-a13c04ecdb3b\">\n",
40
+ " Upload widget is only available when the cell has been executed in the\n",
41
+ " current browser session. Please rerun this cell to enable.\n",
42
+ " </output>\n",
43
+ " <script>// Copyright 2017 Google LLC\n",
44
+ "//\n",
45
+ "// Licensed under the Apache License, Version 2.0 (the \"License\");\n",
46
+ "// you may not use this file except in compliance with the License.\n",
47
+ "// You may obtain a copy of the License at\n",
48
+ "//\n",
49
+ "// http://www.apache.org/licenses/LICENSE-2.0\n",
50
+ "//\n",
51
+ "// Unless required by applicable law or agreed to in writing, software\n",
52
+ "// distributed under the License is distributed on an \"AS IS\" BASIS,\n",
53
+ "// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n",
54
+ "// See the License for the specific language governing permissions and\n",
55
+ "// limitations under the License.\n",
56
+ "\n",
57
+ "/**\n",
58
+ " * @fileoverview Helpers for google.colab Python module.\n",
59
+ " */\n",
60
+ "(function(scope) {\n",
61
+ "function span(text, styleAttributes = {}) {\n",
62
+ " const element = document.createElement('span');\n",
63
+ " element.textContent = text;\n",
64
+ " for (const key of Object.keys(styleAttributes)) {\n",
65
+ " element.style[key] = styleAttributes[key];\n",
66
+ " }\n",
67
+ " return element;\n",
68
+ "}\n",
69
+ "\n",
70
+ "// Max number of bytes which will be uploaded at a time.\n",
71
+ "const MAX_PAYLOAD_SIZE = 100 * 1024;\n",
72
+ "\n",
73
+ "function _uploadFiles(inputId, outputId) {\n",
74
+ " const steps = uploadFilesStep(inputId, outputId);\n",
75
+ " const outputElement = document.getElementById(outputId);\n",
76
+ " // Cache steps on the outputElement to make it available for the next call\n",
77
+ " // to uploadFilesContinue from Python.\n",
78
+ " outputElement.steps = steps;\n",
79
+ "\n",
80
+ " return _uploadFilesContinue(outputId);\n",
81
+ "}\n",
82
+ "\n",
83
+ "// This is roughly an async generator (not supported in the browser yet),\n",
84
+ "// where there are multiple asynchronous steps and the Python side is going\n",
85
+ "// to poll for completion of each step.\n",
86
+ "// This uses a Promise to block the python side on completion of each step,\n",
87
+ "// then passes the result of the previous step as the input to the next step.\n",
88
+ "function _uploadFilesContinue(outputId) {\n",
89
+ " const outputElement = document.getElementById(outputId);\n",
90
+ " const steps = outputElement.steps;\n",
91
+ "\n",
92
+ " const next = steps.next(outputElement.lastPromiseValue);\n",
93
+ " return Promise.resolve(next.value.promise).then((value) => {\n",
94
+ " // Cache the last promise value to make it available to the next\n",
95
+ " // step of the generator.\n",
96
+ " outputElement.lastPromiseValue = value;\n",
97
+ " return next.value.response;\n",
98
+ " });\n",
99
+ "}\n",
100
+ "\n",
101
+ "/**\n",
102
+ " * Generator function which is called between each async step of the upload\n",
103
+ " * process.\n",
104
+ " * @param {string} inputId Element ID of the input file picker element.\n",
105
+ " * @param {string} outputId Element ID of the output display.\n",
106
+ " * @return {!Iterable<!Object>} Iterable of next steps.\n",
107
+ " */\n",
108
+ "function* uploadFilesStep(inputId, outputId) {\n",
109
+ " const inputElement = document.getElementById(inputId);\n",
110
+ " inputElement.disabled = false;\n",
111
+ "\n",
112
+ " const outputElement = document.getElementById(outputId);\n",
113
+ " outputElement.innerHTML = '';\n",
114
+ "\n",
115
+ " const pickedPromise = new Promise((resolve) => {\n",
116
+ " inputElement.addEventListener('change', (e) => {\n",
117
+ " resolve(e.target.files);\n",
118
+ " });\n",
119
+ " });\n",
120
+ "\n",
121
+ " const cancel = document.createElement('button');\n",
122
+ " inputElement.parentElement.appendChild(cancel);\n",
123
+ " cancel.textContent = 'Cancel upload';\n",
124
+ " const cancelPromise = new Promise((resolve) => {\n",
125
+ " cancel.onclick = () => {\n",
126
+ " resolve(null);\n",
127
+ " };\n",
128
+ " });\n",
129
+ "\n",
130
+ " // Wait for the user to pick the files.\n",
131
+ " const files = yield {\n",
132
+ " promise: Promise.race([pickedPromise, cancelPromise]),\n",
133
+ " response: {\n",
134
+ " action: 'starting',\n",
135
+ " }\n",
136
+ " };\n",
137
+ "\n",
138
+ " cancel.remove();\n",
139
+ "\n",
140
+ " // Disable the input element since further picks are not allowed.\n",
141
+ " inputElement.disabled = true;\n",
142
+ "\n",
143
+ " if (!files) {\n",
144
+ " return {\n",
145
+ " response: {\n",
146
+ " action: 'complete',\n",
147
+ " }\n",
148
+ " };\n",
149
+ " }\n",
150
+ "\n",
151
+ " for (const file of files) {\n",
152
+ " const li = document.createElement('li');\n",
153
+ " li.append(span(file.name, {fontWeight: 'bold'}));\n",
154
+ " li.append(span(\n",
155
+ " `(${file.type || 'n/a'}) - ${file.size} bytes, ` +\n",
156
+ " `last modified: ${\n",
157
+ " file.lastModifiedDate ? file.lastModifiedDate.toLocaleDateString() :\n",
158
+ " 'n/a'} - `));\n",
159
+ " const percent = span('0% done');\n",
160
+ " li.appendChild(percent);\n",
161
+ "\n",
162
+ " outputElement.appendChild(li);\n",
163
+ "\n",
164
+ " const fileDataPromise = new Promise((resolve) => {\n",
165
+ " const reader = new FileReader();\n",
166
+ " reader.onload = (e) => {\n",
167
+ " resolve(e.target.result);\n",
168
+ " };\n",
169
+ " reader.readAsArrayBuffer(file);\n",
170
+ " });\n",
171
+ " // Wait for the data to be ready.\n",
172
+ " let fileData = yield {\n",
173
+ " promise: fileDataPromise,\n",
174
+ " response: {\n",
175
+ " action: 'continue',\n",
176
+ " }\n",
177
+ " };\n",
178
+ "\n",
179
+ " // Use a chunked sending to avoid message size limits. See b/62115660.\n",
180
+ " let position = 0;\n",
181
+ " do {\n",
182
+ " const length = Math.min(fileData.byteLength - position, MAX_PAYLOAD_SIZE);\n",
183
+ " const chunk = new Uint8Array(fileData, position, length);\n",
184
+ " position += length;\n",
185
+ "\n",
186
+ " const base64 = btoa(String.fromCharCode.apply(null, chunk));\n",
187
+ " yield {\n",
188
+ " response: {\n",
189
+ " action: 'append',\n",
190
+ " file: file.name,\n",
191
+ " data: base64,\n",
192
+ " },\n",
193
+ " };\n",
194
+ "\n",
195
+ " let percentDone = fileData.byteLength === 0 ?\n",
196
+ " 100 :\n",
197
+ " Math.round((position / fileData.byteLength) * 100);\n",
198
+ " percent.textContent = `${percentDone}% done`;\n",
199
+ "\n",
200
+ " } while (position < fileData.byteLength);\n",
201
+ " }\n",
202
+ "\n",
203
+ " // All done.\n",
204
+ " yield {\n",
205
+ " response: {\n",
206
+ " action: 'complete',\n",
207
+ " }\n",
208
+ " };\n",
209
+ "}\n",
210
+ "\n",
211
+ "scope.google = scope.google || {};\n",
212
+ "scope.google.colab = scope.google.colab || {};\n",
213
+ "scope.google.colab._files = {\n",
214
+ " _uploadFiles,\n",
215
+ " _uploadFilesContinue,\n",
216
+ "};\n",
217
+ "})(self);\n",
218
+ "</script> "
219
+ ]
220
+ },
221
+ "metadata": {}
222
+ },
223
+ {
224
+ "output_type": "stream",
225
+ "name": "stdout",
226
+ "text": [
227
+ "Saving words.txt to words.txt\n"
228
+ ]
229
+ }
230
+ ],
231
+ "source": [
232
+ "# Cell 1\n",
233
+ "from google.colab import files\n",
234
+ "uploaded = files.upload() # select words.txt from your PC\n"
235
+ ]
236
+ },
237
+ {
238
+ "cell_type": "code",
239
+ "source": [
240
+ "\"\"\"\n",
241
+ "Generate realistic typo-based misspellings from words.txt → misspellings.txt\n",
242
+ "\n",
243
+ "Colab version\n",
244
+ "Place words.txt in /content/ before running\n",
245
+ "\"\"\"\n",
246
+ "\n",
247
+ "import os\n",
248
+ "import time\n",
249
+ "\n",
250
+ "# Optional: mount Google Drive if your file is there\n",
251
+ "# from google.colab import drive\n",
252
+ "# drive.mount('/content/drive')\n",
253
+ "# words_path = '/content/drive/MyDrive/words.txt'\n",
254
+ "\n",
255
+ "words_path = '/content/words.txt'\n",
256
+ "output_path = '/content/misspellings.txt'\n",
257
+ "\n",
258
+ "KEYBOARD_NEIGHBORS = {\n",
259
+ " 'q': 'wa', 'w': 'qeas', 'e': 'wrds', 'r': 'etfs', 't': 'rygs',\n",
260
+ " 'y': 'tuhs', 'u': 'yijs', 'i': 'uoks', 'o': 'ipls', 'p': 'o',\n",
261
+ " 'a': 'qwsz', 's': 'awedxz', 'd': 'serfcx', 'f': 'drtgvc',\n",
262
+ " 'g': 'ftyhbv', 'h': 'gyujnb', 'j': 'huikmn', 'k': 'jiolm',\n",
263
+ " 'l': 'kop', 'z': 'asx', 'x': 'zsdc', 'c': 'xdfv', 'v': 'cfgb',\n",
264
+ " 'b': 'vghn', 'n': 'bhjm', 'm': 'njk',\n",
265
+ "}\n",
266
+ "\n",
267
+ "def generate_adjacent_swaps(word):\n",
268
+ " typos = []\n",
269
+ " for i in range(len(word) - 1):\n",
270
+ " chars = list(word)\n",
271
+ " chars[i], chars[i + 1] = chars[i + 1], chars[i]\n",
272
+ " typo = ''.join(chars)\n",
273
+ " if typo != word:\n",
274
+ " typos.append(typo)\n",
275
+ " return typos\n",
276
+ "\n",
277
+ "def generate_deletions(word):\n",
278
+ " typos = []\n",
279
+ " for i in range(len(word)):\n",
280
+ " typo = word[:i] + word[i + 1:]\n",
281
+ " if len(typo) >= 2:\n",
282
+ " typos.append(typo)\n",
283
+ " return typos\n",
284
+ "\n",
285
+ "def generate_duplications(word):\n",
286
+ " typos = []\n",
287
+ " for i in range(len(word)):\n",
288
+ " typo = word[:i] + word[i] + word[i:]\n",
289
+ " if typo != word:\n",
290
+ " typos.append(typo)\n",
291
+ " return typos\n",
292
+ "\n",
293
+ "def generate_nearby_key_subs(word):\n",
294
+ " typos = []\n",
295
+ " lower = word.lower()\n",
296
+ " for i in range(len(word)):\n",
297
+ " ch = lower[i]\n",
298
+ " if ch in KEYBOARD_NEIGHBORS:\n",
299
+ " for neighbor in KEYBOARD_NEIGHBORS[ch]:\n",
300
+ " typo = lower[:i] + neighbor + lower[i + 1:]\n",
301
+ " if typo != lower:\n",
302
+ " typos.append(typo)\n",
303
+ " return typos\n",
304
+ "\n",
305
+ "def generate_all_typos(word):\n",
306
+ " typos = set()\n",
307
+ " typos.update(generate_adjacent_swaps(word))\n",
308
+ " typos.update(generate_deletions(word))\n",
309
+ " typos.update(generate_duplications(word))\n",
310
+ " typos.update(generate_nearby_key_subs(word))\n",
311
+ " typos.discard(word)\n",
312
+ " typos.discard(word.lower())\n",
313
+ " return typos\n",
314
+ "\n",
315
+ "def is_pure_alpha(word):\n",
316
+ " return word.isalpha()\n",
317
+ "\n",
318
+ "# ── Check file ──────────────────────────────────────────────\n",
319
+ "if not os.path.exists(words_path):\n",
320
+ " raise FileNotFoundError(f\"{words_path} not found. Upload it to /content/ first.\")\n",
321
+ "\n",
322
+ "print(f\"Reading words from: {words_path}\")\n",
323
+ "\n",
324
+ "with open(words_path, 'r', encoding='utf-8', errors='replace') as f:\n",
325
+ " raw_words = [line.strip() for line in f if line.strip()]\n",
326
+ "\n",
327
+ "print(f\"Total raw entries: {len(raw_words):,}\")\n",
328
+ "\n",
329
+ "words = [w for w in raw_words if is_pure_alpha(w) and len(w) >= 3]\n",
330
+ "print(f\"Filtered to {len(words):,} alphabetical words (len >= 3)\")\n",
331
+ "\n",
332
+ "start = time.time()\n",
333
+ "total_typos = 0\n",
334
+ "batch_size = 10_000\n",
335
+ "\n",
336
+ "print(f\"Generating typos → {output_path}\")\n",
337
+ "\n",
338
+ "with open(output_path, 'w', encoding='utf-8', newline='\\n') as out:\n",
339
+ " out.write(\"# Auto-generated misspellings database\\n\")\n",
340
+ " out.write(\"# Format: misspelling=correction\\n\\n\")\n",
341
+ "\n",
342
+ " for idx, word in enumerate(words):\n",
343
+ " correction = word\n",
344
+ " typos = generate_all_typos(word.lower())\n",
345
+ "\n",
346
+ " for typo in sorted(typos):\n",
347
+ " out.write(f\"{typo}={correction}\\n\")\n",
348
+ " total_typos += 1\n",
349
+ "\n",
350
+ " if (idx + 1) % batch_size == 0:\n",
351
+ " elapsed = time.time() - start\n",
352
+ " pct = (idx + 1) / len(words) * 100\n",
353
+ " rate = (idx + 1) / elapsed if elapsed > 0 else 0\n",
354
+ " print(f\"[{pct:5.1f}%] {idx + 1:,}/{len(words):,} words | \"\n",
355
+ " f\"{total_typos:,} typos | {rate:.0f} words/sec\")\n",
356
+ "\n",
357
+ "elapsed = time.time() - start\n",
358
+ "file_size_mb = os.path.getsize(output_path) / (1024 * 1024)\n",
359
+ "\n",
360
+ "print(\"\\n\" + \"=\" * 60)\n",
361
+ "print(f\"Done in {elapsed:.1f}s\")\n",
362
+ "print(f\"Words processed : {len(words):,}\")\n",
363
+ "print(f\"Typos generated : {total_typos:,}\")\n",
364
+ "print(f\"Output file : {output_path}\")\n",
365
+ "print(f\"File size : {file_size_mb:.1f} MB\")\n",
366
+ "print(\"=\" * 60)"
367
+ ],
368
+ "metadata": {
369
+ "colab": {
370
+ "base_uri": "https://localhost:8080/"
371
+ },
372
+ "id": "8wpfrH2Rev6c",
373
+ "outputId": "c5b782a4-01e2-46e9-cf19-628f0315eb03"
374
+ },
375
+ "execution_count": 3,
376
+ "outputs": [
377
+ {
378
+ "output_type": "stream",
379
+ "name": "stdout",
380
+ "text": [
381
+ "Reading words from: /content/words.txt\n",
382
+ "Total raw entries: 466,550\n",
383
+ "Filtered to 415,701 alphabetical words (len >= 3)\n",
384
+ "Generating typos → /content/misspellings.txt\n",
385
+ "[ 2.4%] 10,000/415,701 words | 606,939 typos | 25472 words/sec\n",
386
+ "[ 4.8%] 20,000/415,701 words | 1,280,904 typos | 24508 words/sec\n",
387
+ "[ 7.2%] 30,000/415,701 words | 1,896,445 typos | 24634 words/sec\n",
388
+ "[ 9.6%] 40,000/415,701 words | 2,472,636 typos | 25175 words/sec\n",
389
+ "[ 12.0%] 50,000/415,701 words | 3,046,929 typos | 25615 words/sec\n",
390
+ "[ 14.4%] 60,000/415,701 words | 3,658,494 typos | 25610 words/sec\n",
391
+ "[ 16.8%] 70,000/415,701 words | 4,310,538 typos | 25453 words/sec\n",
392
+ "[ 19.2%] 80,000/415,701 words | 4,990,356 typos | 25166 words/sec\n",
393
+ "[ 21.7%] 90,000/415,701 words | 5,607,705 typos | 25045 words/sec\n",
394
+ "[ 24.1%] 100,000/415,701 words | 6,313,297 typos | 24478 words/sec\n",
395
+ "[ 26.5%] 110,000/415,701 words | 6,924,705 typos | 24476 words/sec\n",
396
+ "[ 28.9%] 120,000/415,701 words | 7,551,152 typos | 24435 words/sec\n",
397
+ "[ 31.3%] 130,000/415,701 words | 8,173,721 typos | 24412 words/sec\n",
398
+ "[ 33.7%] 140,000/415,701 words | 8,784,574 typos | 24411 words/sec\n",
399
+ "[ 36.1%] 150,000/415,701 words | 9,371,986 typos | 24565 words/sec\n",
400
+ "[ 38.5%] 160,000/415,701 words | 10,066,265 typos | 24395 words/sec\n",
401
+ "[ 40.9%] 170,000/415,701 words | 10,683,848 typos | 24422 words/sec\n",
402
+ "[ 43.3%] 180,000/415,701 words | 11,419,079 typos | 24226 words/sec\n",
403
+ "[ 45.7%] 190,000/415,701 words | 11,935,360 typos | 24456 words/sec\n",
404
+ "[ 48.1%] 200,000/415,701 words | 12,506,920 typos | 24350 words/sec\n",
405
+ "[ 50.5%] 210,000/415,701 words | 13,082,705 typos | 23918 words/sec\n",
406
+ "[ 52.9%] 220,000/415,701 words | 13,740,979 typos | 23111 words/sec\n",
407
+ "[ 55.3%] 230,000/415,701 words | 14,339,517 typos | 23098 words/sec\n",
408
+ "[ 57.7%] 240,000/415,701 words | 15,158,921 typos | 22855 words/sec\n",
409
+ "[ 60.1%] 250,000/415,701 words | 15,771,208 typos | 22941 words/sec\n",
410
+ "[ 62.5%] 260,000/415,701 words | 16,479,864 typos | 22901 words/sec\n",
411
+ "[ 65.0%] 270,000/415,701 words | 17,144,444 typos | 22915 words/sec\n",
412
+ "[ 67.4%] 280,000/415,701 words | 17,764,197 typos | 23001 words/sec\n",
413
+ "[ 69.8%] 290,000/415,701 words | 18,511,700 typos | 22932 words/sec\n",
414
+ "[ 72.2%] 300,000/415,701 words | 19,126,791 typos | 22983 words/sec\n",
415
+ "[ 74.6%] 310,000/415,701 words | 19,770,597 typos | 22941 words/sec\n",
416
+ "[ 77.0%] 320,000/415,701 words | 20,369,517 typos | 23014 words/sec\n",
417
+ "[ 79.4%] 330,000/415,701 words | 21,019,600 typos | 23035 words/sec\n",
418
+ "[ 81.8%] 340,000/415,701 words | 21,631,279 typos | 23071 words/sec\n",
419
+ "[ 84.2%] 350,000/415,701 words | 22,312,850 typos | 23047 words/sec\n",
420
+ "[ 86.6%] 360,000/415,701 words | 22,968,756 typos | 23043 words/sec\n",
421
+ "[ 89.0%] 370,000/415,701 words | 23,596,078 typos | 23056 words/sec\n",
422
+ "[ 91.4%] 380,000/415,701 words | 24,266,024 typos | 23043 words/sec\n",
423
+ "[ 93.8%] 390,000/415,701 words | 25,041,545 typos | 22925 words/sec\n",
424
+ "[ 96.2%] 400,000/415,701 words | 25,744,156 typos | 22899 words/sec\n",
425
+ "[ 98.6%] 410,000/415,701 words | 26,322,505 typos | 22958 words/sec\n",
426
+ "\n",
427
+ "============================================================\n",
428
+ "Done in 18.1s\n",
429
+ "Words processed : 415,701\n",
430
+ "Typos generated : 26,636,990\n",
431
+ "Output file : /content/misspellings.txt\n",
432
+ "File size : 566.3 MB\n",
433
+ "============================================================\n"
434
+ ]
435
+ }
436
+ ]
437
+ },
438
+ {
439
+ "cell_type": "code",
440
+ "source": [
441
+ "# If saved to VM disk:\n",
442
+ "files.download('misspellings.txt')\n",
443
+ "\n",
444
+ "# If saved to Google Drive: just access it from drive.google.com"
445
+ ],
446
+ "metadata": {
447
+ "colab": {
448
+ "base_uri": "https://localhost:8080/",
449
+ "height": 17
450
+ },
451
+ "id": "HVq_gU0qfG9u",
452
+ "outputId": "dc770f0b-76d2-4ad7-93ba-bef0e9da45e3"
453
+ },
454
+ "execution_count": 4,
455
+ "outputs": [
456
+ {
457
+ "output_type": "display_data",
458
+ "data": {
459
+ "text/plain": [
460
+ "<IPython.core.display.Javascript object>"
461
+ ],
462
+ "application/javascript": [
463
+ "\n",
464
+ " async function download(id, filename, size) {\n",
465
+ " if (!google.colab.kernel.accessAllowed) {\n",
466
+ " return;\n",
467
+ " }\n",
468
+ " const div = document.createElement('div');\n",
469
+ " const label = document.createElement('label');\n",
470
+ " label.textContent = `Downloading \"${filename}\": `;\n",
471
+ " div.appendChild(label);\n",
472
+ " const progress = document.createElement('progress');\n",
473
+ " progress.max = size;\n",
474
+ " div.appendChild(progress);\n",
475
+ " document.body.appendChild(div);\n",
476
+ "\n",
477
+ " const buffers = [];\n",
478
+ " let downloaded = 0;\n",
479
+ "\n",
480
+ " const channel = await google.colab.kernel.comms.open(id);\n",
481
+ " // Send a message to notify the kernel that we're ready.\n",
482
+ " channel.send({})\n",
483
+ "\n",
484
+ " for await (const message of channel.messages) {\n",
485
+ " // Send a message to notify the kernel that we're ready.\n",
486
+ " channel.send({})\n",
487
+ " if (message.buffers) {\n",
488
+ " for (const buffer of message.buffers) {\n",
489
+ " buffers.push(buffer);\n",
490
+ " downloaded += buffer.byteLength;\n",
491
+ " progress.value = downloaded;\n",
492
+ " }\n",
493
+ " }\n",
494
+ " }\n",
495
+ " const blob = new Blob(buffers, {type: 'application/binary'});\n",
496
+ " const a = document.createElement('a');\n",
497
+ " a.href = window.URL.createObjectURL(blob);\n",
498
+ " a.download = filename;\n",
499
+ " div.appendChild(a);\n",
500
+ " a.click();\n",
501
+ " div.remove();\n",
502
+ " }\n",
503
+ " "
504
+ ]
505
+ },
506
+ "metadata": {}
507
+ },
508
+ {
509
+ "output_type": "display_data",
510
+ "data": {
511
+ "text/plain": [
512
+ "<IPython.core.display.Javascript object>"
513
+ ],
514
+ "application/javascript": [
515
+ "download(\"download_ef5c634e-3ae3-4a85-a7b4-8f9422b11298\", \"misspellings.txt\", 593809553)"
516
+ ]
517
+ },
518
+ "metadata": {}
519
+ }
520
+ ]
521
+ }
522
+ ]
523
+ }