Upload 4 files
Browse files- generate_typos_colab.py +127 -0
- generate_typos_local.py +157 -0
- google_collab_173MSW.ipynb +608 -0
- google_collab_263MSW.ipynb +523 -0
generate_typos_colab.py
ADDED
|
@@ -0,0 +1,127 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Generate realistic typo-based misspellings from words.txt → misspellings.txt
|
| 3 |
+
|
| 4 |
+
Colab version
|
| 5 |
+
Place words.txt in /content/ before running
|
| 6 |
+
"""
|
| 7 |
+
|
| 8 |
+
import os
|
| 9 |
+
import time
|
| 10 |
+
|
| 11 |
+
# Optional: mount Google Drive if your file is there
|
| 12 |
+
# from google.colab import drive
|
| 13 |
+
# drive.mount('/content/drive')
|
| 14 |
+
# words_path = '/content/drive/MyDrive/words.txt'
|
| 15 |
+
|
| 16 |
+
words_path = '/content/words.txt'
|
| 17 |
+
output_path = '/content/misspellings.txt'
|
| 18 |
+
|
| 19 |
+
KEYBOARD_NEIGHBORS = {
|
| 20 |
+
'q': 'wa', 'w': 'qeas', 'e': 'wrds', 'r': 'etfs', 't': 'rygs',
|
| 21 |
+
'y': 'tuhs', 'u': 'yijs', 'i': 'uoks', 'o': 'ipls', 'p': 'o',
|
| 22 |
+
'a': 'qwsz', 's': 'awedxz', 'd': 'serfcx', 'f': 'drtgvc',
|
| 23 |
+
'g': 'ftyhbv', 'h': 'gyujnb', 'j': 'huikmn', 'k': 'jiolm',
|
| 24 |
+
'l': 'kop', 'z': 'asx', 'x': 'zsdc', 'c': 'xdfv', 'v': 'cfgb',
|
| 25 |
+
'b': 'vghn', 'n': 'bhjm', 'm': 'njk',
|
| 26 |
+
}
|
| 27 |
+
|
| 28 |
+
def generate_adjacent_swaps(word):
|
| 29 |
+
typos = []
|
| 30 |
+
for i in range(len(word) - 1):
|
| 31 |
+
chars = list(word)
|
| 32 |
+
chars[i], chars[i + 1] = chars[i + 1], chars[i]
|
| 33 |
+
typo = ''.join(chars)
|
| 34 |
+
if typo != word:
|
| 35 |
+
typos.append(typo)
|
| 36 |
+
return typos
|
| 37 |
+
|
| 38 |
+
def generate_deletions(word):
|
| 39 |
+
typos = []
|
| 40 |
+
for i in range(len(word)):
|
| 41 |
+
typo = word[:i] + word[i + 1:]
|
| 42 |
+
if len(typo) >= 2:
|
| 43 |
+
typos.append(typo)
|
| 44 |
+
return typos
|
| 45 |
+
|
| 46 |
+
def generate_duplications(word):
|
| 47 |
+
typos = []
|
| 48 |
+
for i in range(len(word)):
|
| 49 |
+
typo = word[:i] + word[i] + word[i:]
|
| 50 |
+
if typo != word:
|
| 51 |
+
typos.append(typo)
|
| 52 |
+
return typos
|
| 53 |
+
|
| 54 |
+
def generate_nearby_key_subs(word):
|
| 55 |
+
typos = []
|
| 56 |
+
lower = word.lower()
|
| 57 |
+
for i in range(len(word)):
|
| 58 |
+
ch = lower[i]
|
| 59 |
+
if ch in KEYBOARD_NEIGHBORS:
|
| 60 |
+
for neighbor in KEYBOARD_NEIGHBORS[ch]:
|
| 61 |
+
typo = lower[:i] + neighbor + lower[i + 1:]
|
| 62 |
+
if typo != lower:
|
| 63 |
+
typos.append(typo)
|
| 64 |
+
return typos
|
| 65 |
+
|
| 66 |
+
def generate_all_typos(word):
|
| 67 |
+
typos = set()
|
| 68 |
+
typos.update(generate_adjacent_swaps(word))
|
| 69 |
+
typos.update(generate_deletions(word))
|
| 70 |
+
typos.update(generate_duplications(word))
|
| 71 |
+
typos.update(generate_nearby_key_subs(word))
|
| 72 |
+
typos.discard(word)
|
| 73 |
+
typos.discard(word.lower())
|
| 74 |
+
return typos
|
| 75 |
+
|
| 76 |
+
def is_pure_alpha(word):
|
| 77 |
+
return word.isalpha()
|
| 78 |
+
|
| 79 |
+
# ── Check file ──────────────────────────────────────────────
|
| 80 |
+
if not os.path.exists(words_path):
|
| 81 |
+
raise FileNotFoundError(f"{words_path} not found. Upload it to /content/ first.")
|
| 82 |
+
|
| 83 |
+
print(f"Reading words from: {words_path}")
|
| 84 |
+
|
| 85 |
+
with open(words_path, 'r', encoding='utf-8', errors='replace') as f:
|
| 86 |
+
raw_words = [line.strip() for line in f if line.strip()]
|
| 87 |
+
|
| 88 |
+
print(f"Total raw entries: {len(raw_words):,}")
|
| 89 |
+
|
| 90 |
+
words = [w for w in raw_words if is_pure_alpha(w) and len(w) >= 3]
|
| 91 |
+
print(f"Filtered to {len(words):,} alphabetical words (len >= 3)")
|
| 92 |
+
|
| 93 |
+
start = time.time()
|
| 94 |
+
total_typos = 0
|
| 95 |
+
batch_size = 10_000
|
| 96 |
+
|
| 97 |
+
print(f"Generating typos → {output_path}")
|
| 98 |
+
|
| 99 |
+
with open(output_path, 'w', encoding='utf-8', newline='\n') as out:
|
| 100 |
+
out.write("# Auto-generated misspellings database\n")
|
| 101 |
+
out.write("# Format: misspelling=correction\n\n")
|
| 102 |
+
|
| 103 |
+
for idx, word in enumerate(words):
|
| 104 |
+
correction = word
|
| 105 |
+
typos = generate_all_typos(word.lower())
|
| 106 |
+
|
| 107 |
+
for typo in sorted(typos):
|
| 108 |
+
out.write(f"{typo}={correction}\n")
|
| 109 |
+
total_typos += 1
|
| 110 |
+
|
| 111 |
+
if (idx + 1) % batch_size == 0:
|
| 112 |
+
elapsed = time.time() - start
|
| 113 |
+
pct = (idx + 1) / len(words) * 100
|
| 114 |
+
rate = (idx + 1) / elapsed if elapsed > 0 else 0
|
| 115 |
+
print(f"[{pct:5.1f}%] {idx + 1:,}/{len(words):,} words | "
|
| 116 |
+
f"{total_typos:,} typos | {rate:.0f} words/sec")
|
| 117 |
+
|
| 118 |
+
elapsed = time.time() - start
|
| 119 |
+
file_size_mb = os.path.getsize(output_path) / (1024 * 1024)
|
| 120 |
+
|
| 121 |
+
print("\n" + "=" * 60)
|
| 122 |
+
print(f"Done in {elapsed:.1f}s")
|
| 123 |
+
print(f"Words processed : {len(words):,}")
|
| 124 |
+
print(f"Typos generated : {total_typos:,}")
|
| 125 |
+
print(f"Output file : {output_path}")
|
| 126 |
+
print(f"File size : {file_size_mb:.1f} MB")
|
| 127 |
+
print("=" * 60)
|
generate_typos_local.py
ADDED
|
@@ -0,0 +1,157 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Generate realistic typo-based misspellings from words.txt → misspellings.txt
|
| 3 |
+
|
| 4 |
+
Typo strategies:
|
| 5 |
+
1. Adjacent letter swaps ("hello" → "hlelo", "helol")
|
| 6 |
+
2. Single character deletion ("hello" → "hllo", "helo")
|
| 7 |
+
3. Single character duplication ("hello" → "hhello", "heello")
|
| 8 |
+
4. Nearby keyboard key sub ("hello" → "gello", "jello")
|
| 9 |
+
|
| 10 |
+
Output format: misspelling=correction (one per line)
|
| 11 |
+
"""
|
| 12 |
+
|
| 13 |
+
import sys
|
| 14 |
+
import os
|
| 15 |
+
import time
|
| 16 |
+
|
| 17 |
+
# QWERTY keyboard proximity map
|
| 18 |
+
KEYBOARD_NEIGHBORS = {
|
| 19 |
+
'q': 'wa', 'w': 'qeas', 'e': 'wrds', 'r': 'etfs', 't': 'rygs',
|
| 20 |
+
'y': 'tuhs', 'u': 'yijs', 'i': 'uoks', 'o': 'ipls', 'p': 'o',
|
| 21 |
+
'a': 'qwsz', 's': 'awedxz', 'd': 'serfcx', 'f': 'drtgvc',
|
| 22 |
+
'g': 'ftyhbv', 'h': 'gyujnb', 'j': 'huikmn', 'k': 'jiolm',
|
| 23 |
+
'l': 'kop', 'z': 'asx', 'x': 'zsdc', 'c': 'xdfv', 'v': 'cfgb',
|
| 24 |
+
'b': 'vghn', 'n': 'bhjm', 'm': 'njk',
|
| 25 |
+
}
|
| 26 |
+
|
| 27 |
+
|
| 28 |
+
def generate_adjacent_swaps(word):
|
| 29 |
+
"""Swap each pair of adjacent characters."""
|
| 30 |
+
typos = []
|
| 31 |
+
for i in range(len(word) - 1):
|
| 32 |
+
chars = list(word)
|
| 33 |
+
chars[i], chars[i + 1] = chars[i + 1], chars[i]
|
| 34 |
+
typo = ''.join(chars)
|
| 35 |
+
if typo != word:
|
| 36 |
+
typos.append(typo)
|
| 37 |
+
return typos
|
| 38 |
+
|
| 39 |
+
|
| 40 |
+
def generate_deletions(word):
|
| 41 |
+
"""Delete one character at a time."""
|
| 42 |
+
typos = []
|
| 43 |
+
for i in range(len(word)):
|
| 44 |
+
typo = word[:i] + word[i + 1:]
|
| 45 |
+
if len(typo) >= 2: # keep at least 2 chars
|
| 46 |
+
typos.append(typo)
|
| 47 |
+
return typos
|
| 48 |
+
|
| 49 |
+
|
| 50 |
+
def generate_duplications(word):
|
| 51 |
+
"""Duplicate one character at a time."""
|
| 52 |
+
typos = []
|
| 53 |
+
for i in range(len(word)):
|
| 54 |
+
typo = word[:i] + word[i] + word[i:]
|
| 55 |
+
if typo != word:
|
| 56 |
+
typos.append(typo)
|
| 57 |
+
return typos
|
| 58 |
+
|
| 59 |
+
|
| 60 |
+
def generate_nearby_key_subs(word):
|
| 61 |
+
"""Replace one character with a nearby keyboard key."""
|
| 62 |
+
typos = []
|
| 63 |
+
lower = word.lower()
|
| 64 |
+
for i in range(len(word)):
|
| 65 |
+
ch = lower[i]
|
| 66 |
+
if ch in KEYBOARD_NEIGHBORS:
|
| 67 |
+
for neighbor in KEYBOARD_NEIGHBORS[ch]:
|
| 68 |
+
typo = lower[:i] + neighbor + lower[i + 1:]
|
| 69 |
+
if typo != lower:
|
| 70 |
+
typos.append(typo)
|
| 71 |
+
return typos
|
| 72 |
+
|
| 73 |
+
|
| 74 |
+
def generate_all_typos(word):
|
| 75 |
+
"""Generate all realistic typo variants for a word."""
|
| 76 |
+
typos = set()
|
| 77 |
+
typos.update(generate_adjacent_swaps(word))
|
| 78 |
+
typos.update(generate_deletions(word))
|
| 79 |
+
typos.update(generate_duplications(word))
|
| 80 |
+
typos.update(generate_nearby_key_subs(word))
|
| 81 |
+
typos.discard(word) # never map a word to itself
|
| 82 |
+
typos.discard(word.lower())
|
| 83 |
+
return typos
|
| 84 |
+
|
| 85 |
+
|
| 86 |
+
def is_pure_alpha(word):
|
| 87 |
+
"""Only process words that are purely alphabetical (a-z)."""
|
| 88 |
+
return word.isalpha()
|
| 89 |
+
|
| 90 |
+
|
| 91 |
+
def main():
|
| 92 |
+
base_dir = os.path.dirname(os.path.abspath(__file__))
|
| 93 |
+
words_path = os.path.join(base_dir, 'data', 'words.txt')
|
| 94 |
+
output_path = os.path.join(base_dir, 'data', 'misspellings.txt')
|
| 95 |
+
|
| 96 |
+
if not os.path.exists(words_path):
|
| 97 |
+
print(f"ERROR: {words_path} not found.")
|
| 98 |
+
sys.exit(1)
|
| 99 |
+
|
| 100 |
+
# ── Read words ──────────────────────────────────────────────
|
| 101 |
+
print(f"Reading words from: {words_path}")
|
| 102 |
+
with open(words_path, 'r', encoding='utf-8', errors='replace') as f:
|
| 103 |
+
raw_words = [line.strip() for line in f if line.strip()]
|
| 104 |
+
|
| 105 |
+
print(f"Total raw entries: {len(raw_words):,}")
|
| 106 |
+
|
| 107 |
+
# Filter to pure-alpha words with length >= 3
|
| 108 |
+
words = [w for w in raw_words if is_pure_alpha(w) and len(w) >= 3]
|
| 109 |
+
print(f"Filtered to {len(words):,} alphabetical words (len >= 3)")
|
| 110 |
+
|
| 111 |
+
# ── Generate typos ──────────────────────────────────────────
|
| 112 |
+
start = time.time()
|
| 113 |
+
total_typos = 0
|
| 114 |
+
batch_size = 10_000
|
| 115 |
+
|
| 116 |
+
print(f"Generating typos → {output_path}")
|
| 117 |
+
print("This may take a few minutes for 466k words...")
|
| 118 |
+
|
| 119 |
+
with open(output_path, 'w', encoding='utf-8', newline='\n') as out:
|
| 120 |
+
out.write("# Auto-generated misspellings database\n")
|
| 121 |
+
out.write("# Format: misspelling=correction\n")
|
| 122 |
+
out.write("# Generated by generate_typos.py\n")
|
| 123 |
+
out.write("#\n")
|
| 124 |
+
out.write("# Strategies: adjacent swaps, deletions, duplications, keyboard proximity\n")
|
| 125 |
+
out.write("\n")
|
| 126 |
+
|
| 127 |
+
for idx, word in enumerate(words):
|
| 128 |
+
correction = word # original is the correct form
|
| 129 |
+
typos = generate_all_typos(word.lower())
|
| 130 |
+
|
| 131 |
+
for typo in sorted(typos):
|
| 132 |
+
out.write(f"{typo}={correction}\n")
|
| 133 |
+
total_typos += 1
|
| 134 |
+
|
| 135 |
+
# Progress reporting
|
| 136 |
+
if (idx + 1) % batch_size == 0:
|
| 137 |
+
elapsed = time.time() - start
|
| 138 |
+
pct = (idx + 1) / len(words) * 100
|
| 139 |
+
rate = (idx + 1) / elapsed if elapsed > 0 else 0
|
| 140 |
+
print(f" [{pct:5.1f}%] {idx + 1:>7,} / {len(words):,} words |"
|
| 141 |
+
f" {total_typos:>10,} typos | {rate:.0f} words/sec")
|
| 142 |
+
|
| 143 |
+
elapsed = time.time() - start
|
| 144 |
+
file_size_mb = os.path.getsize(output_path) / (1024 * 1024)
|
| 145 |
+
|
| 146 |
+
print()
|
| 147 |
+
print("=" * 60)
|
| 148 |
+
print(f" Done in {elapsed:.1f}s")
|
| 149 |
+
print(f" Words processed : {len(words):,}")
|
| 150 |
+
print(f" Typos generated : {total_typos:,}")
|
| 151 |
+
print(f" Output file : {output_path}")
|
| 152 |
+
print(f" File size : {file_size_mb:.1f} MB")
|
| 153 |
+
print("=" * 60)
|
| 154 |
+
|
| 155 |
+
|
| 156 |
+
if __name__ == '__main__':
|
| 157 |
+
main()
|
google_collab_173MSW.ipynb
ADDED
|
@@ -0,0 +1,608 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"nbformat": 4,
|
| 3 |
+
"nbformat_minor": 0,
|
| 4 |
+
"metadata": {
|
| 5 |
+
"colab": {
|
| 6 |
+
"provenance": []
|
| 7 |
+
},
|
| 8 |
+
"kernelspec": {
|
| 9 |
+
"name": "python3",
|
| 10 |
+
"display_name": "Python 3"
|
| 11 |
+
},
|
| 12 |
+
"language_info": {
|
| 13 |
+
"name": "python"
|
| 14 |
+
}
|
| 15 |
+
},
|
| 16 |
+
"cells": [
|
| 17 |
+
{
|
| 18 |
+
"cell_type": "code",
|
| 19 |
+
"execution_count": null,
|
| 20 |
+
"metadata": {
|
| 21 |
+
"colab": {
|
| 22 |
+
"base_uri": "https://localhost:8080/",
|
| 23 |
+
"height": 73
|
| 24 |
+
},
|
| 25 |
+
"id": "FaSiqVnTItLq",
|
| 26 |
+
"outputId": "1ef8a78c-7421-41eb-8cf4-db8426edeed9"
|
| 27 |
+
},
|
| 28 |
+
"outputs": [
|
| 29 |
+
{
|
| 30 |
+
"output_type": "display_data",
|
| 31 |
+
"data": {
|
| 32 |
+
"text/plain": [
|
| 33 |
+
"<IPython.core.display.HTML object>"
|
| 34 |
+
],
|
| 35 |
+
"text/html": [
|
| 36 |
+
"\n",
|
| 37 |
+
" <input type=\"file\" id=\"files-a9b68a68-38ec-4a0a-8037-171b8cfec796\" name=\"files[]\" multiple disabled\n",
|
| 38 |
+
" style=\"border:none\" />\n",
|
| 39 |
+
" <output id=\"result-a9b68a68-38ec-4a0a-8037-171b8cfec796\">\n",
|
| 40 |
+
" Upload widget is only available when the cell has been executed in the\n",
|
| 41 |
+
" current browser session. Please rerun this cell to enable.\n",
|
| 42 |
+
" </output>\n",
|
| 43 |
+
" <script>// Copyright 2017 Google LLC\n",
|
| 44 |
+
"//\n",
|
| 45 |
+
"// Licensed under the Apache License, Version 2.0 (the \"License\");\n",
|
| 46 |
+
"// you may not use this file except in compliance with the License.\n",
|
| 47 |
+
"// You may obtain a copy of the License at\n",
|
| 48 |
+
"//\n",
|
| 49 |
+
"// http://www.apache.org/licenses/LICENSE-2.0\n",
|
| 50 |
+
"//\n",
|
| 51 |
+
"// Unless required by applicable law or agreed to in writing, software\n",
|
| 52 |
+
"// distributed under the License is distributed on an \"AS IS\" BASIS,\n",
|
| 53 |
+
"// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n",
|
| 54 |
+
"// See the License for the specific language governing permissions and\n",
|
| 55 |
+
"// limitations under the License.\n",
|
| 56 |
+
"\n",
|
| 57 |
+
"/**\n",
|
| 58 |
+
" * @fileoverview Helpers for google.colab Python module.\n",
|
| 59 |
+
" */\n",
|
| 60 |
+
"(function(scope) {\n",
|
| 61 |
+
"function span(text, styleAttributes = {}) {\n",
|
| 62 |
+
" const element = document.createElement('span');\n",
|
| 63 |
+
" element.textContent = text;\n",
|
| 64 |
+
" for (const key of Object.keys(styleAttributes)) {\n",
|
| 65 |
+
" element.style[key] = styleAttributes[key];\n",
|
| 66 |
+
" }\n",
|
| 67 |
+
" return element;\n",
|
| 68 |
+
"}\n",
|
| 69 |
+
"\n",
|
| 70 |
+
"// Max number of bytes which will be uploaded at a time.\n",
|
| 71 |
+
"const MAX_PAYLOAD_SIZE = 100 * 1024;\n",
|
| 72 |
+
"\n",
|
| 73 |
+
"function _uploadFiles(inputId, outputId) {\n",
|
| 74 |
+
" const steps = uploadFilesStep(inputId, outputId);\n",
|
| 75 |
+
" const outputElement = document.getElementById(outputId);\n",
|
| 76 |
+
" // Cache steps on the outputElement to make it available for the next call\n",
|
| 77 |
+
" // to uploadFilesContinue from Python.\n",
|
| 78 |
+
" outputElement.steps = steps;\n",
|
| 79 |
+
"\n",
|
| 80 |
+
" return _uploadFilesContinue(outputId);\n",
|
| 81 |
+
"}\n",
|
| 82 |
+
"\n",
|
| 83 |
+
"// This is roughly an async generator (not supported in the browser yet),\n",
|
| 84 |
+
"// where there are multiple asynchronous steps and the Python side is going\n",
|
| 85 |
+
"// to poll for completion of each step.\n",
|
| 86 |
+
"// This uses a Promise to block the python side on completion of each step,\n",
|
| 87 |
+
"// then passes the result of the previous step as the input to the next step.\n",
|
| 88 |
+
"function _uploadFilesContinue(outputId) {\n",
|
| 89 |
+
" const outputElement = document.getElementById(outputId);\n",
|
| 90 |
+
" const steps = outputElement.steps;\n",
|
| 91 |
+
"\n",
|
| 92 |
+
" const next = steps.next(outputElement.lastPromiseValue);\n",
|
| 93 |
+
" return Promise.resolve(next.value.promise).then((value) => {\n",
|
| 94 |
+
" // Cache the last promise value to make it available to the next\n",
|
| 95 |
+
" // step of the generator.\n",
|
| 96 |
+
" outputElement.lastPromiseValue = value;\n",
|
| 97 |
+
" return next.value.response;\n",
|
| 98 |
+
" });\n",
|
| 99 |
+
"}\n",
|
| 100 |
+
"\n",
|
| 101 |
+
"/**\n",
|
| 102 |
+
" * Generator function which is called between each async step of the upload\n",
|
| 103 |
+
" * process.\n",
|
| 104 |
+
" * @param {string} inputId Element ID of the input file picker element.\n",
|
| 105 |
+
" * @param {string} outputId Element ID of the output display.\n",
|
| 106 |
+
" * @return {!Iterable<!Object>} Iterable of next steps.\n",
|
| 107 |
+
" */\n",
|
| 108 |
+
"function* uploadFilesStep(inputId, outputId) {\n",
|
| 109 |
+
" const inputElement = document.getElementById(inputId);\n",
|
| 110 |
+
" inputElement.disabled = false;\n",
|
| 111 |
+
"\n",
|
| 112 |
+
" const outputElement = document.getElementById(outputId);\n",
|
| 113 |
+
" outputElement.innerHTML = '';\n",
|
| 114 |
+
"\n",
|
| 115 |
+
" const pickedPromise = new Promise((resolve) => {\n",
|
| 116 |
+
" inputElement.addEventListener('change', (e) => {\n",
|
| 117 |
+
" resolve(e.target.files);\n",
|
| 118 |
+
" });\n",
|
| 119 |
+
" });\n",
|
| 120 |
+
"\n",
|
| 121 |
+
" const cancel = document.createElement('button');\n",
|
| 122 |
+
" inputElement.parentElement.appendChild(cancel);\n",
|
| 123 |
+
" cancel.textContent = 'Cancel upload';\n",
|
| 124 |
+
" const cancelPromise = new Promise((resolve) => {\n",
|
| 125 |
+
" cancel.onclick = () => {\n",
|
| 126 |
+
" resolve(null);\n",
|
| 127 |
+
" };\n",
|
| 128 |
+
" });\n",
|
| 129 |
+
"\n",
|
| 130 |
+
" // Wait for the user to pick the files.\n",
|
| 131 |
+
" const files = yield {\n",
|
| 132 |
+
" promise: Promise.race([pickedPromise, cancelPromise]),\n",
|
| 133 |
+
" response: {\n",
|
| 134 |
+
" action: 'starting',\n",
|
| 135 |
+
" }\n",
|
| 136 |
+
" };\n",
|
| 137 |
+
"\n",
|
| 138 |
+
" cancel.remove();\n",
|
| 139 |
+
"\n",
|
| 140 |
+
" // Disable the input element since further picks are not allowed.\n",
|
| 141 |
+
" inputElement.disabled = true;\n",
|
| 142 |
+
"\n",
|
| 143 |
+
" if (!files) {\n",
|
| 144 |
+
" return {\n",
|
| 145 |
+
" response: {\n",
|
| 146 |
+
" action: 'complete',\n",
|
| 147 |
+
" }\n",
|
| 148 |
+
" };\n",
|
| 149 |
+
" }\n",
|
| 150 |
+
"\n",
|
| 151 |
+
" for (const file of files) {\n",
|
| 152 |
+
" const li = document.createElement('li');\n",
|
| 153 |
+
" li.append(span(file.name, {fontWeight: 'bold'}));\n",
|
| 154 |
+
" li.append(span(\n",
|
| 155 |
+
" `(${file.type || 'n/a'}) - ${file.size} bytes, ` +\n",
|
| 156 |
+
" `last modified: ${\n",
|
| 157 |
+
" file.lastModifiedDate ? file.lastModifiedDate.toLocaleDateString() :\n",
|
| 158 |
+
" 'n/a'} - `));\n",
|
| 159 |
+
" const percent = span('0% done');\n",
|
| 160 |
+
" li.appendChild(percent);\n",
|
| 161 |
+
"\n",
|
| 162 |
+
" outputElement.appendChild(li);\n",
|
| 163 |
+
"\n",
|
| 164 |
+
" const fileDataPromise = new Promise((resolve) => {\n",
|
| 165 |
+
" const reader = new FileReader();\n",
|
| 166 |
+
" reader.onload = (e) => {\n",
|
| 167 |
+
" resolve(e.target.result);\n",
|
| 168 |
+
" };\n",
|
| 169 |
+
" reader.readAsArrayBuffer(file);\n",
|
| 170 |
+
" });\n",
|
| 171 |
+
" // Wait for the data to be ready.\n",
|
| 172 |
+
" let fileData = yield {\n",
|
| 173 |
+
" promise: fileDataPromise,\n",
|
| 174 |
+
" response: {\n",
|
| 175 |
+
" action: 'continue',\n",
|
| 176 |
+
" }\n",
|
| 177 |
+
" };\n",
|
| 178 |
+
"\n",
|
| 179 |
+
" // Use a chunked sending to avoid message size limits. See b/62115660.\n",
|
| 180 |
+
" let position = 0;\n",
|
| 181 |
+
" do {\n",
|
| 182 |
+
" const length = Math.min(fileData.byteLength - position, MAX_PAYLOAD_SIZE);\n",
|
| 183 |
+
" const chunk = new Uint8Array(fileData, position, length);\n",
|
| 184 |
+
" position += length;\n",
|
| 185 |
+
"\n",
|
| 186 |
+
" const base64 = btoa(String.fromCharCode.apply(null, chunk));\n",
|
| 187 |
+
" yield {\n",
|
| 188 |
+
" response: {\n",
|
| 189 |
+
" action: 'append',\n",
|
| 190 |
+
" file: file.name,\n",
|
| 191 |
+
" data: base64,\n",
|
| 192 |
+
" },\n",
|
| 193 |
+
" };\n",
|
| 194 |
+
"\n",
|
| 195 |
+
" let percentDone = fileData.byteLength === 0 ?\n",
|
| 196 |
+
" 100 :\n",
|
| 197 |
+
" Math.round((position / fileData.byteLength) * 100);\n",
|
| 198 |
+
" percent.textContent = `${percentDone}% done`;\n",
|
| 199 |
+
"\n",
|
| 200 |
+
" } while (position < fileData.byteLength);\n",
|
| 201 |
+
" }\n",
|
| 202 |
+
"\n",
|
| 203 |
+
" // All done.\n",
|
| 204 |
+
" yield {\n",
|
| 205 |
+
" response: {\n",
|
| 206 |
+
" action: 'complete',\n",
|
| 207 |
+
" }\n",
|
| 208 |
+
" };\n",
|
| 209 |
+
"}\n",
|
| 210 |
+
"\n",
|
| 211 |
+
"scope.google = scope.google || {};\n",
|
| 212 |
+
"scope.google.colab = scope.google.colab || {};\n",
|
| 213 |
+
"scope.google.colab._files = {\n",
|
| 214 |
+
" _uploadFiles,\n",
|
| 215 |
+
" _uploadFilesContinue,\n",
|
| 216 |
+
"};\n",
|
| 217 |
+
"})(self);\n",
|
| 218 |
+
"</script> "
|
| 219 |
+
]
|
| 220 |
+
},
|
| 221 |
+
"metadata": {}
|
| 222 |
+
},
|
| 223 |
+
{
|
| 224 |
+
"output_type": "stream",
|
| 225 |
+
"name": "stdout",
|
| 226 |
+
"text": [
|
| 227 |
+
"Saving words.txt to words (1).txt\n"
|
| 228 |
+
]
|
| 229 |
+
}
|
| 230 |
+
],
|
| 231 |
+
"source": [
|
| 232 |
+
"# Cell 1\n",
|
| 233 |
+
"from google.colab import files\n",
|
| 234 |
+
"uploaded = files.upload() # select words.txt from your PC\n"
|
| 235 |
+
]
|
| 236 |
+
},
|
| 237 |
+
{
|
| 238 |
+
"cell_type": "code",
|
| 239 |
+
"source": [
|
| 240 |
+
"\"\"\"\n",
|
| 241 |
+
"=============================================================================\n",
|
| 242 |
+
" FULL PERMUTATION MISSPELLINGS GENERATOR (Google Colab Edition)\n",
|
| 243 |
+
"=============================================================================\n",
|
| 244 |
+
"\n",
|
| 245 |
+
"Purpose:\n",
|
| 246 |
+
" Generate ALL possible letter permutations of each word from words.txt\n",
|
| 247 |
+
" and write them as misspelling=correction pairs.\n",
|
| 248 |
+
"\n",
|
| 249 |
+
"⚠️ WARNING — READ BEFORE RUNNING ⚠️\n",
|
| 250 |
+
" This is computationally EXTREME. A single 10-letter word has 3,628,800\n",
|
| 251 |
+
" permutations. A 12-letter word has 479,001,600. For 466k words, the full\n",
|
| 252 |
+
" output could be PETABYTES. You WILL need to limit word length.\n",
|
| 253 |
+
"\n",
|
| 254 |
+
"=============================================================================\n",
|
| 255 |
+
" HOW TO USE ON GOOGLE COLAB\n",
|
| 256 |
+
"=============================================================================\n",
|
| 257 |
+
"\n",
|
| 258 |
+
"1. Open Google Colab → https://colab.research.google.com\n",
|
| 259 |
+
"2. Create a new notebook (Python 3)\n",
|
| 260 |
+
"\n",
|
| 261 |
+
"3. Upload your words.txt:\n",
|
| 262 |
+
" ─────────────────────────────────────\n",
|
| 263 |
+
" # CELL 1: Upload words.txt\n",
|
| 264 |
+
" from google.colab import files\n",
|
| 265 |
+
" uploaded = files.upload() # click \"Choose Files\" → select words.txt\n",
|
| 266 |
+
" ─────────────────────────────────────\n",
|
| 267 |
+
"\n",
|
| 268 |
+
"4. Copy-paste this ENTIRE script into a new cell and run it.\n",
|
| 269 |
+
"\n",
|
| 270 |
+
"5. Download the result:\n",
|
| 271 |
+
" ─────────────────────────────────────\n",
|
| 272 |
+
" # CELL 3: Download the output\n",
|
| 273 |
+
" files.download('misspellings_permutations.txt')\n",
|
| 274 |
+
" ─────────────────────────────────────\n",
|
| 275 |
+
"\n",
|
| 276 |
+
"=============================================================================\n",
|
| 277 |
+
" OR: Use Google Drive for large files\n",
|
| 278 |
+
"=============================================================================\n",
|
| 279 |
+
"\n",
|
| 280 |
+
" # Mount Google Drive (you get 15 GB free)\n",
|
| 281 |
+
" from google.colab import drive\n",
|
| 282 |
+
" drive.mount('/content/drive')\n",
|
| 283 |
+
"\n",
|
| 284 |
+
" # Then set OUTPUT_PATH below to:\n",
|
| 285 |
+
" OUTPUT_PATH = '/content/drive/MyDrive/misspellings_permutations.txt'\n",
|
| 286 |
+
"\n",
|
| 287 |
+
"=============================================================================\n",
|
| 288 |
+
" CONFIGURATION — Adjust these before running!\n",
|
| 289 |
+
"=============================================================================\n",
|
| 290 |
+
"\"\"\"\n",
|
| 291 |
+
"\n",
|
| 292 |
+
"import os\n",
|
| 293 |
+
"import sys\n",
|
| 294 |
+
"import time\n",
|
| 295 |
+
"import math\n",
|
| 296 |
+
"from itertools import permutations\n",
|
| 297 |
+
"\n",
|
| 298 |
+
"# ── CONFIGURATION ───────────────────────────────────────────────────────────\n",
|
| 299 |
+
"\n",
|
| 300 |
+
"WORDS_PATH = 'words.txt' # path to your words.txt\n",
|
| 301 |
+
"OUTPUT_PATH = 'misspellings_permutations.txt' # output file path\n",
|
| 302 |
+
"\n",
|
| 303 |
+
"MIN_WORD_LEN = 3 # skip words shorter than this\n",
|
| 304 |
+
"MAX_WORD_LEN = 7 # ⚠️ CRITICAL: max word length to permute\n",
|
| 305 |
+
" # 7 → max 5,040 perms/word (manageable)\n",
|
| 306 |
+
" # 8 → max 40,320 perms/word (large)\n",
|
| 307 |
+
" # 9 → max 362,880 perms/word (very large)\n",
|
| 308 |
+
" # 10 → max 3,628,800 perms/word (EXTREME)\n",
|
| 309 |
+
" # Increase at your own risk!\n",
|
| 310 |
+
"\n",
|
| 311 |
+
"ONLY_ALPHA = True # only process pure-alphabetical words\n",
|
| 312 |
+
"BATCH_LOG = 5000 # print progress every N words\n",
|
| 313 |
+
"\n",
|
| 314 |
+
"# ── ESTIMATION TABLE ────────────────────────────────────────────────────────\n",
|
| 315 |
+
"# Here's roughly how big the output gets at each MAX_WORD_LEN setting,\n",
|
| 316 |
+
"# assuming ~200k qualifying words at each length bracket:\n",
|
| 317 |
+
"#\n",
|
| 318 |
+
"# MAX_WORD_LEN │ Perms per word (worst) │ Rough output size\n",
|
| 319 |
+
"# ─────────────┼────────────────────────┼──────────────────\n",
|
| 320 |
+
"# 5 │ 120 │ ~200 MB\n",
|
| 321 |
+
"# 6 │ 720 │ ~1-2 GB\n",
|
| 322 |
+
"# 7 │ 5,040 │ ~5-15 GB\n",
|
| 323 |
+
"# 8 │ 40,320 │ ~50-150 GB\n",
|
| 324 |
+
"# 9 │ 362,880 │ ~500 GB - 1 TB\n",
|
| 325 |
+
"# 10 │ 3,628,800 │ ~5-50 TB ← won't fit anywhere\n",
|
| 326 |
+
"#\n",
|
| 327 |
+
"# Google Colab free tier gives you:\n",
|
| 328 |
+
"# • ~78 GB disk on the VM (temporary, lost on disconnect)\n",
|
| 329 |
+
"# • 15 GB Google Drive (persistent)\n",
|
| 330 |
+
"# • Colab Pro: 225 GB disk, longer runtimes\n",
|
| 331 |
+
"#\n",
|
| 332 |
+
"# RECOMMENDATION: Start with MAX_WORD_LEN = 6 or 7, see the size,\n",
|
| 333 |
+
"# then increase if you have space.\n",
|
| 334 |
+
"# ────────────────────────────────────────────────────────────────────────────\n",
|
| 335 |
+
"\n",
|
| 336 |
+
"\n",
|
| 337 |
+
"def estimate_output(words):\n",
|
| 338 |
+
" \"\"\"Estimate total permutations and file size before generating.\"\"\"\n",
|
| 339 |
+
" total_perms = 0\n",
|
| 340 |
+
" for w in words:\n",
|
| 341 |
+
" n = len(w)\n",
|
| 342 |
+
" # Account for duplicate letters: n! / (c1! * c2! * ...)\n",
|
| 343 |
+
" freq = {}\n",
|
| 344 |
+
" for ch in w.lower():\n",
|
| 345 |
+
" freq[ch] = freq.get(ch, 0) + 1\n",
|
| 346 |
+
" unique_perms = math.factorial(n)\n",
|
| 347 |
+
" for count in freq.values():\n",
|
| 348 |
+
" unique_perms //= math.factorial(count)\n",
|
| 349 |
+
" total_perms += unique_perms - 1 # subtract the original word\n",
|
| 350 |
+
"\n",
|
| 351 |
+
" # Estimate ~15 bytes per line (avg) → \"typo=word\\n\"\n",
|
| 352 |
+
" avg_bytes_per_line = 15\n",
|
| 353 |
+
" est_bytes = total_perms * avg_bytes_per_line\n",
|
| 354 |
+
" est_gb = est_bytes / (1024 ** 3)\n",
|
| 355 |
+
"\n",
|
| 356 |
+
" return total_perms, est_gb\n",
|
| 357 |
+
"\n",
|
| 358 |
+
"\n",
|
| 359 |
+
"def generate_unique_permutations(word):\n",
|
| 360 |
+
" \"\"\"\n",
|
| 361 |
+
" Generate all unique permutations of a word's letters,\n",
|
| 362 |
+
" excluding the original word itself.\n",
|
| 363 |
+
"\n",
|
| 364 |
+
" Uses set() to deduplicate (handles repeated letters efficiently).\n",
|
| 365 |
+
" \"\"\"\n",
|
| 366 |
+
" lower = word.lower()\n",
|
| 367 |
+
" perms = set(''.join(p) for p in permutations(lower))\n",
|
| 368 |
+
" perms.discard(lower) # remove the correctly-spelled word\n",
|
| 369 |
+
" return perms\n",
|
| 370 |
+
"\n",
|
| 371 |
+
"\n",
|
| 372 |
+
"def is_pure_alpha(word):\n",
|
| 373 |
+
" return word.isalpha()\n",
|
| 374 |
+
"\n",
|
| 375 |
+
"\n",
|
| 376 |
+
"def main():\n",
|
| 377 |
+
" if not os.path.exists(WORDS_PATH):\n",
|
| 378 |
+
" print(f\"ERROR: '{WORDS_PATH}' not found!\")\n",
|
| 379 |
+
" print(\"Make sure you uploaded words.txt or set WORDS_PATH correctly.\")\n",
|
| 380 |
+
" sys.exit(1)\n",
|
| 381 |
+
"\n",
|
| 382 |
+
" # ── Read words ──────────────────────────────────────────────\n",
|
| 383 |
+
" print(f\"Reading words from: {WORDS_PATH}\")\n",
|
| 384 |
+
" with open(WORDS_PATH, 'r', encoding='utf-8', errors='replace') as f:\n",
|
| 385 |
+
" raw_words = [line.strip() for line in f if line.strip()]\n",
|
| 386 |
+
"\n",
|
| 387 |
+
" print(f\"Total raw entries: {len(raw_words):,}\")\n",
|
| 388 |
+
"\n",
|
| 389 |
+
" # Filter\n",
|
| 390 |
+
" words = []\n",
|
| 391 |
+
" for w in raw_words:\n",
|
| 392 |
+
" if ONLY_ALPHA and not is_pure_alpha(w):\n",
|
| 393 |
+
" continue\n",
|
| 394 |
+
" if len(w) < MIN_WORD_LEN or len(w) > MAX_WORD_LEN:\n",
|
| 395 |
+
" continue\n",
|
| 396 |
+
" words.append(w)\n",
|
| 397 |
+
"\n",
|
| 398 |
+
" print(f\"Filtered to {len(words):,} words (alpha-only, len {MIN_WORD_LEN}-{MAX_WORD_LEN})\")\n",
|
| 399 |
+
"\n",
|
| 400 |
+
" if len(words) == 0:\n",
|
| 401 |
+
" print(\"No words matched the filter. Adjust MIN/MAX_WORD_LEN.\")\n",
|
| 402 |
+
" sys.exit(1)\n",
|
| 403 |
+
"\n",
|
| 404 |
+
" # ── Estimate ────────────────────────────────────────────────\n",
|
| 405 |
+
" print(\"\\nEstimating output size (this may take a moment)...\")\n",
|
| 406 |
+
" total_perms, est_gb = estimate_output(words)\n",
|
| 407 |
+
" print(f\" Estimated permutations : {total_perms:,}\")\n",
|
| 408 |
+
" print(f\" Estimated file size : {est_gb:.2f} GB\")\n",
|
| 409 |
+
"\n",
|
| 410 |
+
" # Safety check\n",
|
| 411 |
+
" if est_gb > 70:\n",
|
| 412 |
+
" print(f\"\\n⚠️ WARNING: Estimated output ({est_gb:.1f} GB) exceeds Colab disk (~78 GB).\")\n",
|
| 413 |
+
" print(\" Reduce MAX_WORD_LEN or the script will crash when disk fills up.\")\n",
|
| 414 |
+
" print(\" Aborting. Set MAX_WORD_LEN lower and re-run.\")\n",
|
| 415 |
+
" sys.exit(1)\n",
|
| 416 |
+
"\n",
|
| 417 |
+
" print(f\"\\nProceeding with generation → {OUTPUT_PATH}\")\n",
|
| 418 |
+
" print(\"=\" * 60)\n",
|
| 419 |
+
"\n",
|
| 420 |
+
" # ── Generate ────────────────────────────────────────────────\n",
|
| 421 |
+
" start = time.time()\n",
|
| 422 |
+
" total_written = 0\n",
|
| 423 |
+
"\n",
|
| 424 |
+
" with open(OUTPUT_PATH, 'w', encoding='utf-8') as out:\n",
|
| 425 |
+
" out.write(\"# Auto-generated FULL PERMUTATION misspellings\\n\")\n",
|
| 426 |
+
" out.write(f\"# Config: word length {MIN_WORD_LEN}-{MAX_WORD_LEN}\\n\")\n",
|
| 427 |
+
" out.write(\"# Format: misspelling=correction\\n\\n\")\n",
|
| 428 |
+
"\n",
|
| 429 |
+
" for idx, word in enumerate(words):\n",
|
| 430 |
+
" perms = generate_unique_permutations(word)\n",
|
| 431 |
+
"\n",
|
| 432 |
+
" for typo in sorted(perms):\n",
|
| 433 |
+
" out.write(f\"{typo}={word}\\n\")\n",
|
| 434 |
+
" total_written += 1\n",
|
| 435 |
+
"\n",
|
| 436 |
+
" # Progress\n",
|
| 437 |
+
" if (idx + 1) % BATCH_LOG == 0:\n",
|
| 438 |
+
" elapsed = time.time() - start\n",
|
| 439 |
+
" pct = (idx + 1) / len(words) * 100\n",
|
| 440 |
+
" rate = (idx + 1) / elapsed if elapsed > 0 else 0\n",
|
| 441 |
+
" cur_size = os.path.getsize(OUTPUT_PATH) / (1024 ** 3)\n",
|
| 442 |
+
" print(f\" [{pct:5.1f}%] {idx+1:>7,}/{len(words):,} words |\"\n",
|
| 443 |
+
" f\" {total_written:>12,} lines | {cur_size:.2f} GB |\"\n",
|
| 444 |
+
" f\" {rate:.0f} words/sec\")\n",
|
| 445 |
+
"\n",
|
| 446 |
+
" elapsed = time.time() - start\n",
|
| 447 |
+
" final_size = os.path.getsize(OUTPUT_PATH) / (1024 ** 3)\n",
|
| 448 |
+
"\n",
|
| 449 |
+
" print()\n",
|
| 450 |
+
" print(\"=\" * 60)\n",
|
| 451 |
+
" print(f\" ✅ DONE in {elapsed:.1f}s ({elapsed/60:.1f} min)\")\n",
|
| 452 |
+
" print(f\" Words processed : {len(words):,}\")\n",
|
| 453 |
+
" print(f\" Lines written : {total_written:,}\")\n",
|
| 454 |
+
" print(f\" Output file : {OUTPUT_PATH}\")\n",
|
| 455 |
+
" print(f\" File size : {final_size:.2f} GB\")\n",
|
| 456 |
+
" print(\"=\" * 60)\n",
|
| 457 |
+
"\n",
|
| 458 |
+
"\n",
|
| 459 |
+
"if __name__ == '__main__':\n",
|
| 460 |
+
" main()\n"
|
| 461 |
+
],
|
| 462 |
+
"metadata": {
|
| 463 |
+
"colab": {
|
| 464 |
+
"base_uri": "https://localhost:8080/"
|
| 465 |
+
},
|
| 466 |
+
"id": "Et0QfIxpJz_5",
|
| 467 |
+
"outputId": "e7e72965-f709-45c0-ae56-abf76b89d714"
|
| 468 |
+
},
|
| 469 |
+
"execution_count": null,
|
| 470 |
+
"outputs": [
|
| 471 |
+
{
|
| 472 |
+
"output_type": "stream",
|
| 473 |
+
"name": "stdout",
|
| 474 |
+
"text": [
|
| 475 |
+
"Reading words from: words.txt\n",
|
| 476 |
+
"Total raw entries: 466,550\n",
|
| 477 |
+
"Filtered to 125,414 words (alpha-only, len 3-7)\n",
|
| 478 |
+
"\n",
|
| 479 |
+
"Estimating output size (this may take a moment)...\n",
|
| 480 |
+
" Estimated permutations : 173,110,626\n",
|
| 481 |
+
" Estimated file size : 2.42 GB\n",
|
| 482 |
+
"\n",
|
| 483 |
+
"Proceeding with generation → misspellings_permutations.txt\n",
|
| 484 |
+
"============================================================\n",
|
| 485 |
+
" [ 4.0%] 5,000/125,414 words | 5,810,553 lines | 0.08 GB | 898 words/sec\n",
|
| 486 |
+
" [ 8.0%] 10,000/125,414 words | 11,972,245 lines | 0.18 GB | 781 words/sec\n",
|
| 487 |
+
" [ 12.0%] 15,000/125,414 words | 19,094,747 lines | 0.28 GB | 775 words/sec\n",
|
| 488 |
+
" [ 15.9%] 20,000/125,414 words | 26,800,249 lines | 0.39 GB | 721 words/sec\n",
|
| 489 |
+
" [ 19.9%] 25,000/125,414 words | 35,047,153 lines | 0.51 GB | 690 words/sec\n",
|
| 490 |
+
" [ 23.9%] 30,000/125,414 words | 42,273,166 lines | 0.62 GB | 695 words/sec\n",
|
| 491 |
+
" [ 27.9%] 35,000/125,414 words | 48,702,338 lines | 0.71 GB | 692 words/sec\n",
|
| 492 |
+
" [ 31.9%] 40,000/125,414 words | 55,295,151 lines | 0.81 GB | 703 words/sec\n",
|
| 493 |
+
" [ 35.9%] 45,000/125,414 words | 62,710,327 lines | 0.92 GB | 690 words/sec\n",
|
| 494 |
+
" [ 39.9%] 50,000/125,414 words | 69,722,485 lines | 1.02 GB | 690 words/sec\n",
|
| 495 |
+
" [ 43.9%] 55,000/125,414 words | 76,146,526 lines | 1.12 GB | 674 words/sec\n",
|
| 496 |
+
" [ 47.8%] 60,000/125,414 words | 81,994,038 lines | 1.20 GB | 686 words/sec\n",
|
| 497 |
+
" [ 51.8%] 65,000/125,414 words | 88,058,594 lines | 1.29 GB | 683 words/sec\n",
|
| 498 |
+
" [ 55.8%] 70,000/125,414 words | 94,651,291 lines | 1.39 GB | 688 words/sec\n",
|
| 499 |
+
" [ 59.8%] 75,000/125,414 words | 101,636,647 lines | 1.49 GB | 679 words/sec\n",
|
| 500 |
+
" [ 63.8%] 80,000/125,414 words | 107,086,424 lines | 1.57 GB | 691 words/sec\n",
|
| 501 |
+
" [ 67.8%] 85,000/125,414 words | 114,898,717 lines | 1.68 GB | 678 words/sec\n",
|
| 502 |
+
" [ 71.8%] 90,000/125,414 words | 123,278,791 lines | 1.80 GB | 675 words/sec\n",
|
| 503 |
+
" [ 75.7%] 95,000/125,414 words | 129,821,900 lines | 1.90 GB | 669 words/sec\n",
|
| 504 |
+
" [ 79.7%] 100,000/125,414 words | 136,429,269 lines | 2.00 GB | 673 words/sec\n",
|
| 505 |
+
" [ 83.7%] 105,000/125,414 words | 143,342,171 lines | 2.10 GB | 667 words/sec\n",
|
| 506 |
+
" [ 87.7%] 110,000/125,414 words | 150,701,210 lines | 2.21 GB | 666 words/sec\n",
|
| 507 |
+
" [ 91.7%] 115,000/125,414 words | 157,479,616 lines | 2.31 GB | 665 words/sec\n",
|
| 508 |
+
" [ 95.7%] 120,000/125,414 words | 165,619,673 lines | 2.43 GB | 662 words/sec\n",
|
| 509 |
+
" [ 99.7%] 125,000/125,414 words | 172,558,768 lines | 2.53 GB | 661 words/sec\n",
|
| 510 |
+
"\n",
|
| 511 |
+
"============================================================\n",
|
| 512 |
+
" ✅ DONE in 189.5s (3.2 min)\n",
|
| 513 |
+
" Words processed : 125,414\n",
|
| 514 |
+
" Lines written : 173,110,626\n",
|
| 515 |
+
" Output file : misspellings_permutations.txt\n",
|
| 516 |
+
" File size : 2.53 GB\n",
|
| 517 |
+
"============================================================\n"
|
| 518 |
+
]
|
| 519 |
+
}
|
| 520 |
+
]
|
| 521 |
+
},
|
| 522 |
+
{
|
| 523 |
+
"cell_type": "code",
|
| 524 |
+
"source": [
|
| 525 |
+
"# If saved to VM disk:\n",
|
| 526 |
+
"files.download('misspellings_permutations.txt')\n",
|
| 527 |
+
"\n",
|
| 528 |
+
"# If saved to Google Drive: just access it from drive.google.com\n",
|
| 529 |
+
"\n"
|
| 530 |
+
],
|
| 531 |
+
"metadata": {
|
| 532 |
+
"id": "y9jWxvv8LWoH",
|
| 533 |
+
"outputId": "d8d754d3-234e-4020-bcc7-a19f3fc5fb26",
|
| 534 |
+
"colab": {
|
| 535 |
+
"base_uri": "https://localhost:8080/",
|
| 536 |
+
"height": 34
|
| 537 |
+
}
|
| 538 |
+
},
|
| 539 |
+
"execution_count": null,
|
| 540 |
+
"outputs": [
|
| 541 |
+
{
|
| 542 |
+
"output_type": "display_data",
|
| 543 |
+
"data": {
|
| 544 |
+
"text/plain": [
|
| 545 |
+
"<IPython.core.display.Javascript object>"
|
| 546 |
+
],
|
| 547 |
+
"application/javascript": [
|
| 548 |
+
"\n",
|
| 549 |
+
" async function download(id, filename, size) {\n",
|
| 550 |
+
" if (!google.colab.kernel.accessAllowed) {\n",
|
| 551 |
+
" return;\n",
|
| 552 |
+
" }\n",
|
| 553 |
+
" const div = document.createElement('div');\n",
|
| 554 |
+
" const label = document.createElement('label');\n",
|
| 555 |
+
" label.textContent = `Downloading \"${filename}\": `;\n",
|
| 556 |
+
" div.appendChild(label);\n",
|
| 557 |
+
" const progress = document.createElement('progress');\n",
|
| 558 |
+
" progress.max = size;\n",
|
| 559 |
+
" div.appendChild(progress);\n",
|
| 560 |
+
" document.body.appendChild(div);\n",
|
| 561 |
+
"\n",
|
| 562 |
+
" const buffers = [];\n",
|
| 563 |
+
" let downloaded = 0;\n",
|
| 564 |
+
"\n",
|
| 565 |
+
" const channel = await google.colab.kernel.comms.open(id);\n",
|
| 566 |
+
" // Send a message to notify the kernel that we're ready.\n",
|
| 567 |
+
" channel.send({})\n",
|
| 568 |
+
"\n",
|
| 569 |
+
" for await (const message of channel.messages) {\n",
|
| 570 |
+
" // Send a message to notify the kernel that we're ready.\n",
|
| 571 |
+
" channel.send({})\n",
|
| 572 |
+
" if (message.buffers) {\n",
|
| 573 |
+
" for (const buffer of message.buffers) {\n",
|
| 574 |
+
" buffers.push(buffer);\n",
|
| 575 |
+
" downloaded += buffer.byteLength;\n",
|
| 576 |
+
" progress.value = downloaded;\n",
|
| 577 |
+
" }\n",
|
| 578 |
+
" }\n",
|
| 579 |
+
" }\n",
|
| 580 |
+
" const blob = new Blob(buffers, {type: 'application/binary'});\n",
|
| 581 |
+
" const a = document.createElement('a');\n",
|
| 582 |
+
" a.href = window.URL.createObjectURL(blob);\n",
|
| 583 |
+
" a.download = filename;\n",
|
| 584 |
+
" div.appendChild(a);\n",
|
| 585 |
+
" a.click();\n",
|
| 586 |
+
" div.remove();\n",
|
| 587 |
+
" }\n",
|
| 588 |
+
" "
|
| 589 |
+
]
|
| 590 |
+
},
|
| 591 |
+
"metadata": {}
|
| 592 |
+
},
|
| 593 |
+
{
|
| 594 |
+
"output_type": "display_data",
|
| 595 |
+
"data": {
|
| 596 |
+
"text/plain": [
|
| 597 |
+
"<IPython.core.display.Javascript object>"
|
| 598 |
+
],
|
| 599 |
+
"application/javascript": [
|
| 600 |
+
"download(\"download_10941777-78c6-4833-b8e6-093feee02e11\", \"misspellings_permutations.txt\", 2721877361)"
|
| 601 |
+
]
|
| 602 |
+
},
|
| 603 |
+
"metadata": {}
|
| 604 |
+
}
|
| 605 |
+
]
|
| 606 |
+
}
|
| 607 |
+
]
|
| 608 |
+
}
|
google_collab_263MSW.ipynb
ADDED
|
@@ -0,0 +1,523 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"nbformat": 4,
|
| 3 |
+
"nbformat_minor": 0,
|
| 4 |
+
"metadata": {
|
| 5 |
+
"colab": {
|
| 6 |
+
"provenance": []
|
| 7 |
+
},
|
| 8 |
+
"kernelspec": {
|
| 9 |
+
"name": "python3",
|
| 10 |
+
"display_name": "Python 3"
|
| 11 |
+
},
|
| 12 |
+
"language_info": {
|
| 13 |
+
"name": "python"
|
| 14 |
+
}
|
| 15 |
+
},
|
| 16 |
+
"cells": [
|
| 17 |
+
{
|
| 18 |
+
"cell_type": "code",
|
| 19 |
+
"execution_count": 1,
|
| 20 |
+
"metadata": {
|
| 21 |
+
"colab": {
|
| 22 |
+
"base_uri": "https://localhost:8080/",
|
| 23 |
+
"height": 73
|
| 24 |
+
},
|
| 25 |
+
"id": "NKDQAIA9bkTI",
|
| 26 |
+
"outputId": "b21b6fd4-cbe0-46f2-ae31-639ac21e04c4"
|
| 27 |
+
},
|
| 28 |
+
"outputs": [
|
| 29 |
+
{
|
| 30 |
+
"output_type": "display_data",
|
| 31 |
+
"data": {
|
| 32 |
+
"text/plain": [
|
| 33 |
+
"<IPython.core.display.HTML object>"
|
| 34 |
+
],
|
| 35 |
+
"text/html": [
|
| 36 |
+
"\n",
|
| 37 |
+
" <input type=\"file\" id=\"files-27caebec-daaf-4dc1-9317-a13c04ecdb3b\" name=\"files[]\" multiple disabled\n",
|
| 38 |
+
" style=\"border:none\" />\n",
|
| 39 |
+
" <output id=\"result-27caebec-daaf-4dc1-9317-a13c04ecdb3b\">\n",
|
| 40 |
+
" Upload widget is only available when the cell has been executed in the\n",
|
| 41 |
+
" current browser session. Please rerun this cell to enable.\n",
|
| 42 |
+
" </output>\n",
|
| 43 |
+
" <script>// Copyright 2017 Google LLC\n",
|
| 44 |
+
"//\n",
|
| 45 |
+
"// Licensed under the Apache License, Version 2.0 (the \"License\");\n",
|
| 46 |
+
"// you may not use this file except in compliance with the License.\n",
|
| 47 |
+
"// You may obtain a copy of the License at\n",
|
| 48 |
+
"//\n",
|
| 49 |
+
"// http://www.apache.org/licenses/LICENSE-2.0\n",
|
| 50 |
+
"//\n",
|
| 51 |
+
"// Unless required by applicable law or agreed to in writing, software\n",
|
| 52 |
+
"// distributed under the License is distributed on an \"AS IS\" BASIS,\n",
|
| 53 |
+
"// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n",
|
| 54 |
+
"// See the License for the specific language governing permissions and\n",
|
| 55 |
+
"// limitations under the License.\n",
|
| 56 |
+
"\n",
|
| 57 |
+
"/**\n",
|
| 58 |
+
" * @fileoverview Helpers for google.colab Python module.\n",
|
| 59 |
+
" */\n",
|
| 60 |
+
"(function(scope) {\n",
|
| 61 |
+
"function span(text, styleAttributes = {}) {\n",
|
| 62 |
+
" const element = document.createElement('span');\n",
|
| 63 |
+
" element.textContent = text;\n",
|
| 64 |
+
" for (const key of Object.keys(styleAttributes)) {\n",
|
| 65 |
+
" element.style[key] = styleAttributes[key];\n",
|
| 66 |
+
" }\n",
|
| 67 |
+
" return element;\n",
|
| 68 |
+
"}\n",
|
| 69 |
+
"\n",
|
| 70 |
+
"// Max number of bytes which will be uploaded at a time.\n",
|
| 71 |
+
"const MAX_PAYLOAD_SIZE = 100 * 1024;\n",
|
| 72 |
+
"\n",
|
| 73 |
+
"function _uploadFiles(inputId, outputId) {\n",
|
| 74 |
+
" const steps = uploadFilesStep(inputId, outputId);\n",
|
| 75 |
+
" const outputElement = document.getElementById(outputId);\n",
|
| 76 |
+
" // Cache steps on the outputElement to make it available for the next call\n",
|
| 77 |
+
" // to uploadFilesContinue from Python.\n",
|
| 78 |
+
" outputElement.steps = steps;\n",
|
| 79 |
+
"\n",
|
| 80 |
+
" return _uploadFilesContinue(outputId);\n",
|
| 81 |
+
"}\n",
|
| 82 |
+
"\n",
|
| 83 |
+
"// This is roughly an async generator (not supported in the browser yet),\n",
|
| 84 |
+
"// where there are multiple asynchronous steps and the Python side is going\n",
|
| 85 |
+
"// to poll for completion of each step.\n",
|
| 86 |
+
"// This uses a Promise to block the python side on completion of each step,\n",
|
| 87 |
+
"// then passes the result of the previous step as the input to the next step.\n",
|
| 88 |
+
"function _uploadFilesContinue(outputId) {\n",
|
| 89 |
+
" const outputElement = document.getElementById(outputId);\n",
|
| 90 |
+
" const steps = outputElement.steps;\n",
|
| 91 |
+
"\n",
|
| 92 |
+
" const next = steps.next(outputElement.lastPromiseValue);\n",
|
| 93 |
+
" return Promise.resolve(next.value.promise).then((value) => {\n",
|
| 94 |
+
" // Cache the last promise value to make it available to the next\n",
|
| 95 |
+
" // step of the generator.\n",
|
| 96 |
+
" outputElement.lastPromiseValue = value;\n",
|
| 97 |
+
" return next.value.response;\n",
|
| 98 |
+
" });\n",
|
| 99 |
+
"}\n",
|
| 100 |
+
"\n",
|
| 101 |
+
"/**\n",
|
| 102 |
+
" * Generator function which is called between each async step of the upload\n",
|
| 103 |
+
" * process.\n",
|
| 104 |
+
" * @param {string} inputId Element ID of the input file picker element.\n",
|
| 105 |
+
" * @param {string} outputId Element ID of the output display.\n",
|
| 106 |
+
" * @return {!Iterable<!Object>} Iterable of next steps.\n",
|
| 107 |
+
" */\n",
|
| 108 |
+
"function* uploadFilesStep(inputId, outputId) {\n",
|
| 109 |
+
" const inputElement = document.getElementById(inputId);\n",
|
| 110 |
+
" inputElement.disabled = false;\n",
|
| 111 |
+
"\n",
|
| 112 |
+
" const outputElement = document.getElementById(outputId);\n",
|
| 113 |
+
" outputElement.innerHTML = '';\n",
|
| 114 |
+
"\n",
|
| 115 |
+
" const pickedPromise = new Promise((resolve) => {\n",
|
| 116 |
+
" inputElement.addEventListener('change', (e) => {\n",
|
| 117 |
+
" resolve(e.target.files);\n",
|
| 118 |
+
" });\n",
|
| 119 |
+
" });\n",
|
| 120 |
+
"\n",
|
| 121 |
+
" const cancel = document.createElement('button');\n",
|
| 122 |
+
" inputElement.parentElement.appendChild(cancel);\n",
|
| 123 |
+
" cancel.textContent = 'Cancel upload';\n",
|
| 124 |
+
" const cancelPromise = new Promise((resolve) => {\n",
|
| 125 |
+
" cancel.onclick = () => {\n",
|
| 126 |
+
" resolve(null);\n",
|
| 127 |
+
" };\n",
|
| 128 |
+
" });\n",
|
| 129 |
+
"\n",
|
| 130 |
+
" // Wait for the user to pick the files.\n",
|
| 131 |
+
" const files = yield {\n",
|
| 132 |
+
" promise: Promise.race([pickedPromise, cancelPromise]),\n",
|
| 133 |
+
" response: {\n",
|
| 134 |
+
" action: 'starting',\n",
|
| 135 |
+
" }\n",
|
| 136 |
+
" };\n",
|
| 137 |
+
"\n",
|
| 138 |
+
" cancel.remove();\n",
|
| 139 |
+
"\n",
|
| 140 |
+
" // Disable the input element since further picks are not allowed.\n",
|
| 141 |
+
" inputElement.disabled = true;\n",
|
| 142 |
+
"\n",
|
| 143 |
+
" if (!files) {\n",
|
| 144 |
+
" return {\n",
|
| 145 |
+
" response: {\n",
|
| 146 |
+
" action: 'complete',\n",
|
| 147 |
+
" }\n",
|
| 148 |
+
" };\n",
|
| 149 |
+
" }\n",
|
| 150 |
+
"\n",
|
| 151 |
+
" for (const file of files) {\n",
|
| 152 |
+
" const li = document.createElement('li');\n",
|
| 153 |
+
" li.append(span(file.name, {fontWeight: 'bold'}));\n",
|
| 154 |
+
" li.append(span(\n",
|
| 155 |
+
" `(${file.type || 'n/a'}) - ${file.size} bytes, ` +\n",
|
| 156 |
+
" `last modified: ${\n",
|
| 157 |
+
" file.lastModifiedDate ? file.lastModifiedDate.toLocaleDateString() :\n",
|
| 158 |
+
" 'n/a'} - `));\n",
|
| 159 |
+
" const percent = span('0% done');\n",
|
| 160 |
+
" li.appendChild(percent);\n",
|
| 161 |
+
"\n",
|
| 162 |
+
" outputElement.appendChild(li);\n",
|
| 163 |
+
"\n",
|
| 164 |
+
" const fileDataPromise = new Promise((resolve) => {\n",
|
| 165 |
+
" const reader = new FileReader();\n",
|
| 166 |
+
" reader.onload = (e) => {\n",
|
| 167 |
+
" resolve(e.target.result);\n",
|
| 168 |
+
" };\n",
|
| 169 |
+
" reader.readAsArrayBuffer(file);\n",
|
| 170 |
+
" });\n",
|
| 171 |
+
" // Wait for the data to be ready.\n",
|
| 172 |
+
" let fileData = yield {\n",
|
| 173 |
+
" promise: fileDataPromise,\n",
|
| 174 |
+
" response: {\n",
|
| 175 |
+
" action: 'continue',\n",
|
| 176 |
+
" }\n",
|
| 177 |
+
" };\n",
|
| 178 |
+
"\n",
|
| 179 |
+
" // Use a chunked sending to avoid message size limits. See b/62115660.\n",
|
| 180 |
+
" let position = 0;\n",
|
| 181 |
+
" do {\n",
|
| 182 |
+
" const length = Math.min(fileData.byteLength - position, MAX_PAYLOAD_SIZE);\n",
|
| 183 |
+
" const chunk = new Uint8Array(fileData, position, length);\n",
|
| 184 |
+
" position += length;\n",
|
| 185 |
+
"\n",
|
| 186 |
+
" const base64 = btoa(String.fromCharCode.apply(null, chunk));\n",
|
| 187 |
+
" yield {\n",
|
| 188 |
+
" response: {\n",
|
| 189 |
+
" action: 'append',\n",
|
| 190 |
+
" file: file.name,\n",
|
| 191 |
+
" data: base64,\n",
|
| 192 |
+
" },\n",
|
| 193 |
+
" };\n",
|
| 194 |
+
"\n",
|
| 195 |
+
" let percentDone = fileData.byteLength === 0 ?\n",
|
| 196 |
+
" 100 :\n",
|
| 197 |
+
" Math.round((position / fileData.byteLength) * 100);\n",
|
| 198 |
+
" percent.textContent = `${percentDone}% done`;\n",
|
| 199 |
+
"\n",
|
| 200 |
+
" } while (position < fileData.byteLength);\n",
|
| 201 |
+
" }\n",
|
| 202 |
+
"\n",
|
| 203 |
+
" // All done.\n",
|
| 204 |
+
" yield {\n",
|
| 205 |
+
" response: {\n",
|
| 206 |
+
" action: 'complete',\n",
|
| 207 |
+
" }\n",
|
| 208 |
+
" };\n",
|
| 209 |
+
"}\n",
|
| 210 |
+
"\n",
|
| 211 |
+
"scope.google = scope.google || {};\n",
|
| 212 |
+
"scope.google.colab = scope.google.colab || {};\n",
|
| 213 |
+
"scope.google.colab._files = {\n",
|
| 214 |
+
" _uploadFiles,\n",
|
| 215 |
+
" _uploadFilesContinue,\n",
|
| 216 |
+
"};\n",
|
| 217 |
+
"})(self);\n",
|
| 218 |
+
"</script> "
|
| 219 |
+
]
|
| 220 |
+
},
|
| 221 |
+
"metadata": {}
|
| 222 |
+
},
|
| 223 |
+
{
|
| 224 |
+
"output_type": "stream",
|
| 225 |
+
"name": "stdout",
|
| 226 |
+
"text": [
|
| 227 |
+
"Saving words.txt to words.txt\n"
|
| 228 |
+
]
|
| 229 |
+
}
|
| 230 |
+
],
|
| 231 |
+
"source": [
|
| 232 |
+
"# Cell 1\n",
|
| 233 |
+
"from google.colab import files\n",
|
| 234 |
+
"uploaded = files.upload() # select words.txt from your PC\n"
|
| 235 |
+
]
|
| 236 |
+
},
|
| 237 |
+
{
|
| 238 |
+
"cell_type": "code",
|
| 239 |
+
"source": [
|
| 240 |
+
"\"\"\"\n",
|
| 241 |
+
"Generate realistic typo-based misspellings from words.txt → misspellings.txt\n",
|
| 242 |
+
"\n",
|
| 243 |
+
"Colab version\n",
|
| 244 |
+
"Place words.txt in /content/ before running\n",
|
| 245 |
+
"\"\"\"\n",
|
| 246 |
+
"\n",
|
| 247 |
+
"import os\n",
|
| 248 |
+
"import time\n",
|
| 249 |
+
"\n",
|
| 250 |
+
"# Optional: mount Google Drive if your file is there\n",
|
| 251 |
+
"# from google.colab import drive\n",
|
| 252 |
+
"# drive.mount('/content/drive')\n",
|
| 253 |
+
"# words_path = '/content/drive/MyDrive/words.txt'\n",
|
| 254 |
+
"\n",
|
| 255 |
+
"words_path = '/content/words.txt'\n",
|
| 256 |
+
"output_path = '/content/misspellings.txt'\n",
|
| 257 |
+
"\n",
|
| 258 |
+
"KEYBOARD_NEIGHBORS = {\n",
|
| 259 |
+
" 'q': 'wa', 'w': 'qeas', 'e': 'wrds', 'r': 'etfs', 't': 'rygs',\n",
|
| 260 |
+
" 'y': 'tuhs', 'u': 'yijs', 'i': 'uoks', 'o': 'ipls', 'p': 'o',\n",
|
| 261 |
+
" 'a': 'qwsz', 's': 'awedxz', 'd': 'serfcx', 'f': 'drtgvc',\n",
|
| 262 |
+
" 'g': 'ftyhbv', 'h': 'gyujnb', 'j': 'huikmn', 'k': 'jiolm',\n",
|
| 263 |
+
" 'l': 'kop', 'z': 'asx', 'x': 'zsdc', 'c': 'xdfv', 'v': 'cfgb',\n",
|
| 264 |
+
" 'b': 'vghn', 'n': 'bhjm', 'm': 'njk',\n",
|
| 265 |
+
"}\n",
|
| 266 |
+
"\n",
|
| 267 |
+
"def generate_adjacent_swaps(word):\n",
|
| 268 |
+
" typos = []\n",
|
| 269 |
+
" for i in range(len(word) - 1):\n",
|
| 270 |
+
" chars = list(word)\n",
|
| 271 |
+
" chars[i], chars[i + 1] = chars[i + 1], chars[i]\n",
|
| 272 |
+
" typo = ''.join(chars)\n",
|
| 273 |
+
" if typo != word:\n",
|
| 274 |
+
" typos.append(typo)\n",
|
| 275 |
+
" return typos\n",
|
| 276 |
+
"\n",
|
| 277 |
+
"def generate_deletions(word):\n",
|
| 278 |
+
" typos = []\n",
|
| 279 |
+
" for i in range(len(word)):\n",
|
| 280 |
+
" typo = word[:i] + word[i + 1:]\n",
|
| 281 |
+
" if len(typo) >= 2:\n",
|
| 282 |
+
" typos.append(typo)\n",
|
| 283 |
+
" return typos\n",
|
| 284 |
+
"\n",
|
| 285 |
+
"def generate_duplications(word):\n",
|
| 286 |
+
" typos = []\n",
|
| 287 |
+
" for i in range(len(word)):\n",
|
| 288 |
+
" typo = word[:i] + word[i] + word[i:]\n",
|
| 289 |
+
" if typo != word:\n",
|
| 290 |
+
" typos.append(typo)\n",
|
| 291 |
+
" return typos\n",
|
| 292 |
+
"\n",
|
| 293 |
+
"def generate_nearby_key_subs(word):\n",
|
| 294 |
+
" typos = []\n",
|
| 295 |
+
" lower = word.lower()\n",
|
| 296 |
+
" for i in range(len(word)):\n",
|
| 297 |
+
" ch = lower[i]\n",
|
| 298 |
+
" if ch in KEYBOARD_NEIGHBORS:\n",
|
| 299 |
+
" for neighbor in KEYBOARD_NEIGHBORS[ch]:\n",
|
| 300 |
+
" typo = lower[:i] + neighbor + lower[i + 1:]\n",
|
| 301 |
+
" if typo != lower:\n",
|
| 302 |
+
" typos.append(typo)\n",
|
| 303 |
+
" return typos\n",
|
| 304 |
+
"\n",
|
| 305 |
+
"def generate_all_typos(word):\n",
|
| 306 |
+
" typos = set()\n",
|
| 307 |
+
" typos.update(generate_adjacent_swaps(word))\n",
|
| 308 |
+
" typos.update(generate_deletions(word))\n",
|
| 309 |
+
" typos.update(generate_duplications(word))\n",
|
| 310 |
+
" typos.update(generate_nearby_key_subs(word))\n",
|
| 311 |
+
" typos.discard(word)\n",
|
| 312 |
+
" typos.discard(word.lower())\n",
|
| 313 |
+
" return typos\n",
|
| 314 |
+
"\n",
|
| 315 |
+
"def is_pure_alpha(word):\n",
|
| 316 |
+
" return word.isalpha()\n",
|
| 317 |
+
"\n",
|
| 318 |
+
"# ── Check file ──────────────────────────────────────────────\n",
|
| 319 |
+
"if not os.path.exists(words_path):\n",
|
| 320 |
+
" raise FileNotFoundError(f\"{words_path} not found. Upload it to /content/ first.\")\n",
|
| 321 |
+
"\n",
|
| 322 |
+
"print(f\"Reading words from: {words_path}\")\n",
|
| 323 |
+
"\n",
|
| 324 |
+
"with open(words_path, 'r', encoding='utf-8', errors='replace') as f:\n",
|
| 325 |
+
" raw_words = [line.strip() for line in f if line.strip()]\n",
|
| 326 |
+
"\n",
|
| 327 |
+
"print(f\"Total raw entries: {len(raw_words):,}\")\n",
|
| 328 |
+
"\n",
|
| 329 |
+
"words = [w for w in raw_words if is_pure_alpha(w) and len(w) >= 3]\n",
|
| 330 |
+
"print(f\"Filtered to {len(words):,} alphabetical words (len >= 3)\")\n",
|
| 331 |
+
"\n",
|
| 332 |
+
"start = time.time()\n",
|
| 333 |
+
"total_typos = 0\n",
|
| 334 |
+
"batch_size = 10_000\n",
|
| 335 |
+
"\n",
|
| 336 |
+
"print(f\"Generating typos → {output_path}\")\n",
|
| 337 |
+
"\n",
|
| 338 |
+
"with open(output_path, 'w', encoding='utf-8', newline='\\n') as out:\n",
|
| 339 |
+
" out.write(\"# Auto-generated misspellings database\\n\")\n",
|
| 340 |
+
" out.write(\"# Format: misspelling=correction\\n\\n\")\n",
|
| 341 |
+
"\n",
|
| 342 |
+
" for idx, word in enumerate(words):\n",
|
| 343 |
+
" correction = word\n",
|
| 344 |
+
" typos = generate_all_typos(word.lower())\n",
|
| 345 |
+
"\n",
|
| 346 |
+
" for typo in sorted(typos):\n",
|
| 347 |
+
" out.write(f\"{typo}={correction}\\n\")\n",
|
| 348 |
+
" total_typos += 1\n",
|
| 349 |
+
"\n",
|
| 350 |
+
" if (idx + 1) % batch_size == 0:\n",
|
| 351 |
+
" elapsed = time.time() - start\n",
|
| 352 |
+
" pct = (idx + 1) / len(words) * 100\n",
|
| 353 |
+
" rate = (idx + 1) / elapsed if elapsed > 0 else 0\n",
|
| 354 |
+
" print(f\"[{pct:5.1f}%] {idx + 1:,}/{len(words):,} words | \"\n",
|
| 355 |
+
" f\"{total_typos:,} typos | {rate:.0f} words/sec\")\n",
|
| 356 |
+
"\n",
|
| 357 |
+
"elapsed = time.time() - start\n",
|
| 358 |
+
"file_size_mb = os.path.getsize(output_path) / (1024 * 1024)\n",
|
| 359 |
+
"\n",
|
| 360 |
+
"print(\"\\n\" + \"=\" * 60)\n",
|
| 361 |
+
"print(f\"Done in {elapsed:.1f}s\")\n",
|
| 362 |
+
"print(f\"Words processed : {len(words):,}\")\n",
|
| 363 |
+
"print(f\"Typos generated : {total_typos:,}\")\n",
|
| 364 |
+
"print(f\"Output file : {output_path}\")\n",
|
| 365 |
+
"print(f\"File size : {file_size_mb:.1f} MB\")\n",
|
| 366 |
+
"print(\"=\" * 60)"
|
| 367 |
+
],
|
| 368 |
+
"metadata": {
|
| 369 |
+
"colab": {
|
| 370 |
+
"base_uri": "https://localhost:8080/"
|
| 371 |
+
},
|
| 372 |
+
"id": "8wpfrH2Rev6c",
|
| 373 |
+
"outputId": "c5b782a4-01e2-46e9-cf19-628f0315eb03"
|
| 374 |
+
},
|
| 375 |
+
"execution_count": 3,
|
| 376 |
+
"outputs": [
|
| 377 |
+
{
|
| 378 |
+
"output_type": "stream",
|
| 379 |
+
"name": "stdout",
|
| 380 |
+
"text": [
|
| 381 |
+
"Reading words from: /content/words.txt\n",
|
| 382 |
+
"Total raw entries: 466,550\n",
|
| 383 |
+
"Filtered to 415,701 alphabetical words (len >= 3)\n",
|
| 384 |
+
"Generating typos → /content/misspellings.txt\n",
|
| 385 |
+
"[ 2.4%] 10,000/415,701 words | 606,939 typos | 25472 words/sec\n",
|
| 386 |
+
"[ 4.8%] 20,000/415,701 words | 1,280,904 typos | 24508 words/sec\n",
|
| 387 |
+
"[ 7.2%] 30,000/415,701 words | 1,896,445 typos | 24634 words/sec\n",
|
| 388 |
+
"[ 9.6%] 40,000/415,701 words | 2,472,636 typos | 25175 words/sec\n",
|
| 389 |
+
"[ 12.0%] 50,000/415,701 words | 3,046,929 typos | 25615 words/sec\n",
|
| 390 |
+
"[ 14.4%] 60,000/415,701 words | 3,658,494 typos | 25610 words/sec\n",
|
| 391 |
+
"[ 16.8%] 70,000/415,701 words | 4,310,538 typos | 25453 words/sec\n",
|
| 392 |
+
"[ 19.2%] 80,000/415,701 words | 4,990,356 typos | 25166 words/sec\n",
|
| 393 |
+
"[ 21.7%] 90,000/415,701 words | 5,607,705 typos | 25045 words/sec\n",
|
| 394 |
+
"[ 24.1%] 100,000/415,701 words | 6,313,297 typos | 24478 words/sec\n",
|
| 395 |
+
"[ 26.5%] 110,000/415,701 words | 6,924,705 typos | 24476 words/sec\n",
|
| 396 |
+
"[ 28.9%] 120,000/415,701 words | 7,551,152 typos | 24435 words/sec\n",
|
| 397 |
+
"[ 31.3%] 130,000/415,701 words | 8,173,721 typos | 24412 words/sec\n",
|
| 398 |
+
"[ 33.7%] 140,000/415,701 words | 8,784,574 typos | 24411 words/sec\n",
|
| 399 |
+
"[ 36.1%] 150,000/415,701 words | 9,371,986 typos | 24565 words/sec\n",
|
| 400 |
+
"[ 38.5%] 160,000/415,701 words | 10,066,265 typos | 24395 words/sec\n",
|
| 401 |
+
"[ 40.9%] 170,000/415,701 words | 10,683,848 typos | 24422 words/sec\n",
|
| 402 |
+
"[ 43.3%] 180,000/415,701 words | 11,419,079 typos | 24226 words/sec\n",
|
| 403 |
+
"[ 45.7%] 190,000/415,701 words | 11,935,360 typos | 24456 words/sec\n",
|
| 404 |
+
"[ 48.1%] 200,000/415,701 words | 12,506,920 typos | 24350 words/sec\n",
|
| 405 |
+
"[ 50.5%] 210,000/415,701 words | 13,082,705 typos | 23918 words/sec\n",
|
| 406 |
+
"[ 52.9%] 220,000/415,701 words | 13,740,979 typos | 23111 words/sec\n",
|
| 407 |
+
"[ 55.3%] 230,000/415,701 words | 14,339,517 typos | 23098 words/sec\n",
|
| 408 |
+
"[ 57.7%] 240,000/415,701 words | 15,158,921 typos | 22855 words/sec\n",
|
| 409 |
+
"[ 60.1%] 250,000/415,701 words | 15,771,208 typos | 22941 words/sec\n",
|
| 410 |
+
"[ 62.5%] 260,000/415,701 words | 16,479,864 typos | 22901 words/sec\n",
|
| 411 |
+
"[ 65.0%] 270,000/415,701 words | 17,144,444 typos | 22915 words/sec\n",
|
| 412 |
+
"[ 67.4%] 280,000/415,701 words | 17,764,197 typos | 23001 words/sec\n",
|
| 413 |
+
"[ 69.8%] 290,000/415,701 words | 18,511,700 typos | 22932 words/sec\n",
|
| 414 |
+
"[ 72.2%] 300,000/415,701 words | 19,126,791 typos | 22983 words/sec\n",
|
| 415 |
+
"[ 74.6%] 310,000/415,701 words | 19,770,597 typos | 22941 words/sec\n",
|
| 416 |
+
"[ 77.0%] 320,000/415,701 words | 20,369,517 typos | 23014 words/sec\n",
|
| 417 |
+
"[ 79.4%] 330,000/415,701 words | 21,019,600 typos | 23035 words/sec\n",
|
| 418 |
+
"[ 81.8%] 340,000/415,701 words | 21,631,279 typos | 23071 words/sec\n",
|
| 419 |
+
"[ 84.2%] 350,000/415,701 words | 22,312,850 typos | 23047 words/sec\n",
|
| 420 |
+
"[ 86.6%] 360,000/415,701 words | 22,968,756 typos | 23043 words/sec\n",
|
| 421 |
+
"[ 89.0%] 370,000/415,701 words | 23,596,078 typos | 23056 words/sec\n",
|
| 422 |
+
"[ 91.4%] 380,000/415,701 words | 24,266,024 typos | 23043 words/sec\n",
|
| 423 |
+
"[ 93.8%] 390,000/415,701 words | 25,041,545 typos | 22925 words/sec\n",
|
| 424 |
+
"[ 96.2%] 400,000/415,701 words | 25,744,156 typos | 22899 words/sec\n",
|
| 425 |
+
"[ 98.6%] 410,000/415,701 words | 26,322,505 typos | 22958 words/sec\n",
|
| 426 |
+
"\n",
|
| 427 |
+
"============================================================\n",
|
| 428 |
+
"Done in 18.1s\n",
|
| 429 |
+
"Words processed : 415,701\n",
|
| 430 |
+
"Typos generated : 26,636,990\n",
|
| 431 |
+
"Output file : /content/misspellings.txt\n",
|
| 432 |
+
"File size : 566.3 MB\n",
|
| 433 |
+
"============================================================\n"
|
| 434 |
+
]
|
| 435 |
+
}
|
| 436 |
+
]
|
| 437 |
+
},
|
| 438 |
+
{
|
| 439 |
+
"cell_type": "code",
|
| 440 |
+
"source": [
|
| 441 |
+
"# If saved to VM disk:\n",
|
| 442 |
+
"files.download('misspellings.txt')\n",
|
| 443 |
+
"\n",
|
| 444 |
+
"# If saved to Google Drive: just access it from drive.google.com"
|
| 445 |
+
],
|
| 446 |
+
"metadata": {
|
| 447 |
+
"colab": {
|
| 448 |
+
"base_uri": "https://localhost:8080/",
|
| 449 |
+
"height": 17
|
| 450 |
+
},
|
| 451 |
+
"id": "HVq_gU0qfG9u",
|
| 452 |
+
"outputId": "dc770f0b-76d2-4ad7-93ba-bef0e9da45e3"
|
| 453 |
+
},
|
| 454 |
+
"execution_count": 4,
|
| 455 |
+
"outputs": [
|
| 456 |
+
{
|
| 457 |
+
"output_type": "display_data",
|
| 458 |
+
"data": {
|
| 459 |
+
"text/plain": [
|
| 460 |
+
"<IPython.core.display.Javascript object>"
|
| 461 |
+
],
|
| 462 |
+
"application/javascript": [
|
| 463 |
+
"\n",
|
| 464 |
+
" async function download(id, filename, size) {\n",
|
| 465 |
+
" if (!google.colab.kernel.accessAllowed) {\n",
|
| 466 |
+
" return;\n",
|
| 467 |
+
" }\n",
|
| 468 |
+
" const div = document.createElement('div');\n",
|
| 469 |
+
" const label = document.createElement('label');\n",
|
| 470 |
+
" label.textContent = `Downloading \"${filename}\": `;\n",
|
| 471 |
+
" div.appendChild(label);\n",
|
| 472 |
+
" const progress = document.createElement('progress');\n",
|
| 473 |
+
" progress.max = size;\n",
|
| 474 |
+
" div.appendChild(progress);\n",
|
| 475 |
+
" document.body.appendChild(div);\n",
|
| 476 |
+
"\n",
|
| 477 |
+
" const buffers = [];\n",
|
| 478 |
+
" let downloaded = 0;\n",
|
| 479 |
+
"\n",
|
| 480 |
+
" const channel = await google.colab.kernel.comms.open(id);\n",
|
| 481 |
+
" // Send a message to notify the kernel that we're ready.\n",
|
| 482 |
+
" channel.send({})\n",
|
| 483 |
+
"\n",
|
| 484 |
+
" for await (const message of channel.messages) {\n",
|
| 485 |
+
" // Send a message to notify the kernel that we're ready.\n",
|
| 486 |
+
" channel.send({})\n",
|
| 487 |
+
" if (message.buffers) {\n",
|
| 488 |
+
" for (const buffer of message.buffers) {\n",
|
| 489 |
+
" buffers.push(buffer);\n",
|
| 490 |
+
" downloaded += buffer.byteLength;\n",
|
| 491 |
+
" progress.value = downloaded;\n",
|
| 492 |
+
" }\n",
|
| 493 |
+
" }\n",
|
| 494 |
+
" }\n",
|
| 495 |
+
" const blob = new Blob(buffers, {type: 'application/binary'});\n",
|
| 496 |
+
" const a = document.createElement('a');\n",
|
| 497 |
+
" a.href = window.URL.createObjectURL(blob);\n",
|
| 498 |
+
" a.download = filename;\n",
|
| 499 |
+
" div.appendChild(a);\n",
|
| 500 |
+
" a.click();\n",
|
| 501 |
+
" div.remove();\n",
|
| 502 |
+
" }\n",
|
| 503 |
+
" "
|
| 504 |
+
]
|
| 505 |
+
},
|
| 506 |
+
"metadata": {}
|
| 507 |
+
},
|
| 508 |
+
{
|
| 509 |
+
"output_type": "display_data",
|
| 510 |
+
"data": {
|
| 511 |
+
"text/plain": [
|
| 512 |
+
"<IPython.core.display.Javascript object>"
|
| 513 |
+
],
|
| 514 |
+
"application/javascript": [
|
| 515 |
+
"download(\"download_ef5c634e-3ae3-4a85-a7b4-8f9422b11298\", \"misspellings.txt\", 593809553)"
|
| 516 |
+
]
|
| 517 |
+
},
|
| 518 |
+
"metadata": {}
|
| 519 |
+
}
|
| 520 |
+
]
|
| 521 |
+
}
|
| 522 |
+
]
|
| 523 |
+
}
|