File size: 9,618 Bytes
2b97944
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
"""

=============================================================================

  FULL PERMUTATION MISSPELLINGS GENERATOR  (Google Colab Edition)

=============================================================================



Purpose:

  Generate ALL possible letter permutations of each word from words.txt

  and write them as misspelling=correction pairs.



  WARNING β€” READ BEFORE RUNNING  

  This is computationally EXTREME. A single 10-letter word has 3,628,800

  permutations. A 12-letter word has 479,001,600. For 466k words, the full

  output could be PETABYTES. You WILL need to limit word length.



=============================================================================

  HOW TO USE ON GOOGLE COLAB

=============================================================================



1. Open Google Colab  β†’  https://colab.research.google.com

2. Create a new notebook (Python 3)



3. Upload your words.txt:

   ─────────────────────────────────────

   # CELL 1: Upload words.txt

   from google.colab import files

   uploaded = files.upload()     # click "Choose Files" β†’ select words.txt

   ─────────────────────────────────────



4. Copy-paste this ENTIRE script into a new cell and run it.



5. Download the result:

   ─────────────────────────────────────

   # CELL 3: Download the output

   files.download('misspellings_permutations.txt')

   ─────────────────────────────────────



=============================================================================

  OR: Use Google Drive for large files

=============================================================================



   # Mount Google Drive (you get 15 GB free)

   from google.colab import drive

   drive.mount('/content/drive')



   # Then set OUTPUT_PATH below to:

   OUTPUT_PATH = '/content/drive/MyDrive/misspellings_permutations.txt'



=============================================================================

  CONFIGURATION β€” Adjust these before running!

=============================================================================

"""

import os
import sys
import time
import math
from itertools import permutations

# ── CONFIGURATION ───────────────────────────────────────────────────────────

WORDS_PATH   = 'words.txt'                          # path to your words.txt
OUTPUT_PATH  = 'misspellings_permutations.txt'       # output file path

MIN_WORD_LEN = 3     # skip words shorter than this
MAX_WORD_LEN = 7     #  CRITICAL: max word length to permute
                      # 7  β†’ max 5,040 perms/word   (manageable)
                      # 8  β†’ max 40,320 perms/word  (large)
                      # 9  β†’ max 362,880 perms/word (very large)
                      # 10 β†’ max 3,628,800 perms/word (EXTREME)
                      # Increase at your own risk!

ONLY_ALPHA   = True   # only process pure-alphabetical words
BATCH_LOG    = 5000   # print progress every N words

# ── ESTIMATION TABLE ────────────────────────────────────────────────────────
# Here's roughly how big the output gets at each MAX_WORD_LEN setting,
# assuming ~200k qualifying words at each length bracket:
#
# MAX_WORD_LEN β”‚ Perms per word (worst) β”‚ Rough output size
# ─────────────┼────────────────────────┼──────────────────
#      5       β”‚          120           β”‚   ~200 MB
#      6       β”‚          720           β”‚   ~1-2 GB
#      7       β”‚        5,040           β”‚   ~5-15 GB
#      8       β”‚       40,320           β”‚   ~50-150 GB
#      9       β”‚      362,880           β”‚   ~500 GB - 1 TB
#     10       β”‚    3,628,800           β”‚   ~5-50 TB  ← won't fit anywhere
#
# Google Colab free tier gives you:
#   β€’ ~78 GB disk on the VM (temporary, lost on disconnect)
#   β€’ 15 GB Google Drive (persistent)
#   β€’ Colab Pro: 225 GB disk, longer runtimes
#
# RECOMMENDATION: Start with MAX_WORD_LEN = 6 or 7, see the size,
# then increase if you have space.
# ────────────────────────────────────────────────────────────────────────────


def estimate_output(words):
    """Estimate total permutations and file size before generating."""
    total_perms = 0
    for w in words:
        n = len(w)
        # Account for duplicate letters: n! / (c1! * c2! * ...)
        freq = {}
        for ch in w.lower():
            freq[ch] = freq.get(ch, 0) + 1
        unique_perms = math.factorial(n)
        for count in freq.values():
            unique_perms //= math.factorial(count)
        total_perms += unique_perms - 1  # subtract the original word

    # Estimate ~15 bytes per line (avg)  β†’  "typo=word\n"
    avg_bytes_per_line = 15
    est_bytes = total_perms * avg_bytes_per_line
    est_gb = est_bytes / (1024 ** 3)

    return total_perms, est_gb


def generate_unique_permutations(word):
    """

    Generate all unique permutations of a word's letters,

    excluding the original word itself.



    Uses set() to deduplicate (handles repeated letters efficiently).

    """
    lower = word.lower()
    perms = set(''.join(p) for p in permutations(lower))
    perms.discard(lower)  # remove the correctly-spelled word
    return perms


def is_pure_alpha(word):
    return word.isalpha()


def main():
    if not os.path.exists(WORDS_PATH):
        print(f"ERROR: '{WORDS_PATH}' not found!")
        print("Make sure you uploaded words.txt or set WORDS_PATH correctly.")
        sys.exit(1)

    # ── Read words ──────────────────────────────────────────────
    print(f"Reading words from: {WORDS_PATH}")
    with open(WORDS_PATH, 'r', encoding='utf-8', errors='replace') as f:
        raw_words = [line.strip() for line in f if line.strip()]

    print(f"Total raw entries: {len(raw_words):,}")

    # Filter
    words = []
    for w in raw_words:
        if ONLY_ALPHA and not is_pure_alpha(w):
            continue
        if len(w) < MIN_WORD_LEN or len(w) > MAX_WORD_LEN:
            continue
        words.append(w)

    print(f"Filtered to {len(words):,} words (alpha-only, len {MIN_WORD_LEN}-{MAX_WORD_LEN})")

    if len(words) == 0:
        print("No words matched the filter. Adjust MIN/MAX_WORD_LEN.")
        sys.exit(1)

    # ── Estimate ────────────────────────────────────────────────
    print("\nEstimating output size (this may take a moment)...")
    total_perms, est_gb = estimate_output(words)
    print(f"  Estimated permutations : {total_perms:,}")
    print(f"  Estimated file size    : {est_gb:.2f} GB")

    # Safety check
    if est_gb > 70:
        print(f"\n  WARNING: Estimated output ({est_gb:.1f} GB) exceeds Colab disk (~78 GB).")
        print("  Reduce MAX_WORD_LEN or the script will crash when disk fills up.")
        print("  Aborting. Set MAX_WORD_LEN lower and re-run.")
        sys.exit(1)

    print(f"\nProceeding with generation β†’ {OUTPUT_PATH}")
    print("=" * 60)

    # ── Generate ────────────────────────────────────────────────
    start = time.time()
    total_written = 0

    with open(OUTPUT_PATH, 'w', encoding='utf-8') as out:
        out.write("# Auto-generated FULL PERMUTATION misspellings\n")
        out.write(f"# Config: word length {MIN_WORD_LEN}-{MAX_WORD_LEN}\n")
        out.write("# Format: misspelling=correction\n\n")

        for idx, word in enumerate(words):
            perms = generate_unique_permutations(word)

            for typo in sorted(perms):
                out.write(f"{typo}={word}\n")
                total_written += 1

            # Progress
            if (idx + 1) % BATCH_LOG == 0:
                elapsed = time.time() - start
                pct = (idx + 1) / len(words) * 100
                rate = (idx + 1) / elapsed if elapsed > 0 else 0
                cur_size = os.path.getsize(OUTPUT_PATH) / (1024 ** 3)
                print(f"  [{pct:5.1f}%]  {idx+1:>7,}/{len(words):,} words  |"
                      f"  {total_written:>12,} lines  |  {cur_size:.2f} GB  |"
                      f"  {rate:.0f} words/sec")

    elapsed = time.time() - start
    final_size = os.path.getsize(OUTPUT_PATH) / (1024 ** 3)

    print()
    print("=" * 60)
    print(f"  DONE in {elapsed:.1f}s ({elapsed/60:.1f} min)")
    print(f"  Words processed  : {len(words):,}")
    print(f"  Lines written    : {total_written:,}")
    print(f"  Output file      : {OUTPUT_PATH}")
    print(f"  File size        : {final_size:.2f} GB")
    print("=" * 60)


if __name__ == '__main__':
    main()