File size: 22,152 Bytes
d184fb7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
"""Text cleaning pipeline for preparing training data."""

import logging
import re
import unicodedata

logger = logging.getLogger(__name__)


class TextCleaner:
    """Cleans raw text for character-level language model training."""

    # Project Gutenberg header/footer patterns
    GUTENBERG_START = re.compile(
        r"\*\*\*\s*START OF (?:THE |THIS )?PROJECT GUTENBERG.*?\*\*\*",
        re.IGNORECASE,
    )
    GUTENBERG_END = re.compile(
        r"\*\*\*\s*END OF (?:THE |THIS )?PROJECT GUTENBERG.*?\*\*\*",
        re.IGNORECASE,
    )
    # Fallback for Gutenberg files that lack *** markers
    GUTENBERG_END_PLAIN = re.compile(
        r"^End of (?:the )?Project Gutenberg",
        re.IGNORECASE | re.MULTILINE,
    )

    # MIT Internet Classics Archive patterns
    MIT_HEADER = re.compile(
        r"provided by the internet classics archive\..*?-{6,}",
        re.IGNORECASE | re.DOTALL,
    )
    MIT_FOOTER = re.compile(
        r"the internet classics archive\b[^\n]*(?:web atomics)?[^\n]*",
        re.IGNORECASE,
    )
    MIT_DASH_LINE = re.compile(r"-{6,}")

    # Internet Archive patterns
    IA_HEADER = re.compile(
        r"(?:Digitized by|Book digitized by|Original from|Uploaded by)"
        r"[^\n]*",
        re.IGNORECASE,
    )
    IA_GOOGLE_MARKER = re.compile(
        r"(?:Generated (?:by|from)|Google-digitized|"
        r"This is a digital copy of a book)[^\n]*",
        re.IGNORECASE,
    )

    # Roman numeral pattern β€” matches standalone uppercase Roman numerals (2+ chars)
    ROMAN_NUMERAL = re.compile(
        r"\b(M{0,3}(?:CM|CD|D?C{0,3})(?:XC|XL|L?X{0,3})(?:IX|IV|V?I{0,3}))\b"
    )
    # Context words that allow single "I" to be treated as Roman numeral 1
    ROMAN_CONTEXT = re.compile(
        r"\b(?:book|chapter|prop|proposition|part|vol|volume|no|number|"
        r"section|act|scene|lib|epistle|ode|psalm|canon|lemma|corollary|"
        r"cor|def|definition|axiom|postulate)\b",
        re.IGNORECASE,
    )

    # Roman numeral value map
    ROMAN_VALUES = {"I": 1, "V": 5, "X": 10, "L": 50, "C": 100, "D": 500, "M": 1000}

    # Non-body section headers (for aggressive stripping)
    # NOTE: "INTRODUCTION" is deliberately excluded β€” it is often the author's own text
    FRONT_MATTER_HEADERS = re.compile(
        r"^\s*(?:PREFACE|FOREWORD|FORWARD|EDITOR[\u2019']?S?\s+NOTE|"
        r"TRANSLATOR[\u2019']?S?\s+NOTE|PREFATORY\s+NOTE|PRELIMINARY\s+NOTE|"
        r"BIOGRAPHICAL\s+(?:NOTE|SKETCH)|ADVERTISEMENT|DEDICAT(?:ION|ED\s+TO)|"
        r"TO\s+THE\s+READER|NOTE\s+ON\s+(?:THE\s+)?TEXT|ABOUT\s+THIS\s+EDITION|"
        r"CHRONOLOG(?:Y|ICAL))[.:\-\u2014]*\s*$",
        re.IGNORECASE | re.MULTILINE,
    )
    BACK_MATTER_HEADERS = re.compile(
        r"^\s*(?:APPENDIX|ADDEND(?:UM|A)|INDEX|GLOSSARY|BIBLIOGRAPHY|"
        r"WORKS?\s+CITED|REFERENCES|ENDNOTES|FOOTNOTES|"
        r"ACKNOWLEDG(?:E?MENTS?)|CREDITS|COLOPHON|ERRATA|"
        r"TRANSCRIBER[\u2019']?S?\s+NOTES?|"
        r"TYPOGRAPHICAL\s+ERRORS?\s+CORRECTED|"
        r"LIST\s+OF\s+(?:ILLUSTRATIONS|FIGURES|PLATES))[.:\-\u2014]*\s*$",
        re.IGNORECASE | re.MULTILINE,
    )
    TOC_HEADER = re.compile(
        r"^\s*(?:TABLE\s+OF\s+)?CONTENTS?[.:\-\u2014]*\s*$",
        re.IGNORECASE | re.MULTILINE,
    )

    # Production/publisher patterns (for front matter cleanup)
    PRODUCTION_PATTERNS = [
        re.compile(p, re.IGNORECASE) for p in [
            r"(?:produced|prepared|transcribed|digitized|scanned)\s+(?:by|for|at)",
            r"production\s+note",
            r"transcriber[\u2019']?s?\s+note",
            r"scanner[\u2019']?s?\s+note",
            r"cornell\s+university\s+library",
            r"(?:published|printed)\s+(?:by|for|at|in)",
            r"(?:first|second|third|\d+(?:st|nd|rd|th))\s+edition",
            r"price\s+\w+[sd]\.",
            r"(?:cloth|paper|hardcover|paperback|octavo|quarto)",
            r"\bisbn\b",
            r"all\s+rights?\s+reserved",
            r"(?:copyright|copr\.?)\s*(?:\(c\)|\xa9|\d)",
            r"press\s+of\b",
            r"university\s+press",
        ]
    ]

    # Transcriber correction notes (back matter)
    TRANSCRIBER_CORRECTION = re.compile(
        r"^p\.\s*(?:\d+|\?\??|\.)\s*[.,]?\s*(?:sqq\.|in\s|the\s|as\s|heading|"
        r"reference|prop|from\s|then\s|these\s|def\.|"
        r"twenty|thirty|forty|fifty|sixty|seventy|eighty|ninety|"
        r"one\s|two\s|three|four|five|six\s|seven|eight|nine|"
        # Match quoted corrections
        r'["\u201c])',
        re.IGNORECASE,
    )

    # Separator/decoration lines
    SEPARATOR_LINE = re.compile(r"^[\s.*_=~\-#]+$")

    # Number words for 0-19 and tens
    ONES = [
        "zero", "one", "two", "three", "four", "five", "six", "seven",
        "eight", "nine", "ten", "eleven", "twelve", "thirteen", "fourteen",
        "fifteen", "sixteen", "seventeen", "eighteen", "nineteen",
    ]
    TENS = [
        "", "", "twenty", "thirty", "forty", "fifty", "sixty", "seventy",
        "eighty", "ninety",
    ]

    def __init__(self, config: dict):
        self.lowercase = config.get("lowercase", True)
        self.strip_gutenberg = config.get("strip_gutenberg", True)
        self.strip_mit_classics = config.get("strip_mit_classics", True)
        self.strip_internet_archive = config.get("strip_internet_archive", True)
        self.normalize_unicode = config.get("normalize_unicode", True)
        self.convert_numerals = config.get("convert_numerals", False)
        self.convert_roman_numerals = config.get("convert_roman_numerals", False)
        self.strip_non_body = config.get("strip_non_body", True)
        self.min_line_length = config.get("min_line_length", 20)
        self.remove_urls = config.get("remove_urls", True)
        self.collapse_whitespace = config.get("collapse_whitespace", True)
        self.allowed_chars = config.get("allowed_chars", r"a-z0-9 .,;:!?'\"\-\(\)")

    def clean(self, text: str) -> str:
        """Run all cleaning stages on the input text."""
        if not text.strip():
            return ""

        # Stage 1: Strip source-specific boilerplate
        if self.strip_gutenberg:
            text = self._strip_gutenberg(text)

        if self.strip_mit_classics:
            text = self._strip_mit_classics(text)

        if self.strip_internet_archive:
            text = self._strip_internet_archive(text)

        # Stage 2: Strip non-body content (before any text transforms)
        if self.strip_non_body:
            text = self._strip_non_body(text)

        # Stage 3: Normalize unicode
        if self.normalize_unicode:
            text = self._normalize_unicode(text)

        if self.remove_urls:
            text = self._remove_urls(text)

        # Stage 4: Convert Roman numerals (BEFORE lowercase β€” needs uppercase)
        if self.convert_roman_numerals:
            text = self._convert_roman_numerals(text)

        # Stage 5: Lowercase
        if self.lowercase:
            text = text.lower()

        # Stage 6: Convert Arabic numerals
        if self.convert_numerals:
            text = self._convert_numerals(text)

        # Stage 7: Character filtering
        text = self._clean_chars(text)

        # Stage 8: Collapse whitespace
        if self.collapse_whitespace:
            text = self._collapse_whitespace(text)

        return text.strip()

    # ------------------------------------------------------------------
    # Source boilerplate stripping
    # ------------------------------------------------------------------

    def _strip_gutenberg(self, text: str) -> str:
        """Remove Project Gutenberg headers and footers."""
        # Strip footer first (before positions shift)
        end_match = self.GUTENBERG_END.search(text)
        if not end_match:
            end_match = self.GUTENBERG_END_PLAIN.search(text)
        if end_match:
            text = text[:end_match.start()]

        # Strip header
        start_match = self.GUTENBERG_START.search(text)
        if start_match:
            text = text[start_match.end():]

        # Also strip common Gutenberg preamble lines
        lines = text.split("\n")
        cleaned = []
        skip = True if start_match is None else False
        for line in lines:
            stripped = line.strip()
            if skip and stripped.startswith(("Title:", "Author:", "Release Date:",
                                            "Language:", "Character set",
                                            "Produced by", "Updated editions")):
                continue
            if skip and not stripped:
                continue
            skip = False
            cleaned.append(line)

        return "\n".join(cleaned)

    def _strip_mit_classics(self, text: str) -> str:
        """Remove MIT Internet Classics Archive headers, footers, and section dividers."""
        text = self.MIT_HEADER.sub("", text)
        text = self.MIT_FOOTER.sub("", text)
        text = self.MIT_DASH_LINE.sub("", text)
        return text

    def _strip_internet_archive(self, text: str) -> str:
        """Remove Internet Archive / Google Books digitization boilerplate."""
        text = self.IA_HEADER.sub("", text)
        text = self.IA_GOOGLE_MARKER.sub("", text)
        return text

    # ------------------------------------------------------------------
    # Non-body content stripping (aggressive mode)
    # ------------------------------------------------------------------

    def _strip_non_body(self, text: str) -> str:
        """Remove front matter, back matter, and inline non-body content."""
        text = self._strip_front_matter(text)
        text = self._strip_back_matter(text)
        text = self._strip_inline_non_body(text)
        return text

    def _strip_front_matter(self, text: str) -> str:
        """Strip front matter: production notes, TOC, preface, etc.

        Order: (1) strip named sections by header, (2) skip remaining
        non-body paragraphs at the top.
        """
        # Pass 1: Remove named sections that have clear headers
        text = self._strip_section(text, self.FRONT_MATTER_HEADERS)
        text = self._strip_section(text, self.TOC_HEADER)

        # Pass 2: Skip non-body paragraphs at the beginning.
        # Body prose = substantial paragraph (>150 chars) with full sentences
        # that does NOT match production/publisher patterns.
        lines = text.split("\n")
        start_idx = 0
        i = 0
        while i < len(lines):
            # Collect next paragraph
            while i < len(lines) and not lines[i].strip():
                i += 1
            para_start = i
            para_lines = []
            while i < len(lines) and lines[i].strip():
                para_lines.append(lines[i].strip())
                i += 1

            if not para_lines:
                continue

            para_text = " ".join(para_lines)

            has_sentences = bool(re.search(r"\.\s+[A-Z]", para_text))
            is_substantial = len(para_text) > 150
            is_production = self._is_production_line(para_text)

            # Title pages / heading blocks: mostly uppercase letters
            alpha_chars = [c for c in para_text if c.isalpha()]
            is_mostly_uppercase = (
                alpha_chars
                and sum(1 for c in alpha_chars if c.isupper()) / len(alpha_chars) > 0.5
            )

            # Short average line length suggests a title/heading block
            avg_line_len = sum(len(l) for l in para_lines) / len(para_lines)
            is_short_lines = avg_line_len < 50

            if (is_substantial and has_sentences
                    and not is_production
                    and not is_mostly_uppercase
                    and not is_short_lines):
                start_idx = para_start
                break

            # Not body yet β€” skip it
            start_idx = i

        return "\n".join(lines[start_idx:])

    def _strip_back_matter(self, text: str) -> str:
        """Strip back matter: appendixes, index, transcriber notes, etc."""
        lines = text.split("\n")

        # Find the first back-matter header and truncate there
        first_back_idx = None
        for i, line in enumerate(lines):
            stripped = line.strip()
            if self.BACK_MATTER_HEADERS.match(stripped):
                first_back_idx = i
                break
            # Also detect "Typographical Errors corrected..." as back matter start
            if re.match(r"Typographical\s+Errors?\b", stripped, re.IGNORECASE):
                first_back_idx = i
                break

        if first_back_idx is not None:
            lines = lines[:first_back_idx]

        # Strip trailing transcriber correction notes (working backward)
        while lines:
            stripped = lines[-1].strip()
            if not stripped:
                lines.pop()
                continue
            if self.TRANSCRIBER_CORRECTION.match(stripped):
                lines.pop()
                continue
            if self._is_production_line(stripped):
                lines.pop()
                continue
            break

        return "\n".join(lines)

    def _strip_inline_non_body(self, text: str) -> str:
        """Strip inline non-body markers: separator lines, all-caps headings."""
        lines = text.split("\n")
        cleaned = []
        for line in lines:
            stripped = line.strip()

            # Remove separator/decoration lines
            if stripped and self.SEPARATOR_LINE.match(stripped):
                continue

            # Remove short ALL-CAPS lines (likely section headings)
            if stripped and len(stripped) < 80 and stripped == stripped.upper() and stripped.isalpha():
                continue

            cleaned.append(line)

        return "\n".join(cleaned)

    def _strip_section(self, text: str, header_pattern: re.Pattern) -> str:
        """Remove a section identified by header_pattern until next section boundary."""
        lines = text.split("\n")
        result = []
        skipping = False

        for i, line in enumerate(lines):
            stripped = line.strip()

            if header_pattern.match(stripped):
                skipping = True
                continue

            if skipping:
                # Stop skipping at next section boundary:
                # A substantial non-empty line after a blank line, OR
                # A line that looks like a real body section start
                is_blank = not stripped
                if not is_blank and self._is_section_boundary(stripped, lines, i):
                    skipping = False
                    result.append(line)
                continue

            result.append(line)

        return "\n".join(result)

    def _is_section_boundary(self, stripped: str, lines: list[str], idx: int) -> bool:
        """Detect if a line marks the beginning of a new major section.

        Only returns True for explicit section headers/markers, NOT for
        long body-text lines (which can appear inside prefaces/forewords).
        """
        # Body-start keywords (these signal real content resuming)
        if re.match(
            r"(?:Book|Chapter|Part|Section|Proposition|Theorem|Definition|"
            r"Axiom|Postulate|Introduction|Definitions|Lemma|Corollary|"
            r"Contents?)\b",
            stripped, re.IGNORECASE,
        ):
            return True

        # Another named section header (front or back matter)
        if self.FRONT_MATTER_HEADERS.match(stripped):
            return True
        if self.BACK_MATTER_HEADERS.match(stripped):
            return True
        if self.TOC_HEADER.match(stripped):
            return True

        return False

    def _is_production_line(self, line: str) -> bool:
        """Check if a line is production/publisher metadata."""
        for pattern in self.PRODUCTION_PATTERNS:
            if pattern.search(line):
                return True
        return False

    # ------------------------------------------------------------------
    # Unicode normalization
    # ------------------------------------------------------------------

    def _normalize_unicode(self, text: str) -> str:
        """Normalize unicode characters to their closest ASCII equivalents."""
        text = unicodedata.normalize("NFKD", text)
        replacements = {
            "\u2018": "'", "\u2019": "'",  # smart quotes
            "\u201c": '"', "\u201d": '"',
            "\u2013": "-", "\u2014": "-",  # en/em dash
            "\u2026": "...",               # ellipsis
            "\u00a0": " ",                 # non-breaking space
            "\u00b6": "",                  # pilcrow
            "\u00a7": "",                  # section sign
        }
        for old, new in replacements.items():
            text = text.replace(old, new)

        # Strip remaining non-ASCII
        text = text.encode("ascii", errors="ignore").decode("ascii")
        return text

    def _remove_urls(self, text: str) -> str:
        """Remove URLs and email addresses."""
        text = re.sub(r"https?://\S+", "", text)
        text = re.sub(r"www\.\S+", "", text)
        text = re.sub(r"\S+@\S+\.\S+", "", text)
        return text

    # ------------------------------------------------------------------
    # Roman numeral conversion
    # ------------------------------------------------------------------

    def _roman_to_int(self, s: str) -> int:
        """Convert a Roman numeral string to an integer."""
        result = 0
        prev = 0
        for char in reversed(s.upper()):
            val = self.ROMAN_VALUES.get(char, 0)
            if val < prev:
                result -= val
            else:
                result += val
            prev = val
        return result

    def _is_valid_roman(self, s: str) -> bool:
        """Check if a string is a valid Roman numeral (not just random letters)."""
        if not s:
            return False
        # Must only contain valid Roman numeral characters
        if not all(c in "IVXLCDM" for c in s.upper()):
            return False
        # Must convert to a positive number
        val = self._roman_to_int(s)
        return val > 0

    def _convert_roman_numerals(self, text: str) -> str:
        """Convert Roman numerals to English words.

        Handles multi-character Roman numerals (II, IV, XIV, etc.) directly.
        Single 'I' is only converted when preceded by a context word.
        """
        def replace_roman(m):
            numeral = m.group(1)
            # Skip single-char matches that aren't clearly Roman numerals
            if len(numeral) == 1:
                # Single 'I' β€” only convert after context words
                if numeral.upper() == "I":
                    # Check the text before this match for context words
                    before = text[max(0, m.start() - 30):m.start()]
                    if not self.ROMAN_CONTEXT.search(before):
                        return m.group(0)
                else:
                    # Single V, X, L, C, D, M β€” convert them
                    pass

            if not self._is_valid_roman(numeral):
                return m.group(0)

            val = self._roman_to_int(numeral)
            return self._number_to_words(val)

        return self.ROMAN_NUMERAL.sub(replace_roman, text)

    # ------------------------------------------------------------------
    # Arabic numeral conversion
    # ------------------------------------------------------------------

    def _number_to_words(self, n: int) -> str:
        """Convert an integer to English words."""
        if n < 0:
            return "negative " + self._number_to_words(-n)
        if n == 0:
            return self.ONES[0]
        if n < 20:
            return self.ONES[n]
        if n < 100:
            tens, ones = divmod(n, 10)
            return self.TENS[tens] + (" " + self.ONES[ones] if ones else "")
        if n < 1000:
            hundreds, remainder = divmod(n, 100)
            result = self.ONES[hundreds] + " hundred"
            if remainder:
                result += " " + self._number_to_words(remainder)
            return result
        if n < 1000000:
            thousands, remainder = divmod(n, 1000)
            result = self._number_to_words(thousands) + " thousand"
            if remainder:
                result += " " + self._number_to_words(remainder)
            return result
        return str(n)

    def _convert_numerals(self, text: str) -> str:
        """Replace standalone digit sequences with their English word equivalents.

        Only converts digit groups that are standalone words (surrounded by
        whitespace or punctuation), preventing garbled output from codes
        like Z39.48-1984.
        """
        def replace_match(m):
            # Ensure digits are not part of a larger alphanumeric token
            start, end = m.start(), m.end()
            if start > 0 and text[start - 1].isalnum():
                return m.group()
            if end < len(text) and text[end].isalnum():
                return m.group()
            try:
                n = int(m.group())
                if n < 1000000:
                    return self._number_to_words(n)
            except ValueError:
                pass
            return m.group()
        return re.sub(r"\d+", replace_match, text)

    # ------------------------------------------------------------------
    # Character filtering and whitespace
    # ------------------------------------------------------------------

    def _clean_chars(self, text: str) -> str:
        """Remove characters not in the allowed set."""
        pattern = f"[^{self.allowed_chars}\n]"
        text = re.sub(pattern, " ", text)
        # Remove lines that are only dots and/or spaces (separator lines)
        text = re.sub(r"^[. ]+$", "", text, flags=re.MULTILINE)
        return text

    def _collapse_whitespace(self, text: str) -> str:
        """Collapse multiple spaces/newlines into single spaces."""
        text = re.sub(r"\n{3,}", "\n\n", text)
        text = re.sub(r" {2,}", " ", text)
        text = re.sub(r" *\n *", "\n", text)
        return text