LisaMegaWatts commited on
Commit
d184fb7
·
verified ·
1 Parent(s): d97776b

Upload cleaner.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. cleaner.py +581 -0
cleaner.py ADDED
@@ -0,0 +1,581 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Text cleaning pipeline for preparing training data."""
2
+
3
+ import logging
4
+ import re
5
+ import unicodedata
6
+
7
+ logger = logging.getLogger(__name__)
8
+
9
+
10
+ class TextCleaner:
11
+ """Cleans raw text for character-level language model training."""
12
+
13
+ # Project Gutenberg header/footer patterns
14
+ GUTENBERG_START = re.compile(
15
+ r"\*\*\*\s*START OF (?:THE |THIS )?PROJECT GUTENBERG.*?\*\*\*",
16
+ re.IGNORECASE,
17
+ )
18
+ GUTENBERG_END = re.compile(
19
+ r"\*\*\*\s*END OF (?:THE |THIS )?PROJECT GUTENBERG.*?\*\*\*",
20
+ re.IGNORECASE,
21
+ )
22
+ # Fallback for Gutenberg files that lack *** markers
23
+ GUTENBERG_END_PLAIN = re.compile(
24
+ r"^End of (?:the )?Project Gutenberg",
25
+ re.IGNORECASE | re.MULTILINE,
26
+ )
27
+
28
+ # MIT Internet Classics Archive patterns
29
+ MIT_HEADER = re.compile(
30
+ r"provided by the internet classics archive\..*?-{6,}",
31
+ re.IGNORECASE | re.DOTALL,
32
+ )
33
+ MIT_FOOTER = re.compile(
34
+ r"the internet classics archive\b[^\n]*(?:web atomics)?[^\n]*",
35
+ re.IGNORECASE,
36
+ )
37
+ MIT_DASH_LINE = re.compile(r"-{6,}")
38
+
39
+ # Internet Archive patterns
40
+ IA_HEADER = re.compile(
41
+ r"(?:Digitized by|Book digitized by|Original from|Uploaded by)"
42
+ r"[^\n]*",
43
+ re.IGNORECASE,
44
+ )
45
+ IA_GOOGLE_MARKER = re.compile(
46
+ r"(?:Generated (?:by|from)|Google-digitized|"
47
+ r"This is a digital copy of a book)[^\n]*",
48
+ re.IGNORECASE,
49
+ )
50
+
51
+ # Roman numeral pattern — matches standalone uppercase Roman numerals (2+ chars)
52
+ ROMAN_NUMERAL = re.compile(
53
+ r"\b(M{0,3}(?:CM|CD|D?C{0,3})(?:XC|XL|L?X{0,3})(?:IX|IV|V?I{0,3}))\b"
54
+ )
55
+ # Context words that allow single "I" to be treated as Roman numeral 1
56
+ ROMAN_CONTEXT = re.compile(
57
+ r"\b(?:book|chapter|prop|proposition|part|vol|volume|no|number|"
58
+ r"section|act|scene|lib|epistle|ode|psalm|canon|lemma|corollary|"
59
+ r"cor|def|definition|axiom|postulate)\b",
60
+ re.IGNORECASE,
61
+ )
62
+
63
+ # Roman numeral value map
64
+ ROMAN_VALUES = {"I": 1, "V": 5, "X": 10, "L": 50, "C": 100, "D": 500, "M": 1000}
65
+
66
+ # Non-body section headers (for aggressive stripping)
67
+ # NOTE: "INTRODUCTION" is deliberately excluded — it is often the author's own text
68
+ FRONT_MATTER_HEADERS = re.compile(
69
+ r"^\s*(?:PREFACE|FOREWORD|FORWARD|EDITOR[\u2019']?S?\s+NOTE|"
70
+ r"TRANSLATOR[\u2019']?S?\s+NOTE|PREFATORY\s+NOTE|PRELIMINARY\s+NOTE|"
71
+ r"BIOGRAPHICAL\s+(?:NOTE|SKETCH)|ADVERTISEMENT|DEDICAT(?:ION|ED\s+TO)|"
72
+ r"TO\s+THE\s+READER|NOTE\s+ON\s+(?:THE\s+)?TEXT|ABOUT\s+THIS\s+EDITION|"
73
+ r"CHRONOLOG(?:Y|ICAL))[.:\-\u2014]*\s*$",
74
+ re.IGNORECASE | re.MULTILINE,
75
+ )
76
+ BACK_MATTER_HEADERS = re.compile(
77
+ r"^\s*(?:APPENDIX|ADDEND(?:UM|A)|INDEX|GLOSSARY|BIBLIOGRAPHY|"
78
+ r"WORKS?\s+CITED|REFERENCES|ENDNOTES|FOOTNOTES|"
79
+ r"ACKNOWLEDG(?:E?MENTS?)|CREDITS|COLOPHON|ERRATA|"
80
+ r"TRANSCRIBER[\u2019']?S?\s+NOTES?|"
81
+ r"TYPOGRAPHICAL\s+ERRORS?\s+CORRECTED|"
82
+ r"LIST\s+OF\s+(?:ILLUSTRATIONS|FIGURES|PLATES))[.:\-\u2014]*\s*$",
83
+ re.IGNORECASE | re.MULTILINE,
84
+ )
85
+ TOC_HEADER = re.compile(
86
+ r"^\s*(?:TABLE\s+OF\s+)?CONTENTS?[.:\-\u2014]*\s*$",
87
+ re.IGNORECASE | re.MULTILINE,
88
+ )
89
+
90
+ # Production/publisher patterns (for front matter cleanup)
91
+ PRODUCTION_PATTERNS = [
92
+ re.compile(p, re.IGNORECASE) for p in [
93
+ r"(?:produced|prepared|transcribed|digitized|scanned)\s+(?:by|for|at)",
94
+ r"production\s+note",
95
+ r"transcriber[\u2019']?s?\s+note",
96
+ r"scanner[\u2019']?s?\s+note",
97
+ r"cornell\s+university\s+library",
98
+ r"(?:published|printed)\s+(?:by|for|at|in)",
99
+ r"(?:first|second|third|\d+(?:st|nd|rd|th))\s+edition",
100
+ r"price\s+\w+[sd]\.",
101
+ r"(?:cloth|paper|hardcover|paperback|octavo|quarto)",
102
+ r"\bisbn\b",
103
+ r"all\s+rights?\s+reserved",
104
+ r"(?:copyright|copr\.?)\s*(?:\(c\)|\xa9|\d)",
105
+ r"press\s+of\b",
106
+ r"university\s+press",
107
+ ]
108
+ ]
109
+
110
+ # Transcriber correction notes (back matter)
111
+ TRANSCRIBER_CORRECTION = re.compile(
112
+ r"^p\.\s*(?:\d+|\?\??|\.)\s*[.,]?\s*(?:sqq\.|in\s|the\s|as\s|heading|"
113
+ r"reference|prop|from\s|then\s|these\s|def\.|"
114
+ r"twenty|thirty|forty|fifty|sixty|seventy|eighty|ninety|"
115
+ r"one\s|two\s|three|four|five|six\s|seven|eight|nine|"
116
+ # Match quoted corrections
117
+ r'["\u201c])',
118
+ re.IGNORECASE,
119
+ )
120
+
121
+ # Separator/decoration lines
122
+ SEPARATOR_LINE = re.compile(r"^[\s.*_=~\-#]+$")
123
+
124
+ # Number words for 0-19 and tens
125
+ ONES = [
126
+ "zero", "one", "two", "three", "four", "five", "six", "seven",
127
+ "eight", "nine", "ten", "eleven", "twelve", "thirteen", "fourteen",
128
+ "fifteen", "sixteen", "seventeen", "eighteen", "nineteen",
129
+ ]
130
+ TENS = [
131
+ "", "", "twenty", "thirty", "forty", "fifty", "sixty", "seventy",
132
+ "eighty", "ninety",
133
+ ]
134
+
135
+ def __init__(self, config: dict):
136
+ self.lowercase = config.get("lowercase", True)
137
+ self.strip_gutenberg = config.get("strip_gutenberg", True)
138
+ self.strip_mit_classics = config.get("strip_mit_classics", True)
139
+ self.strip_internet_archive = config.get("strip_internet_archive", True)
140
+ self.normalize_unicode = config.get("normalize_unicode", True)
141
+ self.convert_numerals = config.get("convert_numerals", False)
142
+ self.convert_roman_numerals = config.get("convert_roman_numerals", False)
143
+ self.strip_non_body = config.get("strip_non_body", True)
144
+ self.min_line_length = config.get("min_line_length", 20)
145
+ self.remove_urls = config.get("remove_urls", True)
146
+ self.collapse_whitespace = config.get("collapse_whitespace", True)
147
+ self.allowed_chars = config.get("allowed_chars", r"a-z0-9 .,;:!?'\"\-\(\)")
148
+
149
+ def clean(self, text: str) -> str:
150
+ """Run all cleaning stages on the input text."""
151
+ if not text.strip():
152
+ return ""
153
+
154
+ # Stage 1: Strip source-specific boilerplate
155
+ if self.strip_gutenberg:
156
+ text = self._strip_gutenberg(text)
157
+
158
+ if self.strip_mit_classics:
159
+ text = self._strip_mit_classics(text)
160
+
161
+ if self.strip_internet_archive:
162
+ text = self._strip_internet_archive(text)
163
+
164
+ # Stage 2: Strip non-body content (before any text transforms)
165
+ if self.strip_non_body:
166
+ text = self._strip_non_body(text)
167
+
168
+ # Stage 3: Normalize unicode
169
+ if self.normalize_unicode:
170
+ text = self._normalize_unicode(text)
171
+
172
+ if self.remove_urls:
173
+ text = self._remove_urls(text)
174
+
175
+ # Stage 4: Convert Roman numerals (BEFORE lowercase — needs uppercase)
176
+ if self.convert_roman_numerals:
177
+ text = self._convert_roman_numerals(text)
178
+
179
+ # Stage 5: Lowercase
180
+ if self.lowercase:
181
+ text = text.lower()
182
+
183
+ # Stage 6: Convert Arabic numerals
184
+ if self.convert_numerals:
185
+ text = self._convert_numerals(text)
186
+
187
+ # Stage 7: Character filtering
188
+ text = self._clean_chars(text)
189
+
190
+ # Stage 8: Collapse whitespace
191
+ if self.collapse_whitespace:
192
+ text = self._collapse_whitespace(text)
193
+
194
+ return text.strip()
195
+
196
+ # ------------------------------------------------------------------
197
+ # Source boilerplate stripping
198
+ # ------------------------------------------------------------------
199
+
200
+ def _strip_gutenberg(self, text: str) -> str:
201
+ """Remove Project Gutenberg headers and footers."""
202
+ # Strip footer first (before positions shift)
203
+ end_match = self.GUTENBERG_END.search(text)
204
+ if not end_match:
205
+ end_match = self.GUTENBERG_END_PLAIN.search(text)
206
+ if end_match:
207
+ text = text[:end_match.start()]
208
+
209
+ # Strip header
210
+ start_match = self.GUTENBERG_START.search(text)
211
+ if start_match:
212
+ text = text[start_match.end():]
213
+
214
+ # Also strip common Gutenberg preamble lines
215
+ lines = text.split("\n")
216
+ cleaned = []
217
+ skip = True if start_match is None else False
218
+ for line in lines:
219
+ stripped = line.strip()
220
+ if skip and stripped.startswith(("Title:", "Author:", "Release Date:",
221
+ "Language:", "Character set",
222
+ "Produced by", "Updated editions")):
223
+ continue
224
+ if skip and not stripped:
225
+ continue
226
+ skip = False
227
+ cleaned.append(line)
228
+
229
+ return "\n".join(cleaned)
230
+
231
+ def _strip_mit_classics(self, text: str) -> str:
232
+ """Remove MIT Internet Classics Archive headers, footers, and section dividers."""
233
+ text = self.MIT_HEADER.sub("", text)
234
+ text = self.MIT_FOOTER.sub("", text)
235
+ text = self.MIT_DASH_LINE.sub("", text)
236
+ return text
237
+
238
+ def _strip_internet_archive(self, text: str) -> str:
239
+ """Remove Internet Archive / Google Books digitization boilerplate."""
240
+ text = self.IA_HEADER.sub("", text)
241
+ text = self.IA_GOOGLE_MARKER.sub("", text)
242
+ return text
243
+
244
+ # ------------------------------------------------------------------
245
+ # Non-body content stripping (aggressive mode)
246
+ # ------------------------------------------------------------------
247
+
248
+ def _strip_non_body(self, text: str) -> str:
249
+ """Remove front matter, back matter, and inline non-body content."""
250
+ text = self._strip_front_matter(text)
251
+ text = self._strip_back_matter(text)
252
+ text = self._strip_inline_non_body(text)
253
+ return text
254
+
255
+ def _strip_front_matter(self, text: str) -> str:
256
+ """Strip front matter: production notes, TOC, preface, etc.
257
+
258
+ Order: (1) strip named sections by header, (2) skip remaining
259
+ non-body paragraphs at the top.
260
+ """
261
+ # Pass 1: Remove named sections that have clear headers
262
+ text = self._strip_section(text, self.FRONT_MATTER_HEADERS)
263
+ text = self._strip_section(text, self.TOC_HEADER)
264
+
265
+ # Pass 2: Skip non-body paragraphs at the beginning.
266
+ # Body prose = substantial paragraph (>150 chars) with full sentences
267
+ # that does NOT match production/publisher patterns.
268
+ lines = text.split("\n")
269
+ start_idx = 0
270
+ i = 0
271
+ while i < len(lines):
272
+ # Collect next paragraph
273
+ while i < len(lines) and not lines[i].strip():
274
+ i += 1
275
+ para_start = i
276
+ para_lines = []
277
+ while i < len(lines) and lines[i].strip():
278
+ para_lines.append(lines[i].strip())
279
+ i += 1
280
+
281
+ if not para_lines:
282
+ continue
283
+
284
+ para_text = " ".join(para_lines)
285
+
286
+ has_sentences = bool(re.search(r"\.\s+[A-Z]", para_text))
287
+ is_substantial = len(para_text) > 150
288
+ is_production = self._is_production_line(para_text)
289
+
290
+ # Title pages / heading blocks: mostly uppercase letters
291
+ alpha_chars = [c for c in para_text if c.isalpha()]
292
+ is_mostly_uppercase = (
293
+ alpha_chars
294
+ and sum(1 for c in alpha_chars if c.isupper()) / len(alpha_chars) > 0.5
295
+ )
296
+
297
+ # Short average line length suggests a title/heading block
298
+ avg_line_len = sum(len(l) for l in para_lines) / len(para_lines)
299
+ is_short_lines = avg_line_len < 50
300
+
301
+ if (is_substantial and has_sentences
302
+ and not is_production
303
+ and not is_mostly_uppercase
304
+ and not is_short_lines):
305
+ start_idx = para_start
306
+ break
307
+
308
+ # Not body yet — skip it
309
+ start_idx = i
310
+
311
+ return "\n".join(lines[start_idx:])
312
+
313
+ def _strip_back_matter(self, text: str) -> str:
314
+ """Strip back matter: appendixes, index, transcriber notes, etc."""
315
+ lines = text.split("\n")
316
+
317
+ # Find the first back-matter header and truncate there
318
+ first_back_idx = None
319
+ for i, line in enumerate(lines):
320
+ stripped = line.strip()
321
+ if self.BACK_MATTER_HEADERS.match(stripped):
322
+ first_back_idx = i
323
+ break
324
+ # Also detect "Typographical Errors corrected..." as back matter start
325
+ if re.match(r"Typographical\s+Errors?\b", stripped, re.IGNORECASE):
326
+ first_back_idx = i
327
+ break
328
+
329
+ if first_back_idx is not None:
330
+ lines = lines[:first_back_idx]
331
+
332
+ # Strip trailing transcriber correction notes (working backward)
333
+ while lines:
334
+ stripped = lines[-1].strip()
335
+ if not stripped:
336
+ lines.pop()
337
+ continue
338
+ if self.TRANSCRIBER_CORRECTION.match(stripped):
339
+ lines.pop()
340
+ continue
341
+ if self._is_production_line(stripped):
342
+ lines.pop()
343
+ continue
344
+ break
345
+
346
+ return "\n".join(lines)
347
+
348
+ def _strip_inline_non_body(self, text: str) -> str:
349
+ """Strip inline non-body markers: separator lines, all-caps headings."""
350
+ lines = text.split("\n")
351
+ cleaned = []
352
+ for line in lines:
353
+ stripped = line.strip()
354
+
355
+ # Remove separator/decoration lines
356
+ if stripped and self.SEPARATOR_LINE.match(stripped):
357
+ continue
358
+
359
+ # Remove short ALL-CAPS lines (likely section headings)
360
+ if stripped and len(stripped) < 80 and stripped == stripped.upper() and stripped.isalpha():
361
+ continue
362
+
363
+ cleaned.append(line)
364
+
365
+ return "\n".join(cleaned)
366
+
367
+ def _strip_section(self, text: str, header_pattern: re.Pattern) -> str:
368
+ """Remove a section identified by header_pattern until next section boundary."""
369
+ lines = text.split("\n")
370
+ result = []
371
+ skipping = False
372
+
373
+ for i, line in enumerate(lines):
374
+ stripped = line.strip()
375
+
376
+ if header_pattern.match(stripped):
377
+ skipping = True
378
+ continue
379
+
380
+ if skipping:
381
+ # Stop skipping at next section boundary:
382
+ # A substantial non-empty line after a blank line, OR
383
+ # A line that looks like a real body section start
384
+ is_blank = not stripped
385
+ if not is_blank and self._is_section_boundary(stripped, lines, i):
386
+ skipping = False
387
+ result.append(line)
388
+ continue
389
+
390
+ result.append(line)
391
+
392
+ return "\n".join(result)
393
+
394
+ def _is_section_boundary(self, stripped: str, lines: list[str], idx: int) -> bool:
395
+ """Detect if a line marks the beginning of a new major section.
396
+
397
+ Only returns True for explicit section headers/markers, NOT for
398
+ long body-text lines (which can appear inside prefaces/forewords).
399
+ """
400
+ # Body-start keywords (these signal real content resuming)
401
+ if re.match(
402
+ r"(?:Book|Chapter|Part|Section|Proposition|Theorem|Definition|"
403
+ r"Axiom|Postulate|Introduction|Definitions|Lemma|Corollary|"
404
+ r"Contents?)\b",
405
+ stripped, re.IGNORECASE,
406
+ ):
407
+ return True
408
+
409
+ # Another named section header (front or back matter)
410
+ if self.FRONT_MATTER_HEADERS.match(stripped):
411
+ return True
412
+ if self.BACK_MATTER_HEADERS.match(stripped):
413
+ return True
414
+ if self.TOC_HEADER.match(stripped):
415
+ return True
416
+
417
+ return False
418
+
419
+ def _is_production_line(self, line: str) -> bool:
420
+ """Check if a line is production/publisher metadata."""
421
+ for pattern in self.PRODUCTION_PATTERNS:
422
+ if pattern.search(line):
423
+ return True
424
+ return False
425
+
426
+ # ------------------------------------------------------------------
427
+ # Unicode normalization
428
+ # ------------------------------------------------------------------
429
+
430
+ def _normalize_unicode(self, text: str) -> str:
431
+ """Normalize unicode characters to their closest ASCII equivalents."""
432
+ text = unicodedata.normalize("NFKD", text)
433
+ replacements = {
434
+ "\u2018": "'", "\u2019": "'", # smart quotes
435
+ "\u201c": '"', "\u201d": '"',
436
+ "\u2013": "-", "\u2014": "-", # en/em dash
437
+ "\u2026": "...", # ellipsis
438
+ "\u00a0": " ", # non-breaking space
439
+ "\u00b6": "", # pilcrow
440
+ "\u00a7": "", # section sign
441
+ }
442
+ for old, new in replacements.items():
443
+ text = text.replace(old, new)
444
+
445
+ # Strip remaining non-ASCII
446
+ text = text.encode("ascii", errors="ignore").decode("ascii")
447
+ return text
448
+
449
+ def _remove_urls(self, text: str) -> str:
450
+ """Remove URLs and email addresses."""
451
+ text = re.sub(r"https?://\S+", "", text)
452
+ text = re.sub(r"www\.\S+", "", text)
453
+ text = re.sub(r"\S+@\S+\.\S+", "", text)
454
+ return text
455
+
456
+ # ------------------------------------------------------------------
457
+ # Roman numeral conversion
458
+ # ------------------------------------------------------------------
459
+
460
+ def _roman_to_int(self, s: str) -> int:
461
+ """Convert a Roman numeral string to an integer."""
462
+ result = 0
463
+ prev = 0
464
+ for char in reversed(s.upper()):
465
+ val = self.ROMAN_VALUES.get(char, 0)
466
+ if val < prev:
467
+ result -= val
468
+ else:
469
+ result += val
470
+ prev = val
471
+ return result
472
+
473
+ def _is_valid_roman(self, s: str) -> bool:
474
+ """Check if a string is a valid Roman numeral (not just random letters)."""
475
+ if not s:
476
+ return False
477
+ # Must only contain valid Roman numeral characters
478
+ if not all(c in "IVXLCDM" for c in s.upper()):
479
+ return False
480
+ # Must convert to a positive number
481
+ val = self._roman_to_int(s)
482
+ return val > 0
483
+
484
+ def _convert_roman_numerals(self, text: str) -> str:
485
+ """Convert Roman numerals to English words.
486
+
487
+ Handles multi-character Roman numerals (II, IV, XIV, etc.) directly.
488
+ Single 'I' is only converted when preceded by a context word.
489
+ """
490
+ def replace_roman(m):
491
+ numeral = m.group(1)
492
+ # Skip single-char matches that aren't clearly Roman numerals
493
+ if len(numeral) == 1:
494
+ # Single 'I' — only convert after context words
495
+ if numeral.upper() == "I":
496
+ # Check the text before this match for context words
497
+ before = text[max(0, m.start() - 30):m.start()]
498
+ if not self.ROMAN_CONTEXT.search(before):
499
+ return m.group(0)
500
+ else:
501
+ # Single V, X, L, C, D, M — convert them
502
+ pass
503
+
504
+ if not self._is_valid_roman(numeral):
505
+ return m.group(0)
506
+
507
+ val = self._roman_to_int(numeral)
508
+ return self._number_to_words(val)
509
+
510
+ return self.ROMAN_NUMERAL.sub(replace_roman, text)
511
+
512
+ # ------------------------------------------------------------------
513
+ # Arabic numeral conversion
514
+ # ------------------------------------------------------------------
515
+
516
+ def _number_to_words(self, n: int) -> str:
517
+ """Convert an integer to English words."""
518
+ if n < 0:
519
+ return "negative " + self._number_to_words(-n)
520
+ if n == 0:
521
+ return self.ONES[0]
522
+ if n < 20:
523
+ return self.ONES[n]
524
+ if n < 100:
525
+ tens, ones = divmod(n, 10)
526
+ return self.TENS[tens] + (" " + self.ONES[ones] if ones else "")
527
+ if n < 1000:
528
+ hundreds, remainder = divmod(n, 100)
529
+ result = self.ONES[hundreds] + " hundred"
530
+ if remainder:
531
+ result += " " + self._number_to_words(remainder)
532
+ return result
533
+ if n < 1000000:
534
+ thousands, remainder = divmod(n, 1000)
535
+ result = self._number_to_words(thousands) + " thousand"
536
+ if remainder:
537
+ result += " " + self._number_to_words(remainder)
538
+ return result
539
+ return str(n)
540
+
541
+ def _convert_numerals(self, text: str) -> str:
542
+ """Replace standalone digit sequences with their English word equivalents.
543
+
544
+ Only converts digit groups that are standalone words (surrounded by
545
+ whitespace or punctuation), preventing garbled output from codes
546
+ like Z39.48-1984.
547
+ """
548
+ def replace_match(m):
549
+ # Ensure digits are not part of a larger alphanumeric token
550
+ start, end = m.start(), m.end()
551
+ if start > 0 and text[start - 1].isalnum():
552
+ return m.group()
553
+ if end < len(text) and text[end].isalnum():
554
+ return m.group()
555
+ try:
556
+ n = int(m.group())
557
+ if n < 1000000:
558
+ return self._number_to_words(n)
559
+ except ValueError:
560
+ pass
561
+ return m.group()
562
+ return re.sub(r"\d+", replace_match, text)
563
+
564
+ # ------------------------------------------------------------------
565
+ # Character filtering and whitespace
566
+ # ------------------------------------------------------------------
567
+
568
+ def _clean_chars(self, text: str) -> str:
569
+ """Remove characters not in the allowed set."""
570
+ pattern = f"[^{self.allowed_chars}\n]"
571
+ text = re.sub(pattern, " ", text)
572
+ # Remove lines that are only dots and/or spaces (separator lines)
573
+ text = re.sub(r"^[. ]+$", "", text, flags=re.MULTILINE)
574
+ return text
575
+
576
+ def _collapse_whitespace(self, text: str) -> str:
577
+ """Collapse multiple spaces/newlines into single spaces."""
578
+ text = re.sub(r"\n{3,}", "\n\n", text)
579
+ text = re.sub(r" {2,}", " ", text)
580
+ text = re.sub(r" *\n *", "\n", text)
581
+ return text