Spaces:

LisaMegaWatts
/

pre-punctuation-processor

Sleeping

App Files Files Community

LisaMegaWatts commited on Feb 20

Commit

d184fb7

verified ·

1 Parent(s): d97776b

Upload cleaner.py with huggingface_hub

Browse files

Files changed (1) hide show

cleaner.py +581 -0

cleaner.py ADDED Viewed

	@@ -0,0 +1,581 @@

+"""Text cleaning pipeline for preparing training data."""
+import logging
+import re
+import unicodedata
+logger = logging.getLogger(__name__)
+class TextCleaner:
+    """Cleans raw text for character-level language model training."""
+    # Project Gutenberg header/footer patterns
+    GUTENBERG_START = re.compile(
+        r"\*\*\*\s*START OF (?:THE |THIS )?PROJECT GUTENBERG.*?\*\*\*",
+        re.IGNORECASE,
+    )
+    GUTENBERG_END = re.compile(
+        r"\*\*\*\s*END OF (?:THE |THIS )?PROJECT GUTENBERG.*?\*\*\*",
+        re.IGNORECASE,
+    )
+    # Fallback for Gutenberg files that lack *** markers
+    GUTENBERG_END_PLAIN = re.compile(
+        r"^End of (?:the )?Project Gutenberg",
+        re.IGNORECASE | re.MULTILINE,
+    )
+    # MIT Internet Classics Archive patterns
+    MIT_HEADER = re.compile(
+        r"provided by the internet classics archive\..*?-{6,}",
+        re.IGNORECASE | re.DOTALL,
+    )
+    MIT_FOOTER = re.compile(
+        r"the internet classics archive\b[^\n]*(?:web atomics)?[^\n]*",
+        re.IGNORECASE,
+    )
+    MIT_DASH_LINE = re.compile(r"-{6,}")
+    # Internet Archive patterns
+    IA_HEADER = re.compile(
+        r"(?:Digitized by|Book digitized by|Original from|Uploaded by)"
+        r"[^\n]*",
+        re.IGNORECASE,
+    )
+    IA_GOOGLE_MARKER = re.compile(
+        r"(?:Generated (?:by|from)|Google-digitized|"
+        r"This is a digital copy of a book)[^\n]*",
+        re.IGNORECASE,
+    )
+    # Roman numeral pattern — matches standalone uppercase Roman numerals (2+ chars)
+    ROMAN_NUMERAL = re.compile(
+        r"\b(M{0,3}(?:CM|CD|D?C{0,3})(?:XC|XL|L?X{0,3})(?:IX|IV|V?I{0,3}))\b"
+    )
+    # Context words that allow single "I" to be treated as Roman numeral 1
+    ROMAN_CONTEXT = re.compile(
+        r"\b(?:book|chapter|prop|proposition|part|vol|volume|no|number|"
+        r"section|act|scene|lib|epistle|ode|psalm|canon|lemma|corollary|"
+        r"cor|def|definition|axiom|postulate)\b",
+        re.IGNORECASE,
+    )
+    # Roman numeral value map
+    ROMAN_VALUES = {"I": 1, "V": 5, "X": 10, "L": 50, "C": 100, "D": 500, "M": 1000}
+    # Non-body section headers (for aggressive stripping)
+    # NOTE: "INTRODUCTION" is deliberately excluded — it is often the author's own text
+    FRONT_MATTER_HEADERS = re.compile(
+        r"^\s*(?:PREFACE|FOREWORD|FORWARD|EDITOR[\u2019']?S?\s+NOTE|"
+        r"TRANSLATOR[\u2019']?S?\s+NOTE|PREFATORY\s+NOTE|PRELIMINARY\s+NOTE|"
+        r"BIOGRAPHICAL\s+(?:NOTE|SKETCH)|ADVERTISEMENT|DEDICAT(?:ION|ED\s+TO)|"
+        r"TO\s+THE\s+READER|NOTE\s+ON\s+(?:THE\s+)?TEXT|ABOUT\s+THIS\s+EDITION|"
+        r"CHRONOLOG(?:Y|ICAL))[.:\-\u2014]*\s*$",
+        re.IGNORECASE | re.MULTILINE,
+    )
+    BACK_MATTER_HEADERS = re.compile(
+        r"^\s*(?:APPENDIX|ADDEND(?:UM|A)|INDEX|GLOSSARY|BIBLIOGRAPHY|"
+        r"WORKS?\s+CITED|REFERENCES|ENDNOTES|FOOTNOTES|"
+        r"ACKNOWLEDG(?:E?MENTS?)|CREDITS|COLOPHON|ERRATA|"
+        r"TRANSCRIBER[\u2019']?S?\s+NOTES?|"
+        r"TYPOGRAPHICAL\s+ERRORS?\s+CORRECTED|"
+        r"LIST\s+OF\s+(?:ILLUSTRATIONS|FIGURES|PLATES))[.:\-\u2014]*\s*$",
+        re.IGNORECASE | re.MULTILINE,
+    )
+    TOC_HEADER = re.compile(
+        r"^\s*(?:TABLE\s+OF\s+)?CONTENTS?[.:\-\u2014]*\s*$",
+        re.IGNORECASE | re.MULTILINE,
+    )
+    # Production/publisher patterns (for front matter cleanup)
+    PRODUCTION_PATTERNS = [
+        re.compile(p, re.IGNORECASE) for p in [
+            r"(?:produced|prepared|transcribed|digitized|scanned)\s+(?:by|for|at)",
+            r"production\s+note",
+            r"transcriber[\u2019']?s?\s+note",
+            r"scanner[\u2019']?s?\s+note",
+            r"cornell\s+university\s+library",
+            r"(?:published|printed)\s+(?:by|for|at|in)",
+            r"(?:first|second|third|\d+(?:st|nd|rd|th))\s+edition",
+            r"price\s+\w+[sd]\.",
+            r"(?:cloth|paper|hardcover|paperback|octavo|quarto)",
+            r"\bisbn\b",
+            r"all\s+rights?\s+reserved",
+            r"(?:copyright|copr\.?)\s*(?:\(c\)|\xa9|\d)",
+            r"press\s+of\b",
+            r"university\s+press",
+        ]
+    ]
+    # Transcriber correction notes (back matter)
+    TRANSCRIBER_CORRECTION = re.compile(
+        r"^p\.\s*(?:\d+|\?\??|\.)\s*[.,]?\s*(?:sqq\.|in\s|the\s|as\s|heading|"
+        r"reference|prop|from\s|then\s|these\s|def\.|"
+        r"twenty|thirty|forty|fifty|sixty|seventy|eighty|ninety|"
+        r"one\s|two\s|three|four|five|six\s|seven|eight|nine|"
+        # Match quoted corrections
+        r'["\u201c])',
+        re.IGNORECASE,
+    )
+    # Separator/decoration lines
+    SEPARATOR_LINE = re.compile(r"^[\s.*_=~\-#]+$")
+    # Number words for 0-19 and tens
+    ONES = [
+        "zero", "one", "two", "three", "four", "five", "six", "seven",
+        "eight", "nine", "ten", "eleven", "twelve", "thirteen", "fourteen",
+        "fifteen", "sixteen", "seventeen", "eighteen", "nineteen",
+    ]
+    TENS = [
+        "", "", "twenty", "thirty", "forty", "fifty", "sixty", "seventy",
+        "eighty", "ninety",
+    ]
+    def __init__(self, config: dict):
+        self.lowercase = config.get("lowercase", True)
+        self.strip_gutenberg = config.get("strip_gutenberg", True)
+        self.strip_mit_classics = config.get("strip_mit_classics", True)
+        self.strip_internet_archive = config.get("strip_internet_archive", True)
+        self.normalize_unicode = config.get("normalize_unicode", True)
+        self.convert_numerals = config.get("convert_numerals", False)
+        self.convert_roman_numerals = config.get("convert_roman_numerals", False)
+        self.strip_non_body = config.get("strip_non_body", True)
+        self.min_line_length = config.get("min_line_length", 20)
+        self.remove_urls = config.get("remove_urls", True)
+        self.collapse_whitespace = config.get("collapse_whitespace", True)
+        self.allowed_chars = config.get("allowed_chars", r"a-z0-9 .,;:!?'\"\-\(\)")
+    def clean(self, text: str) -> str:
+        """Run all cleaning stages on the input text."""
+        if not text.strip():
+            return ""
+        # Stage 1: Strip source-specific boilerplate
+        if self.strip_gutenberg:
+            text = self._strip_gutenberg(text)
+        if self.strip_mit_classics:
+            text = self._strip_mit_classics(text)
+        if self.strip_internet_archive:
+            text = self._strip_internet_archive(text)
+        # Stage 2: Strip non-body content (before any text transforms)
+        if self.strip_non_body:
+            text = self._strip_non_body(text)
+        # Stage 3: Normalize unicode
+        if self.normalize_unicode:
+            text = self._normalize_unicode(text)
+        if self.remove_urls:
+            text = self._remove_urls(text)
+        # Stage 4: Convert Roman numerals (BEFORE lowercase — needs uppercase)
+        if self.convert_roman_numerals:
+            text = self._convert_roman_numerals(text)
+        # Stage 5: Lowercase
+        if self.lowercase:
+            text = text.lower()
+        # Stage 6: Convert Arabic numerals
+        if self.convert_numerals:
+            text = self._convert_numerals(text)
+        # Stage 7: Character filtering
+        text = self._clean_chars(text)
+        # Stage 8: Collapse whitespace
+        if self.collapse_whitespace:
+            text = self._collapse_whitespace(text)
+        return text.strip()
+    # ------------------------------------------------------------------
+    # Source boilerplate stripping
+    # ------------------------------------------------------------------
+    def _strip_gutenberg(self, text: str) -> str:
+        """Remove Project Gutenberg headers and footers."""
+        # Strip footer first (before positions shift)
+        end_match = self.GUTENBERG_END.search(text)
+        if not end_match:
+            end_match = self.GUTENBERG_END_PLAIN.search(text)
+        if end_match:
+            text = text[:end_match.start()]
+        # Strip header
+        start_match = self.GUTENBERG_START.search(text)
+        if start_match:
+            text = text[start_match.end():]
+        # Also strip common Gutenberg preamble lines
+        lines = text.split("\n")
+        cleaned = []
+        skip = True if start_match is None else False
+        for line in lines:
+            stripped = line.strip()
+            if skip and stripped.startswith(("Title:", "Author:", "Release Date:",
+                                            "Language:", "Character set",
+                                            "Produced by", "Updated editions")):
+                continue
+            if skip and not stripped:
+                continue
+            skip = False
+            cleaned.append(line)
+        return "\n".join(cleaned)
+    def _strip_mit_classics(self, text: str) -> str:
+        """Remove MIT Internet Classics Archive headers, footers, and section dividers."""
+        text = self.MIT_HEADER.sub("", text)
+        text = self.MIT_FOOTER.sub("", text)
+        text = self.MIT_DASH_LINE.sub("", text)
+        return text
+    def _strip_internet_archive(self, text: str) -> str:
+        """Remove Internet Archive / Google Books digitization boilerplate."""
+        text = self.IA_HEADER.sub("", text)
+        text = self.IA_GOOGLE_MARKER.sub("", text)
+        return text
+    # ------------------------------------------------------------------
+    # Non-body content stripping (aggressive mode)
+    # ------------------------------------------------------------------
+    def _strip_non_body(self, text: str) -> str:
+        """Remove front matter, back matter, and inline non-body content."""
+        text = self._strip_front_matter(text)
+        text = self._strip_back_matter(text)
+        text = self._strip_inline_non_body(text)
+        return text
+    def _strip_front_matter(self, text: str) -> str:
+        """Strip front matter: production notes, TOC, preface, etc.
+        Order: (1) strip named sections by header, (2) skip remaining
+        non-body paragraphs at the top.
+        """
+        # Pass 1: Remove named sections that have clear headers
+        text = self._strip_section(text, self.FRONT_MATTER_HEADERS)
+        text = self._strip_section(text, self.TOC_HEADER)
+        # Pass 2: Skip non-body paragraphs at the beginning.
+        # Body prose = substantial paragraph (>150 chars) with full sentences
+        # that does NOT match production/publisher patterns.
+        lines = text.split("\n")
+        start_idx = 0
+        i = 0
+        while i < len(lines):
+            # Collect next paragraph
+            while i < len(lines) and not lines[i].strip():
+                i += 1
+            para_start = i
+            para_lines = []
+            while i < len(lines) and lines[i].strip():
+                para_lines.append(lines[i].strip())
+                i += 1
+            if not para_lines:
+                continue
+            para_text = " ".join(para_lines)
+            has_sentences = bool(re.search(r"\.\s+[A-Z]", para_text))
+            is_substantial = len(para_text) > 150
+            is_production = self._is_production_line(para_text)
+            # Title pages / heading blocks: mostly uppercase letters
+            alpha_chars = [c for c in para_text if c.isalpha()]
+            is_mostly_uppercase = (
+                alpha_chars
+                and sum(1 for c in alpha_chars if c.isupper()) / len(alpha_chars) > 0.5
+            )
+            # Short average line length suggests a title/heading block
+            avg_line_len = sum(len(l) for l in para_lines) / len(para_lines)
+            is_short_lines = avg_line_len < 50
+            if (is_substantial and has_sentences
+                    and not is_production
+                    and not is_mostly_uppercase
+                    and not is_short_lines):
+                start_idx = para_start
+                break
+            # Not body yet — skip it
+            start_idx = i
+        return "\n".join(lines[start_idx:])
+    def _strip_back_matter(self, text: str) -> str:
+        """Strip back matter: appendixes, index, transcriber notes, etc."""
+        lines = text.split("\n")
+        # Find the first back-matter header and truncate there
+        first_back_idx = None
+        for i, line in enumerate(lines):
+            stripped = line.strip()
+            if self.BACK_MATTER_HEADERS.match(stripped):
+                first_back_idx = i
+                break
+            # Also detect "Typographical Errors corrected..." as back matter start
+            if re.match(r"Typographical\s+Errors?\b", stripped, re.IGNORECASE):
+                first_back_idx = i
+                break
+        if first_back_idx is not None:
+            lines = lines[:first_back_idx]
+        # Strip trailing transcriber correction notes (working backward)
+        while lines:
+            stripped = lines[-1].strip()
+            if not stripped:
+                lines.pop()
+                continue
+            if self.TRANSCRIBER_CORRECTION.match(stripped):
+                lines.pop()
+                continue
+            if self._is_production_line(stripped):
+                lines.pop()
+                continue
+            break
+        return "\n".join(lines)
+    def _strip_inline_non_body(self, text: str) -> str:
+        """Strip inline non-body markers: separator lines, all-caps headings."""
+        lines = text.split("\n")
+        cleaned = []
+        for line in lines:
+            stripped = line.strip()
+            # Remove separator/decoration lines
+            if stripped and self.SEPARATOR_LINE.match(stripped):
+                continue
+            # Remove short ALL-CAPS lines (likely section headings)
+            if stripped and len(stripped) < 80 and stripped == stripped.upper() and stripped.isalpha():
+                continue
+            cleaned.append(line)
+        return "\n".join(cleaned)
+    def _strip_section(self, text: str, header_pattern: re.Pattern) -> str:
+        """Remove a section identified by header_pattern until next section boundary."""
+        lines = text.split("\n")
+        result = []
+        skipping = False
+        for i, line in enumerate(lines):
+            stripped = line.strip()
+            if header_pattern.match(stripped):
+                skipping = True
+                continue
+            if skipping:
+                # Stop skipping at next section boundary:
+                # A substantial non-empty line after a blank line, OR
+                # A line that looks like a real body section start
+                is_blank = not stripped
+                if not is_blank and self._is_section_boundary(stripped, lines, i):
+                    skipping = False
+                    result.append(line)
+                continue
+            result.append(line)
+        return "\n".join(result)
+    def _is_section_boundary(self, stripped: str, lines: list[str], idx: int) -> bool:
+        """Detect if a line marks the beginning of a new major section.
+        Only returns True for explicit section headers/markers, NOT for
+        long body-text lines (which can appear inside prefaces/forewords).
+        """
+        # Body-start keywords (these signal real content resuming)
+        if re.match(
+            r"(?:Book|Chapter|Part|Section|Proposition|Theorem|Definition|"
+            r"Axiom|Postulate|Introduction|Definitions|Lemma|Corollary|"
+            r"Contents?)\b",
+            stripped, re.IGNORECASE,
+        ):
+            return True
+        # Another named section header (front or back matter)
+        if self.FRONT_MATTER_HEADERS.match(stripped):
+            return True
+        if self.BACK_MATTER_HEADERS.match(stripped):
+            return True
+        if self.TOC_HEADER.match(stripped):
+            return True
+        return False
+    def _is_production_line(self, line: str) -> bool:
+        """Check if a line is production/publisher metadata."""
+        for pattern in self.PRODUCTION_PATTERNS:
+            if pattern.search(line):
+                return True
+        return False
+    # ------------------------------------------------------------------
+    # Unicode normalization
+    # ------------------------------------------------------------------
+    def _normalize_unicode(self, text: str) -> str:
+        """Normalize unicode characters to their closest ASCII equivalents."""
+        text = unicodedata.normalize("NFKD", text)
+        replacements = {
+            "\u2018": "'", "\u2019": "'",  # smart quotes
+            "\u201c": '"', "\u201d": '"',
+            "\u2013": "-", "\u2014": "-",  # en/em dash
+            "\u2026": "...",               # ellipsis
+            "\u00a0": " ",                 # non-breaking space
+            "\u00b6": "",                  # pilcrow
+            "\u00a7": "",                  # section sign
+        }
+        for old, new in replacements.items():
+            text = text.replace(old, new)
+        # Strip remaining non-ASCII
+        text = text.encode("ascii", errors="ignore").decode("ascii")
+        return text
+    def _remove_urls(self, text: str) -> str:
+        """Remove URLs and email addresses."""
+        text = re.sub(r"https?://\S+", "", text)
+        text = re.sub(r"www\.\S+", "", text)
+        text = re.sub(r"\S+@\S+\.\S+", "", text)
+        return text
+    # ------------------------------------------------------------------
+    # Roman numeral conversion
+    # ------------------------------------------------------------------
+    def _roman_to_int(self, s: str) -> int:
+        """Convert a Roman numeral string to an integer."""
+        result = 0
+        prev = 0
+        for char in reversed(s.upper()):
+            val = self.ROMAN_VALUES.get(char, 0)
+            if val < prev:
+                result -= val
+            else:
+                result += val
+            prev = val
+        return result
+    def _is_valid_roman(self, s: str) -> bool:
+        """Check if a string is a valid Roman numeral (not just random letters)."""
+        if not s:
+            return False
+        # Must only contain valid Roman numeral characters
+        if not all(c in "IVXLCDM" for c in s.upper()):
+            return False
+        # Must convert to a positive number
+        val = self._roman_to_int(s)
+        return val > 0
+    def _convert_roman_numerals(self, text: str) -> str:
+        """Convert Roman numerals to English words.
+        Handles multi-character Roman numerals (II, IV, XIV, etc.) directly.
+        Single 'I' is only converted when preceded by a context word.
+        """
+        def replace_roman(m):
+            numeral = m.group(1)
+            # Skip single-char matches that aren't clearly Roman numerals
+            if len(numeral) == 1:
+                # Single 'I' — only convert after context words
+                if numeral.upper() == "I":
+                    # Check the text before this match for context words
+                    before = text[max(0, m.start() - 30):m.start()]
+                    if not self.ROMAN_CONTEXT.search(before):
+                        return m.group(0)
+                else:
+                    # Single V, X, L, C, D, M — convert them
+                    pass
+            if not self._is_valid_roman(numeral):
+                return m.group(0)
+            val = self._roman_to_int(numeral)
+            return self._number_to_words(val)
+        return self.ROMAN_NUMERAL.sub(replace_roman, text)
+    # ------------------------------------------------------------------
+    # Arabic numeral conversion
+    # ------------------------------------------------------------------
+    def _number_to_words(self, n: int) -> str:
+        """Convert an integer to English words."""
+        if n < 0:
+            return "negative " + self._number_to_words(-n)
+        if n == 0:
+            return self.ONES[0]
+        if n < 20:
+            return self.ONES[n]
+        if n < 100:
+            tens, ones = divmod(n, 10)
+            return self.TENS[tens] + (" " + self.ONES[ones] if ones else "")
+        if n < 1000:
+            hundreds, remainder = divmod(n, 100)
+            result = self.ONES[hundreds] + " hundred"
+            if remainder:
+                result += " " + self._number_to_words(remainder)
+            return result
+        if n < 1000000:
+            thousands, remainder = divmod(n, 1000)
+            result = self._number_to_words(thousands) + " thousand"
+            if remainder:
+                result += " " + self._number_to_words(remainder)
+            return result
+        return str(n)
+    def _convert_numerals(self, text: str) -> str:
+        """Replace standalone digit sequences with their English word equivalents.
+        Only converts digit groups that are standalone words (surrounded by
+        whitespace or punctuation), preventing garbled output from codes
+        like Z39.48-1984.
+        """
+        def replace_match(m):
+            # Ensure digits are not part of a larger alphanumeric token
+            start, end = m.start(), m.end()
+            if start > 0 and text[start - 1].isalnum():
+                return m.group()
+            if end < len(text) and text[end].isalnum():
+                return m.group()
+            try:
+                n = int(m.group())
+                if n < 1000000:
+                    return self._number_to_words(n)
+            except ValueError:
+                pass
+            return m.group()
+        return re.sub(r"\d+", replace_match, text)
+    # ------------------------------------------------------------------
+    # Character filtering and whitespace
+    # ------------------------------------------------------------------
+    def _clean_chars(self, text: str) -> str:
+        """Remove characters not in the allowed set."""
+        pattern = f"[^{self.allowed_chars}\n]"
+        text = re.sub(pattern, " ", text)
+        # Remove lines that are only dots and/or spaces (separator lines)
+        text = re.sub(r"^[. ]+$", "", text, flags=re.MULTILINE)
+        return text
+    def _collapse_whitespace(self, text: str) -> str:
+        """Collapse multiple spaces/newlines into single spaces."""
+        text = re.sub(r"\n{3,}", "\n\n", text)
+        text = re.sub(r" {2,}", " ", text)
+        text = re.sub(r" *\n *", "\n", text)
+        return text