Spaces:
Paused
Paused
| """Document cutter — extracts 300-600 word paragraphs from .docx files. | |
| Implements Requirements 1.1 and 1.2 of the GRPO Humanizer v2 spec: | |
| * Read plain text from ``.docx`` files via ``python-docx``. | |
| * Split on double newlines, normalize whitespace, drop noise paragraphs | |
| (empty, formulas-only, tables) by filtering out anything shorter than | |
| ``MIN_FILTER_WORDS`` words. | |
| * Greedily merge consecutive short paragraphs until the accumulated block | |
| falls into the ``[MIN_WORDS, MAX_WORDS]`` target range. Blocks that | |
| overshoot ``MAX_WORDS`` are dropped with a warning. | |
| This module is a pure library — CLI orchestration lives in ``scripts/``. | |
| """ | |
| import logging | |
| import re | |
| from dataclasses import dataclass | |
| from pathlib import Path | |
| import docx | |
| logger = logging.getLogger(__name__) | |
| _WHITESPACE_RE = re.compile(r"\s+") | |
| class ExtractedParagraph: | |
| """A normalized text block extracted from a single ``.docx`` file. | |
| Attributes: | |
| source_file: Basename of the source ``.docx`` file. | |
| paragraph_index: Zero-based position of this block within the | |
| list returned by :meth:`Document_Cutter.extract` for the | |
| same source file. | |
| text: Normalized plain text (single-spaced). When the block is | |
| assembled from multiple raw paragraphs, they are rejoined | |
| with ``"\\n\\n"`` to preserve paragraph structure. | |
| word_count: ``len(text.split())`` at the time of extraction. | |
| """ | |
| source_file: str | |
| paragraph_index: int | |
| text: str | |
| word_count: int | |
| class Document_Cutter: | |
| """Extract 300-600 word paragraph blocks from ``.docx`` files. | |
| Constants: | |
| MIN_WORDS: Lower bound (inclusive) of the target block length. | |
| MAX_WORDS: Upper bound (inclusive) of the target block length. | |
| MIN_FILTER_WORDS: Paragraphs shorter than this are treated as | |
| noise (empty lines, formula-only fragments, table cells) | |
| and dropped before greedy accumulation. | |
| """ | |
| MIN_WORDS = 300 | |
| MAX_WORDS = 600 | |
| MIN_FILTER_WORDS = 20 | |
| def extract(self, docx_path: Path) -> list[ExtractedParagraph]: | |
| """Extract all valid paragraph blocks from a single ``.docx``. | |
| Steps: | |
| 1. Read all paragraphs via ``python-docx``. | |
| 2. Join with ``"\\n\\n"`` and split again by ``"\\n\\n"`` to | |
| collapse duplicate blank lines introduced by Word. | |
| 3. Normalize each raw paragraph (strip + collapse whitespace). | |
| 4. Drop paragraphs with fewer than ``MIN_FILTER_WORDS`` words. | |
| 5. Greedy accumulation: append paragraphs to an accumulator | |
| until the total word count lands in | |
| ``[MIN_WORDS, MAX_WORDS]``; flush on hit. If the | |
| accumulator overshoots ``MAX_WORDS``, drop it with a | |
| warning and reset. | |
| Returns: | |
| List of :class:`ExtractedParagraph`. ``paragraph_index`` is | |
| assigned 0-based in the order blocks are flushed. | |
| """ | |
| try: | |
| document = docx.Document(str(docx_path)) | |
| except Exception as exc: # pragma: no cover - defensive | |
| logger.warning("failed to read %s: %s", docx_path, exc) | |
| return [] | |
| raw_joined = "\n\n".join(p.text for p in document.paragraphs) | |
| raw_paragraphs = raw_joined.split("\n\n") | |
| normalized: list[tuple[str, int]] = [] | |
| for raw in raw_paragraphs: | |
| text = _WHITESPACE_RE.sub(" ", raw).strip() | |
| if not text: | |
| continue | |
| word_count = len(text.split()) | |
| if word_count < self.MIN_FILTER_WORDS: | |
| continue | |
| normalized.append((text, word_count)) | |
| source_file = docx_path.name | |
| results: list[ExtractedParagraph] = [] | |
| acc_texts: list[str] = [] | |
| acc_words = 0 | |
| def flush() -> None: | |
| nonlocal acc_texts, acc_words | |
| merged = "\n\n".join(acc_texts) | |
| results.append( | |
| ExtractedParagraph( | |
| source_file=source_file, | |
| paragraph_index=len(results), | |
| text=merged, | |
| word_count=acc_words, | |
| ) | |
| ) | |
| acc_texts = [] | |
| acc_words = 0 | |
| for text, word_count in normalized: | |
| acc_texts.append(text) | |
| acc_words += word_count | |
| if self.MIN_WORDS <= acc_words <= self.MAX_WORDS: | |
| flush() | |
| elif acc_words > self.MAX_WORDS: | |
| logger.warning( | |
| "dropping over-long accumulator in %s: %d words > MAX_WORDS=%d", | |
| source_file, | |
| acc_words, | |
| self.MAX_WORDS, | |
| ) | |
| acc_texts = [] | |
| acc_words = 0 | |
| # A trailing accumulator below MIN_WORDS is intentionally discarded. | |
| return results | |
| def extract_all(self, docx_dir: Path) -> list[ExtractedParagraph]: | |
| """Extract paragraph blocks from every ``.docx`` in ``docx_dir``. | |
| Files are processed in lexicographic order for deterministic | |
| dataset builds. Per-file errors (corrupt archives, unreadable | |
| content) are logged as warnings and the iteration continues. | |
| """ | |
| aggregated: list[ExtractedParagraph] = [] | |
| files = sorted(docx_dir.glob("*.docx")) | |
| for path in files: | |
| try: | |
| aggregated.extend(self.extract(path)) | |
| except Exception as exc: | |
| logger.warning("failed to read %s: %s", path, exc) | |
| continue | |
| return aggregated | |