path-d-humanizer / training /data /document_cutter.py
LevArtesa's picture
clone training/data/document_cutter.py from LevArtesa/grpo-humanizer-training
b868740 verified
Raw
History Blame Contribute Delete
5.66 kB
"""Document cutter — extracts 300-600 word paragraphs from .docx files.
Implements Requirements 1.1 and 1.2 of the GRPO Humanizer v2 spec:
* Read plain text from ``.docx`` files via ``python-docx``.
* Split on double newlines, normalize whitespace, drop noise paragraphs
(empty, formulas-only, tables) by filtering out anything shorter than
``MIN_FILTER_WORDS`` words.
* Greedily merge consecutive short paragraphs until the accumulated block
falls into the ``[MIN_WORDS, MAX_WORDS]`` target range. Blocks that
overshoot ``MAX_WORDS`` are dropped with a warning.
This module is a pure library — CLI orchestration lives in ``scripts/``.
"""
import logging
import re
from dataclasses import dataclass
from pathlib import Path
import docx
logger = logging.getLogger(__name__)
_WHITESPACE_RE = re.compile(r"\s+")
@dataclass
class ExtractedParagraph:
"""A normalized text block extracted from a single ``.docx`` file.
Attributes:
source_file: Basename of the source ``.docx`` file.
paragraph_index: Zero-based position of this block within the
list returned by :meth:`Document_Cutter.extract` for the
same source file.
text: Normalized plain text (single-spaced). When the block is
assembled from multiple raw paragraphs, they are rejoined
with ``"\\n\\n"`` to preserve paragraph structure.
word_count: ``len(text.split())`` at the time of extraction.
"""
source_file: str
paragraph_index: int
text: str
word_count: int
class Document_Cutter:
"""Extract 300-600 word paragraph blocks from ``.docx`` files.
Constants:
MIN_WORDS: Lower bound (inclusive) of the target block length.
MAX_WORDS: Upper bound (inclusive) of the target block length.
MIN_FILTER_WORDS: Paragraphs shorter than this are treated as
noise (empty lines, formula-only fragments, table cells)
and dropped before greedy accumulation.
"""
MIN_WORDS = 300
MAX_WORDS = 600
MIN_FILTER_WORDS = 20
def extract(self, docx_path: Path) -> list[ExtractedParagraph]:
"""Extract all valid paragraph blocks from a single ``.docx``.
Steps:
1. Read all paragraphs via ``python-docx``.
2. Join with ``"\\n\\n"`` and split again by ``"\\n\\n"`` to
collapse duplicate blank lines introduced by Word.
3. Normalize each raw paragraph (strip + collapse whitespace).
4. Drop paragraphs with fewer than ``MIN_FILTER_WORDS`` words.
5. Greedy accumulation: append paragraphs to an accumulator
until the total word count lands in
``[MIN_WORDS, MAX_WORDS]``; flush on hit. If the
accumulator overshoots ``MAX_WORDS``, drop it with a
warning and reset.
Returns:
List of :class:`ExtractedParagraph`. ``paragraph_index`` is
assigned 0-based in the order blocks are flushed.
"""
try:
document = docx.Document(str(docx_path))
except Exception as exc: # pragma: no cover - defensive
logger.warning("failed to read %s: %s", docx_path, exc)
return []
raw_joined = "\n\n".join(p.text for p in document.paragraphs)
raw_paragraphs = raw_joined.split("\n\n")
normalized: list[tuple[str, int]] = []
for raw in raw_paragraphs:
text = _WHITESPACE_RE.sub(" ", raw).strip()
if not text:
continue
word_count = len(text.split())
if word_count < self.MIN_FILTER_WORDS:
continue
normalized.append((text, word_count))
source_file = docx_path.name
results: list[ExtractedParagraph] = []
acc_texts: list[str] = []
acc_words = 0
def flush() -> None:
nonlocal acc_texts, acc_words
merged = "\n\n".join(acc_texts)
results.append(
ExtractedParagraph(
source_file=source_file,
paragraph_index=len(results),
text=merged,
word_count=acc_words,
)
)
acc_texts = []
acc_words = 0
for text, word_count in normalized:
acc_texts.append(text)
acc_words += word_count
if self.MIN_WORDS <= acc_words <= self.MAX_WORDS:
flush()
elif acc_words > self.MAX_WORDS:
logger.warning(
"dropping over-long accumulator in %s: %d words > MAX_WORDS=%d",
source_file,
acc_words,
self.MAX_WORDS,
)
acc_texts = []
acc_words = 0
# A trailing accumulator below MIN_WORDS is intentionally discarded.
return results
def extract_all(self, docx_dir: Path) -> list[ExtractedParagraph]:
"""Extract paragraph blocks from every ``.docx`` in ``docx_dir``.
Files are processed in lexicographic order for deterministic
dataset builds. Per-file errors (corrupt archives, unreadable
content) are logged as warnings and the iteration continues.
"""
aggregated: list[ExtractedParagraph] = []
files = sorted(docx_dir.glob("*.docx"))
for path in files:
try:
aggregated.extend(self.extract(path))
except Exception as exc:
logger.warning("failed to read %s: %s", path, exc)
continue
return aggregated