Spaces:

LevArtesa
/

path-d-humanizer

Paused

App Files Files Community

path-d-humanizer / training /data /document_cutter.py

LevArtesa

clone training/data/document_cutter.py from LevArtesa/grpo-humanizer-training

b868740 verified about 2 months ago

Raw

History Blame Contribute Delete

5.66 kB

	"""Document cutter — extracts 300-600 word paragraphs from .docx files.

	Implements Requirements 1.1 and 1.2 of the GRPO Humanizer v2 spec:

	* Read plain text from ``.docx`` files via ``python-docx``.
	* Split on double newlines, normalize whitespace, drop noise paragraphs
	(empty, formulas-only, tables) by filtering out anything shorter than
	``MIN_FILTER_WORDS`` words.
	* Greedily merge consecutive short paragraphs until the accumulated block
	falls into the ``[MIN_WORDS, MAX_WORDS]`` target range. Blocks that
	overshoot ``MAX_WORDS`` are dropped with a warning.

	This module is a pure library — CLI orchestration lives in ``scripts/``.
	"""

	import logging
	import re
	from dataclasses import dataclass
	from pathlib import Path

	import docx

	logger = logging.getLogger(__name__)

	_WHITESPACE_RE = re.compile(r"\s+")


	@dataclass
	class ExtractedParagraph:
	"""A normalized text block extracted from a single ``.docx`` file.

	Attributes:
	source_file: Basename of the source ``.docx`` file.
	paragraph_index: Zero-based position of this block within the
	list returned by :meth:`Document_Cutter.extract` for the
	same source file.
	text: Normalized plain text (single-spaced). When the block is
	assembled from multiple raw paragraphs, they are rejoined
	with ``"\\n\\n"`` to preserve paragraph structure.
	word_count: ``len(text.split())`` at the time of extraction.
	"""

	source_file: str
	paragraph_index: int
	text: str
	word_count: int


	class Document_Cutter:
	"""Extract 300-600 word paragraph blocks from ``.docx`` files.

	Constants:
	MIN_WORDS: Lower bound (inclusive) of the target block length.
	MAX_WORDS: Upper bound (inclusive) of the target block length.
	MIN_FILTER_WORDS: Paragraphs shorter than this are treated as
	noise (empty lines, formula-only fragments, table cells)
	and dropped before greedy accumulation.
	"""

	MIN_WORDS = 300
	MAX_WORDS = 600
	MIN_FILTER_WORDS = 20

	def extract(self, docx_path: Path) -> list[ExtractedParagraph]:
	"""Extract all valid paragraph blocks from a single ``.docx``.

	Steps:
	1. Read all paragraphs via ``python-docx``.
	2. Join with ``"\\n\\n"`` and split again by ``"\\n\\n"`` to
	collapse duplicate blank lines introduced by Word.
	3. Normalize each raw paragraph (strip + collapse whitespace).
	4. Drop paragraphs with fewer than ``MIN_FILTER_WORDS`` words.
	5. Greedy accumulation: append paragraphs to an accumulator
	until the total word count lands in
	``[MIN_WORDS, MAX_WORDS]``; flush on hit. If the
	accumulator overshoots ``MAX_WORDS``, drop it with a
	warning and reset.

	Returns:
	List of :class:`ExtractedParagraph`. ``paragraph_index`` is
	assigned 0-based in the order blocks are flushed.
	"""
	try:
	document = docx.Document(str(docx_path))
	except Exception as exc: # pragma: no cover - defensive
	logger.warning("failed to read %s: %s", docx_path, exc)
	return []

	raw_joined = "\n\n".join(p.text for p in document.paragraphs)
	raw_paragraphs = raw_joined.split("\n\n")

	normalized: list[tuple[str, int]] = []
	for raw in raw_paragraphs:
	text = _WHITESPACE_RE.sub(" ", raw).strip()
	if not text:
	continue
	word_count = len(text.split())
	if word_count < self.MIN_FILTER_WORDS:
	continue
	normalized.append((text, word_count))

	source_file = docx_path.name
	results: list[ExtractedParagraph] = []
	acc_texts: list[str] = []
	acc_words = 0

	def flush() -> None:
	nonlocal acc_texts, acc_words
	merged = "\n\n".join(acc_texts)
	results.append(
	ExtractedParagraph(
	source_file=source_file,
	paragraph_index=len(results),
	text=merged,
	word_count=acc_words,
	)
	)
	acc_texts = []
	acc_words = 0

	for text, word_count in normalized:
	acc_texts.append(text)
	acc_words += word_count

	if self.MIN_WORDS <= acc_words <= self.MAX_WORDS:
	flush()
	elif acc_words > self.MAX_WORDS:
	logger.warning(
	"dropping over-long accumulator in %s: %d words > MAX_WORDS=%d",
	source_file,
	acc_words,
	self.MAX_WORDS,
	)
	acc_texts = []
	acc_words = 0

	# A trailing accumulator below MIN_WORDS is intentionally discarded.
	return results

	def extract_all(self, docx_dir: Path) -> list[ExtractedParagraph]:
	"""Extract paragraph blocks from every ``.docx`` in ``docx_dir``.

	Files are processed in lexicographic order for deterministic
	dataset builds. Per-file errors (corrupt archives, unreadable
	content) are logged as warnings and the iteration continues.
	"""
	aggregated: list[ExtractedParagraph] = []
	files = sorted(docx_dir.glob("*.docx"))
	for path in files:
	try:
	aggregated.extend(self.extract(path))
	except Exception as exc:
	logger.warning("failed to read %s: %s", path, exc)
	continue
	return aggregated