Spaces:

Ma-Ri-Ba-Ku
/

Picarones

Running

Claude

feat: corpus triplet, post-correction LLM et modèles dynamiques avec capacités

63e236b unverified 13 days ago

6.71 kB

	"""Chargement et gestion des corpus de documents.

	Format supporté :
	- Paires classiques : image + .gt.txt
	- Triplets post-correction : image + .gt.txt + .ocr.txt

	Convention :
	mon_document.jpg ←→ mon_document.gt.txt (paire)
	mon_document.jpg ←→ mon_document.gt.txt + mon_document.ocr.txt (triplet)

	Le fichier ``.ocr.txt`` contient le texte OCR bruité (sortie d'un moteur OCR)
	qui sera utilisé comme entrée pour les benchmarks de post-correction LLM.
	Il est optionnel — un corpus sans ``.ocr.txt`` reste un corpus classique.

	Extensions d'images acceptées : .jpg, .jpeg, .png, .tif, .tiff, .bmp, .webp
	"""

	from __future__ import annotations

	import logging
	from dataclasses import dataclass, field
	from pathlib import Path
	from typing import Iterator, Optional

	logger = logging.getLogger(__name__)

	# Extensions image reconnues
	IMAGE_EXTENSIONS = {".jpg", ".jpeg", ".png", ".tif", ".tiff", ".bmp", ".webp"}


	@dataclass
	class Document:
	"""Un document du corpus : image + vérité terrain + (optionnel) OCR bruité.

	Quand ``ocr_text`` est renseigné (corpus triplet), le benchmark de
	post-correction LLM peut utiliser ce texte au lieu de lancer un moteur OCR.
	"""

	image_path: Path
	ground_truth: str
	doc_id: str = ""
	ocr_text: Optional[str] = None
	"""Texte OCR bruité pré-calculé (``None`` pour les corpus classiques sans ``.ocr.txt``)."""
	metadata: dict = field(default_factory=dict)

	def __post_init__(self) -> None:
	if not self.doc_id:
	self.doc_id = self.image_path.stem


	@dataclass
	class Corpus:
	"""Collection de documents avec leurs métadonnées."""

	name: str
	documents: list[Document]
	source_path: Optional[str] = None
	metadata: dict = field(default_factory=dict)

	def __len__(self) -> int:
	return len(self.documents)

	def __iter__(self) -> Iterator[Document]:
	return iter(self.documents)

	def __repr__(self) -> str:
	return f"Corpus(name={self.name!r}, documents={len(self.documents)})"

	@property
	def has_ocr_text(self) -> bool:
	"""True si au moins un document possède un texte OCR pré-calculé."""
	return any(doc.ocr_text is not None for doc in self.documents)

	@property
	def ocr_text_count(self) -> int:
	"""Nombre de documents avec un texte OCR pré-calculé."""
	return sum(1 for doc in self.documents if doc.ocr_text is not None)

	@property
	def stats(self) -> dict:
	gt_lengths = [len(doc.ground_truth) for doc in self.documents]
	if not gt_lengths:
	return {"document_count": 0}
	import statistics

	s = {
	"document_count": len(self.documents),
	"gt_length_mean": round(statistics.mean(gt_lengths), 1),
	"gt_length_median": round(statistics.median(gt_lengths), 1),
	"gt_length_min": min(gt_lengths),
	"gt_length_max": max(gt_lengths),
	"has_ocr_text": self.has_ocr_text,
	"ocr_text_count": self.ocr_text_count,
	}
	return s


	def load_corpus_from_directory(
	directory: str \| Path,
	name: Optional[str] = None,
	gt_suffix: str = ".gt.txt",
	ocr_suffix: str = ".ocr.txt",
	encoding: str = "utf-8",
	) -> Corpus:
	"""Charge un corpus depuis un dossier local.

	Supporte deux formats :
	- Paires : ``image + .gt.txt``
	- Triplets : ``image + .gt.txt + .ocr.txt`` (post-correction LLM)

	Le fichier ``.ocr.txt`` est optionnel. Quand il est présent, le champ
	``Document.ocr_text`` est renseigné et le benchmark peut l'utiliser
	comme entrée OCR bruitée pour tester la post-correction LLM sans
	relancer un moteur OCR.

	Parameters
	----------
	directory:
	Chemin vers le dossier contenant les paires/triplets.
	name:
	Nom du corpus (par défaut : nom du dossier).
	gt_suffix:
	Suffixe des fichiers vérité terrain (par défaut : ``.gt.txt``).
	ocr_suffix:
	Suffixe des fichiers OCR bruité (par défaut : ``.ocr.txt``).
	encoding:
	Encodage des fichiers texte (par défaut : utf-8).

	Returns
	-------
	Corpus

	Raises
	------
	FileNotFoundError
	Si le dossier n'existe pas.
	ValueError
	Si aucun document valide n'est trouvé.
	"""
	directory = Path(directory)
	if not directory.is_dir():
	raise FileNotFoundError(f"Dossier introuvable : {directory}")

	corpus_name = name or directory.name
	documents: list[Document] = []
	skipped = 0

	# Collecte de toutes les images (on exclut les fichiers cachés macOS ._* et .*)
	image_paths = sorted(
	p for p in directory.iterdir()
	if p.suffix.lower() in IMAGE_EXTENSIONS and not p.name.startswith(".")
	)

	ocr_text_loaded = 0

	for image_path in image_paths:
	gt_path = image_path.with_name(image_path.stem + gt_suffix)
	if not gt_path.exists():
	logger.debug("Pas de fichier GT pour %s — ignoré.", image_path.name)
	skipped += 1
	continue

	try:
	ground_truth = gt_path.read_text(encoding=encoding).strip()
	except OSError as exc:
	logger.warning("Impossible de lire %s : %s — ignoré.", gt_path, exc)
	skipped += 1
	continue

	# OCR bruité optionnel (.ocr.txt)
	ocr_text: Optional[str] = None
	ocr_path = image_path.with_name(image_path.stem + ocr_suffix)
	if ocr_path.exists():
	try:
	ocr_text = ocr_path.read_text(encoding=encoding).strip()
	ocr_text_loaded += 1
	except OSError as exc:
	logger.warning("Impossible de lire %s : %s — OCR bruité ignoré.", ocr_path, exc)

	documents.append(
	Document(
	image_path=image_path,
	ground_truth=ground_truth,
	ocr_text=ocr_text,
	)
	)

	if not documents:
	raise ValueError(
	f"Aucun document valide trouvé dans {directory}. "
	f"Vérifiez que les fichiers GT portent le suffixe '{gt_suffix}'."
	)

	if skipped:
	logger.info("%d image(s) ignorée(s) faute de fichier GT.", skipped)

	if ocr_text_loaded:
	logger.info(
	"Corpus '%s' chargé : %d documents (%d avec OCR bruité — post-correction disponible).",
	corpus_name, len(documents), ocr_text_loaded,
	)
	else:
	logger.info("Corpus '%s' chargé : %d documents.", corpus_name, len(documents))
	return Corpus(
	name=corpus_name,
	documents=documents,
	source_path=str(directory),
	)