Upload fraktur_other_classifier_best.pth + config + pipeline

1d2b2a9 verified about 1 month ago

8.22 kB

	"""
	Fraktur/Other text-line classifier — self-contained Hugging Face pipeline.

	This file is designed to live on the Hugging Face Hub alongside the model
	weights. It has no dependency on the training repository; everything needed
	for inference is defined here.

	Usage::

	import importlib.util
	from huggingface_hub import hf_hub_download

	spec = importlib.util.spec_from_file_location(
	"pipeline",
	hf_hub_download("impresso-project/frakturline-classification-cnn", "pipeline.py"),
	)
	m = importlib.util.module_from_spec(spec)
	spec.loader.exec_module(m)

	pipe = m.FrakturPipeline.from_pretrained(
	"impresso-project/frakturline-classification-cnn"
	)

	# From a local file path
	result = pipe("path/to/line.png")
	# → {"label": "fraktur", "score": 0.9731}

	# From an https:// URL (fetched via urllib — no extra dependencies)
	result = pipe("https://example.com/line.png")

	# From a PIL image (e.g. already loaded in memory)
	from PIL import Image
	result = pipe(Image.open("line.png"))

	# Batch
	results = pipe(["line1.png", "line2.png"])
	"""

	import math
	import urllib.request
	import torch
	import torch.nn as nn
	import torch.nn.functional as F
	from torchvision import transforms
	from torchvision.transforms import functional as TF
	from PIL import Image
	from huggingface_hub import PyTorchModelHubMixin

	# ---------------------------------------------------------------------------
	# Device selection
	# ---------------------------------------------------------------------------
	device = torch.device(
	"mps"
	if torch.backends.mps.is_available()
	else ("cuda" if torch.cuda.is_available() else "cpu")
	)

	# ---------------------------------------------------------------------------
	# Preprocessing
	# ---------------------------------------------------------------------------


	class ResizePad:
	"""Resize to a fixed height (preserving aspect ratio), then pad/crop width."""

	def __init__(self, target_height: int, target_width: int, fill: int = 0):
	self.target_height = target_height
	self.target_width = target_width
	self.fill = fill

	def __call__(self, image: Image.Image) -> Image.Image:
	width, height = image.size
	new_width = int(width * self.target_height / height)
	image = image.resize(
	(new_width, self.target_height), resample=Image.Resampling.BILINEAR
	)
	if new_width < self.target_width:
	image = TF.pad(
	image,
	padding=(0, 0, self.target_width - new_width, 0),
	fill=self.fill,
	)
	else:
	left = (new_width - self.target_width) // 2
	image = image.crop((left, 0, left + self.target_width, self.target_height))
	return image


	inference_transforms = transforms.Compose(
	[
	transforms.Grayscale(num_output_channels=1),
	ResizePad(target_height=60, target_width=800),
	transforms.ToTensor(),
	transforms.Normalize(mean=[0.5], std=[0.5]),
	]
	)

	# ---------------------------------------------------------------------------
	# Model
	# ---------------------------------------------------------------------------


	class BinaryClassificationCNN(nn.Module, PyTorchModelHubMixin):
	"""
	Three-layer CNN binary classifier for Fraktur vs. Other text lines.

	Input: grayscale image of shape (1, 60, 800).
	Output: single logit — positive → Fraktur, negative → Other.

	Architecture::

	Conv1(1→32) + ReLU + MaxPool → LayerNorm[32,30,400]
	Conv2(32→64) + ReLU + MaxPool → LayerNorm[64,15,200] + Dropout(0.15)
	Conv3(64→128) + LayerNorm[128,15,200] + ReLU + AdaptiveMaxPool2d((1,8))
	Flatten → FC(1024→128) + ReLU → FC(128→1)
	"""

	def __init__(self):
	super().__init__()
	self.conv1 = nn.Conv2d(1, 32, kernel_size=3, padding=1)
	self.ln1 = nn.LayerNorm([32, 30, 400])
	self.pool = nn.MaxPool2d(2, 2)
	self.conv2 = nn.Conv2d(32, 64, kernel_size=3, padding=1)
	self.ln2 = nn.LayerNorm([64, 15, 200])
	self.dropout_conv = nn.Dropout(p=0.15, inplace=False)
	self.conv3 = nn.Conv2d(64, 128, kernel_size=3, padding=1)
	self.ln3 = nn.LayerNorm([128, 15, 200])
	# Global max pool: (128, 15, 200) → (128, 1, 8)
	self.global_pool = nn.AdaptiveMaxPool2d((1, 8))
	# Classifier head
	self.fc1 = nn.Linear(128 * 8, 128)
	self.fc2 = nn.Linear(128, 1)

	def forward(self, x: torch.Tensor) -> torch.Tensor:
	x = self.pool(F.relu(self.conv1(x)))
	x = self.ln1(x)
	x = self.pool(F.relu(self.conv2(x)))
	x = self.ln2(x)
	x = self.dropout_conv(x)
	x = self.conv3(x)
	x = self.ln3(x)
	x = F.relu(x)
	x = self.global_pool(x)
	x = F.relu(self.fc1(x.view(x.size(0), -1)))
	return self.fc2(x)


	# ---------------------------------------------------------------------------
	# Pipeline
	# ---------------------------------------------------------------------------


	class FrakturPipeline:
	"""
	End-to-end pipeline for Fraktur/Other classification.

	Accepts file paths, PIL Images, or lists thereof. Returns a dict (or list
	of dicts) with keys ``label`` (``"fraktur"`` or ``"other"``) and
	``score`` (sigmoid probability of the predicted class, in [0, 1]).

	Example::

	pipe = FrakturPipeline.from_pretrained(
	"impresso-project/frakturline-classification-cnn"
	)
	pipe("line.png")
	# → {"label": "fraktur", "score": 0.9731}
	"""

	def __init__(
	self, model: BinaryClassificationCNN, device: torch.device = device
	) -> None:
	self.model = model.to(device)
	self.model.eval()
	self._device = device

	@classmethod
	def from_pretrained(cls, repo_id: str, **kwargs) -> "FrakturPipeline":
	"""Load pipeline from a Hugging Face Hub repository.

	Args:
	repo_id: HF Hub repository identifier, e.g.
	``"impresso-project/frakturline-classification-cnn"``.
	**kwargs: Forwarded to
	:meth:`BinaryClassificationCNN.from_pretrained`.

	Returns:
	FrakturPipeline: Ready-to-use pipeline.
	"""
	model = BinaryClassificationCNN.from_pretrained(repo_id, **kwargs)
	return cls(model)

	def __call__(
	self,
	image: "str \| Image.Image \| list[str \| Image.Image]",
	**kwargs,
	) -> "dict[str, str \| float] \| list[dict[str, str \| float]]":
	"""Classify one or more text-line images.

	Args:
	image: A local file path, an ``https://`` / ``http://`` URL
	(``str``), a :class:`PIL.Image.Image`, or a ``list`` of either.
	URLs are fetched with :mod:`urllib.request` — no extra
	dependencies required.

	Returns:
	dict or list[dict]: Each dict has keys:

	- ``label`` (``str``): ``"fraktur"`` or ``"other"``.
	- ``score`` (``float``): Sigmoid probability of the predicted
	class, rounded to 4 decimal places.
	"""
	if isinstance(image, list):
	return [self._classify_one(img) for img in image]
	return self._classify_one(image)

	def _classify_one(self, image: "str \| Image.Image") -> "dict[str, str \| float]":
	if isinstance(image, str):
	if image.startswith("http://") or image.startswith("https://"):
	with urllib.request.urlopen(image) as resp: # noqa: S310
	image = Image.open(resp).convert("L")
	else:
	with Image.open(image) as img:
	image = img.convert("L")
	else:
	image = image.convert("L")
	tensor = inference_transforms(image).unsqueeze(0).to(self._device)
	with torch.no_grad():
	logit = self.model(tensor).item()
	label = "fraktur" if logit > 0 else "other"
	prob = 1.0 / (1.0 + math.exp(-logit)) # sigmoid
	score = prob if label == "fraktur" else 1.0 - prob
	return {"label": label, "score": round(score, 4)}