simon-clmtd's picture
Upload fraktur_other_classifier_best.pth + config + pipeline
1d2b2a9 verified
"""
Fraktur/Other text-line classifier β€” self-contained Hugging Face pipeline.
This file is designed to live on the Hugging Face Hub alongside the model
weights. It has no dependency on the training repository; everything needed
for inference is defined here.
Usage::
import importlib.util
from huggingface_hub import hf_hub_download
spec = importlib.util.spec_from_file_location(
"pipeline",
hf_hub_download("impresso-project/frakturline-classification-cnn", "pipeline.py"),
)
m = importlib.util.module_from_spec(spec)
spec.loader.exec_module(m)
pipe = m.FrakturPipeline.from_pretrained(
"impresso-project/frakturline-classification-cnn"
)
# From a local file path
result = pipe("path/to/line.png")
# β†’ {"label": "fraktur", "score": 0.9731}
# From an https:// URL (fetched via urllib β€” no extra dependencies)
result = pipe("https://example.com/line.png")
# From a PIL image (e.g. already loaded in memory)
from PIL import Image
result = pipe(Image.open("line.png"))
# Batch
results = pipe(["line1.png", "line2.png"])
"""
import math
import urllib.request
import torch
import torch.nn as nn
import torch.nn.functional as F
from torchvision import transforms
from torchvision.transforms import functional as TF
from PIL import Image
from huggingface_hub import PyTorchModelHubMixin
# ---------------------------------------------------------------------------
# Device selection
# ---------------------------------------------------------------------------
device = torch.device(
"mps"
if torch.backends.mps.is_available()
else ("cuda" if torch.cuda.is_available() else "cpu")
)
# ---------------------------------------------------------------------------
# Preprocessing
# ---------------------------------------------------------------------------
class ResizePad:
"""Resize to a fixed height (preserving aspect ratio), then pad/crop width."""
def __init__(self, target_height: int, target_width: int, fill: int = 0):
self.target_height = target_height
self.target_width = target_width
self.fill = fill
def __call__(self, image: Image.Image) -> Image.Image:
width, height = image.size
new_width = int(width * self.target_height / height)
image = image.resize(
(new_width, self.target_height), resample=Image.Resampling.BILINEAR
)
if new_width < self.target_width:
image = TF.pad(
image,
padding=(0, 0, self.target_width - new_width, 0),
fill=self.fill,
)
else:
left = (new_width - self.target_width) // 2
image = image.crop((left, 0, left + self.target_width, self.target_height))
return image
inference_transforms = transforms.Compose(
[
transforms.Grayscale(num_output_channels=1),
ResizePad(target_height=60, target_width=800),
transforms.ToTensor(),
transforms.Normalize(mean=[0.5], std=[0.5]),
]
)
# ---------------------------------------------------------------------------
# Model
# ---------------------------------------------------------------------------
class BinaryClassificationCNN(nn.Module, PyTorchModelHubMixin):
"""
Three-layer CNN binary classifier for Fraktur vs. Other text lines.
Input: grayscale image of shape (1, 60, 800).
Output: single logit β€” positive β†’ Fraktur, negative β†’ Other.
Architecture::
Conv1(1β†’32) + ReLU + MaxPool β†’ LayerNorm[32,30,400]
Conv2(32β†’64) + ReLU + MaxPool β†’ LayerNorm[64,15,200] + Dropout(0.15)
Conv3(64β†’128) + LayerNorm[128,15,200] + ReLU + AdaptiveMaxPool2d((1,8))
Flatten β†’ FC(1024β†’128) + ReLU β†’ FC(128β†’1)
"""
def __init__(self):
super().__init__()
self.conv1 = nn.Conv2d(1, 32, kernel_size=3, padding=1)
self.ln1 = nn.LayerNorm([32, 30, 400])
self.pool = nn.MaxPool2d(2, 2)
self.conv2 = nn.Conv2d(32, 64, kernel_size=3, padding=1)
self.ln2 = nn.LayerNorm([64, 15, 200])
self.dropout_conv = nn.Dropout(p=0.15, inplace=False)
self.conv3 = nn.Conv2d(64, 128, kernel_size=3, padding=1)
self.ln3 = nn.LayerNorm([128, 15, 200])
# Global max pool: (128, 15, 200) β†’ (128, 1, 8)
self.global_pool = nn.AdaptiveMaxPool2d((1, 8))
# Classifier head
self.fc1 = nn.Linear(128 * 8, 128)
self.fc2 = nn.Linear(128, 1)
def forward(self, x: torch.Tensor) -> torch.Tensor:
x = self.pool(F.relu(self.conv1(x)))
x = self.ln1(x)
x = self.pool(F.relu(self.conv2(x)))
x = self.ln2(x)
x = self.dropout_conv(x)
x = self.conv3(x)
x = self.ln3(x)
x = F.relu(x)
x = self.global_pool(x)
x = F.relu(self.fc1(x.view(x.size(0), -1)))
return self.fc2(x)
# ---------------------------------------------------------------------------
# Pipeline
# ---------------------------------------------------------------------------
class FrakturPipeline:
"""
End-to-end pipeline for Fraktur/Other classification.
Accepts file paths, PIL Images, or lists thereof. Returns a dict (or list
of dicts) with keys ``label`` (``"fraktur"`` or ``"other"``) and
``score`` (sigmoid probability of the predicted class, in [0, 1]).
Example::
pipe = FrakturPipeline.from_pretrained(
"impresso-project/frakturline-classification-cnn"
)
pipe("line.png")
# β†’ {"label": "fraktur", "score": 0.9731}
"""
def __init__(
self, model: BinaryClassificationCNN, device: torch.device = device
) -> None:
self.model = model.to(device)
self.model.eval()
self._device = device
@classmethod
def from_pretrained(cls, repo_id: str, **kwargs) -> "FrakturPipeline":
"""Load pipeline from a Hugging Face Hub repository.
Args:
repo_id: HF Hub repository identifier, e.g.
``"impresso-project/frakturline-classification-cnn"``.
**kwargs: Forwarded to
:meth:`BinaryClassificationCNN.from_pretrained`.
Returns:
FrakturPipeline: Ready-to-use pipeline.
"""
model = BinaryClassificationCNN.from_pretrained(repo_id, **kwargs)
return cls(model)
def __call__(
self,
image: "str | Image.Image | list[str | Image.Image]",
**kwargs,
) -> "dict[str, str | float] | list[dict[str, str | float]]":
"""Classify one or more text-line images.
Args:
image: A local file path, an ``https://`` / ``http://`` URL
(``str``), a :class:`PIL.Image.Image`, or a ``list`` of either.
URLs are fetched with :mod:`urllib.request` β€” no extra
dependencies required.
Returns:
dict or list[dict]: Each dict has keys:
- ``label`` (``str``): ``"fraktur"`` or ``"other"``.
- ``score`` (``float``): Sigmoid probability of the predicted
class, rounded to 4 decimal places.
"""
if isinstance(image, list):
return [self._classify_one(img) for img in image]
return self._classify_one(image)
def _classify_one(self, image: "str | Image.Image") -> "dict[str, str | float]":
if isinstance(image, str):
if image.startswith("http://") or image.startswith("https://"):
with urllib.request.urlopen(image) as resp: # noqa: S310
image = Image.open(resp).convert("L")
else:
with Image.open(image) as img:
image = img.convert("L")
else:
image = image.convert("L")
tensor = inference_transforms(image).unsqueeze(0).to(self._device)
with torch.no_grad():
logit = self.model(tensor).item()
label = "fraktur" if logit > 0 else "other"
prob = 1.0 / (1.0 + math.exp(-logit)) # sigmoid
score = prob if label == "fraktur" else 1.0 - prob
return {"label": label, "score": round(score, 4)}