| """ |
| Fraktur/Other text-line classifier β self-contained Hugging Face pipeline. |
| |
| This file is designed to live on the Hugging Face Hub alongside the model |
| weights. It has no dependency on the training repository; everything needed |
| for inference is defined here. |
| |
| Usage:: |
| |
| import importlib.util |
| from huggingface_hub import hf_hub_download |
| |
| spec = importlib.util.spec_from_file_location( |
| "pipeline", |
| hf_hub_download("impresso-project/frakturline-classification-cnn", "pipeline.py"), |
| ) |
| m = importlib.util.module_from_spec(spec) |
| spec.loader.exec_module(m) |
| |
| pipe = m.FrakturPipeline.from_pretrained( |
| "impresso-project/frakturline-classification-cnn" |
| ) |
| |
| # From a local file path |
| result = pipe("path/to/line.png") |
| # β {"label": "fraktur", "score": 0.9731} |
| |
| # From an https:// URL (fetched via urllib β no extra dependencies) |
| result = pipe("https://example.com/line.png") |
| |
| # From a PIL image (e.g. already loaded in memory) |
| from PIL import Image |
| result = pipe(Image.open("line.png")) |
| |
| # Batch |
| results = pipe(["line1.png", "line2.png"]) |
| """ |
|
|
| import math |
| import urllib.request |
| import torch |
| import torch.nn as nn |
| import torch.nn.functional as F |
| from torchvision import transforms |
| from torchvision.transforms import functional as TF |
| from PIL import Image |
| from huggingface_hub import PyTorchModelHubMixin |
|
|
| |
| |
| |
| device = torch.device( |
| "mps" |
| if torch.backends.mps.is_available() |
| else ("cuda" if torch.cuda.is_available() else "cpu") |
| ) |
|
|
| |
| |
| |
|
|
|
|
| class ResizePad: |
| """Resize to a fixed height (preserving aspect ratio), then pad/crop width.""" |
|
|
| def __init__(self, target_height: int, target_width: int, fill: int = 0): |
| self.target_height = target_height |
| self.target_width = target_width |
| self.fill = fill |
|
|
| def __call__(self, image: Image.Image) -> Image.Image: |
| width, height = image.size |
| new_width = int(width * self.target_height / height) |
| image = image.resize( |
| (new_width, self.target_height), resample=Image.Resampling.BILINEAR |
| ) |
| if new_width < self.target_width: |
| image = TF.pad( |
| image, |
| padding=(0, 0, self.target_width - new_width, 0), |
| fill=self.fill, |
| ) |
| else: |
| left = (new_width - self.target_width) // 2 |
| image = image.crop((left, 0, left + self.target_width, self.target_height)) |
| return image |
|
|
|
|
| inference_transforms = transforms.Compose( |
| [ |
| transforms.Grayscale(num_output_channels=1), |
| ResizePad(target_height=60, target_width=800), |
| transforms.ToTensor(), |
| transforms.Normalize(mean=[0.5], std=[0.5]), |
| ] |
| ) |
|
|
| |
| |
| |
|
|
|
|
| class BinaryClassificationCNN(nn.Module, PyTorchModelHubMixin): |
| """ |
| Three-layer CNN binary classifier for Fraktur vs. Other text lines. |
| |
| Input: grayscale image of shape (1, 60, 800). |
| Output: single logit β positive β Fraktur, negative β Other. |
| |
| Architecture:: |
| |
| Conv1(1β32) + ReLU + MaxPool β LayerNorm[32,30,400] |
| Conv2(32β64) + ReLU + MaxPool β LayerNorm[64,15,200] + Dropout(0.15) |
| Conv3(64β128) + LayerNorm[128,15,200] + ReLU + AdaptiveMaxPool2d((1,8)) |
| Flatten β FC(1024β128) + ReLU β FC(128β1) |
| """ |
|
|
| def __init__(self): |
| super().__init__() |
| self.conv1 = nn.Conv2d(1, 32, kernel_size=3, padding=1) |
| self.ln1 = nn.LayerNorm([32, 30, 400]) |
| self.pool = nn.MaxPool2d(2, 2) |
| self.conv2 = nn.Conv2d(32, 64, kernel_size=3, padding=1) |
| self.ln2 = nn.LayerNorm([64, 15, 200]) |
| self.dropout_conv = nn.Dropout(p=0.15, inplace=False) |
| self.conv3 = nn.Conv2d(64, 128, kernel_size=3, padding=1) |
| self.ln3 = nn.LayerNorm([128, 15, 200]) |
| |
| self.global_pool = nn.AdaptiveMaxPool2d((1, 8)) |
| |
| self.fc1 = nn.Linear(128 * 8, 128) |
| self.fc2 = nn.Linear(128, 1) |
|
|
| def forward(self, x: torch.Tensor) -> torch.Tensor: |
| x = self.pool(F.relu(self.conv1(x))) |
| x = self.ln1(x) |
| x = self.pool(F.relu(self.conv2(x))) |
| x = self.ln2(x) |
| x = self.dropout_conv(x) |
| x = self.conv3(x) |
| x = self.ln3(x) |
| x = F.relu(x) |
| x = self.global_pool(x) |
| x = F.relu(self.fc1(x.view(x.size(0), -1))) |
| return self.fc2(x) |
|
|
|
|
| |
| |
| |
|
|
|
|
| class FrakturPipeline: |
| """ |
| End-to-end pipeline for Fraktur/Other classification. |
| |
| Accepts file paths, PIL Images, or lists thereof. Returns a dict (or list |
| of dicts) with keys ``label`` (``"fraktur"`` or ``"other"``) and |
| ``score`` (sigmoid probability of the predicted class, in [0, 1]). |
| |
| Example:: |
| |
| pipe = FrakturPipeline.from_pretrained( |
| "impresso-project/frakturline-classification-cnn" |
| ) |
| pipe("line.png") |
| # β {"label": "fraktur", "score": 0.9731} |
| """ |
|
|
| def __init__( |
| self, model: BinaryClassificationCNN, device: torch.device = device |
| ) -> None: |
| self.model = model.to(device) |
| self.model.eval() |
| self._device = device |
|
|
| @classmethod |
| def from_pretrained(cls, repo_id: str, **kwargs) -> "FrakturPipeline": |
| """Load pipeline from a Hugging Face Hub repository. |
| |
| Args: |
| repo_id: HF Hub repository identifier, e.g. |
| ``"impresso-project/frakturline-classification-cnn"``. |
| **kwargs: Forwarded to |
| :meth:`BinaryClassificationCNN.from_pretrained`. |
| |
| Returns: |
| FrakturPipeline: Ready-to-use pipeline. |
| """ |
| model = BinaryClassificationCNN.from_pretrained(repo_id, **kwargs) |
| return cls(model) |
|
|
| def __call__( |
| self, |
| image: "str | Image.Image | list[str | Image.Image]", |
| **kwargs, |
| ) -> "dict[str, str | float] | list[dict[str, str | float]]": |
| """Classify one or more text-line images. |
| |
| Args: |
| image: A local file path, an ``https://`` / ``http://`` URL |
| (``str``), a :class:`PIL.Image.Image`, or a ``list`` of either. |
| URLs are fetched with :mod:`urllib.request` β no extra |
| dependencies required. |
| |
| Returns: |
| dict or list[dict]: Each dict has keys: |
| |
| - ``label`` (``str``): ``"fraktur"`` or ``"other"``. |
| - ``score`` (``float``): Sigmoid probability of the predicted |
| class, rounded to 4 decimal places. |
| """ |
| if isinstance(image, list): |
| return [self._classify_one(img) for img in image] |
| return self._classify_one(image) |
|
|
| def _classify_one(self, image: "str | Image.Image") -> "dict[str, str | float]": |
| if isinstance(image, str): |
| if image.startswith("http://") or image.startswith("https://"): |
| with urllib.request.urlopen(image) as resp: |
| image = Image.open(resp).convert("L") |
| else: |
| with Image.open(image) as img: |
| image = img.convert("L") |
| else: |
| image = image.convert("L") |
| tensor = inference_transforms(image).unsqueeze(0).to(self._device) |
| with torch.no_grad(): |
| logit = self.model(tensor).item() |
| label = "fraktur" if logit > 0 else "other" |
| prob = 1.0 / (1.0 + math.exp(-logit)) |
| score = prob if label == "fraktur" else 1.0 - prob |
| return {"label": label, "score": round(score, 4)} |
|
|