Upload folder using huggingface_hub

Browse files

Files changed (11) hide show

README.md +98 -0
config.json +22 -0
doclayout_yolo/__init__.py +93 -0
doclayout_yolo/__pycache__/__init__.cpython-312.pyc +0 -0
doclayout_yolo/__pycache__/__init__.cpython-313.pyc +0 -0
doclayout_yolo/__pycache__/g2l_crm.cpython-312.pyc +0 -0
doclayout_yolo/__pycache__/model.cpython-312.pyc +0 -0
doclayout_yolo/g2l_crm.py +118 -0
doclayout_yolo/model.py +154 -0
model.onnx +3 -0
model.pt +3 -0

README.md ADDED Viewed

	@@ -0,0 +1,98 @@

+---
+license: apache-2.0
+tags:
+  - document-layout
+  - object-detection
+  - yolo
+  - document-analysis
+library_name: ultralytics
+---
+# DocLayout-YOLO - Docstructbench
+Document layout detection model based on [DocLayout-YOLO](https://github.com/opendatalab/DocLayout-YOLO).
+## Model Description
+- **Architecture**: YOLOv10m with G2L_CRM (Global-to-Local Context Refining Module)
+- **Classes**: 10 document layout elements
+- **Input Size**: 1024x1024
+- **Paper**: [DocLayout-YOLO](https://arxiv.org/abs/2410.12628)
+### Classes
+- `title`
+- `plain_text`
+- `abandon`
+- `figure`
+- `figure_caption`
+- `table`
+- `table_caption`
+- `table_footnote`
+- `isolate_formula`
+- `formula_caption`
+## Usage
+### PyTorch
+```python
+from huggingface_hub import snapshot_download
+import sys
+# Download model (includes code + weights)
+repo_path = snapshot_download("anyformat-ai/doclayout-yolo-docstructbench")
+# Import and use
+sys.path.insert(0, repo_path)
+from doclayout_yolo import DocLayoutModel
+model = DocLayoutModel(f"{repo_path}/model.pt")
+results = model.predict("document.png")
+for det in results:
+    print(f"{det['class_name']}: {det['confidence']:.2f} at {det['bbox']}")
+```
+### ONNX
+```python
+import onnxruntime as ort
+import numpy as np
+from huggingface_hub import hf_hub_download
+import json
+# Download ONNX model and config
+model_path = hf_hub_download("anyformat-ai/doclayout-yolo-docstructbench", "model.onnx")
+config_path = hf_hub_download("anyformat-ai/doclayout-yolo-docstructbench", "config.json")
+with open(config_path) as f:
+    config = json.load(f)
+session = ort.InferenceSession(model_path)
+# Preprocess image to (1, 3, 1024, 1024) float32, normalized to [0, 1]
+# Run inference and post-process outputs
+```
+## Requirements
+```
+ultralytics
+huggingface-hub
+onnxruntime  # for ONNX inference
+```
+## Citation
+```bibtex
+@article{zhao2024doclayout,
+  title={DocLayout-YOLO: Enhancing Document Layout Analysis through Diverse Synthetic Data and Global-to-Local Adaptive Perception},
+  author={Zhao, Zhiyuan and Kang, Hengrui and Wang, Bin and He, Conghui},
+  journal={arXiv preprint arXiv:2410.12628},
+  year={2024}
+}
+```
+## License
+Apache 2.0

config.json ADDED Viewed

	@@ -0,0 +1,22 @@

+{
+  "model_type": "doclayout-yolo",
+  "model_name": "docstructbench",
+  "architecture": "yolov10m-g2l-crm",
+  "num_classes": 10,
+  "class_names": [
+    "title",
+    "plain_text",
+    "abandon",
+    "figure",
+    "figure_caption",
+    "table",
+    "table_caption",
+    "table_footnote",
+    "isolate_formula",
+    "formula_caption"
+  ],
+  "input_size": 1024,
+  "description": "Document layout detection for financial documents",
+  "source": "https://github.com/opendatalab/DocLayout-YOLO",
+  "paper": "https://arxiv.org/abs/2410.12628"
+}

doclayout_yolo/__init__.py ADDED Viewed

	@@ -0,0 +1,93 @@

+"""
+DocLayout-YOLO inference module.
+This module patches ultralytics to support G2L_CRM custom layers,
+enabling loading of DocLayout-YOLO model weights.
+Usage:
+    from doclayout_yolo import DocLayoutModel
+    model = DocLayoutModel("model.pt")
+    results = model.predict("document.png")
+"""
+import sys
+# Import our custom modules first
+from .g2l_crm import G2L_CRM, DilatedBlock, DilatedBottleneck
+def _patch_ultralytics():
+    """
+    Patch ultralytics to recognize G2L_CRM module.
+    This must be called before loading any models that use G2L_CRM.
+    """
+    import sys
+    from types import ModuleType
+    try:
+        import ultralytics.nn.modules as modules
+        import ultralytics.nn.tasks as tasks
+    except ImportError:
+        raise ImportError(
+            "ultralytics is required. Install with: pip install ultralytics"
+        )
+    # Check if already patched (fork is installed)
+    if hasattr(modules, "G2L_CRM"):
+        return  # Already has G2L_CRM, no patching needed
+    # Inject G2L_CRM into ultralytics.nn.modules
+    modules.__dict__["G2L_CRM"] = G2L_CRM
+    modules.__dict__["DilatedBlock"] = DilatedBlock
+    modules.__dict__["DilatedBottleneck"] = DilatedBottleneck
+    # Create fake ultralytics.nn.modules.g2l_crm module for PyTorch unpickling
+    # The weights file references this path, so we need it to exist
+    fake_module = ModuleType("ultralytics.nn.modules.g2l_crm")
+    fake_module.G2L_CRM = G2L_CRM
+    fake_module.DilatedBlock = DilatedBlock
+    fake_module.DilatedBottleneck = DilatedBottleneck
+    sys.modules["ultralytics.nn.modules.g2l_crm"] = fake_module
+    # Also need to inject into tasks module for globals() lookup
+    tasks.__dict__["G2L_CRM"] = G2L_CRM
+    # Monkey-patch parse_model to include G2L_CRM in base_modules and repeat_modules
+    _patch_parse_model(tasks)
+def _patch_parse_model(tasks):
+    """Patch parse_model to include G2L_CRM in the module sets."""
+    import functools
+    original_parse_model = tasks.parse_model
+    @functools.wraps(original_parse_model)
+    def patched_parse_model(d, ch, verbose=True):
+        # Temporarily inject G2L_CRM into the function's globals
+        # This is a hack, but necessary because parse_model uses globals()[m]
+        old_globals = original_parse_model.__globals__.copy()
+        original_parse_model.__globals__["G2L_CRM"] = G2L_CRM
+        try:
+            result = original_parse_model(d, ch, verbose)
+        finally:
+            # Restore original globals
+            if "G2L_CRM" not in old_globals:
+                original_parse_model.__globals__.pop("G2L_CRM", None)
+        return result
+    tasks.parse_model = patched_parse_model
+# Apply patch on import
+_patch_ultralytics()
+# Public API
+from .model import DocLayoutModel
+__all__ = ["DocLayoutModel", "G2L_CRM"]
+__version__ = "0.1.0"

doclayout_yolo/__pycache__/__init__.cpython-312.pyc ADDED Viewed

Binary file (2.94 kB). View file

doclayout_yolo/__pycache__/__init__.cpython-313.pyc ADDED Viewed

Binary file (1.21 kB). View file

doclayout_yolo/__pycache__/g2l_crm.cpython-312.pyc ADDED Viewed

Binary file (8.73 kB). View file

doclayout_yolo/__pycache__/model.cpython-312.pyc ADDED Viewed

Binary file (5.65 kB). View file

doclayout_yolo/g2l_crm.py ADDED Viewed

	@@ -0,0 +1,118 @@

+"""
+G2L_CRM (Global-to-Local Context Refining Module) for document layout analysis.
+Based on DocLayout-YOLO: https://github.com/opendatalab/DocLayout-YOLO
+Paper: https://arxiv.org/abs/2410.12628
+Original Authors: Zhiyuan Zhao, Hengrui Kang, Bin Wang, Conghui He
+"""
+import torch
+from torch import nn
+import torch.nn.functional as F
+from ultralytics.nn.modules.conv import Conv
+from ultralytics.nn.modules.block import CIB
+class DilatedBlock(nn.Module):
+    """Dilated convolution block with multi-scale fusion."""
+    def __init__(self, c, dilation, k, fuse="sum", shortcut=True):
+        super().__init__()
+        self.dilation = dilation
+        self.k = k
+        self.cv2 = Conv(c, c, k=1, s=1)
+        self.add = shortcut
+        self.fuse = fuse
+        if fuse == "glu":
+            self.conv_gating = Conv(
+                c * len(self.dilation), c * len(self.dilation), k=1, s=1, g=c * len(self.dilation)
+            )
+            self.conv1x1 = Conv(c * len(self.dilation), c, k=1, s=1, g=c)
+        elif fuse == "sum":
+            self.conv1x1 = Conv(c, c, k=1, s=1, g=c)
+        self.dcv = Conv(c, c, k=self.k, s=1)
+    def dilated_conv(self, x, dilation):
+        """Apply dilated convolution, handling both fused and non-fused cases."""
+        act = self.dcv.act
+        padding = dilation * (self.k // 2)
+        if hasattr(self.dcv, "bn") and self.dcv.bn is not None:
+            bn = self.dcv.bn
+            weight = self.dcv.conv.weight
+            return act(bn(F.conv2d(x, weight, stride=1, padding=padding, dilation=dilation)))
+        else:
+            weight = self.dcv.conv.weight
+            bias = self.dcv.conv.bias if hasattr(self.dcv.conv, "bias") else None
+            return act(F.conv2d(x, weight, bias=bias, stride=1, padding=padding, dilation=dilation))
+    def forward(self, x):
+        dx = [self.dilated_conv(x, d) for d in self.dilation]
+        dx = [self.cv2(_dx) for _dx in dx]
+        if self.fuse == "glu":
+            dx = torch.cat(dx, dim=1)
+            G = torch.sigmoid(self.conv_gating(dx))
+            dx = dx * G
+            dx = self.conv1x1(dx)
+        elif self.fuse == "sum":
+            dx = [_dx.unsqueeze(0) for _dx in dx]
+            dx = torch.cat(dx, dim=0)
+            dx = torch.sum(dx, dim=0)
+            dx = self.conv1x1(dx)
+        return x + dx if self.add else dx
+class DilatedBottleneck(nn.Module):
+    """Bottleneck with dilated convolution."""
+    def __init__(self, c1, c2, shortcut=True, dilation=[1, 2, 3], block_k=3, fuse="sum", g=1, k=(3, 3), e=0.5):
+        super().__init__()
+        c_ = int(c2 * e)
+        self.cv1 = Conv(c1, c_, k[0], 1)
+        self.cv2 = Conv(c_, c2, k[1], 1, g=g)
+        self.dilated_block = DilatedBlock(c_, dilation, block_k, fuse)
+        self.add = shortcut and c1 == c2
+    def forward(self, x):
+        return x + self.cv2(self.dilated_block(self.cv1(x))) if self.add else self.cv2(self.dilated_block(self.cv1(x)))
+class G2L_CRM(nn.Module):
+    """
+    Global-to-Local Context Refining Module.
+    CSP Bottleneck with optional dilated convolutions for multi-scale
+    feature extraction in document layout analysis.
+    """
+    def __init__(
+        self, c1, c2, n=1, shortcut=False, use_dilated=False, dilation=[1, 2, 3], block_k=3, fuse="sum", g=1, e=0.5
+    ):
+        super().__init__()
+        self.c = int(c2 * e)
+        self.cv1 = Conv(c1, 2 * self.c, 1, 1)
+        self.cv2 = Conv((2 + n) * self.c, c2, 1)
+        if use_dilated:
+            self.m = nn.ModuleList(
+                DilatedBottleneck(self.c, self.c, shortcut, dilation, block_k, fuse, g, k=((3, 3), (3, 3)), e=1.0)
+                for _ in range(n)
+            )
+        else:
+            self.m = nn.ModuleList(CIB(self.c, self.c, shortcut, e=1.0) for _ in range(n))
+    def forward(self, x):
+        y = list(self.cv1(x).chunk(2, 1))
+        for m in self.m:
+            y.append(m(y[-1]))
+        return self.cv2(torch.cat(y, 1))
+    def forward_split(self, x):
+        y = list(self.cv1(x).split((self.c, self.c), 1))
+        for m in self.m:
+            y.append(m(y[-1]))
+        return self.cv2(torch.cat(y, 1))

doclayout_yolo/model.py ADDED Viewed

	@@ -0,0 +1,154 @@

+"""Simple DocLayout model for inference."""
+import json
+from pathlib import Path
+from typing import Dict, List, Union
+import numpy as np
+from PIL import Image
+from ultralytics import YOLO
+class DocLayoutModel:
+    """
+    Document layout detection model.
+    Examples
+    --------
+    >>> model = DocLayoutModel("model.pt")
+    >>> results = model.predict("document.png")
+    >>> for det in results:
+    ...     print(f"{det['class_name']}: {det['confidence']:.2f}")
+    """
+    # Default class mappings
+    DOCSTRUCTBENCH_CLASSES = {
+        0: "title",
+        1: "plain_text",
+        2: "abandon",
+        3: "figure",
+        4: "figure_caption",
+        5: "table",
+        6: "table_caption",
+        7: "table_footnote",
+        8: "isolate_formula",
+        9: "formula_caption",
+    }
+    DOCLAYNET_CLASSES = {
+        0: "Caption",
+        1: "Footnote",
+        2: "Formula",
+        3: "List-item",
+        4: "Page-footer",
+        5: "Page-header",
+        6: "Picture",
+        7: "Section-header",
+        8: "Table",
+        9: "Text",
+        10: "Title",
+    }
+    def __init__(
+        self,
+        weights_path: Union[str, Path],
+        config_path: Union[str, Path, None] = None,
+        model_type: str = "auto",
+    ):
+        """
+        Initialize model.
+        Parameters
+        ----------
+        weights_path : str or Path
+            Path to model weights (.pt file)
+        config_path : str or Path, optional
+            Path to config.json with class names. If None, auto-detects from weights filename.
+        model_type : str, default="auto"
+            Model type: "docstructbench", "doclaynet", or "auto" (detect from filename)
+        """
+        self.weights_path = Path(weights_path)
+        self._model = None
+        # Load class names from config or auto-detect
+        if config_path:
+            with open(config_path) as f:
+                config = json.load(f)
+            self.class_names = {i: name for i, name in enumerate(config["class_names"])}
+        else:
+            self.class_names = self._get_class_names(model_type)
+    def _get_class_names(self, model_type: str) -> Dict[int, str]:
+        """Get class names based on model type."""
+        if model_type == "auto":
+            name = self.weights_path.stem.lower()
+            if "doclaynet" in name:
+                return self.DOCLAYNET_CLASSES
+            return self.DOCSTRUCTBENCH_CLASSES
+        elif model_type == "doclaynet":
+            return self.DOCLAYNET_CLASSES
+        elif model_type == "docstructbench":
+            return self.DOCSTRUCTBENCH_CLASSES
+        else:
+            raise ValueError(f"Unknown model type: {model_type}")
+    @property
+    def model(self) -> YOLO:
+        """Lazy-load the YOLO model."""
+        if self._model is None:
+            self._model = YOLO(str(self.weights_path))
+        return self._model
+    def predict(
+        self,
+        source: Union[str, Path, Image.Image, np.ndarray],
+        confidence: float = 0.2,
+        image_size: int = 1024,
+        device: str = "cpu",
+    ) -> List[Dict]:
+        """
+        Run inference on an image.
+        Parameters
+        ----------
+        source : str, Path, PIL.Image, or np.ndarray
+            Input image
+        confidence : float, default=0.2
+            Confidence threshold
+        image_size : int, default=1024
+            Input image size
+        device : str, default="cpu"
+            Device to run on ("cpu", "cuda", "mps")
+        Returns
+        -------
+        List[Dict]
+            List of detections, each with keys:
+            - class_id: int
+            - class_name: str
+            - confidence: float
+            - bbox: [x1, y1, x2, y2]
+        """
+        results = self.model.predict(
+            source=str(source) if isinstance(source, Path) else source,
+            imgsz=image_size,
+            conf=confidence,
+            device=device,
+            save=False,
+            verbose=False,
+        )
+        detections = []
+        for result in results:
+            for box in result.boxes:
+                cls = int(box.cls[0])
+                detections.append(
+                    {
+                        "class_id": cls,
+                        "class_name": self.class_names.get(cls, f"class_{cls}"),
+                        "confidence": float(box.conf[0]),
+                        "bbox": box.xyxy[0].tolist(),
+                    }
+                )
+        return detections

model.onnx ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0142c1154f5f4fcb5eb14d5f29d9cebfaee96433b6d2a99c36bb07779cd7a388
+size 75823701

model.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f1457fe54bb1dedc4b1d1b7b07348288ab63c730569343f3e7a8194e69d39266
+size 40597687