File size: 5,237 Bytes

7952bce

# Install dependencies:
# pip install numpy opencv-python onnxruntime

import numpy as np
import cv2
import onnxruntime as ort
from pathlib import Path

def preprocess_image_doclayout(image, target_input_size=(800, 800)):
    """
    Preprocessing for DocLayoutV3 with 800x800 input.
    Returns CHW tensor (no batch dim) + scale factors.
    """
    orig_h, orig_w = image.shape[:2]
    target_h, target_w = target_input_size
    scale_h = target_h / orig_h
    scale_w = target_w / orig_w

    resized = cv2.resize(image, (target_w, target_h), interpolation=cv2.INTER_LINEAR)
    rgb = cv2.cvtColor(resized, cv2.COLOR_BGR2RGB)
    blob = rgb.astype(np.float32) / 255.0

    mean = np.array([0.485, 0.456, 0.406], dtype=np.float32)
    std  = np.array([0.229, 0.224, 0.225], dtype=np.float32)
    blob = (blob - mean) / std

    # CHW — no batch dim yet; caller stacks the batch
    blob = blob.transpose(2, 0, 1)
    return blob, scale_h, scale_w


def preprocess_batch(image_paths, target_input_size=(800, 800)):
    """
    Load and preprocess a list of image paths.
    Returns:
        input_blob  : (N, 3, H, W)  float32
        shape_list  : (N, 2)        float32  [[H, W], ...]
        scale_list  : (N, 2)        float32  [[scale_h, scale_w], ...]
        images      : list of original BGR images (for debug / visualisation)
    """
    blobs, shapes, scales, images = [], [], [], []

    for path in image_paths:
        img = cv2.imread(str(path))
        if img is None:
            raise FileNotFoundError(f"Could not read image: {path}")

        blob, scale_h, scale_w = preprocess_image_doclayout(img, target_input_size)
        blobs.append(blob)
        shapes.append(target_input_size)          # (H, W)
        scales.append((scale_h, scale_w))
        images.append(img)

    input_blob = np.stack(blobs, axis=0).astype(np.float32)        # (N, 3, H, W)
    shape_arr  = np.array(shapes, dtype=np.float32)                 # (N, 2)
    scale_arr  = np.array(scales, dtype=np.float32)                 # (N, 2)

    return input_blob, shape_arr, scale_arr, images


def run_doclayout_onnx_batch(image_paths, model_path, conf_thresh=0.5):
    """
    Run DocLayoutV3 on a batch of images.

    The model's three inputs are:
        input_names[0] : image shape   – expected shape (N, 2)
        input_names[1] : image tensor  – expected shape (N, 3, H, W)
        input_names[2] : scale factors – expected shape (N, 2)

    Output shape: (N * max_dets, 7)
    Values: [image_index, label_index, score, xmin, ymin, xmax, ymax]
    (Some ONNX exports omit image_index — see note in post-processing.)
    """
    model = ort.InferenceSession(model_path)
    input_names  = [i.name for i in model.get_inputs()]
    output_names = [o.name for o in model.get_outputs()]

    input_blob, shape_arr, scale_arr, images = preprocess_batch(image_paths)
    n = len(image_paths)

    input_feed = {
        "im_shape": shape_arr,  # (N, 2)
        "image": input_blob,  # (N, 3, 800, 800)
        "scale_factor": scale_arr,  # (N, 2)
    }

    raw_output = model.run(output_names, input_feed)[0]  # (N*dets, 7) or (N*dets, 6)

    return postprocess_batch(raw_output, n, conf_thresh)


def postprocess_batch(raw_output, n_images, conf_thresh=0.5):
    """
    Split flat detection output back into per-image results.

    PP-DocLayout ONNX output columns:
        [img_idx, label, score, x0, y0, x1, y1, read_order]   (8 cols)
      or
        [label, score, x0, y0, x1, y1, read_order]             (7 cols — single-image compat)

    We handle both layouts automatically.
    """
    n_cols = raw_output.shape[1]

    if n_cols == 8:
        # Batched export: first column is the image index
        img_idx_col = raw_output[:, 0].astype(int)
        detections  = raw_output[:, 1:]   # drop img_idx → 7 cols
    else:
        # Single-image export used for a batch: distribute evenly
        dets_per_image = len(raw_output) // n_images
        img_idx_col = np.repeat(np.arange(n_images), dets_per_image)
        detections  = raw_output

    results = []
    for i in range(n_images):
        mask  = img_idx_col == i
        boxes = detections[mask]
        boxes = boxes[boxes[:, 1] > conf_thresh]          # confidence filter
        boxes = boxes[np.argsort(boxes[:, 6])]            # sort by read_order
        results.append(boxes)

    return results


def print_doclayout_res(boxes, image_label=""):
    header = f"--- {image_label} ---" if image_label else "--- Results ---"
    print(header)
    print("cls_id\tscore\txmin\tymin\txmax\tymax\tread_order")
    for box in boxes:
        print(
            f"{box[0]:.0f}\t\t{box[1]:.3f}\t"
            f"{box[2]:.2f}\t{box[3]:.2f}\t"
            f"{box[4]:.2f}\t{box[5]:.2f}\t{box[6]:.0f}"
        )


if __name__ == '__main__':
    MODEL_PATH = "your/path/to/PP-DocLayoutV3.onnx"

    image_paths = [
        "your_test_image_1.png",
        "your_test_image_2.png",
        "your_test_image_3.png",
        "your_test_image_4.png",
    ]

    results = run_doclayout_onnx_batch(image_paths, MODEL_PATH, conf_thresh=0.5)

    for path, boxes in zip(image_paths, results):
        print_doclayout_res(boxes, image_label=Path(path).name)
        print()