File size: 5,237 Bytes
7952bce | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 | # Install dependencies:
# pip install numpy opencv-python onnxruntime
import numpy as np
import cv2
import onnxruntime as ort
from pathlib import Path
def preprocess_image_doclayout(image, target_input_size=(800, 800)):
"""
Preprocessing for DocLayoutV3 with 800x800 input.
Returns CHW tensor (no batch dim) + scale factors.
"""
orig_h, orig_w = image.shape[:2]
target_h, target_w = target_input_size
scale_h = target_h / orig_h
scale_w = target_w / orig_w
resized = cv2.resize(image, (target_w, target_h), interpolation=cv2.INTER_LINEAR)
rgb = cv2.cvtColor(resized, cv2.COLOR_BGR2RGB)
blob = rgb.astype(np.float32) / 255.0
mean = np.array([0.485, 0.456, 0.406], dtype=np.float32)
std = np.array([0.229, 0.224, 0.225], dtype=np.float32)
blob = (blob - mean) / std
# CHW — no batch dim yet; caller stacks the batch
blob = blob.transpose(2, 0, 1)
return blob, scale_h, scale_w
def preprocess_batch(image_paths, target_input_size=(800, 800)):
"""
Load and preprocess a list of image paths.
Returns:
input_blob : (N, 3, H, W) float32
shape_list : (N, 2) float32 [[H, W], ...]
scale_list : (N, 2) float32 [[scale_h, scale_w], ...]
images : list of original BGR images (for debug / visualisation)
"""
blobs, shapes, scales, images = [], [], [], []
for path in image_paths:
img = cv2.imread(str(path))
if img is None:
raise FileNotFoundError(f"Could not read image: {path}")
blob, scale_h, scale_w = preprocess_image_doclayout(img, target_input_size)
blobs.append(blob)
shapes.append(target_input_size) # (H, W)
scales.append((scale_h, scale_w))
images.append(img)
input_blob = np.stack(blobs, axis=0).astype(np.float32) # (N, 3, H, W)
shape_arr = np.array(shapes, dtype=np.float32) # (N, 2)
scale_arr = np.array(scales, dtype=np.float32) # (N, 2)
return input_blob, shape_arr, scale_arr, images
def run_doclayout_onnx_batch(image_paths, model_path, conf_thresh=0.5):
"""
Run DocLayoutV3 on a batch of images.
The model's three inputs are:
input_names[0] : image shape – expected shape (N, 2)
input_names[1] : image tensor – expected shape (N, 3, H, W)
input_names[2] : scale factors – expected shape (N, 2)
Output shape: (N * max_dets, 7)
Values: [image_index, label_index, score, xmin, ymin, xmax, ymax]
(Some ONNX exports omit image_index — see note in post-processing.)
"""
model = ort.InferenceSession(model_path)
input_names = [i.name for i in model.get_inputs()]
output_names = [o.name for o in model.get_outputs()]
input_blob, shape_arr, scale_arr, images = preprocess_batch(image_paths)
n = len(image_paths)
input_feed = {
"im_shape": shape_arr, # (N, 2)
"image": input_blob, # (N, 3, 800, 800)
"scale_factor": scale_arr, # (N, 2)
}
raw_output = model.run(output_names, input_feed)[0] # (N*dets, 7) or (N*dets, 6)
return postprocess_batch(raw_output, n, conf_thresh)
def postprocess_batch(raw_output, n_images, conf_thresh=0.5):
"""
Split flat detection output back into per-image results.
PP-DocLayout ONNX output columns:
[img_idx, label, score, x0, y0, x1, y1, read_order] (8 cols)
or
[label, score, x0, y0, x1, y1, read_order] (7 cols — single-image compat)
We handle both layouts automatically.
"""
n_cols = raw_output.shape[1]
if n_cols == 8:
# Batched export: first column is the image index
img_idx_col = raw_output[:, 0].astype(int)
detections = raw_output[:, 1:] # drop img_idx → 7 cols
else:
# Single-image export used for a batch: distribute evenly
dets_per_image = len(raw_output) // n_images
img_idx_col = np.repeat(np.arange(n_images), dets_per_image)
detections = raw_output
results = []
for i in range(n_images):
mask = img_idx_col == i
boxes = detections[mask]
boxes = boxes[boxes[:, 1] > conf_thresh] # confidence filter
boxes = boxes[np.argsort(boxes[:, 6])] # sort by read_order
results.append(boxes)
return results
def print_doclayout_res(boxes, image_label=""):
header = f"--- {image_label} ---" if image_label else "--- Results ---"
print(header)
print("cls_id\tscore\txmin\tymin\txmax\tymax\tread_order")
for box in boxes:
print(
f"{box[0]:.0f}\t\t{box[1]:.3f}\t"
f"{box[2]:.2f}\t{box[3]:.2f}\t"
f"{box[4]:.2f}\t{box[5]:.2f}\t{box[6]:.0f}"
)
if __name__ == '__main__':
MODEL_PATH = "your/path/to/PP-DocLayoutV3.onnx"
image_paths = [
"your_test_image_1.png",
"your_test_image_2.png",
"your_test_image_3.png",
"your_test_image_4.png",
]
results = run_doclayout_onnx_batch(image_paths, MODEL_PATH, conf_thresh=0.5)
for path, boxes in zip(image_paths, results):
print_doclayout_res(boxes, image_label=Path(path).name)
print()
|