add code

Files changed (13) hide show

model.py +219 -0
post_processing/table_struct_pp.py +230 -0
post_processing/wbf.py +321 -0
table_structure_v1.py +78 -0
utils.py +201 -0
yolox/__init__.py +7 -0
yolox/boxes.py +55 -0
yolox/darknet.py +179 -0
yolox/network_blocks.py +210 -0
yolox/yolo_fpn.py +84 -0
yolox/yolo_head.py +235 -0
yolox/yolo_pafpn.py +116 -0
yolox/yolox.py +32 -0

model.py ADDED Viewed

	@@ -0,0 +1,219 @@

+import os
+import sys
+import torch
+import importlib
+import numpy as np
+import numpy.typing as npt
+import torch.nn as nn
+import torch.nn.functional as F
+from typing import Dict, List, Tuple, Union
+from yolox.boxes import postprocess
+def define_model(config_name: str = "page_element_v3", verbose: bool = True) -> nn.Module:
+    """
+    Defines and initializes the model based on the configuration.
+    Args:
+        config_name (str): Configuration name. Defaults to "page_element_v3".
+        verbose (bool): Whether to print verbose output. Defaults to True.
+    Returns:
+        torch.nn.Module: The initialized YOLOX model.
+    """
+    # Load model from exp_file
+    sys.path.append(os.path.dirname(config_name))
+    exp_module = importlib.import_module(os.path.basename(config_name).split(".")[0])
+    config = exp_module.Exp()
+    model = config.get_model()
+    # Load weights
+    if verbose:
+        print(" -> Loading weights from", config.ckpt)
+    ckpt = torch.load(config.ckpt, map_location="cpu", weights_only=False)
+    model.load_state_dict(ckpt["model"], strict=True)
+    model = YoloXWrapper(model, config)
+    return model.eval().to(config.device)
+def resize_pad(img: torch.Tensor, size: tuple) -> torch.Tensor:
+    """
+    Resizes and pads an image to a given size.
+    The goal is to preserve the aspect ratio of the image.
+    Args:
+        img (torch.Tensor[C x H x W]): The image to resize and pad.
+        size (tuple[2]): The size to resize and pad the image to.
+    Returns:
+        torch.Tensor: The resized and padded image.
+    """
+    img = img.float()
+    _, h, w = img.shape
+    scale = min(size[0] / h, size[1] / w)
+    nh = int(h * scale)
+    nw = int(w * scale)
+    img = F.interpolate(
+        img.unsqueeze(0), size=(nh, nw), mode="bilinear", align_corners=False
+    ).squeeze(0)
+    img = torch.clamp(img, 0, 255)
+    pad_b = size[0] - nh
+    pad_r = size[1] - nw
+    img = F.pad(img, (0, pad_r, 0, pad_b), value=114.0)
+    return img
+class YoloXWrapper(nn.Module):
+    """
+    Wrapper for YoloX models.
+    """
+    def __init__(self, model: nn.Module, config) -> None:
+        """
+        Constructor
+        Args:
+            model (torch model): Yolo model.
+            config (Config): Config object containing model parameters.
+        """
+        super().__init__()
+        self.model = model
+        self.config = config
+        # Copy config parameters
+        self.device = config.device
+        self.img_size = config.size
+        self.min_bbox_size = config.min_bbox_size
+        self.normalize_boxes = config.normalize_boxes
+        self.conf_thresh = config.conf_thresh
+        self.iou_thresh = config.iou_thresh
+        self.class_agnostic = config.class_agnostic
+        self.threshold = config.threshold
+        self.labels = config.labels
+        self.num_classes = config.num_classes
+    def reformat_input(
+        self,
+        x: torch.Tensor,
+        orig_sizes: Union[torch.Tensor, List, Tuple, npt.NDArray]
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """
+        Reformats the input data and original sizes to the correct format.
+        Args:
+            x (torch.Tensor[BS x C x H x W]): Input image batch.
+            orig_sizes (torch.Tensor or list or np.ndarray): Original image sizes.
+        Returns:
+            torch tensor [BS x C x H x W]: Input image batch.
+            torch tensor [BS x 2]: Original image sizes (before resizing and padding).
+        """
+        # Convert image size to tensor
+        if isinstance(orig_sizes, (list, tuple)):
+            orig_sizes = np.array(orig_sizes)
+        if orig_sizes.shape[-1] == 3:  # remove channel
+            orig_sizes = orig_sizes[..., :2]
+        if isinstance(orig_sizes, np.ndarray):
+            orig_sizes = torch.from_numpy(orig_sizes).to(self.device)
+        # Add batch dimension if not present
+        if len(x.size()) == 3:
+            x = x.unsqueeze(0)
+        if len(orig_sizes.size()) == 1:
+            orig_sizes = orig_sizes.unsqueeze(0)
+        return x, orig_sizes
+    def preprocess(self, image: Union[torch.Tensor, npt.NDArray]) -> torch.Tensor:
+        """
+        YoloX preprocessing function:
+        - Resizes to the longest edge to img_size while preserving the aspect ratio
+        - Pads the shortest edge to img_size
+        Args:
+            image (torch tensor or np array [H x W x 3]): Input images in uint8 format.
+        Returns:
+            torch tensor [3 x H x W]: Processed image.
+        """
+        if not isinstance(image, torch.Tensor):
+            image = torch.from_numpy(image)
+        image = image.permute(2, 0, 1)  # [H, W, 3] -> [3, H, W]
+        image = resize_pad(image, self.img_size)
+        return image.float()
+    def forward(
+        self,
+        x: torch.Tensor,
+        orig_sizes: Union[torch.Tensor, List, Tuple, npt.NDArray]
+    ) -> List[Dict[str, torch.Tensor]]:
+        """
+        Forward pass of the model.
+        Applies NMS and reformats the predictions.
+        Args:
+            x (torch.Tensor[BS x C x H x W]): Input image batch.
+            orig_sizes (torch.Tensor or list or np.ndarray): Original image sizes.
+        Returns:
+            list[dict]: List of prediction dictionaries. Each dictionary contains:
+                - labels (torch.Tensor[N]): Class labels
+                - boxes (torch.Tensor[N x 4]): Bounding boxes
+                - scores (torch.Tensor[N]): Confidence scores.
+        """
+        x, orig_sizes = self.reformat_input(x, orig_sizes)
+        # Scale to 0-255 if in range 0-1
+        if x.max() <= 1:
+            x *= 255
+        pred_boxes = self.model(x.to(self.device))
+        # NMS
+        pred_boxes = postprocess(
+            pred_boxes,
+            self.config.num_classes,
+            self.conf_thresh,
+            self.iou_thresh,
+            class_agnostic=self.class_agnostic,
+        )
+        # Reformat output
+        preds = []
+        for i, (p, size) in enumerate(zip(pred_boxes, orig_sizes)):
+            if p is None:  # No detections
+                preds.append({
+                    "labels": torch.empty(0),
+                    "boxes": torch.empty((0, 4)),
+                    "scores": torch.empty(0),
+                })
+                continue
+            p = p.view(-1, p.size(-1))
+            ratio = min(self.img_size[0] / size[0], self.img_size[1] / size[1])
+            boxes = p[:, :4] / ratio
+            # Clip
+            boxes[:, [0, 2]] = torch.clamp(boxes[:, [0, 2]], 0, size[1])
+            boxes[:, [1, 3]] = torch.clamp(boxes[:, [1, 3]], 0, size[0])
+            # Remove too small
+            kept = (
+                (boxes[:, 2] - boxes[:, 0] > self.min_bbox_size) &
+                (boxes[:, 3] - boxes[:, 1] > self.min_bbox_size)
+            )
+            boxes = boxes[kept]
+            p = p[kept]
+            # Normalize to 0-1
+            if self.normalize_boxes:
+                boxes[:, [0, 2]] /= size[1]
+                boxes[:, [1, 3]] /= size[0]
+            scores = p[:, 4] * p[:, 5]
+            labels = p[:, 6]
+            preds.append({"labels": labels, "boxes": boxes, "scores": scores})
+        return preds

post_processing/table_struct_pp.py ADDED Viewed

	@@ -0,0 +1,230 @@

+import numpy as np
+import numpy.typing as npt
+from typing import List, Tuple, Optional
+def expand_boxes(
+    boxes: npt.NDArray[np.float64],
+    r_x: Tuple[float, float] = (1, 1),
+    r_y: Tuple[float, float] = (1, 1),
+    size_agnostic: bool = True,
+) -> npt.NDArray[np.float64]:
+    """
+    Expands bounding boxes by a specified ratio.
+    Expected box format is normalized [x_min, y_min, x_max, y_max].
+    Args:
+        boxes (numpy.ndarray): Array of bounding boxes with shape (N, 4).
+        r_x (tuple, optional): Left, right expansion ratios. Defaults to (1, 1) (no expansion).
+        r_y (tuple, optional): Up, down expansion ratios. Defaults to (1, 1) (no expansion).
+        size_agnostic (bool, optional): Expand independently of the box shape. Defaults to True.
+    Returns:
+        numpy.ndarray: Adjusted bounding boxes clipped to the [0, 1] range.
+    """
+    old_boxes = boxes.copy()
+    if not size_agnostic:
+        h = boxes[:, 3] - boxes[:, 1]
+        w = boxes[:, 2] - boxes[:, 0]
+    else:
+        h, w = 1, 1
+    boxes[:, 0] -= w * (r_x[0] - 1)  # left
+    boxes[:, 2] += w * (r_x[1] - 1)  # right
+    boxes[:, 1] -= h * (r_y[0] - 1)  # up
+    boxes[:, 3] += h * (r_y[1] - 1)  # down
+    boxes = np.clip(boxes, 0, 1)
+    # Enforce non-overlapping boxes
+    for i in range(len(boxes)):
+        for j in range(i + 1, len(boxes)):
+            iou = bb_iou_array(boxes[i][None], boxes[j])[0]
+            old_iou = bb_iou_array(old_boxes[i][None], old_boxes[j])[0]
+            # print(iou, old_iou)
+            if iou > 0.05 and old_iou < 0.1:
+                if boxes[i, 1] < boxes[j, 1]:  # i above j
+                    boxes[j, 1] = min(old_boxes[j, 1], boxes[i, 3])
+                    if old_iou > 0:
+                        boxes[i, 3] = max(old_boxes[i, 3], boxes[j, 1])
+                else:
+                    boxes[i, 1] = min(old_boxes[i, 1], boxes[j, 3])
+                    if old_iou > 0:
+                        boxes[j, 3] = max(old_boxes[j, 3], boxes[i, 1])
+    return boxes
+def merge_boxes(
+    b1: npt.NDArray[np.float64], b2: npt.NDArray[np.float64]
+) -> npt.NDArray[np.float64]:
+    """
+    Merges two bounding boxes into a single box that encompasses both.
+    Args:
+        b1 (numpy.ndarray): First bounding box [x_min, y_min, x_max, y_max].
+        b2 (numpy.ndarray): Second bounding box [x_min, y_min, x_max, y_max].
+    Returns:
+        numpy.ndarray: A single bounding box that covers both input boxes.
+    """
+    b = b1.copy()
+    b[0] = min(b1[0], b2[0])
+    b[1] = min(b1[1], b2[1])
+    b[2] = max(b1[2], b2[2])
+    b[3] = max(b1[3], b2[3])
+    return b
+def bb_iou_array(
+    boxes: npt.NDArray[np.float64], new_box: npt.NDArray[np.float64]
+) -> npt.NDArray[np.float64]:
+    """
+    Calculates the Intersection over Union (IoU) between a box and an array of boxes.
+    Args:
+        boxes (numpy.ndarray): Array of bounding boxes with shape (N, 4).
+        new_box (numpy.ndarray): A single bounding box [x_min, y_min, x_max, y_max].
+    Returns:
+        numpy.ndarray: Array of IoU values between the new_box and each box in the array.
+    """
+    # bb interesection over union
+    xA = np.maximum(boxes[:, 0], new_box[0])
+    yA = np.maximum(boxes[:, 1], new_box[1])
+    xB = np.minimum(boxes[:, 2], new_box[2])
+    yB = np.minimum(boxes[:, 3], new_box[3])
+    interArea = np.maximum(xB - xA, 0) * np.maximum(yB - yA, 0)
+    # compute the area of both the prediction and ground-truth rectangles
+    boxAArea = (boxes[:, 2] - boxes[:, 0]) * (boxes[:, 3] - boxes[:, 1])
+    boxBArea = (new_box[2] - new_box[0]) * (new_box[3] - new_box[1])
+    iou = interArea / (boxAArea + boxBArea - interArea)
+    return iou
+def match_with_title(
+    box: npt.NDArray[np.float64],
+    title_boxes: npt.NDArray[np.float64],
+    match_dist: float = 0.1,
+    delta: float = 1.,
+    already_matched: List[int] = [],
+) -> Tuple[Optional[npt.NDArray[np.float64]], Optional[List[int]]]:
+    """
+    Matches a bounding box with a title bounding box based on IoU or proximity.
+    Args:
+        box (numpy.ndarray): Bounding box to match with title [x_min, y_min, x_max, y_max].
+        title_boxes (numpy.ndarray): Array of title bounding boxes with shape (N, 4).
+        match_dist (float, optional): Maximum distance for matching. Defaults to 0.1.
+        delta (float, optional): Multiplier for matching several titles. Defaults to 1..
+        already_matched (list, optional): List of already matched title indices. Defaults to [].
+    Returns:
+        tuple or None: If matched, returns a tuple of (merged_bbox, updated_title_boxes).
+                       If no match is found, returns None, None.
+    """
+    if not len(title_boxes):
+        return None, None
+    dist_above = np.abs(title_boxes[:, 3] - box[1])
+    dist_below = np.abs(box[3] - title_boxes[:, 1])
+    dist_left = np.abs(title_boxes[:, 0] - box[0])
+    dist_center = np.abs(title_boxes[:, 0] + title_boxes[:, 2] - box[0] - box[2]) / 2
+    dists = np.min([dist_above, dist_below], 0)
+    dists += np.min([dist_left, dist_center], 0) / 2
+    ious = bb_iou_array(title_boxes, box)
+    dists = np.where(ious > 0, min(match_dist - 0.01, np.min(dists)) / delta, dists)
+    if len(already_matched):
+        dists[already_matched] = match_dist * 10  # Remove already matched titles
+    matches = None
+    if np.min(dists) <= match_dist:
+        matches = np.where(
+            dists <= min(match_dist, np.min(dists) * delta)
+        )[0]
+    if matches is not None:
+        new_bbox = box
+        for match in matches:
+            new_bbox = merge_boxes(new_bbox, title_boxes[match])
+        return new_bbox, list(matches)
+    else:
+        return None, None
+def match_boxes_with_title(
+    boxes: npt.NDArray[np.float64],
+    confs: npt.NDArray[np.float64],
+    labels: npt.NDArray[np.int_],
+    classes: List[str],
+    to_match_labels: List[str] = ["chart"],
+    remove_matched_titles: bool = False,
+    match_dist: float = 0.1,
+) -> Tuple[
+    npt.NDArray[np.float64],
+    npt.NDArray[np.float64],
+    npt.NDArray[np.int_],
+    List[int],
+]:
+    """
+    Matches charts with title.
+    Args:
+        boxes (numpy.ndarray): Array of bounding boxes with shape (N, 4).
+        confs (numpy.ndarray): Array of confidence scores with shape (N,).
+        labels (numpy.ndarray): Array of labels with shape (N,).
+        classes (list): List of class names.
+        to_match_labels (list): List of class names to match with titles.
+        remove_matched_titles (bool): Whether to remove matched titles from the boxes.
+    Returns:
+        boxes (numpy.ndarray): Array of bounding boxes with shape (M, 4).
+        confs (numpy.ndarray): Array of confidence scores with shape (M,).
+        labels (numpy.ndarray): Array of labels with shape (M,).
+        found_title (list): List of indices of matched titles.
+        no_found_title (list): List of indices of unmatched titles.
+        match_dist (float, optional): Maximum distance for matching. Defaults to 0.1.
+    """
+    # Put titles at the end
+    title_ids = np.where(labels == classes.index("title"))[0]
+    order = np.concatenate([np.delete(np.arange(len(boxes)), title_ids), title_ids])
+    boxes = boxes[order]
+    confs = confs[order]
+    labels = labels[order]
+    # Ids
+    title_ids = np.where(labels == classes.index("title"))[0]
+    to_match = np.where(np.isin(labels, [classes.index(c) for c in to_match_labels]))[0]
+    # Matching
+    found_title, already_matched = [], []
+    for i in range(len(boxes)):
+        if i not in to_match:
+            continue
+        merged_box, matched_title_ids = match_with_title(
+            boxes[i],
+            boxes[title_ids],
+            already_matched=already_matched,
+            match_dist=match_dist,
+        )
+        if matched_title_ids is not None:
+            # print(f'Merged {classes[int(labels[i])]} at idx #{i} with title {matched_title_ids[-1]}')  # noqa
+            boxes[i] = merged_box
+            already_matched += matched_title_ids
+            found_title.append(i)
+    if remove_matched_titles and len(already_matched):
+        boxes = np.delete(boxes, title_ids[already_matched], axis=0)
+        confs = np.delete(confs, title_ids[already_matched], axis=0)
+        labels = np.delete(labels, title_ids[already_matched], axis=0)
+    return boxes, confs, labels, found_title

post_processing/wbf.py ADDED Viewed

	@@ -0,0 +1,321 @@

+# Adapted from:
+# https://github.com/ZFTurbo/Weighted-Boxes-Fusion/blob/master/ensemble_boxes/ensemble_boxes_wbf.py
+import warnings
+from typing import Dict, List, Tuple, Union, Literal
+import numpy as np
+import numpy.typing as npt
+def prefilter_boxes(
+    boxes: List[npt.NDArray[np.float64]],
+    scores: List[npt.NDArray[np.float64]],
+    labels: List[npt.NDArray[np.int_]],
+    weights: List[float],
+    thr: float,
+    class_agnostic: bool = False,
+) -> Dict[Union[str, int], npt.NDArray[np.float64]]:
+    """
+    Reformats and filters boxes.
+    Output is a dict of boxes to merge separately.
+    Args:
+        boxes (list[np array[n x 4]]): List of boxes. One list per model.
+        scores (list[np array[n]]): List of confidences.
+        labels (list[np array[n]]): List of labels.
+        weights (list): Model weights.
+        thr (float): Confidence threshold
+        class_agnostic (bool, optional): Merge boxes from different classes. Defaults to False.
+    Returns:
+        dict[np array [? x 8]]: Filtered boxes.
+    """
+    # Create dict with boxes stored by its label
+    new_boxes = dict()
+    for t in range(len(boxes)):
+        assert len(boxes[t]) == len(scores[t]), "len(boxes) != len(scores)"
+        assert len(boxes[t]) == len(labels[t]), "len(boxes) != len(labels)"
+        for j in range(len(boxes[t])):
+            score = scores[t][j]
+            if score < thr:
+                continue
+            label = int(labels[t][j])
+            box_part = boxes[t][j]
+            x1 = float(box_part[0])
+            y1 = float(box_part[1])
+            x2 = float(box_part[2])
+            y2 = float(box_part[3])
+            # Box data checks
+            if x2 < x1:
+                warnings.warn("X2 < X1 value in box. Swap them.")
+                x1, x2 = x2, x1
+            if y2 < y1:
+                warnings.warn("Y2 < Y1 value in box. Swap them.")
+                y1, y2 = y2, y1
+            array = np.array([x1, x2, y1, y2])
+            if array.min() < 0 or array.max() > 1:
+                warnings.warn("Coordinates outside [0, 1]")
+                array = np.clip(array, 0, 1)
+                x1, x2, y1, y2 = array
+            if (x2 - x1) * (y2 - y1) == 0.0:
+                warnings.warn("Zero area box skipped: {}.".format(box_part))
+                continue
+            # [label, score, weight, model index, x1, y1, x2, y2]
+            b = [int(label), float(score) * weights[t], weights[t], t, x1, y1, x2, y2]
+            label_k = "*" if class_agnostic else label
+            if label_k not in new_boxes:
+                new_boxes[label_k] = []
+            new_boxes[label_k].append(b)
+    # Sort each list in dict by score and transform it to numpy array
+    for k in new_boxes:
+        current_boxes = np.array(new_boxes[k])
+        new_boxes[k] = current_boxes[current_boxes[:, 1].argsort()[::-1]]
+    return new_boxes
+def merge_labels(
+    labels: npt.NDArray[np.int_], confs: npt.NDArray[np.float64]
+) -> int:
+    """
+    Custom function for merging labels.
+    If all labels are the same, return the unique value.
+    Else, return the label of the most confident non-title (class 2) box.
+    Args:
+        labels (np array [n]): Labels.
+        confs (np array [n]): Confidence.
+    Returns:
+        int: Label.
+    """
+    if len(np.unique(labels)) == 1:
+        return labels[0]
+    else:  # Most confident and not a title
+        confs = confs[confs != 2]
+        labels = labels[labels != 2]
+        return labels[np.argmax(confs)]
+def get_weighted_box(
+    boxes: npt.NDArray[np.float64], conf_type: Literal["avg", "max"] = "avg"
+) -> npt.NDArray[np.float64]:
+    """
+    Merges boxes by using the weighted fusion.
+    Args:
+        boxes (np array [n x 8]): Boxes to merge.
+        conf_type (str, optional): Confidence merging type. Defaults to "avg".
+    Returns:
+        np array [8]: Merged box.
+    """
+    box = np.zeros(8, dtype=np.float32)
+    conf = 0
+    conf_list = []
+    w = 0
+    for b in boxes:
+        box[4:] += b[1] * b[4:]
+        conf += b[1]
+        conf_list.append(b[1])
+        w += b[2]
+    box[0] = merge_labels(
+        np.array([b[0] for b in boxes]), np.array([b[1] for b in boxes])
+    )
+    box[1] = np.max(conf_list) if conf_type == "max" else np.mean(conf_list)
+    box[2] = w
+    box[3] = -1  # model index field is retained for consistency but is not used.
+    box[4:] /= conf
+    return box
+def get_biggest_box(
+    boxes: npt.NDArray[np.float64], conf_type: Literal["avg", "max"] = "avg"
+) -> npt.NDArray[np.float64]:
+    """
+    Merges boxes by using the biggest box.
+    Args:
+        boxes (np array [n x 8]): Boxes to merge.
+        conf_type (str, optional): Confidence merging type. Defaults to "avg".
+    Returns:
+        np array [8]: Merged box.
+    """
+    box = np.zeros(8, dtype=np.float32)
+    box[4:] = boxes[0][4:]
+    conf_list = []
+    w = 0
+    for b in boxes:
+        box[4] = min(box[4], b[4])
+        box[5] = min(box[5], b[5])
+        box[6] = max(box[6], b[6])
+        box[7] = max(box[7], b[7])
+        conf_list.append(b[1])
+        w += b[2]
+    box[0] = merge_labels(
+        np.array([b[0] for b in boxes]), np.array([b[1] for b in boxes])
+    )
+    #     print(box[0], np.array([b[0] for b in boxes]))
+    box[1] = np.max(conf_list) if conf_type == "max" else np.mean(conf_list)
+    box[2] = w
+    box[3] = -1  # model index field is retained for consistency but is not used.
+    return box
+def find_matching_box_fast(
+    boxes_list: npt.NDArray[np.float64],
+    new_box: npt.NDArray[np.float64],
+    match_iou: float,
+) -> Tuple[int, float]:
+    """
+    Reimplementation of find_matching_box with numpy instead of loops.
+    Gives significant speed up for larger arrays (~100x).
+    This was previously the bottleneck since the function is called for every entry in the array.
+    Args:
+        boxes_list (np.ndarray): Array of boxes with shape (N, 8).
+        new_box (np.ndarray): New box to match with shape (8,).
+        match_iou (float): IoU threshold for matching.
+    Returns:
+        Tuple[int, float]: Index of best matching box (-1 if no match) and IoU value.
+    """
+    def bb_iou_array(
+        boxes: npt.NDArray[np.float64], new_box: npt.NDArray[np.float64]
+    ) -> npt.NDArray[np.float64]:
+        # bb interesection over union
+        xA = np.maximum(boxes[:, 0], new_box[0])
+        yA = np.maximum(boxes[:, 1], new_box[1])
+        xB = np.minimum(boxes[:, 2], new_box[2])
+        yB = np.minimum(boxes[:, 3], new_box[3])
+        interArea = np.maximum(xB - xA, 0) * np.maximum(yB - yA, 0)
+        # compute the area of both the prediction and ground-truth rectangles
+        boxAArea = (boxes[:, 2] - boxes[:, 0]) * (boxes[:, 3] - boxes[:, 1])
+        boxBArea = (new_box[2] - new_box[0]) * (new_box[3] - new_box[1])
+        iou = interArea / (boxAArea + boxBArea - interArea)
+        return iou
+    if boxes_list.shape[0] == 0:
+        return -1, match_iou
+    ious = bb_iou_array(boxes_list[:, 4:], new_box[4:])
+    # ious[boxes[:, 0] != new_box[0]] = -1
+    best_idx = np.argmax(ious)
+    best_iou = ious[best_idx]
+    if best_iou <= match_iou:
+        best_iou = match_iou
+        best_idx = -1
+    return best_idx, best_iou
+def weighted_boxes_fusion(
+    boxes_list: List[npt.NDArray[np.float64]],
+    labels_list: List[npt.NDArray[np.int_]],
+    scores_list: List[npt.NDArray[np.float64]],
+    iou_thr: float = 0.5,
+    skip_box_thr: float = 0.0,
+    conf_type: Literal["avg", "max"] = "avg",
+    merge_type: Literal["weighted", "biggest"] = "weighted",
+    class_agnostic: bool = False,
+) -> Tuple[npt.NDArray[np.float64], npt.NDArray[np.float64], npt.NDArray[np.int_]]:
+    """
+    Custom WBF implementation that supports a class_agnostic mode and a biggest box fusion.
+    Boxes are expected to be in normalized (x0, y0, x1, y1) format.
+    Args:
+        boxes_list (list[np.ndarray[n x 4]]): List of boxes. One list per model.
+        labels_list (list[np.ndarray[n]]): List of labels.
+        scores_list (list[np.ndarray[n]]): List of confidences.
+        iou_thr (float, optional): IoU threshold for matching. Defaults to 0.55.
+        skip_box_thr (float, optional): Exclude boxes with score < skip_box_thr. Defaults to 0.0.
+        conf_type (str, optional): Confidence merging type ("avg" or "max"). Defaults to "avg".
+        merge_type (str, optional): Merge type ("weighted" or "biggest"). Defaults to "weighted".
+        class_agnostic (bool, optional): Merge boxes from different classes. Defaults to False.
+    Returns:
+        numpy.ndarray [N x 4]: Array of bounding boxes.
+        numpy.ndarray [N]: Array of labels.
+        numpy.ndarray [N]: Array of scores.
+    """
+    weights = np.ones(len(boxes_list))
+    assert conf_type in ["avg", "max"], 'Conf type must be "avg" or "max"'
+    assert merge_type in ["weighted", "biggest"], 'Conf type must be "weighted" or "biggest"'
+    filtered_boxes = prefilter_boxes(
+        boxes_list,
+        scores_list,
+        labels_list,
+        weights,
+        skip_box_thr,
+        class_agnostic=class_agnostic,
+    )
+    if len(filtered_boxes) == 0:
+        return np.zeros((0, 4)), np.zeros((0,)), np.zeros((0,))
+    overall_boxes = []
+    for label in filtered_boxes:
+        boxes = filtered_boxes[label]
+        clusters = []
+        # Clusterize boxes
+        for j in range(len(boxes)):
+            ids = [i for i in range(len(boxes)) if i != j]
+            index, best_iou = find_matching_box_fast(boxes[ids], boxes[j], iou_thr)
+            if index != -1:
+                index = ids[index]
+                cluster_idx = [
+                    clust_idx
+                    for clust_idx, clust in enumerate(clusters)
+                    if (j in clust or index in clust)
+                ]
+                if len(cluster_idx):
+                    cluster_idx = cluster_idx[0]
+                    clusters[cluster_idx] = list(
+                        set(clusters[cluster_idx] + [index, j])
+                    )
+                else:
+                    clusters.append([index, j])
+            else:
+                clusters.append([j])
+        for j, c in enumerate(clusters):
+            if merge_type == "weighted":
+                weighted_box = get_weighted_box(boxes[c], conf_type)
+            elif merge_type == "biggest":
+                weighted_box = get_biggest_box(boxes[c], conf_type)
+            if conf_type == "max":
+                weighted_box[1] = weighted_box[1] / weights.max()
+            else:  # avg
+                weighted_box[1] = weighted_box[1] * len(c) / weights.sum()
+            overall_boxes.append(weighted_box)
+    overall_boxes = np.array(overall_boxes)
+    overall_boxes = overall_boxes[overall_boxes[:, 1].argsort()[::-1]]
+    boxes = overall_boxes[:, 4:]
+    scores = overall_boxes[:, 1]
+    labels = overall_boxes[:, 0]
+    return boxes, labels, scores

table_structure_v1.py ADDED Viewed

	@@ -0,0 +1,78 @@

+import torch
+import torch.nn as nn
+from typing import List, Tuple
+class Exp:
+    """
+    Configuration class for the table structure model.
+    This class contains all configuration parameters for the YOLOX-based
+    table structure detection model, including architecture settings, inference
+    parameters, and class-specific thresholds.
+    """
+    def __init__(self) -> None:
+        """Initialize the configuration with default parameters."""
+        self.name: str = "page-element-v3"
+        self.ckpt: str = "weights.pth"
+        self.device: str = "cuda:0" if torch.cuda.is_available() else "cpu"
+        # YOLOX architecture parameters
+        self.act: str = "silu"
+        self.depth: float = 1.00
+        self.width: float = 1.00
+        self.labels: List[str] = [
+            "border",  # not used
+            "cell",
+            "row",
+            "column",
+            "header"  # not used
+        ]
+        self.num_classes: int = len(self.labels)
+        # Inference parameters
+        self.size: Tuple[int, int] = (1024, 1024)
+        self.min_bbox_size: int = 0
+        self.normalize_boxes: bool = True
+        # NMS & thresholding. These can be updated
+        self.conf_thresh: float = 0.01
+        self.iou_thresh: float = 0.25
+        self.class_agnostic: bool = False
+        self.threshold: float = 0.05
+    def get_model(self) -> nn.Module:
+        """
+        Get the YOLOX model.
+        Builds and returns a YOLOX model with the configured architecture.
+        Also updates batch normalization parameters for optimal inference.
+        Returns:
+            nn.Module: The YOLOX model with configured parameters.
+        """
+        from yolox import YOLOX, YOLOPAFPN, YOLOXHead
+        # Build model
+        if getattr(self, "model", None) is None:
+            in_channels = [256, 512, 1024]
+            backbone = YOLOPAFPN(
+                self.depth, self.width, in_channels=in_channels, act=self.act
+            )
+            head = YOLOXHead(
+                self.num_classes, self.width, in_channels=in_channels, act=self.act
+            )
+            self.model = YOLOX(backbone, head)
+        # Update batch-norm parameters
+        def init_yolo(M: nn.Module) -> None:
+            for m in M.modules():
+                if isinstance(m, nn.BatchNorm2d):
+                    m.eps = 1e-3
+                    m.momentum = 0.03
+        self.model.apply(init_yolo)
+        return self.model

utils.py ADDED Viewed

	@@ -0,0 +1,201 @@

+import numpy as np
+import pandas as pd
+import numpy.typing as npt
+import matplotlib.pyplot as plt
+from matplotlib.patches import Rectangle
+from typing import Dict, List, Tuple, Optional, Union
+COLORS = [
+    "#003EFF",
+    "#FF8F00",
+    "#079700",
+    "#A123FF",
+    "#87CEEB",
+    "#FF5733",
+    "#C70039",
+    "#900C3F",
+    "#581845",
+    "#11998E",
+]
+def reformat_for_plotting(
+    boxes: npt.NDArray[np.float64],
+    labels: npt.NDArray[np.int_],
+    scores: npt.NDArray[np.float64],
+    shape: Tuple[int, int],
+    num_classes: int,
+) -> Tuple[List[npt.NDArray[np.int_]], List[npt.NDArray[np.float64]]]:
+    """
+    Reformat YOLOX predictions for plotting.
+    - Unnormalizes boxes to original image size.
+    - Reformats boxes to [xmin, ymin, width, height].
+    - Converts to list of boxes and scores per class.
+    Args:
+        boxes (np.ndarray [N, 4]): Array of bounding boxes in format [xmin, ymin, xmax, ymax].
+        labels (np.ndarray [N]): Array of labels.
+        scores (np.ndarray [N]): Array of confidence scores.
+        shape (tuple [2]): Shape of the image (height, width).
+        num_classes (int): Number of classes.
+    Returns:
+        list[np.ndarray[N]]: List of box bounding boxes per class.
+        list[np.ndarray[N]]: List of confidence scores per class.
+    """
+    boxes_plot = boxes.copy()
+    boxes_plot[:, [0, 2]] *= shape[1]
+    boxes_plot[:, [1, 3]] *= shape[0]
+    boxes_plot = boxes_plot.astype(int)
+    boxes_plot[:, 2] -= boxes_plot[:, 0]
+    boxes_plot[:, 3] -= boxes_plot[:, 1]
+    boxes_plot = [boxes_plot[labels == c] for c in range(num_classes)]
+    confs = [scores[labels == c] for c in range(num_classes)]
+    return boxes_plot, confs
+def plot_sample(
+    img: npt.NDArray[np.uint8],
+    boxes_list: List[npt.NDArray[np.int_]],
+    confs_list: List[npt.NDArray[np.float64]],
+    labels: List[str],
+    show_text: bool = True,
+) -> None:
+    """
+    Plots an image with bounding boxes.
+    Coordinates are expected in format [x_min, y_min, width, height].
+    Args:
+        img (numpy.ndarray): The input image to be plotted.
+        boxes_list (list[np.ndarray]): List of box bounding boxes per class.
+        confs_list (list[np.ndarray]): List of confidence scores per class.
+        labels (list): List of class labels.
+        show_text (bool, optional): Whether to show the text. Defaults to True.
+    """
+    plt.imshow(img, cmap="gray")
+    plt.axis(False)
+    for boxes, confs, col, l in zip(boxes_list, confs_list, COLORS, labels):
+        for box_idx, box in enumerate(boxes):
+            # Better display around boundaries
+            h, w, _ = img.shape
+            box = np.copy(box)
+            box[:2] = np.clip(box[:2], 2, max(h, w))
+            box[2] = min(box[2], w - 2 - box[0])
+            box[3] = min(box[3], h - 2 - box[1])
+            rect = Rectangle(
+                (box[0], box[1]),
+                box[2],
+                box[3],
+                linewidth=2,
+                facecolor="none",
+                edgecolor=col,
+            )
+            plt.gca().add_patch(rect)
+            # Add class and index label with proper alignment
+            if show_text:
+                plt.text(
+                    box[0], box[1],
+                    f"{l}_{box_idx}   conf={confs[box_idx]:.3f}",
+                    color='white',
+                    fontsize=8,
+                    bbox=dict(facecolor=col, alpha=1, edgecolor=col, pad=0, linewidth=2),
+                    verticalalignment='bottom',
+                    horizontalalignment='left'
+                )
+def reorder_boxes(
+    boxes: npt.NDArray[np.float64],
+    labels: npt.NDArray[np.int_],
+    classes: Optional[List[str]] = None,
+    scores: Optional[npt.NDArray[np.float64]] = None,
+) -> Union[
+    Tuple[npt.NDArray[np.float64], npt.NDArray[np.int_]],
+    Tuple[npt.NDArray[np.float64], npt.NDArray[np.int_], npt.NDArray[np.float64]],
+]:
+    """
+    Reorder boxes, labels and scores by box coordinates.
+    Columns are sorted by x first, rows and cells are sorted by y first.
+    Args:
+        boxes (np.ndarray [N, 4]): Array of bounding boxes in format [xmin, ymin, xmax, ymax].
+        labels (np.ndarray [N]): Array of labels.
+        classes (list, optional): List of class labels. Defaults to None.
+        scores (np.ndarray [N], optional): Array of confidence scores. Defaults to None.
+    Returns:
+        np.ndarray [N, 4]: Ordered boxes in format [xmin, ymin, xmax, ymax].
+        np.ndarray [N]: Ordered labels.
+        np.ndarray [N]: Ordered scores if scores is not None.
+    """
+    n_classes = labels.max() if classes is None else len(classes)
+    classes = labels.unique() if classes is None else classes
+    ordered_boxes, ordered_labels, ordered_scores = [], [], []
+    for c in range(n_classes):
+        boxes_class = boxes[labels == c]
+        if len(boxes_class):
+            # Reorder
+            sort = ["x0", "y0"] if classes[c] == "column" else ["y0", "x0"]
+            df_coords = pd.DataFrame({
+                "y0": np.round(boxes_class[:, 1] - boxes_class[:, 1].min(), 2),
+                "x0": np.round(boxes_class[:, 0] - boxes_class[:, 0].min(), 2),
+            })
+            idxs = df_coords.sort_values(sort).index
+            ordered_boxes.append(boxes_class[idxs])
+            ordered_labels.append(labels[labels == c][idxs])
+            if scores is not None:
+                ordered_scores.append(scores[labels == c][idxs])
+    ordered_boxes = np.concatenate(ordered_boxes)
+    ordered_labels = np.concatenate(ordered_labels)
+    if scores is not None:
+        ordered_scores = np.concatenate(ordered_scores)
+        return ordered_boxes, ordered_labels, ordered_scores
+    return ordered_boxes, ordered_labels
+def postprocess_preds_table_structure(
+    preds: Dict[str, npt.NDArray],
+    threshold: float = 0.1,
+    class_labels: Optional[List[str]] = None,
+    reorder: bool = True,
+) -> Tuple[npt.NDArray[np.float64], npt.NDArray[np.int_], npt.NDArray[np.float64]]:
+    """
+    Post process predictions for table structure task.
+    - Applies thresholding
+    - Reorders boxes using the reading order
+    Args:
+        preds (dict): Predictions. Keys are "scores", "boxes", "labels".
+        threshold (float, optional): Threshold for the confidence scores. Defaults to 0.1.
+        class_labels (list, optional): List of class labels. Defaults to None.
+        reorder (bool, optional): Whether to apply reordering. Defaults to True.
+    Returns:
+        numpy.ndarray [N x 4]: Array of bounding boxes.
+        numpy.ndarray [N]: Array of labels.
+        numpy.ndarray [N]: Array of scores.
+    """
+    boxes = preds["boxes"].cpu().numpy()
+    labels = preds["labels"].cpu().numpy()
+    scores = preds["scores"].cpu().numpy()
+    # Threshold
+    boxes = boxes[scores > threshold]
+    labels = labels[scores > threshold]
+    scores = scores[scores > threshold]
+    if len(boxes) > 0 and reorder:
+        boxes, labels, scores = reorder_boxes(boxes, labels, class_labels, scores)
+    return boxes, labels, scores

yolox/__init__.py ADDED Viewed

	@@ -0,0 +1,7 @@

+#!/usr/bin/env python3
+# -*- coding:utf-8 -*-
+# Copyright (c) Megvii Inc. All rights reserved.
+from .yolo_head import YOLOXHead
+from .yolo_pafpn import YOLOPAFPN
+from .yolox import YOLOX

yolox/boxes.py ADDED Viewed

	@@ -0,0 +1,55 @@

+#!/usr/bin/env python3
+# Copyright (c) Megvii Inc. All rights reserved.
+import torch
+import torchvision
+def postprocess(prediction, num_classes, conf_thre=0.7, nms_thre=0.45, class_agnostic=False):
+    """
+    Copied from YOLOX/yolox/utils/boxes.py
+    """
+    box_corner = prediction.new(prediction.shape)
+    box_corner[:, :, 0] = prediction[:, :, 0] - prediction[:, :, 2] / 2
+    box_corner[:, :, 1] = prediction[:, :, 1] - prediction[:, :, 3] / 2
+    box_corner[:, :, 2] = prediction[:, :, 0] + prediction[:, :, 2] / 2
+    box_corner[:, :, 3] = prediction[:, :, 1] + prediction[:, :, 3] / 2
+    prediction[:, :, :4] = box_corner[:, :, :4]
+    output = [None for _ in range(len(prediction))]
+    for i, image_pred in enumerate(prediction):
+        # If none are remaining => process next image
+        if not image_pred.size(0):
+            continue
+        # Get score and class with highest confidence
+        class_conf, class_pred = torch.max(image_pred[:, 5: 5 + num_classes], 1, keepdim=True)
+        conf_mask = (image_pred[:, 4] * class_conf.squeeze() >= conf_thre).squeeze()
+        # Detections ordered as (x1, y1, x2, y2, obj_conf, class_conf, class_pred)
+        detections = torch.cat((image_pred[:, :5], class_conf, class_pred.float()), 1)
+        detections = detections[conf_mask]
+        if not detections.size(0):
+            continue
+        if class_agnostic:
+            nms_out_index = torchvision.ops.nms(
+                detections[:, :4],
+                detections[:, 4] * detections[:, 5],
+                nms_thre,
+            )
+        else:
+            nms_out_index = torchvision.ops.batched_nms(
+                detections[:, :4],
+                detections[:, 4] * detections[:, 5],
+                detections[:, 6],
+                nms_thre,
+            )
+        detections = detections[nms_out_index]
+        if output[i] is None:
+            output[i] = detections
+        else:
+            output[i] = torch.cat((output[i], detections))
+    return output

yolox/darknet.py ADDED Viewed

	@@ -0,0 +1,179 @@

+#!/usr/bin/env python
+# -*- encoding: utf-8 -*-
+# Copyright (c) Megvii Inc. All rights reserved.
+from torch import nn
+from .network_blocks import BaseConv, CSPLayer, DWConv, Focus, ResLayer, SPPBottleneck
+class Darknet(nn.Module):
+    # number of blocks from dark2 to dark5.
+    depth2blocks = {21: [1, 2, 2, 1], 53: [2, 8, 8, 4]}
+    def __init__(
+        self,
+        depth,
+        in_channels=3,
+        stem_out_channels=32,
+        out_features=("dark3", "dark4", "dark5"),
+    ):
+        """
+        Args:
+            depth (int): depth of darknet used in model, usually use [21, 53] for this param.
+            in_channels (int): number of input channels, for example, use 3 for RGB image.
+            stem_out_channels (int): number of output channels of darknet stem.
+                It decides channels of darknet layer2 to layer5.
+            out_features (Tuple[str]): desired output layer name.
+        """
+        super().__init__()
+        assert out_features, "please provide output features of Darknet"
+        self.out_features = out_features
+        self.stem = nn.Sequential(
+            BaseConv(in_channels, stem_out_channels, ksize=3, stride=1, act="lrelu"),
+            *self.make_group_layer(stem_out_channels, num_blocks=1, stride=2),
+        )
+        in_channels = stem_out_channels * 2  # 64
+        num_blocks = Darknet.depth2blocks[depth]
+        # create darknet with `stem_out_channels` and `num_blocks` layers.
+        # to make model structure more clear, we don't use `for` statement in python.
+        self.dark2 = nn.Sequential(
+            *self.make_group_layer(in_channels, num_blocks[0], stride=2)
+        )
+        in_channels *= 2  # 128
+        self.dark3 = nn.Sequential(
+            *self.make_group_layer(in_channels, num_blocks[1], stride=2)
+        )
+        in_channels *= 2  # 256
+        self.dark4 = nn.Sequential(
+            *self.make_group_layer(in_channels, num_blocks[2], stride=2)
+        )
+        in_channels *= 2  # 512
+        self.dark5 = nn.Sequential(
+            *self.make_group_layer(in_channels, num_blocks[3], stride=2),
+            *self.make_spp_block([in_channels, in_channels * 2], in_channels * 2),
+        )
+    def make_group_layer(self, in_channels: int, num_blocks: int, stride: int = 1):
+        "starts with conv layer then has `num_blocks` `ResLayer`"
+        return [
+            BaseConv(in_channels, in_channels * 2, ksize=3, stride=stride, act="lrelu"),
+            *[(ResLayer(in_channels * 2)) for _ in range(num_blocks)],
+        ]
+    def make_spp_block(self, filters_list, in_filters):
+        m = nn.Sequential(
+            *[
+                BaseConv(in_filters, filters_list[0], 1, stride=1, act="lrelu"),
+                BaseConv(filters_list[0], filters_list[1], 3, stride=1, act="lrelu"),
+                SPPBottleneck(
+                    in_channels=filters_list[1],
+                    out_channels=filters_list[0],
+                    activation="lrelu",
+                ),
+                BaseConv(filters_list[0], filters_list[1], 3, stride=1, act="lrelu"),
+                BaseConv(filters_list[1], filters_list[0], 1, stride=1, act="lrelu"),
+            ]
+        )
+        return m
+    def forward(self, x):
+        outputs = {}
+        x = self.stem(x)
+        outputs["stem"] = x
+        x = self.dark2(x)
+        outputs["dark2"] = x
+        x = self.dark3(x)
+        outputs["dark3"] = x
+        x = self.dark4(x)
+        outputs["dark4"] = x
+        x = self.dark5(x)
+        outputs["dark5"] = x
+        return {k: v for k, v in outputs.items() if k in self.out_features}
+class CSPDarknet(nn.Module):
+    def __init__(
+        self,
+        dep_mul,
+        wid_mul,
+        out_features=("dark3", "dark4", "dark5"),
+        depthwise=False,
+        act="silu",
+    ):
+        super().__init__()
+        assert out_features, "please provide output features of Darknet"
+        self.out_features = out_features
+        Conv = DWConv if depthwise else BaseConv
+        base_channels = int(wid_mul * 64)  # 64
+        base_depth = max(round(dep_mul * 3), 1)  # 3
+        # stem
+        self.stem = Focus(3, base_channels, ksize=3, act=act)
+        # dark2
+        self.dark2 = nn.Sequential(
+            Conv(base_channels, base_channels * 2, 3, 2, act=act),
+            CSPLayer(
+                base_channels * 2,
+                base_channels * 2,
+                n=base_depth,
+                depthwise=depthwise,
+                act=act,
+            ),
+        )
+        # dark3
+        self.dark3 = nn.Sequential(
+            Conv(base_channels * 2, base_channels * 4, 3, 2, act=act),
+            CSPLayer(
+                base_channels * 4,
+                base_channels * 4,
+                n=base_depth * 3,
+                depthwise=depthwise,
+                act=act,
+            ),
+        )
+        # dark4
+        self.dark4 = nn.Sequential(
+            Conv(base_channels * 4, base_channels * 8, 3, 2, act=act),
+            CSPLayer(
+                base_channels * 8,
+                base_channels * 8,
+                n=base_depth * 3,
+                depthwise=depthwise,
+                act=act,
+            ),
+        )
+        # dark5
+        self.dark5 = nn.Sequential(
+            Conv(base_channels * 8, base_channels * 16, 3, 2, act=act),
+            SPPBottleneck(base_channels * 16, base_channels * 16, activation=act),
+            CSPLayer(
+                base_channels * 16,
+                base_channels * 16,
+                n=base_depth,
+                shortcut=False,
+                depthwise=depthwise,
+                act=act,
+            ),
+        )
+    def forward(self, x):
+        outputs = {}
+        x = self.stem(x)
+        outputs["stem"] = x
+        x = self.dark2(x)
+        outputs["dark2"] = x
+        x = self.dark3(x)
+        outputs["dark3"] = x
+        x = self.dark4(x)
+        outputs["dark4"] = x
+        x = self.dark5(x)
+        outputs["dark5"] = x
+        return {k: v for k, v in outputs.items() if k in self.out_features}

yolox/network_blocks.py ADDED Viewed

	@@ -0,0 +1,210 @@

+#!/usr/bin/env python
+# -*- encoding: utf-8 -*-
+# Copyright (c) Megvii Inc. All rights reserved.
+import torch
+import torch.nn as nn
+class SiLU(nn.Module):
+    """export-friendly version of nn.SiLU()"""
+    @staticmethod
+    def forward(x):
+        return x * torch.sigmoid(x)
+def get_activation(name="silu", inplace=True):
+    if name == "silu":
+        module = nn.SiLU(inplace=inplace)
+    elif name == "relu":
+        module = nn.ReLU(inplace=inplace)
+    elif name == "lrelu":
+        module = nn.LeakyReLU(0.1, inplace=inplace)
+    else:
+        raise AttributeError("Unsupported act type: {}".format(name))
+    return module
+class BaseConv(nn.Module):
+    """A Conv2d -> Batchnorm -> silu/leaky relu block"""
+    def __init__(
+        self, in_channels, out_channels, ksize, stride, groups=1, bias=False, act="silu"
+    ):
+        super().__init__()
+        # same padding
+        pad = (ksize - 1) // 2
+        self.conv = nn.Conv2d(
+            in_channels,
+            out_channels,
+            kernel_size=ksize,
+            stride=stride,
+            padding=pad,
+            groups=groups,
+            bias=bias,
+        )
+        self.bn = nn.BatchNorm2d(out_channels)
+        self.act = get_activation(act, inplace=True)
+    def forward(self, x):
+        return self.act(self.bn(self.conv(x)))
+    def fuseforward(self, x):
+        return self.act(self.conv(x))
+class DWConv(nn.Module):
+    """Depthwise Conv + Conv"""
+    def __init__(self, in_channels, out_channels, ksize, stride=1, act="silu"):
+        super().__init__()
+        self.dconv = BaseConv(
+            in_channels,
+            in_channels,
+            ksize=ksize,
+            stride=stride,
+            groups=in_channels,
+            act=act,
+        )
+        self.pconv = BaseConv(
+            in_channels, out_channels, ksize=1, stride=1, groups=1, act=act
+        )
+    def forward(self, x):
+        x = self.dconv(x)
+        return self.pconv(x)
+class Bottleneck(nn.Module):
+    # Standard bottleneck
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        shortcut=True,
+        expansion=0.5,
+        depthwise=False,
+        act="silu",
+    ):
+        super().__init__()
+        hidden_channels = int(out_channels * expansion)
+        Conv = DWConv if depthwise else BaseConv
+        self.conv1 = BaseConv(in_channels, hidden_channels, 1, stride=1, act=act)
+        self.conv2 = Conv(hidden_channels, out_channels, 3, stride=1, act=act)
+        self.use_add = shortcut and in_channels == out_channels
+    def forward(self, x):
+        y = self.conv2(self.conv1(x))
+        if self.use_add:
+            y = y + x
+        return y
+class ResLayer(nn.Module):
+    "Residual layer with `in_channels` inputs."
+    def __init__(self, in_channels: int):
+        super().__init__()
+        mid_channels = in_channels // 2
+        self.layer1 = BaseConv(
+            in_channels, mid_channels, ksize=1, stride=1, act="lrelu"
+        )
+        self.layer2 = BaseConv(
+            mid_channels, in_channels, ksize=3, stride=1, act="lrelu"
+        )
+    def forward(self, x):
+        out = self.layer2(self.layer1(x))
+        return x + out
+class SPPBottleneck(nn.Module):
+    """Spatial pyramid pooling layer used in YOLOv3-SPP"""
+    def __init__(
+        self, in_channels, out_channels, kernel_sizes=(5, 9, 13), activation="silu"
+    ):
+        super().__init__()
+        hidden_channels = in_channels // 2
+        self.conv1 = BaseConv(in_channels, hidden_channels, 1, stride=1, act=activation)
+        self.m = nn.ModuleList(
+            [
+                nn.MaxPool2d(kernel_size=ks, stride=1, padding=ks // 2)
+                for ks in kernel_sizes
+            ]
+        )
+        conv2_channels = hidden_channels * (len(kernel_sizes) + 1)
+        self.conv2 = BaseConv(conv2_channels, out_channels, 1, stride=1, act=activation)
+    def forward(self, x):
+        x = self.conv1(x)
+        x = torch.cat([x] + [m(x) for m in self.m], dim=1)
+        x = self.conv2(x)
+        return x
+class CSPLayer(nn.Module):
+    """C3 in yolov5, CSP Bottleneck with 3 convolutions"""
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        n=1,
+        shortcut=True,
+        expansion=0.5,
+        depthwise=False,
+        act="silu",
+    ):
+        """
+        Args:
+            in_channels (int): input channels.
+            out_channels (int): output channels.
+            n (int): number of Bottlenecks. Default value: 1.
+        """
+        # ch_in, ch_out, number, shortcut, groups, expansion
+        super().__init__()
+        hidden_channels = int(out_channels * expansion)  # hidden channels
+        self.conv1 = BaseConv(in_channels, hidden_channels, 1, stride=1, act=act)
+        self.conv2 = BaseConv(in_channels, hidden_channels, 1, stride=1, act=act)
+        self.conv3 = BaseConv(2 * hidden_channels, out_channels, 1, stride=1, act=act)
+        module_list = [
+            Bottleneck(
+                hidden_channels, hidden_channels, shortcut, 1.0, depthwise, act=act
+            )
+            for _ in range(n)
+        ]
+        self.m = nn.Sequential(*module_list)
+    def forward(self, x):
+        x_1 = self.conv1(x)
+        x_2 = self.conv2(x)
+        x_1 = self.m(x_1)
+        x = torch.cat((x_1, x_2), dim=1)
+        return self.conv3(x)
+class Focus(nn.Module):
+    """Focus width and height information into channel space."""
+    def __init__(self, in_channels, out_channels, ksize=1, stride=1, act="silu"):
+        super().__init__()
+        self.conv = BaseConv(in_channels * 4, out_channels, ksize, stride, act=act)
+    def forward(self, x):
+        # shape of x (b,c,w,h) -> y(b,4c,w/2,h/2)
+        patch_top_left = x[..., ::2, ::2]
+        patch_top_right = x[..., ::2, 1::2]
+        patch_bot_left = x[..., 1::2, ::2]
+        patch_bot_right = x[..., 1::2, 1::2]
+        x = torch.cat(
+            (
+                patch_top_left,
+                patch_bot_left,
+                patch_top_right,
+                patch_bot_right,
+            ),
+            dim=1,
+        )
+        return self.conv(x)

yolox/yolo_fpn.py ADDED Viewed

	@@ -0,0 +1,84 @@

+#!/usr/bin/env python
+# -*- encoding: utf-8 -*-
+# Copyright (c) Megvii Inc. All rights reserved.
+import torch
+import torch.nn as nn
+from .darknet import Darknet
+from .network_blocks import BaseConv
+class YOLOFPN(nn.Module):
+    """
+    YOLOFPN module. Darknet 53 is the default backbone of this model.
+    """
+    def __init__(
+        self,
+        depth=53,
+        in_features=["dark3", "dark4", "dark5"],
+    ):
+        super().__init__()
+        self.backbone = Darknet(depth)
+        self.in_features = in_features
+        # out 1
+        self.out1_cbl = self._make_cbl(512, 256, 1)
+        self.out1 = self._make_embedding([256, 512], 512 + 256)
+        # out 2
+        self.out2_cbl = self._make_cbl(256, 128, 1)
+        self.out2 = self._make_embedding([128, 256], 256 + 128)
+        # upsample
+        self.upsample = nn.Upsample(scale_factor=2, mode="nearest")
+    def _make_cbl(self, _in, _out, ks):
+        return BaseConv(_in, _out, ks, stride=1, act="lrelu")
+    def _make_embedding(self, filters_list, in_filters):
+        m = nn.Sequential(
+            *[
+                self._make_cbl(in_filters, filters_list[0], 1),
+                self._make_cbl(filters_list[0], filters_list[1], 3),
+                self._make_cbl(filters_list[1], filters_list[0], 1),
+                self._make_cbl(filters_list[0], filters_list[1], 3),
+                self._make_cbl(filters_list[1], filters_list[0], 1),
+            ]
+        )
+        return m
+    def load_pretrained_model(self, filename="./weights/darknet53.mix.pth"):
+        with open(filename, "rb") as f:
+            state_dict = torch.load(f, map_location="cpu")
+        print("loading pretrained weights...")
+        self.backbone.load_state_dict(state_dict)
+    def forward(self, inputs):
+        """
+        Args:
+            inputs (Tensor): input image.
+        Returns:
+            Tuple[Tensor]: FPN output features..
+        """
+        #  backbone
+        out_features = self.backbone(inputs)
+        x2, x1, x0 = [out_features[f] for f in self.in_features]
+        #  yolo branch 1
+        x1_in = self.out1_cbl(x0)
+        x1_in = self.upsample(x1_in)
+        x1_in = torch.cat([x1_in, x1], 1)
+        out_dark4 = self.out1(x1_in)
+        #  yolo branch 2
+        x2_in = self.out2_cbl(out_dark4)
+        x2_in = self.upsample(x2_in)
+        x2_in = torch.cat([x2_in, x2], 1)
+        out_dark3 = self.out2(x2_in)
+        outputs = (out_dark3, out_dark4, x0)
+        return outputs

yolox/yolo_head.py ADDED Viewed

	@@ -0,0 +1,235 @@

+#!/usr/bin/env python3
+# -*- coding:utf-8 -*-
+# Copyright (c) Megvii Inc. All rights reserved.
+import torch
+import torch.nn as nn
+from .network_blocks import BaseConv, DWConv
+_TORCH_VER = [int(x) for x in torch.__version__.split(".")[:2]]
+def meshgrid(*tensors):
+    """
+    Copied from YOLOX/yolox/utils/compat.py
+    """
+    if _TORCH_VER >= [1, 10]:
+        return torch.meshgrid(*tensors, indexing="ij")
+    else:
+        return torch.meshgrid(*tensors)
+def bboxes_iou(bboxes_a, bboxes_b, xyxy=True):
+    """
+    Copied from YOLOX/yolox/utils/boxes.py
+    """
+    if bboxes_a.shape[1] != 4 or bboxes_b.shape[1] != 4:
+        raise IndexError
+    if xyxy:
+        tl = torch.max(bboxes_a[:, None, :2], bboxes_b[:, :2])
+        br = torch.min(bboxes_a[:, None, 2:], bboxes_b[:, 2:])
+        area_a = torch.prod(bboxes_a[:, 2:] - bboxes_a[:, :2], 1)
+        area_b = torch.prod(bboxes_b[:, 2:] - bboxes_b[:, :2], 1)
+    else:
+        tl = torch.max(
+            (bboxes_a[:, None, :2] - bboxes_a[:, None, 2:] / 2),
+            (bboxes_b[:, :2] - bboxes_b[:, 2:] / 2),
+        )
+        br = torch.min(
+            (bboxes_a[:, None, :2] + bboxes_a[:, None, 2:] / 2),
+            (bboxes_b[:, :2] + bboxes_b[:, 2:] / 2),
+        )
+        area_a = torch.prod(bboxes_a[:, 2:], 1)
+        area_b = torch.prod(bboxes_b[:, 2:], 1)
+    en = (tl < br).type(tl.type()).prod(dim=2)
+    area_i = torch.prod(br - tl, 2) * en  # * ((tl < br).all())
+    return area_i / (area_a[:, None] + area_b - area_i)
+class YOLOXHead(nn.Module):
+    def __init__(
+        self,
+        num_classes,
+        width=1.0,
+        strides=[8, 16, 32],
+        in_channels=[256, 512, 1024],
+        act="silu",
+        depthwise=False,
+    ):
+        """
+        Args:
+            act (str): activation type of conv. Defalut value: "silu".
+            depthwise (bool): whether apply depthwise conv in conv branch. Defalut value: False.
+        """
+        super().__init__()
+        self.num_classes = num_classes
+        self.decode_in_inference = True  # for deploy, set to False
+        self.cls_convs = nn.ModuleList()
+        self.reg_convs = nn.ModuleList()
+        self.cls_preds = nn.ModuleList()
+        self.reg_preds = nn.ModuleList()
+        self.obj_preds = nn.ModuleList()
+        self.stems = nn.ModuleList()
+        Conv = DWConv if depthwise else BaseConv
+        for i in range(len(in_channels)):
+            self.stems.append(
+                BaseConv(
+                    in_channels=int(in_channels[i] * width),
+                    out_channels=int(256 * width),
+                    ksize=1,
+                    stride=1,
+                    act=act,
+                )
+            )
+            self.cls_convs.append(
+                nn.Sequential(
+                    *[
+                        Conv(
+                            in_channels=int(256 * width),
+                            out_channels=int(256 * width),
+                            ksize=3,
+                            stride=1,
+                            act=act,
+                        ),
+                        Conv(
+                            in_channels=int(256 * width),
+                            out_channels=int(256 * width),
+                            ksize=3,
+                            stride=1,
+                            act=act,
+                        ),
+                    ]
+                )
+            )
+            self.reg_convs.append(
+                nn.Sequential(
+                    *[
+                        Conv(
+                            in_channels=int(256 * width),
+                            out_channels=int(256 * width),
+                            ksize=3,
+                            stride=1,
+                            act=act,
+                        ),
+                        Conv(
+                            in_channels=int(256 * width),
+                            out_channels=int(256 * width),
+                            ksize=3,
+                            stride=1,
+                            act=act,
+                        ),
+                    ]
+                )
+            )
+            self.cls_preds.append(
+                nn.Conv2d(
+                    in_channels=int(256 * width),
+                    out_channels=self.num_classes,
+                    kernel_size=1,
+                    stride=1,
+                    padding=0,
+                )
+            )
+            self.reg_preds.append(
+                nn.Conv2d(
+                    in_channels=int(256 * width),
+                    out_channels=4,
+                    kernel_size=1,
+                    stride=1,
+                    padding=0,
+                )
+            )
+            self.obj_preds.append(
+                nn.Conv2d(
+                    in_channels=int(256 * width),
+                    out_channels=1,
+                    kernel_size=1,
+                    stride=1,
+                    padding=0,
+                )
+            )
+        self.use_l1 = False
+        self.l1_loss = nn.L1Loss(reduction="none")
+        self.bcewithlog_loss = nn.BCEWithLogitsLoss(reduction="none")
+        self.iou_loss = None
+        self.strides = strides
+        self.grids = [torch.zeros(1)] * len(in_channels)
+    def forward(self, xin, labels=None, imgs=None):
+        outputs = []
+        for k, (cls_conv, reg_conv, stride_this_level, x) in enumerate(
+            zip(self.cls_convs, self.reg_convs, self.strides, xin)
+        ):
+            x = self.stems[k](x)
+            cls_x = x
+            reg_x = x
+            cls_feat = cls_conv(cls_x)
+            cls_output = self.cls_preds[k](cls_feat)
+            reg_feat = reg_conv(reg_x)
+            reg_output = self.reg_preds[k](reg_feat)
+            obj_output = self.obj_preds[k](reg_feat)
+            output = torch.cat(
+                [reg_output, obj_output.sigmoid(), cls_output.sigmoid()], 1
+            )
+            outputs.append(output)
+        self.hw = [x.shape[-2:] for x in outputs]
+        # [batch, n_anchors_all, 85]
+        outputs = torch.cat(
+            [x.flatten(start_dim=2) for x in outputs], dim=2
+        ).permute(0, 2, 1)
+        if self.decode_in_inference:
+            return self.decode_outputs(outputs, dtype=xin[0].type())
+        else:
+            return outputs
+    def get_output_and_grid(self, output, k, stride, dtype):
+        grid = self.grids[k]
+        batch_size = output.shape[0]
+        n_ch = 5 + self.num_classes
+        hsize, wsize = output.shape[-2:]
+        if grid.shape[2:4] != output.shape[2:4]:
+            yv, xv = meshgrid([torch.arange(hsize), torch.arange(wsize)])
+            grid = torch.stack((xv, yv), 2).view(1, 1, hsize, wsize, 2).type(dtype)
+            self.grids[k] = grid
+        output = output.view(batch_size, 1, n_ch, hsize, wsize)
+        output = output.permute(0, 1, 3, 4, 2).reshape(
+            batch_size, hsize * wsize, -1
+        )
+        grid = grid.view(1, -1, 2)
+        output[..., :2] = (output[..., :2] + grid) * stride
+        output[..., 2:4] = torch.exp(output[..., 2:4]) * stride
+        return output, grid
+    def decode_outputs(self, outputs, dtype):
+        grids = []
+        strides = []
+        for (hsize, wsize), stride in zip(self.hw, self.strides):
+            yv, xv = meshgrid([torch.arange(hsize), torch.arange(wsize)])
+            grid = torch.stack((xv, yv), 2).view(1, -1, 2)
+            grids.append(grid)
+            shape = grid.shape[:2]
+            strides.append(torch.full((*shape, 1), stride))
+        grids = torch.cat(grids, dim=1).type(dtype)
+        strides = torch.cat(strides, dim=1).type(dtype)
+        outputs = torch.cat([
+            (outputs[..., 0:2] + grids) * strides,
+            torch.exp(outputs[..., 2:4]) * strides,
+            outputs[..., 4:]
+        ], dim=-1)
+        return outputs

yolox/yolo_pafpn.py ADDED Viewed

	@@ -0,0 +1,116 @@

+#!/usr/bin/env python
+# -*- encoding: utf-8 -*-
+# Copyright (c) Megvii Inc. All rights reserved.
+import torch
+import torch.nn as nn
+from .darknet import CSPDarknet
+from .network_blocks import BaseConv, CSPLayer, DWConv
+class YOLOPAFPN(nn.Module):
+    """
+    YOLOv3 model. Darknet 53 is the default backbone of this model.
+    """
+    def __init__(
+        self,
+        depth=1.0,
+        width=1.0,
+        in_features=("dark3", "dark4", "dark5"),
+        in_channels=[256, 512, 1024],
+        depthwise=False,
+        act="silu",
+    ):
+        super().__init__()
+        self.backbone = CSPDarknet(depth, width, depthwise=depthwise, act=act)
+        self.in_features = in_features
+        self.in_channels = in_channels
+        Conv = DWConv if depthwise else BaseConv
+        self.upsample = nn.Upsample(scale_factor=2, mode="nearest")
+        self.lateral_conv0 = BaseConv(
+            int(in_channels[2] * width), int(in_channels[1] * width), 1, 1, act=act
+        )
+        self.C3_p4 = CSPLayer(
+            int(2 * in_channels[1] * width),
+            int(in_channels[1] * width),
+            round(3 * depth),
+            False,
+            depthwise=depthwise,
+            act=act,
+        )  # cat
+        self.reduce_conv1 = BaseConv(
+            int(in_channels[1] * width), int(in_channels[0] * width), 1, 1, act=act
+        )
+        self.C3_p3 = CSPLayer(
+            int(2 * in_channels[0] * width),
+            int(in_channels[0] * width),
+            round(3 * depth),
+            False,
+            depthwise=depthwise,
+            act=act,
+        )
+        # bottom-up conv
+        self.bu_conv2 = Conv(
+            int(in_channels[0] * width), int(in_channels[0] * width), 3, 2, act=act
+        )
+        self.C3_n3 = CSPLayer(
+            int(2 * in_channels[0] * width),
+            int(in_channels[1] * width),
+            round(3 * depth),
+            False,
+            depthwise=depthwise,
+            act=act,
+        )
+        # bottom-up conv
+        self.bu_conv1 = Conv(
+            int(in_channels[1] * width), int(in_channels[1] * width), 3, 2, act=act
+        )
+        self.C3_n4 = CSPLayer(
+            int(2 * in_channels[1] * width),
+            int(in_channels[2] * width),
+            round(3 * depth),
+            False,
+            depthwise=depthwise,
+            act=act,
+        )
+    def forward(self, input):
+        """
+        Args:
+            inputs: input images.
+        Returns:
+            Tuple[Tensor]: FPN feature.
+        """
+        #  backbone
+        out_features = self.backbone(input)
+        features = [out_features[f] for f in self.in_features]
+        [x2, x1, x0] = features
+        fpn_out0 = self.lateral_conv0(x0)  # 1024->512/32
+        f_out0 = self.upsample(fpn_out0)  # 512/16
+        f_out0 = torch.cat([f_out0, x1], 1)  # 512->1024/16
+        f_out0 = self.C3_p4(f_out0)  # 1024->512/16
+        fpn_out1 = self.reduce_conv1(f_out0)  # 512->256/16
+        f_out1 = self.upsample(fpn_out1)  # 256/8
+        f_out1 = torch.cat([f_out1, x2], 1)  # 256->512/8
+        pan_out2 = self.C3_p3(f_out1)  # 512->256/8
+        p_out1 = self.bu_conv2(pan_out2)  # 256->256/16
+        p_out1 = torch.cat([p_out1, fpn_out1], 1)  # 256->512/16
+        pan_out1 = self.C3_n3(p_out1)  # 512->512/16
+        p_out0 = self.bu_conv1(pan_out1)  # 512->512/32
+        p_out0 = torch.cat([p_out0, fpn_out0], 1)  # 512->1024/32
+        pan_out0 = self.C3_n4(p_out0)  # 1024->1024/32
+        outputs = (pan_out2, pan_out1, pan_out0)
+        return outputs

yolox/yolox.py ADDED Viewed

	@@ -0,0 +1,32 @@

+#!/usr/bin/env python
+# -*- encoding: utf-8 -*-
+# Copyright (c) Megvii Inc. All rights reserved.
+import torch.nn as nn
+from .yolo_head import YOLOXHead
+from .yolo_pafpn import YOLOPAFPN
+class YOLOX(nn.Module):
+    """
+    YOLOX model module. The module list is defined by create_yolov3_modules function.
+    The network returns loss values from three YOLO layers during training
+    and detection results during test.
+    """
+    def __init__(self, backbone=None, head=None):
+        super().__init__()
+        if backbone is None:
+            backbone = YOLOPAFPN()
+        if head is None:
+            head = YOLOXHead(80)
+        self.backbone = backbone
+        self.head = head
+    def forward(self, x, targets=None):
+        assert not self.training, "Training mode not supported, please refer to the YOLOX repo"
+        fpn_outs = self.backbone(x)
+        outputs = self.head(fpn_outs)
+        return outputs