YOLOMH

File size: 3,003 Bytes

7f44b94

from typing import Dict, List, Any
from PIL import Image
import base64
import io
import os
import torch

class EndpointHandler:
    def __init__(self, path=""):
        from doclayout_yolo import YOLOv10
        
        # Load model from repo path
        model_path = os.path.join(path, "doclayout_yolo_docstructbench_imgsz1024.pt")
        self.model = YOLOv10(model_path)
        
        # Label mapping
        self.id_to_names = {
            0: 'title',
            1: 'plain_text',
            2: 'abandon',
            3: 'figure',
            4: 'figure_caption',
            5: 'table',
            6: 'table_caption',
            7: 'table_footnote',
            8: 'isolate_formula',
            9: 'formula_caption'
        }
        
        # Set device
        self.device = 'cuda' if torch.cuda.is_available() else 'cpu'
    
    def __call__(self, data: Dict[str, Any]) -> List[Dict[str, Any]]:
        """
        Process image and return layout detections.
        
        Args:
            data: Dictionary with:
                - "inputs": base64 encoded image string or PIL Image
                - "parameters" (optional): {
                    "confidence": float (default 0.2),
                    "iou_threshold": float (default 0.45)
                  }
        
        Returns:
            List of detections with label, score, and bounding box
        """
        # Get image from request
        image = data.get("inputs")
        
        # Get optional parameters
        params = data.get("parameters", {})
        conf_threshold = params.get("confidence", 0.2)
        iou_threshold = params.get("iou_threshold", 0.45)
        
        # Handle base64 encoded image
        if isinstance(image, str):
            # Remove data URL prefix if present
            if "base64," in image:
                image = image.split("base64,")[1]
            image = Image.open(io.BytesIO(base64.b64decode(image)))
        
        # Run inference
        results = self.model.predict(
            image,
            imgsz=1024,
            conf=conf_threshold,
            iou=iou_threshold,
            device=self.device
        )[0]
        
        # Format output
        detections = []
        boxes = results.boxes
        
        for i in range(len(boxes)):
            box = boxes[i]
            cls_id = int(box.cls.item())
            
            detections.append({
                "label": self.id_to_names.get(cls_id, f"class_{cls_id}"),
                "score": round(float(box.conf.item()), 4),
                "box": {
                    "x1": round(float(box.xyxy[0][0].item()), 2),
                    "y1": round(float(box.xyxy[0][1].item()), 2),
                    "x2": round(float(box.xyxy[0][2].item()), 2),
                    "y2": round(float(box.xyxy[0][3].item()), 2)
                }
            })
        
        # Sort by confidence score
        detections.sort(key=lambda x: x["score"], reverse=True)
        
        return detections