Spaces:

MikeTrizna
/

doctr_demo_fork

Running

File size: 7,256 Bytes

f3270e6

# Copyright (C) 2021-2025, Mindee.

# This program is licensed under the Apache License 2.0.
# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
import logging
from typing import Any

import numpy as np
from anyascii import anyascii
from PIL import Image, ImageDraw

from .fonts import get_font

__all__ = ["synthesize_page", "synthesize_kie_page"]


# Global variable to avoid multiple warnings
ROTATION_WARNING = False


def _warn_rotation(entry: dict[str, Any]) -> None:  # pragma: no cover
    global ROTATION_WARNING
    if not ROTATION_WARNING and len(entry["geometry"]) == 4:
        logging.warning("Polygons with larger rotations will lead to inaccurate rendering")
        ROTATION_WARNING = True


def _synthesize(
    response: Image.Image,
    entry: dict[str, Any],
    w: int,
    h: int,
    draw_proba: bool = False,
    font_family: str | None = None,
    smoothing_factor: float = 0.75,
    min_font_size: int = 6,
    max_font_size: int = 50,
) -> Image.Image:
    if len(entry["geometry"]) == 2:
        (xmin, ymin), (xmax, ymax) = entry["geometry"]
        polygon = [(xmin, ymin), (xmax, ymin), (xmax, ymax), (xmin, ymax)]
    else:
        polygon = entry["geometry"]

    # Calculate the bounding box of the word
    x_coords, y_coords = zip(*polygon)
    xmin, ymin, xmax, ymax = (
        int(round(w * min(x_coords))),
        int(round(h * min(y_coords))),
        int(round(w * max(x_coords))),
        int(round(h * max(y_coords))),
    )
    word_width = xmax - xmin
    word_height = ymax - ymin

    # If lines are provided instead of words, concatenate the word entries
    if "words" in entry:
        word_text = " ".join(word["value"] for word in entry["words"])
    else:
        word_text = entry["value"]
    # Find the optimal font size
    try:
        font_size = min(word_height, max_font_size)
        font = get_font(font_family, font_size)
        text_width, text_height = font.getbbox(word_text)[2:4]

        while (text_width > word_width or text_height > word_height) and font_size > min_font_size:
            font_size = max(int(font_size * smoothing_factor), min_font_size)
            font = get_font(font_family, font_size)
            text_width, text_height = font.getbbox(word_text)[2:4]
    except ValueError:
        font = get_font(font_family, min_font_size)

    # Create a mask for the word
    mask = Image.new("L", (w, h), 0)
    ImageDraw.Draw(mask).polygon([(int(round(w * x)), int(round(h * y))) for x, y in polygon], fill=255)

    # Draw the word text
    d = ImageDraw.Draw(response)
    try:
        try:
            d.text((xmin, ymin), word_text, font=font, fill=(0, 0, 0), anchor="lt")
        except UnicodeEncodeError:
            d.text((xmin, ymin), anyascii(word_text), font=font, fill=(0, 0, 0), anchor="lt")
    # Catch generic exceptions to avoid crashing the whole rendering
    except Exception:  # pragma: no cover
        logging.warning(f"Could not render word: {word_text}")

    if draw_proba:
        confidence = (
            entry["confidence"]
            if "confidence" in entry
            else sum(w["confidence"] for w in entry["words"]) / len(entry["words"])
        )
        p = int(255 * confidence)
        color = (255 - p, 0, p)  # Red to blue gradient based on probability
        d.rectangle([(xmin, ymin), (xmax, ymax)], outline=color, width=2)

        prob_font = get_font(font_family, 20)
        prob_text = f"{confidence:.2f}"
        prob_text_width, prob_text_height = prob_font.getbbox(prob_text)[2:4]

        # Position the probability slightly above the bounding box
        prob_x_offset = (word_width - prob_text_width) // 2
        prob_y_offset = ymin - prob_text_height - 2
        prob_y_offset = max(0, prob_y_offset)

        d.text((xmin + prob_x_offset, prob_y_offset), prob_text, font=prob_font, fill=color, anchor="lt")

    return response


def synthesize_page(
    page: dict[str, Any],
    draw_proba: bool = False,
    font_family: str | None = None,
    smoothing_factor: float = 0.95,
    min_font_size: int = 8,
    max_font_size: int = 50,
) -> np.ndarray:
    """Draw a the content of the element page (OCR response) on a blank page.

    Args:
        page: exported Page object to represent
        draw_proba: if True, draw words in colors to represent confidence. Blue: p=1, red: p=0
        font_family: family of the font
        smoothing_factor: factor to smooth the font size
        min_font_size: minimum font size
        max_font_size: maximum font size

    Returns:
        the synthesized page
    """
    # Draw template
    h, w = page["dimensions"]
    response = Image.new("RGB", (w, h), color=(255, 255, 255))

    for block in page["blocks"]:
        # If lines are provided use these to get better rendering results
        if len(block["lines"]) > 1:
            for line in block["lines"]:
                _warn_rotation(block)  # pragma: no cover
                response = _synthesize(
                    response=response,
                    entry=line,
                    w=w,
                    h=h,
                    draw_proba=draw_proba,
                    font_family=font_family,
                    smoothing_factor=smoothing_factor,
                    min_font_size=min_font_size,
                    max_font_size=max_font_size,
                )
        # Otherwise, draw each word
        else:
            for line in block["lines"]:
                _warn_rotation(block)  # pragma: no cover
                for word in line["words"]:
                    response = _synthesize(
                        response=response,
                        entry=word,
                        w=w,
                        h=h,
                        draw_proba=draw_proba,
                        font_family=font_family,
                        smoothing_factor=smoothing_factor,
                        min_font_size=min_font_size,
                        max_font_size=max_font_size,
                    )

    return np.array(response, dtype=np.uint8)


def synthesize_kie_page(
    page: dict[str, Any],
    draw_proba: bool = False,
    font_family: str | None = None,
) -> np.ndarray:
    """Draw a the content of the element page (OCR response) on a blank page.

    Args:
        page: exported Page object to represent
        draw_proba: if True, draw words in colors to represent confidence. Blue: p=1, red: p=0
        font_family: family of the font
        smoothing_factor: factor to smooth the font size
        min_font_size: minimum font size
        max_font_size: maximum font size

    Returns:
        the synthesized page
    """
    # Draw template
    h, w = page["dimensions"]
    response = Image.new("RGB", (w, h), color=(255, 255, 255))

    # Draw each word
    for predictions in page["predictions"].values():
        for prediction in predictions:
            _warn_rotation(prediction)  # pragma: no cover
            response = _synthesize(
                response=response,
                entry=prediction,
                w=w,
                h=h,
                draw_proba=draw_proba,
                font_family=font_family,
            )
    return np.array(response, dtype=np.uint8)