Spaces:
Running
Running
| # Copyright (C) 2021-2025, Mindee. | |
| # This program is licensed under the Apache License 2.0. | |
| # See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details. | |
| import logging | |
| from typing import Any | |
| import numpy as np | |
| from anyascii import anyascii | |
| from PIL import Image, ImageDraw | |
| from .fonts import get_font | |
| __all__ = ["synthesize_page", "synthesize_kie_page"] | |
| # Global variable to avoid multiple warnings | |
| ROTATION_WARNING = False | |
| def _warn_rotation(entry: dict[str, Any]) -> None: # pragma: no cover | |
| global ROTATION_WARNING | |
| if not ROTATION_WARNING and len(entry["geometry"]) == 4: | |
| logging.warning("Polygons with larger rotations will lead to inaccurate rendering") | |
| ROTATION_WARNING = True | |
| def _synthesize( | |
| response: Image.Image, | |
| entry: dict[str, Any], | |
| w: int, | |
| h: int, | |
| draw_proba: bool = False, | |
| font_family: str | None = None, | |
| smoothing_factor: float = 0.75, | |
| min_font_size: int = 6, | |
| max_font_size: int = 50, | |
| ) -> Image.Image: | |
| if len(entry["geometry"]) == 2: | |
| (xmin, ymin), (xmax, ymax) = entry["geometry"] | |
| polygon = [(xmin, ymin), (xmax, ymin), (xmax, ymax), (xmin, ymax)] | |
| else: | |
| polygon = entry["geometry"] | |
| # Calculate the bounding box of the word | |
| x_coords, y_coords = zip(*polygon) | |
| xmin, ymin, xmax, ymax = ( | |
| int(round(w * min(x_coords))), | |
| int(round(h * min(y_coords))), | |
| int(round(w * max(x_coords))), | |
| int(round(h * max(y_coords))), | |
| ) | |
| word_width = xmax - xmin | |
| word_height = ymax - ymin | |
| # If lines are provided instead of words, concatenate the word entries | |
| if "words" in entry: | |
| word_text = " ".join(word["value"] for word in entry["words"]) | |
| else: | |
| word_text = entry["value"] | |
| # Find the optimal font size | |
| try: | |
| font_size = min(word_height, max_font_size) | |
| font = get_font(font_family, font_size) | |
| text_width, text_height = font.getbbox(word_text)[2:4] | |
| while (text_width > word_width or text_height > word_height) and font_size > min_font_size: | |
| font_size = max(int(font_size * smoothing_factor), min_font_size) | |
| font = get_font(font_family, font_size) | |
| text_width, text_height = font.getbbox(word_text)[2:4] | |
| except ValueError: | |
| font = get_font(font_family, min_font_size) | |
| # Create a mask for the word | |
| mask = Image.new("L", (w, h), 0) | |
| ImageDraw.Draw(mask).polygon([(int(round(w * x)), int(round(h * y))) for x, y in polygon], fill=255) | |
| # Draw the word text | |
| d = ImageDraw.Draw(response) | |
| try: | |
| try: | |
| d.text((xmin, ymin), word_text, font=font, fill=(0, 0, 0), anchor="lt") | |
| except UnicodeEncodeError: | |
| d.text((xmin, ymin), anyascii(word_text), font=font, fill=(0, 0, 0), anchor="lt") | |
| # Catch generic exceptions to avoid crashing the whole rendering | |
| except Exception: # pragma: no cover | |
| logging.warning(f"Could not render word: {word_text}") | |
| if draw_proba: | |
| confidence = ( | |
| entry["confidence"] | |
| if "confidence" in entry | |
| else sum(w["confidence"] for w in entry["words"]) / len(entry["words"]) | |
| ) | |
| p = int(255 * confidence) | |
| color = (255 - p, 0, p) # Red to blue gradient based on probability | |
| d.rectangle([(xmin, ymin), (xmax, ymax)], outline=color, width=2) | |
| prob_font = get_font(font_family, 20) | |
| prob_text = f"{confidence:.2f}" | |
| prob_text_width, prob_text_height = prob_font.getbbox(prob_text)[2:4] | |
| # Position the probability slightly above the bounding box | |
| prob_x_offset = (word_width - prob_text_width) // 2 | |
| prob_y_offset = ymin - prob_text_height - 2 | |
| prob_y_offset = max(0, prob_y_offset) | |
| d.text((xmin + prob_x_offset, prob_y_offset), prob_text, font=prob_font, fill=color, anchor="lt") | |
| return response | |
| def synthesize_page( | |
| page: dict[str, Any], | |
| draw_proba: bool = False, | |
| font_family: str | None = None, | |
| smoothing_factor: float = 0.95, | |
| min_font_size: int = 8, | |
| max_font_size: int = 50, | |
| ) -> np.ndarray: | |
| """Draw a the content of the element page (OCR response) on a blank page. | |
| Args: | |
| page: exported Page object to represent | |
| draw_proba: if True, draw words in colors to represent confidence. Blue: p=1, red: p=0 | |
| font_family: family of the font | |
| smoothing_factor: factor to smooth the font size | |
| min_font_size: minimum font size | |
| max_font_size: maximum font size | |
| Returns: | |
| the synthesized page | |
| """ | |
| # Draw template | |
| h, w = page["dimensions"] | |
| response = Image.new("RGB", (w, h), color=(255, 255, 255)) | |
| for block in page["blocks"]: | |
| # If lines are provided use these to get better rendering results | |
| if len(block["lines"]) > 1: | |
| for line in block["lines"]: | |
| _warn_rotation(block) # pragma: no cover | |
| response = _synthesize( | |
| response=response, | |
| entry=line, | |
| w=w, | |
| h=h, | |
| draw_proba=draw_proba, | |
| font_family=font_family, | |
| smoothing_factor=smoothing_factor, | |
| min_font_size=min_font_size, | |
| max_font_size=max_font_size, | |
| ) | |
| # Otherwise, draw each word | |
| else: | |
| for line in block["lines"]: | |
| _warn_rotation(block) # pragma: no cover | |
| for word in line["words"]: | |
| response = _synthesize( | |
| response=response, | |
| entry=word, | |
| w=w, | |
| h=h, | |
| draw_proba=draw_proba, | |
| font_family=font_family, | |
| smoothing_factor=smoothing_factor, | |
| min_font_size=min_font_size, | |
| max_font_size=max_font_size, | |
| ) | |
| return np.array(response, dtype=np.uint8) | |
| def synthesize_kie_page( | |
| page: dict[str, Any], | |
| draw_proba: bool = False, | |
| font_family: str | None = None, | |
| ) -> np.ndarray: | |
| """Draw a the content of the element page (OCR response) on a blank page. | |
| Args: | |
| page: exported Page object to represent | |
| draw_proba: if True, draw words in colors to represent confidence. Blue: p=1, red: p=0 | |
| font_family: family of the font | |
| smoothing_factor: factor to smooth the font size | |
| min_font_size: minimum font size | |
| max_font_size: maximum font size | |
| Returns: | |
| the synthesized page | |
| """ | |
| # Draw template | |
| h, w = page["dimensions"] | |
| response = Image.new("RGB", (w, h), color=(255, 255, 255)) | |
| # Draw each word | |
| for predictions in page["predictions"].values(): | |
| for prediction in predictions: | |
| _warn_rotation(prediction) # pragma: no cover | |
| response = _synthesize( | |
| response=response, | |
| entry=prediction, | |
| w=w, | |
| h=h, | |
| draw_proba=draw_proba, | |
| font_family=font_family, | |
| ) | |
| return np.array(response, dtype=np.uint8) | |