Add Magiv2 model configuration, processing, and utility functions

- Implement Magiv2Config class for model configuration, supporting detection, OCR, and crop embeddings.
- Create Magiv2Processor class for preprocessing inputs for detection, OCR, and crop embeddings.
- Add utility functions for handling bounding boxes, including cropping, sorting, and visualizing predictions.
- Introduce UnionFind class for managing connected components in bounding box graphs.
- Implement functions for converting annotation formats and managing text-to-panel mappings.

Files changed (5) hide show

configuration_magiv2_PRE.py +131 -0
processing_magiv2.py +364 -65
processing_magiv2_PRE.py +225 -0
utils.py +867 -152
utils_PRE.py +456 -0

configuration_magiv2_PRE.py ADDED Viewed

	@@ -0,0 +1,131 @@

+from transformers import PretrainedConfig, VisionEncoderDecoderConfig
+from typing import Any, Optional
+class Magiv2Config(PretrainedConfig):
+    """
+    Klasa konfiguracyjna dla modelu Magiv2.
+    Magiv2Config dziedziczy po PretrainedConfig z biblioteki transformers i definiuje
+    kompletną konfigurację dla modelu wizyjnego składającego się z trzech głównych komponentów:
+    - Model detekcji obiektów (detection)
+    - Model OCR (rozpoznawanie tekstu)
+    - Model embedowania wyciętych fragmentów obrazu (crop embeddings)
+    Attributes:
+        model_type: Identyfikator typu modelu dla biblioteki transformers
+        disable_ocr: Flaga wyłączająca moduł OCR
+        disable_crop_embeddings: Flaga wyłączająca moduł embedowania wyciętych fragmentów
+        disable_detections: Flaga wyłączająca moduł detekcji obiektów
+        detection_model_config: Konfiguracja modelu detekcji (po deserializacji)
+        ocr_model_config: Konfiguracja modelu OCR (po deserializacji)
+        crop_embedding_model_config: Konfiguracja modelu embedowania (po deserializacji)
+        detection_image_preprocessing_config: Parametry przetwarzania obrazu dla detekcji
+        ocr_pretrained_processor_path: Ścieżka do wytrenowanego procesora OCR
+        crop_embedding_image_preprocessing_config: Parametry przetwarzania obrazu dla embedowania
+    """
+    # Identyfikator typu modelu używany przez bibliotekę transformers
+    model_type: str = "magiv2"
+    def __init__(
+        self,
+        disable_ocr: bool = False,
+        disable_crop_embeddings: bool = False,
+        disable_detections: bool = False,
+        detection_model_config: Optional[dict[str, Any]] = None,
+        ocr_model_config: Optional[dict[str, Any]] = None,
+        crop_embedding_model_config: Optional[dict[str, Any]] = None,
+        detection_image_preprocessing_config: Optional[dict[str, Any]] = None,
+        ocr_pretrained_processor_path: Optional[str] = None,
+        crop_embedding_image_preprocessing_config: Optional[dict[str, Any]] = None,
+        **kwargs: Any,
+    ) -> None:
+        """
+        Inicjalizuje konfigurację modelu Magiv2.
+        Konstruktor przyjmuje parametry kontrolujące które moduły modelu są aktywne,
+        oraz konfiguracje dla poszczególnych komponentów. Konfiguracje przekazane jako
+        słowniki są deserializowane do odpowiednich obiektów Config z transformers.
+        Args:
+            disable_ocr: Czy wyłączyć moduł rozpoznawania tekstu (OCR).
+                        Domyślnie False - OCR jest aktywne.
+            disable_crop_embeddings: Czy wyłączyć moduł tworzenia embeddingów dla wyciętych
+                                    fragmentów obrazu. Domyślnie False - embedowanie aktywne.
+            disable_detections: Czy wyłączyć moduł detekcji obiektów na obrazie.
+                               Domyślnie False - detekcja aktywna.
+            detection_model_config: Słownik z konfiguracją modelu detekcji obiektów.
+                                   Jeśli podany, zostanie zdeserializowany do PretrainedConfig.
+            ocr_model_config: Słownik z konfiguracją modelu OCR (encoder-decoder).
+                             Jeśli podany, zostanie zdeserializowany do VisionEncoderDecoderConfig.
+            crop_embedding_model_config: Słownik z konfiguracją modelu embedowania wyciętych
+                                        fragmentów. Jeśli podany, zostanie zdeserializowany
+                                        do PretrainedConfig.
+            detection_image_preprocessing_config: Słownik z parametrami preprocessingu obrazu
+                                                 dla modułu detekcji (np. rozmiar, normalizacja).
+            ocr_pretrained_processor_path: Ścieżka do katalogu lub Hub ID z wytrenowanym
+                                          procesorem obrazu dla modułu OCR.
+            crop_embedding_image_preprocessing_config: Słownik z parametrami preprocessingu
+                                                      obrazu dla modułu embedowania.
+            **kwargs: Dodatkowe argumenty przekazywane do klasy bazowej PretrainedConfig.
+        Returns:
+            None
+        Note:
+            - Konfiguracje modeli są deserializowane z dict do obiektów Config tylko wtedy,
+              gdy zostały przekazane (nie są None)
+            - Flagi disable_* pozwalają na selektywne wyłączanie poszczególnych modułów
+            - Wszystkie dodatkowe kwargs są przekazywane do klasy bazowej PretrainedConfig
+        """
+        # Przechowywanie flag wyłączających poszczególne moduły
+        self.disable_ocr: bool = disable_ocr
+        self.disable_crop_embeddings: bool = disable_crop_embeddings
+        self.disable_detections: bool = disable_detections
+        # Przechowywanie dodatkowych argumentów przekazanych do konstruktora
+        self.kwargs: dict[str, Any] = kwargs
+        # Inicjalizacja atrybutów konfiguracji modeli jako None
+        # (mog�� zostać zdeserializowane poniżej jeśli parametry nie są None)
+        self.detection_model_config: Optional[PretrainedConfig] = None
+        self.ocr_model_config: Optional[VisionEncoderDecoderConfig] = None
+        self.crop_embedding_model_config: Optional[PretrainedConfig] = None
+        # Deserializacja konfiguracji modelu detekcji ze słownika do obiektu PretrainedConfig
+        if detection_model_config is not None:
+            self.detection_model_config = PretrainedConfig.from_dict(
+                detection_model_config
+            )
+        # Deserializacja konfiguracji modelu OCR ze słownika do obiektu VisionEncoderDecoderConfig
+        # OCR wykorzystuje architekturę encoder-decoder (vision encoder + text decoder)
+        if ocr_model_config is not None:
+            self.ocr_model_config = VisionEncoderDecoderConfig.from_dict(
+                ocr_model_config
+            )
+        # Deserializacja konfiguracji modelu embedowania ze słownika do obiektu PretrainedConfig
+        if crop_embedding_model_config is not None:
+            self.crop_embedding_model_config = PretrainedConfig.from_dict(
+                crop_embedding_model_config
+            )
+        # Przechowywanie konfiguracji preprocessingu obrazu dla modułu detekcji
+        # (np. docelowy rozmiar obrazu, parametry normalizacji, augmentacje)
+        self.detection_image_preprocessing_config: Optional[dict[str, Any]] = (
+            detection_image_preprocessing_config
+        )
+        # Ścieżka do wytrenowanego procesora OCR (może być lokalna lub z Hugging Face Hub)
+        self.ocr_pretrained_processor_path: Optional[str] = ocr_pretrained_processor_path
+        # Przechowywanie konfiguracji preprocessingu obrazu dla modułu embedowania
+        # (np. docelowy rozmiar wycięć, parametry normalizacji)
+        self.crop_embedding_image_preprocessing_config: Optional[dict[str, Any]] = (
+            crop_embedding_image_preprocessing_config
+        )
+        # Wywołanie konstruktora klasy bazowej PretrainedConfig z dodatkowymi kwargs
+        super().__init__(**kwargs)

processing_magiv2.py CHANGED Viewed

@@ -1,118 +1,325 @@
 from transformers import ConditionalDetrImageProcessor, TrOCRProcessor, ViTImageProcessor
 import torch
-from typing import List
 from shapely.geometry import box
 from .utils import x1y1x2y2_to_xywh
 import numpy as np
 class Magiv2Processor():
-    def __init__(self, config):
-        self.config = config
-        self.detection_image_preprocessor = None
-        self.ocr_preprocessor = None
-        self.crop_embedding_image_preprocessor = None
         if not config.disable_detections:
             assert config.detection_image_preprocessing_config is not None
             self.detection_image_preprocessor = ConditionalDetrImageProcessor.from_dict(
                 config.detection_image_preprocessing_config)
         if not config.disable_ocr:
             assert config.ocr_pretrained_processor_path is not None
             self.ocr_preprocessor = TrOCRProcessor.from_pretrained(
                 config.ocr_pretrained_processor_path)
         if not config.disable_crop_embeddings:
             assert config.crop_embedding_image_preprocessing_config is not None
             self.crop_embedding_image_preprocessor = ViTImageProcessor.from_dict(
                 config.crop_embedding_image_preprocessing_config)
-    def preprocess_inputs_for_detection(self, images, annotations=None):
-        images = list(images)
-        assert isinstance(images[0], np.ndarray)
-        annotations = self._convert_annotations_to_coco_format(annotations)
-        inputs = self.detection_image_preprocessor(
-            images, annotations=annotations, return_tensors="pt")
         return inputs
-    def preprocess_inputs_for_ocr(self, images):
-        images = list(images)
-        assert isinstance(images[0], np.ndarray)
-        return self.ocr_preprocessor(images, return_tensors="pt").pixel_values
-    def preprocess_inputs_for_crop_embeddings(self, images):
-        images = list(images)
-        assert isinstance(images[0], np.ndarray)
-        return self.crop_embedding_image_preprocessor(images, return_tensors="pt").pixel_values
-    def postprocess_ocr_tokens(self, generated_ids, skip_special_tokens=True):
         return self.ocr_preprocessor.batch_decode(generated_ids, skip_special_tokens=skip_special_tokens)
-    def crop_image(self, image, bboxes):
-        crops_for_image = []
         for bbox in bboxes:
             x1, y1, x2, y2 = bbox
-            # fix the bounding box in case it is out of bounds or too small
             x1, y1, x2, y2 = int(x1), int(y1), int(x2), int(y2)
-            x1, y1, x2, y2 = min(x1, x2), min(y1, y2), max(
-                x1, x2), max(y1, y2)  # just incase
             x1, y1 = max(0, x1), max(0, y1)
             x1, y1 = min(image.shape[1], x1), min(image.shape[0], y1)
             x2, y2 = max(0, x2), max(0, y2)
             x2, y2 = min(image.shape[1], x2), min(image.shape[0], y2)
             if x2 - x1 < 10:
                 if image.shape[1] - x1 > 10:
                     x2 = x1 + 10
                 else:
                     x1 = x2 - 10
             if y2 - y1 < 10:
                 if image.shape[0] - y1 > 10:
                     y2 = y1 + 10
                 else:
                     y1 = y2 - 10
-            crop = image[y1:y2, x1:x2]
             crops_for_image.append(crop)
         return crops_for_image
-    def _get_indices_of_characters_to_keep(self, batch_scores, batch_labels, batch_bboxes, character_detection_threshold):
-        indices_of_characters_to_keep = []
         for scores, labels, _ in zip(batch_scores, batch_labels, batch_bboxes):
-            indices = torch.where((labels == 0) & (
                 scores > character_detection_threshold))[0]
             indices_of_characters_to_keep.append(indices)
         return indices_of_characters_to_keep
-    def _get_indices_of_panels_to_keep(self, batch_scores, batch_labels, batch_bboxes, panel_detection_threshold):
-        indices_of_panels_to_keep = []
         for scores, labels, bboxes in zip(batch_scores, batch_labels, batch_bboxes):
-            indices = torch.where(labels == 2)[0]
             bboxes = bboxes[indices]
             scores = scores[indices]
             labels = labels[indices]
             if len(indices) == 0:
                 indices_of_panels_to_keep.append([])
                 continue
             scores, labels, indices, bboxes = zip(
                 *sorted(zip(scores, labels, indices, bboxes), reverse=True))
-            panels_to_keep = []
-            union_of_panels_so_far = box(0, 0, 0, 0)
             for ps, pb, pl, pi in zip(scores, bboxes, labels, indices):
-                panel_polygon = box(pb[0], pb[1], pb[2], pb[3])
                 if ps < panel_detection_threshold:
                     continue
                 if union_of_panels_so_far.intersection(panel_polygon).area / panel_polygon.area > 0.5:
                     continue
                 panels_to_keep.append((ps, pl, pb, pi))
                 union_of_panels_so_far = union_of_panels_so_far.union(
                     panel_polygon)
             indices_of_panels_to_keep.append(
                 [p[3].item() for p in panels_to_keep])
         return indices_of_panels_to_keep
-    def _get_indices_of_texts_to_keep(self, batch_scores, batch_labels, batch_bboxes, text_detection_threshold):
-        indices_of_texts_to_keep = []
         for scores, labels, bboxes in zip(batch_scores, batch_labels, batch_bboxes):
-            indices = torch.where((labels == 1) & (
                 scores > text_detection_threshold))[0]
             bboxes = bboxes[indices]
             scores = scores[indices]
@@ -120,74 +327,159 @@ class Magiv2Processor():
             if len(indices) == 0:
                 indices_of_texts_to_keep.append([])
                 continue
             scores, labels, indices, bboxes = zip(
                 *sorted(zip(scores, labels, indices, bboxes), reverse=True))
-            texts_to_keep = []
-            texts_to_keep_as_shapely_objects = []
             for ts, tb, tl, ti in zip(scores, bboxes, labels, indices):
-                text_polygon = box(tb[0], tb[1], tb[2], tb[3])
-                should_append = True
                 for t in texts_to_keep_as_shapely_objects:
                     if t.intersection(text_polygon).area / t.union(text_polygon).area > 0.5:
                         should_append = False
                         break
                 if should_append:
                     texts_to_keep.append((ts, tl, tb, ti))
                     texts_to_keep_as_shapely_objects.append(text_polygon)
             indices_of_texts_to_keep.append(
                 [t[3].item() for t in texts_to_keep])
         return indices_of_texts_to_keep
-    def _get_indices_of_tails_to_keep(self, batch_scores, batch_labels, batch_bboxes, text_detection_threshold):
-        indices_of_texts_to_keep = []
         for scores, labels, bboxes in zip(batch_scores, batch_labels, batch_bboxes):
-            indices = torch.where((labels == 3) & (
                 scores > text_detection_threshold))[0]
             bboxes = bboxes[indices]
             scores = scores[indices]
             labels = labels[indices]
             if len(indices) == 0:
-                indices_of_texts_to_keep.append([])
                 continue
             scores, labels, indices, bboxes = zip(
                 *sorted(zip(scores, labels, indices, bboxes), reverse=True))
-            texts_to_keep = []
-            texts_to_keep_as_shapely_objects = []
             for ts, tb, tl, ti in zip(scores, bboxes, labels, indices):
-                text_polygon = box(tb[0], tb[1], tb[2], tb[3])
-                should_append = True
-                for t in texts_to_keep_as_shapely_objects:
-                    if t.intersection(text_polygon).area / t.union(text_polygon).area > 0.5:
                         should_append = False
                         break
                 if should_append:
-                    texts_to_keep.append((ts, tl, tb, ti))
-                    texts_to_keep_as_shapely_objects.append(text_polygon)
-            indices_of_texts_to_keep.append(
-                [t[3].item() for t in texts_to_keep])
-        return indices_of_texts_to_keep
-    def _convert_annotations_to_coco_format(self, annotations):
         if annotations is None:
             return None
         self._verify_annotations_are_in_correct_format(annotations)
-        coco_annotations = []
         for annotation in annotations:
-            coco_annotation = {
                 "image_id": annotation["image_id"],
                 "annotations": [],
             }
             for bbox, label in zip(annotation["bboxes_as_x1y1x2y2"], annotation["labels"]):
                 coco_annotation["annotations"].append({
                     "bbox": x1y1x2y2_to_xywh(bbox),
                     "category_id": label,
                     "area": (bbox[2] - bbox[0]) * (bbox[3] - bbox[1]),
                 })
             coco_annotations.append(coco_annotation)
         return coco_annotations
-    def _verify_annotations_are_in_correct_format(self, annotations):
-        error_msg = """
         Annotations must be in the following format:
         [
             {
@@ -197,20 +489,27 @@ class Magiv2Processor():
             },
             ...
         ]
-        Labels: 0 for characters, 1 for text, 2 for panels.
         """
         if annotations is None:
             return
         if not isinstance(annotations, List) and not isinstance(annotations, tuple):
             raise ValueError(
                 f"{error_msg} Expected a List/Tuple, found {type(annotations)}."
             )
         if len(annotations) == 0:
             return
         if not isinstance(annotations[0], dict):
             raise ValueError(
-                f"{error_msg} Expected a List[Dicct], found {type(annotations[0])}."
             )
         if "image_id" not in annotations[0]:
             raise ValueError(
                 f"{error_msg} Dict must contain 'image_id'."

 from transformers import ConditionalDetrImageProcessor, TrOCRProcessor, ViTImageProcessor
 import torch
+from typing import List, Dict, Any, Optional, Tuple
 from shapely.geometry import box
+from shapely.geometry.polygon import Polygon
 from .utils import x1y1x2y2_to_xywh
 import numpy as np
+from numpy.typing import NDArray
 class Magiv2Processor():
+    """
+    Procesor danych dla modelu Magiv2 - obsługuje preprocessing i postprocessing.
+    Klasa odpowiedzialna za przygotowanie danych wejściowych dla różnych modułów
+    Magiv2 (detekcja, OCR, embeddingi) oraz przetwarzanie outputów. Zawiera również
+    metody pomocnicze do filtrowania detekcji i konwersji formatów anotacji.
+    Attributes:
+        config: Konfiguracja modelu Magiv2
+        detection_image_preprocessor: Preprocessor dla obrazów do detekcji obiektów
+        ocr_preprocessor: Preprocessor dla obrazów do OCR
+        crop_embedding_image_preprocessor: Preprocessor dla wyciętych fragmentów obrazu
+    """
+    def __init__(self, config: Any) -> None:
+        """
+        Inicjalizuje procesor z podaną konfiguracją.
+        Tworzy preprocessory dla modułów, które są aktywne zgodnie z konfiguracją:
+        - Detekcja obiektów: ConditionalDetrImageProcessor
+        - OCR: TrOCRProcessor
+        - Embeddingi crops: ViTImageProcessor
+        Args:
+            config: Obiekt konfiguracji Magiv2Config z parametrami preprocessingu
+        """
+        self.config: Any = config
+        self.detection_image_preprocessor: Optional[ConditionalDetrImageProcessor] = None
+        self.ocr_preprocessor: Optional[TrOCRProcessor] = None
+        self.crop_embedding_image_preprocessor: Optional[ViTImageProcessor] = None
+        # Inicjalizacja preprocessora dla detekcji obiektów (jeśli aktywny)
         if not config.disable_detections:
             assert config.detection_image_preprocessing_config is not None
             self.detection_image_preprocessor = ConditionalDetrImageProcessor.from_dict(
                 config.detection_image_preprocessing_config)
+        # Inicjalizacja preprocessora dla OCR (jeśli aktywny)
         if not config.disable_ocr:
             assert config.ocr_pretrained_processor_path is not None
             self.ocr_preprocessor = TrOCRProcessor.from_pretrained(
                 config.ocr_pretrained_processor_path)
+        # Inicjalizacja preprocessora dla embeddingów crops (jeśli aktywny)
         if not config.disable_crop_embeddings:
             assert config.crop_embedding_image_preprocessing_config is not None
             self.crop_embedding_image_preprocessor = ViTImageProcessor.from_dict(
                 config.crop_embedding_image_preprocessing_config)
+    def preprocess_inputs_for_detection(
+        self,
+        images: List[NDArray[np.uint8]],
+        annotations: Optional[List[Dict[str, Any]]] = None
+    ) -> Dict[str, torch.Tensor]:
+        """
+        Preprocessuje obrazy do formatu wymaganego przez moduł detekcji obiektów.
+        Wykonuje normalizację, resize i padding obrazów. Jeśli podano anotacje,
+        konwertuje je do formatu COCO i skaluje współrzędnie bbox zgodnie z resize.
+        Args:
+            images: Lista obrazów jako numpy arrays (format HWC)
+            annotations: Opcjonalne anotacje ground truth w formacie:
+                        [{"image_id": int, "bboxes_as_x1y1x2y2": List, "labels": List}]
+        Returns:
+            Słownik z kluczami:
+            - "pixel_values": torch.Tensor z preprocessowanymi obrazami
+            - "pixel_mask": torch.Tensor z maską paddingu
+            - "labels": List[Dict] z przetworzonymi anotacjami (jeśli podano)
+        """
+        images_list: List[NDArray[np.uint8]] = list(images)
+        assert isinstance(images_list[0], np.ndarray)
+        # Konwersja anotacji do formatu COCO (bbox w formacie xywh zamiast x1y1x2y2)
+        coco_annotations: Optional[List[Dict[str, Any]]
+                                   ] = self._convert_annotations_to_coco_format(annotations)
+        # Preprocessing obrazów i anotacji
+        inputs: Dict[str, torch.Tensor] = self.detection_image_preprocessor(
+            images_list, annotations=coco_annotations, return_tensors="pt")
         return inputs
+    def preprocess_inputs_for_ocr(self, images: List[NDArray[np.uint8]]) -> torch.Tensor:
+        """
+        Preprocessuje obrazy do formatu wymaganego przez moduł OCR.
+        Wykonuje normalizację i resize obrazów tekstowych dla modelu TrOCR.
+        Args:
+            images: Lista obrazów jako numpy arrays (fragmenty z tekstem)
+        Returns:
+            Tensor z preprocessowanymi obrazami [batch, channels, height, width]
+        """
+        images_list: List[NDArray[np.uint8]] = list(images)
+        assert isinstance(images_list[0], np.ndarray)
+        return self.ocr_preprocessor(images_list, return_tensors="pt").pixel_values
+    def preprocess_inputs_for_crop_embeddings(self, images: List[NDArray[np.uint8]]) -> torch.Tensor:
+        """
+        Preprocessuje wycięte fragmenty obrazów dla modułu embeddingów.
+        Wykonuje normalizację i resize crops dla modelu ViT-MAE.
+        Args:
+            images: Lista wyciętych fragmentów obrazów jako numpy arrays
+        Returns:
+            Tensor z preprocessowanymi crops [batch, channels, height, width]
+        """
+        images_list: List[NDArray[np.uint8]] = list(images)
+        assert isinstance(images_list[0], np.ndarray)
+        return self.crop_embedding_image_preprocessor(images_list, return_tensors="pt").pixel_values
+    def postprocess_ocr_tokens(
+        self,
+        generated_ids: torch.Tensor,
+        skip_special_tokens: bool = True
+    ) -> List[str]:
+        """
+        Dekoduje tokeny wygenerowane przez model OCR na tekst.
+        Args:
+            generated_ids: Tensor z ID tokenów wygenerowanych przez decoder OCR
+            skip_special_tokens: Czy pomijać specjalne tokeny (PAD, BOS, EOS) w wyniku
+        Returns:
+            Lista stringów z rozpoznanym tekstem
+        """
         return self.ocr_preprocessor.batch_decode(generated_ids, skip_special_tokens=skip_special_tokens)
+    def crop_image(
+        self,
+        image: NDArray[np.uint8],
+        bboxes: List[List[float]]
+    ) -> List[NDArray[np.uint8]]:
+        """
+        Wycina fragmenty obrazu zgodnie z podanymi bounding boxami.
+        Metoda automatycznie naprawia nieprawidłowe bounding boxy:
+        - Ogranicza współrzędne do granic obrazu
+        - Zapewnia minimalny rozmiar 10x10 pikseli
+        - Zamienia współrzędne jeśli są w nieprawidłowej kolejności
+        Args:
+            image: Obraz źródłowy jako numpy array (format HWC)
+            bboxes: Lista bounding boxów w formacie [x1, y1, x2, y2]
+        Returns:
+            Lista wyciętych fragmentów obrazu (każdy jako numpy array)
+        """
+        crops_for_image: List[NDArray[np.uint8]] = []
         for bbox in bboxes:
+            x1: float
+            y1: float
+            x2: float
+            y2: float
             x1, y1, x2, y2 = bbox
+            # Naprawa bounding boxa w przypadku gdy jest poza granicami lub za mały
+            # Konwersja do int
             x1, y1, x2, y2 = int(x1), int(y1), int(x2), int(y2)
+            # Upewnienie się że x1<x2 i y1<y2 (na wypadek odwróconej kolejności)
+            x1, y1, x2, y2 = min(x1, x2), min(y1, y2), max(x1, x2), max(y1, y2)
+            # Ograniczenie do granic obrazu (minimum)
             x1, y1 = max(0, x1), max(0, y1)
             x1, y1 = min(image.shape[1], x1), min(image.shape[0], y1)
+            # Ograniczenie do granic obrazu (maksimum)
             x2, y2 = max(0, x2), max(0, y2)
             x2, y2 = min(image.shape[1], x2), min(image.shape[0], y2)
+            # Zapewnienie minimalnej szerokości 10 pikseli
             if x2 - x1 < 10:
                 if image.shape[1] - x1 > 10:
                     x2 = x1 + 10
                 else:
                     x1 = x2 - 10
+            # Zapewnienie minimalnej wysokości 10 pikseli
             if y2 - y1 < 10:
                 if image.shape[0] - y1 > 10:
                     y2 = y1 + 10
                 else:
                     y1 = y2 - 10
+            # Wycięcie fragmentu obrazu
+            crop: NDArray[np.uint8] = image[y1:y2, x1:x2]
             crops_for_image.append(crop)
         return crops_for_image
+    def _get_indices_of_characters_to_keep(
+        self,
+        batch_scores: torch.Tensor,
+        batch_labels: torch.Tensor,
+        batch_bboxes: torch.Tensor,
+        character_detection_threshold: float
+    ) -> List[torch.Tensor]:
+        """
+        Filtruje detekcje postaci na podstawie progu prawdopodobieństwa.
+        Zachowuje tylko detekcje z etykietą 0 (postać) i score powyżej progu.
+        Args:
+            batch_scores: Tensor ze scorami prawdopodobieństwa [batch, num_queries]
+            batch_labels: Tensor z etykietami klas [batch, num_queries]
+            batch_bboxes: Tensor z bounding boxami [batch, num_queries, 4]
+            character_detection_threshold: Minimalny score do zachowania detekcji (0-1)
+        Returns:
+            Lista tensorów z indeksami postaci do zachowania dla każdego obrazu
+        """
+        indices_of_characters_to_keep: List[torch.Tensor] = []
         for scores, labels, _ in zip(batch_scores, batch_labels, batch_bboxes):
+            # Filtrowanie: label=0 (postać) AND score > próg
+            indices: torch.Tensor = torch.where((labels == 0) & (
                 scores > character_detection_threshold))[0]
             indices_of_characters_to_keep.append(indices)
         return indices_of_characters_to_keep
+    def _get_indices_of_panels_to_keep(
+        self,
+        batch_scores: torch.Tensor,
+        batch_labels: torch.Tensor,
+        batch_bboxes: torch.Tensor,
+        panel_detection_threshold: float
+    ) -> List[List[int]]:
+        """
+        Filtruje detekcje paneli z zastosowaniem NMS (Non-Maximum Suppression).
+        Zachowuje tylko panele z etykietą 2 i score powyżej progu. Dodatkowo
+        stosuje NMS aby usunąć nakładające się panele - jeśli nowy panel
+        pokrywa się w >50% z już zaakceptowanymi panelami, jest odrzucany.
+        Args:
+            batch_scores: Tensor ze scorami [batch, num_queries]
+            batch_labels: Tensor z etykietami [batch, num_queries]
+            batch_bboxes: Tensor z bboxami [batch, num_queries, 4]
+            panel_detection_threshold: Minimalny score do zachowania panelu
+        Returns:
+            Lista list indeksów paneli do zachowania (po NMS) dla każdego obrazu
+        """
+        indices_of_panels_to_keep: List[List[int]] = []
         for scores, labels, bboxes in zip(batch_scores, batch_labels, batch_bboxes):
+            # Wybranie tylko detekcji z label=2 (panel)
+            indices: torch.Tensor = torch.where(labels == 2)[0]
             bboxes = bboxes[indices]
             scores = scores[indices]
             labels = labels[indices]
             if len(indices) == 0:
                 indices_of_panels_to_keep.append([])
                 continue
+            # Sortowanie paneli malejąco po score (najlepsze pierwsze)
             scores, labels, indices, bboxes = zip(
                 *sorted(zip(scores, labels, indices, bboxes), reverse=True))
+            panels_to_keep: List[Tuple[torch.Tensor,
+                                       torch.Tensor, torch.Tensor, torch.Tensor]] = []
+            # Unia wszystkich zaakceptowanych paneli (do sprawdzania nakładania)
+            union_of_panels_so_far: Polygon = box(0, 0, 0, 0)
             for ps, pb, pl, pi in zip(scores, bboxes, labels, indices):
+                # Konwersja bbox na polygon Shapely
+                panel_polygon: Polygon = box(pb[0], pb[1], pb[2], pb[3])
+                # Odrzuć jeśli score poniżej progu
                 if ps < panel_detection_threshold:
                     continue
+                # Odrzuć jeśli panel nakłada się >50% z już zaakceptowanymi panelami (NMS)
                 if union_of_panels_so_far.intersection(panel_polygon).area / panel_polygon.area > 0.5:
                     continue
+                # Zaakceptuj panel
                 panels_to_keep.append((ps, pl, pb, pi))
+                # Dodaj do unii zaakceptowanych paneli
                 union_of_panels_so_far = union_of_panels_so_far.union(
                     panel_polygon)
+            # Wyciągnięcie indeksów zaakceptowanych paneli
             indices_of_panels_to_keep.append(
                 [p[3].item() for p in panels_to_keep])
         return indices_of_panels_to_keep
+    def _get_indices_of_texts_to_keep(
+        self,
+        batch_scores: torch.Tensor,
+        batch_labels: torch.Tensor,
+        batch_bboxes: torch.Tensor,
+        text_detection_threshold: float
+    ) -> List[List[int]]:
+        """
+        Filtruje detekcje tekstu z zastosowaniem NMS (Non-Maximum Suppression).
+        Zachowuje tylko tekst z etykietą 1 i score powyżej progu. Stosuje NMS
+        aby usunąć duplikaty - jeśli nowy tekst ma IoU >0.5 z już zaakceptowanym
+        tekstem, jest odrzucany.
+        Args:
+            batch_scores: Tensor ze scorami [batch, num_queries]
+            batch_labels: Tensor z etykietami [batch, num_queries]
+            batch_bboxes: Tensor z bboxami [batch, num_queries, 4]
+            text_detection_threshold: Minimalny score do zachowania tekstu
+        Returns:
+            Lista list indeksów tekstów do zachowania (po NMS) dla każdego obrazu
+        """
+        indices_of_texts_to_keep: List[List[int]] = []
         for scores, labels, bboxes in zip(batch_scores, batch_labels, batch_bboxes):
+            # Filtrowanie: label=1 (tekst) AND score > próg
+            indices: torch.Tensor = torch.where((labels == 1) & (
                 scores > text_detection_threshold))[0]
             bboxes = bboxes[indices]
             scores = scores[indices]
             if len(indices) == 0:
                 indices_of_texts_to_keep.append([])
                 continue
+            # Sortowanie tekstów malejąco po score (najlepsze pierwsze)
             scores, labels, indices, bboxes = zip(
                 *sorted(zip(scores, labels, indices, bboxes), reverse=True))
+            texts_to_keep: List[Tuple[torch.Tensor,
+                                      torch.Tensor, torch.Tensor, torch.Tensor]] = []
+            # Lista polygonów zaakceptowanych tekstów (do sprawdzania nakładania)
+            texts_to_keep_as_shapely_objects: List[Polygon] = []
             for ts, tb, tl, ti in zip(scores, bboxes, labels, indices):
+                # Konwersja bbox na polygon Shapely
+                text_polygon: Polygon = box(tb[0], tb[1], tb[2], tb[3])
+                should_append: bool = True
+                # Sprawdź nakładanie z już zaakceptowanymi tekstami
                 for t in texts_to_keep_as_shapely_objects:
+                    # Jeśli IoU > 0.5, odrzuć (to duplikat)
                     if t.intersection(text_polygon).area / t.union(text_polygon).area > 0.5:
                         should_append = False
                         break
                 if should_append:
                     texts_to_keep.append((ts, tl, tb, ti))
                     texts_to_keep_as_shapely_objects.append(text_polygon)
+            # Wyciągnięcie indeksów zaakceptowanych tekstów
             indices_of_texts_to_keep.append(
                 [t[3].item() for t in texts_to_keep])
         return indices_of_texts_to_keep
+    def _get_indices_of_tails_to_keep(
+        self,
+        batch_scores: torch.Tensor,
+        batch_labels: torch.Tensor,
+        batch_bboxes: torch.Tensor,
+        text_detection_threshold: float
+    ) -> List[List[int]]:
+        """
+        Filtruje detekcje ogonów dymków z zastosowaniem NMS (Non-Maximum Suppression).
+        Zachowuje tylko ogony z etykietą 3 i score powyżej progu. Stosuje NMS
+        aby usunąć duplikaty - jeśli nowy ogon ma IoU >0.5 z już zaakceptowanym
+        ogonem, jest odrzucany.
+        Args:
+            batch_scores: Tensor ze scorami [batch, num_queries]
+            batch_labels: Tensor z etykietami [batch, num_queries]
+            batch_bboxes: Tensor z bboxami [batch, num_queries, 4]
+            text_detection_threshold: Minimalny score do zachowania ogona
+        Returns:
+            Lista list indeksów ogonów do zachowania (po NMS) dla każdego obrazu
+        """
+        indices_of_tails_to_keep: List[List[int]] = []
         for scores, labels, bboxes in zip(batch_scores, batch_labels, batch_bboxes):
+            # Filtrowanie: label=3 (ogon dymku) AND score > próg
+            indices: torch.Tensor = torch.where((labels == 3) & (
                 scores > text_detection_threshold))[0]
             bboxes = bboxes[indices]
             scores = scores[indices]
             labels = labels[indices]
             if len(indices) == 0:
+                indices_of_tails_to_keep.append([])
                 continue
+            # Sortowanie ogonów malejąco po score (najlepsze pierwsze)
             scores, labels, indices, bboxes = zip(
                 *sorted(zip(scores, labels, indices, bboxes), reverse=True))
+            tails_to_keep: List[Tuple[torch.Tensor,
+                                      torch.Tensor, torch.Tensor, torch.Tensor]] = []
+            # Lista polygonów zaakceptowanych ogonów (do sprawdzania nakładania)
+            tails_to_keep_as_shapely_objects: List[Polygon] = []
             for ts, tb, tl, ti in zip(scores, bboxes, labels, indices):
+                # Konwersja bbox na polygon Shapely
+                tail_polygon: Polygon = box(tb[0], tb[1], tb[2], tb[3])
+                should_append: bool = True
+                # Sprawdź nakładanie z już zaakceptowanymi ogonami
+                for t in tails_to_keep_as_shapely_objects:
+                    # Jeśli IoU > 0.5, odrzuć (to duplikat)
+                    if t.intersection(tail_polygon).area / t.union(tail_polygon).area > 0.5:
                         should_append = False
                         break
                 if should_append:
+                    tails_to_keep.append((ts, tl, tb, ti))
+                    tails_to_keep_as_shapely_objects.append(tail_polygon)
+            # Wyciągnięcie indeksów zaakceptowanych ogonów
+            indices_of_tails_to_keep.append(
+                [t[3].item() for t in tails_to_keep])
+        return indices_of_tails_to_keep
+    def _convert_annotations_to_coco_format(
+        self,
+        annotations: Optional[List[Dict[str, Any]]]
+    ) -> Optional[List[Dict[str, Any]]]:
+        """
+        Konwertuje anotacje z formatu x1y1x2y2 do formatu COCO (xywh).
+        Format COCO używa bbox jako [x, y, width, height] zamiast [x1, y1, x2, y2].
+        Dodatkowo oblicza pole powierzchni dla każdego bbox.
+        Args:
+            annotations: Lista anotacji w formacie:
+                        [{"image_id": int, "bboxes_as_x1y1x2y2": List, "labels": List}]
+                        lub None
+        Returns:
+            Lista anotacji w formacie COCO lub None jeśli input był None
+        """
         if annotations is None:
             return None
+        # Weryfikacja poprawności formatu anotacji
         self._verify_annotations_are_in_correct_format(annotations)
+        coco_annotations: List[Dict[str, Any]] = []
         for annotation in annotations:
+            coco_annotation: Dict[str, Any] = {
                 "image_id": annotation["image_id"],
                 "annotations": [],
             }
+            # Konwersja każdego bbox z x1y1x2y2 na xywh
             for bbox, label in zip(annotation["bboxes_as_x1y1x2y2"], annotation["labels"]):
                 coco_annotation["annotations"].append({
+                    # [x1,y1,x2,y2] -> [x,y,w,h]
                     "bbox": x1y1x2y2_to_xywh(bbox),
                     "category_id": label,
+                    # width * height
                     "area": (bbox[2] - bbox[0]) * (bbox[3] - bbox[1]),
                 })
             coco_annotations.append(coco_annotation)
         return coco_annotations
+    def _verify_annotations_are_in_correct_format(self, annotations: Optional[List[Dict[str, Any]]]) -> None:
+        """
+        Weryfikuje poprawność formatu anotacji.
+        Sprawdza czy anotacje są w oczekiwanym formacie:
+        - Lista/tupla słowników
+        - Każdy słownik zawiera klucze: "image_id", "bboxes_as_x1y1x2y2", "labels"
+        - Labels: 0=postać, 1=tekst, 2=panel, 3=ogon
+        Args:
+            annotations: Anotacje do weryfikacji lub None
+        Raises:
+            ValueError: Jeśli format anotacji jest nieprawidłowy
+        """
+        error_msg: str = """
         Annotations must be in the following format:
         [
             {
             },
             ...
         ]
+        Labels: 0 for characters, 1 for text, 2 for panels, 3 for tails.
         """
         if annotations is None:
             return
+        # Sprawdzenie czy to lista lub tupla
         if not isinstance(annotations, List) and not isinstance(annotations, tuple):
             raise ValueError(
                 f"{error_msg} Expected a List/Tuple, found {type(annotations)}."
             )
         if len(annotations) == 0:
             return
+        # Sprawdzenie czy elementy to słowniki
         if not isinstance(annotations[0], dict):
             raise ValueError(
+                f"{error_msg} Expected a List[Dict], found {type(annotations[0])}."
             )
+        # Sprawdzenie wymaganych kluczy w słowniku
         if "image_id" not in annotations[0]:
             raise ValueError(
                 f"{error_msg} Dict must contain 'image_id'."

processing_magiv2_PRE.py ADDED Viewed

	@@ -0,0 +1,225 @@

+from transformers import ConditionalDetrImageProcessor, TrOCRProcessor, ViTImageProcessor
+import torch
+from typing import List
+from shapely.geometry import box
+from .utils import x1y1x2y2_to_xywh
+import numpy as np
+class Magiv2Processor():
+    def __init__(self, config):
+        self.config = config
+        self.detection_image_preprocessor = None
+        self.ocr_preprocessor = None
+        self.crop_embedding_image_preprocessor = None
+        if not config.disable_detections:
+            assert config.detection_image_preprocessing_config is not None
+            self.detection_image_preprocessor = ConditionalDetrImageProcessor.from_dict(
+                config.detection_image_preprocessing_config)
+        if not config.disable_ocr:
+            assert config.ocr_pretrained_processor_path is not None
+            self.ocr_preprocessor = TrOCRProcessor.from_pretrained(
+                config.ocr_pretrained_processor_path)
+        if not config.disable_crop_embeddings:
+            assert config.crop_embedding_image_preprocessing_config is not None
+            self.crop_embedding_image_preprocessor = ViTImageProcessor.from_dict(
+                config.crop_embedding_image_preprocessing_config)
+    def preprocess_inputs_for_detection(self, images, annotations=None):
+        images = list(images)
+        assert isinstance(images[0], np.ndarray)
+        annotations = self._convert_annotations_to_coco_format(annotations)
+        inputs = self.detection_image_preprocessor(
+            images, annotations=annotations, return_tensors="pt")
+        return inputs
+    def preprocess_inputs_for_ocr(self, images):
+        images = list(images)
+        assert isinstance(images[0], np.ndarray)
+        return self.ocr_preprocessor(images, return_tensors="pt").pixel_values
+    def preprocess_inputs_for_crop_embeddings(self, images):
+        images = list(images)
+        assert isinstance(images[0], np.ndarray)
+        return self.crop_embedding_image_preprocessor(images, return_tensors="pt").pixel_values
+    def postprocess_ocr_tokens(self, generated_ids, skip_special_tokens=True):
+        return self.ocr_preprocessor.batch_decode(generated_ids, skip_special_tokens=skip_special_tokens)
+    def crop_image(self, image, bboxes):
+        crops_for_image = []
+        for bbox in bboxes:
+            x1, y1, x2, y2 = bbox
+            # fix the bounding box in case it is out of bounds or too small
+            x1, y1, x2, y2 = int(x1), int(y1), int(x2), int(y2)
+            x1, y1, x2, y2 = min(x1, x2), min(y1, y2), max(
+                x1, x2), max(y1, y2)  # just incase
+            x1, y1 = max(0, x1), max(0, y1)
+            x1, y1 = min(image.shape[1], x1), min(image.shape[0], y1)
+            x2, y2 = max(0, x2), max(0, y2)
+            x2, y2 = min(image.shape[1], x2), min(image.shape[0], y2)
+            if x2 - x1 < 10:
+                if image.shape[1] - x1 > 10:
+                    x2 = x1 + 10
+                else:
+                    x1 = x2 - 10
+            if y2 - y1 < 10:
+                if image.shape[0] - y1 > 10:
+                    y2 = y1 + 10
+                else:
+                    y1 = y2 - 10
+            crop = image[y1:y2, x1:x2]
+            crops_for_image.append(crop)
+        return crops_for_image
+    def _get_indices_of_characters_to_keep(self, batch_scores, batch_labels, batch_bboxes, character_detection_threshold):
+        indices_of_characters_to_keep = []
+        for scores, labels, _ in zip(batch_scores, batch_labels, batch_bboxes):
+            indices = torch.where((labels == 0) & (
+                scores > character_detection_threshold))[0]
+            indices_of_characters_to_keep.append(indices)
+        return indices_of_characters_to_keep
+    def _get_indices_of_panels_to_keep(self, batch_scores, batch_labels, batch_bboxes, panel_detection_threshold):
+        indices_of_panels_to_keep = []
+        for scores, labels, bboxes in zip(batch_scores, batch_labels, batch_bboxes):
+            indices = torch.where(labels == 2)[0]
+            bboxes = bboxes[indices]
+            scores = scores[indices]
+            labels = labels[indices]
+            if len(indices) == 0:
+                indices_of_panels_to_keep.append([])
+                continue
+            scores, labels, indices, bboxes = zip(
+                *sorted(zip(scores, labels, indices, bboxes), reverse=True))
+            panels_to_keep = []
+            union_of_panels_so_far = box(0, 0, 0, 0)
+            for ps, pb, pl, pi in zip(scores, bboxes, labels, indices):
+                panel_polygon = box(pb[0], pb[1], pb[2], pb[3])
+                if ps < panel_detection_threshold:
+                    continue
+                if union_of_panels_so_far.intersection(panel_polygon).area / panel_polygon.area > 0.5:
+                    continue
+                panels_to_keep.append((ps, pl, pb, pi))
+                union_of_panels_so_far = union_of_panels_so_far.union(
+                    panel_polygon)
+            indices_of_panels_to_keep.append(
+                [p[3].item() for p in panels_to_keep])
+        return indices_of_panels_to_keep
+    def _get_indices_of_texts_to_keep(self, batch_scores, batch_labels, batch_bboxes, text_detection_threshold):
+        indices_of_texts_to_keep = []
+        for scores, labels, bboxes in zip(batch_scores, batch_labels, batch_bboxes):
+            indices = torch.where((labels == 1) & (
+                scores > text_detection_threshold))[0]
+            bboxes = bboxes[indices]
+            scores = scores[indices]
+            labels = labels[indices]
+            if len(indices) == 0:
+                indices_of_texts_to_keep.append([])
+                continue
+            scores, labels, indices, bboxes = zip(
+                *sorted(zip(scores, labels, indices, bboxes), reverse=True))
+            texts_to_keep = []
+            texts_to_keep_as_shapely_objects = []
+            for ts, tb, tl, ti in zip(scores, bboxes, labels, indices):
+                text_polygon = box(tb[0], tb[1], tb[2], tb[3])
+                should_append = True
+                for t in texts_to_keep_as_shapely_objects:
+                    if t.intersection(text_polygon).area / t.union(text_polygon).area > 0.5:
+                        should_append = False
+                        break
+                if should_append:
+                    texts_to_keep.append((ts, tl, tb, ti))
+                    texts_to_keep_as_shapely_objects.append(text_polygon)
+            indices_of_texts_to_keep.append(
+                [t[3].item() for t in texts_to_keep])
+        return indices_of_texts_to_keep
+    def _get_indices_of_tails_to_keep(self, batch_scores, batch_labels, batch_bboxes, text_detection_threshold):
+        indices_of_texts_to_keep = []
+        for scores, labels, bboxes in zip(batch_scores, batch_labels, batch_bboxes):
+            indices = torch.where((labels == 3) & (
+                scores > text_detection_threshold))[0]
+            bboxes = bboxes[indices]
+            scores = scores[indices]
+            labels = labels[indices]
+            if len(indices) == 0:
+                indices_of_texts_to_keep.append([])
+                continue
+            scores, labels, indices, bboxes = zip(
+                *sorted(zip(scores, labels, indices, bboxes), reverse=True))
+            texts_to_keep = []
+            texts_to_keep_as_shapely_objects = []
+            for ts, tb, tl, ti in zip(scores, bboxes, labels, indices):
+                text_polygon = box(tb[0], tb[1], tb[2], tb[3])
+                should_append = True
+                for t in texts_to_keep_as_shapely_objects:
+                    if t.intersection(text_polygon).area / t.union(text_polygon).area > 0.5:
+                        should_append = False
+                        break
+                if should_append:
+                    texts_to_keep.append((ts, tl, tb, ti))
+                    texts_to_keep_as_shapely_objects.append(text_polygon)
+            indices_of_texts_to_keep.append(
+                [t[3].item() for t in texts_to_keep])
+        return indices_of_texts_to_keep
+    def _convert_annotations_to_coco_format(self, annotations):
+        if annotations is None:
+            return None
+        self._verify_annotations_are_in_correct_format(annotations)
+        coco_annotations = []
+        for annotation in annotations:
+            coco_annotation = {
+                "image_id": annotation["image_id"],
+                "annotations": [],
+            }
+            for bbox, label in zip(annotation["bboxes_as_x1y1x2y2"], annotation["labels"]):
+                coco_annotation["annotations"].append({
+                    "bbox": x1y1x2y2_to_xywh(bbox),
+                    "category_id": label,
+                    "area": (bbox[2] - bbox[0]) * (bbox[3] - bbox[1]),
+                })
+            coco_annotations.append(coco_annotation)
+        return coco_annotations
+    def _verify_annotations_are_in_correct_format(self, annotations):
+        error_msg = """
+        Annotations must be in the following format:
+        [
+            {
+                "image_id": 0,
+                "bboxes_as_x1y1x2y2": [[0, 0, 10, 10], [10, 10, 20, 20], [20, 20, 30, 30]],
+                "labels": [0, 1, 2],
+            },
+            ...
+        ]
+        Labels: 0 for characters, 1 for text, 2 for panels.
+        """
+        if annotations is None:
+            return
+        if not isinstance(annotations, List) and not isinstance(annotations, tuple):
+            raise ValueError(
+                f"{error_msg} Expected a List/Tuple, found {type(annotations)}."
+            )
+        if len(annotations) == 0:
+            return
+        if not isinstance(annotations[0], dict):
+            raise ValueError(
+                f"{error_msg} Expected a List[Dicct], found {type(annotations[0])}."
+            )
+        if "image_id" not in annotations[0]:
+            raise ValueError(
+                f"{error_msg} Dict must contain 'image_id'."
+            )
+        if "bboxes_as_x1y1x2y2" not in annotations[0]:
+            raise ValueError(
+                f"{error_msg} Dict must contain 'bboxes_as_x1y1x2y2'."
+            )
+        if "labels" not in annotations[0]:
+            raise ValueError(
+                f"{error_msg} Dict must contain 'labels'."
+            )

utils.py CHANGED Viewed

@@ -1,36 +1,240 @@
 import torch
 import numpy as np
 import random
 import matplotlib.pyplot as plt
 import matplotlib.patches as patches
 from shapely.geometry import Point, box
 import networkx as nx
 from copy import deepcopy
 from itertools import groupby
-def move_to_device(inputs, device):
     if hasattr(inputs, "keys"):
         return {k: move_to_device(v, device) for k, v in inputs.items()}
     elif isinstance(inputs, list):
         return [move_to_device(v, device) for v in inputs]
     elif isinstance(inputs, tuple):
         return tuple([move_to_device(v, device) for v in inputs])
     elif isinstance(inputs, np.ndarray):
         return torch.from_numpy(inputs).to(device)
     else:
         return inputs.to(device)
 class UnionFind:
-    def __init__(self, n):
-        self.parent = list(range(n))
-        self.size = [1] * n
-        self.num_components = n
     @classmethod
-    def from_adj_matrix(cls, adj_matrix):
-        ufds = cls(adj_matrix.shape[0])
         for i in range(adj_matrix.shape[0]):
             for j in range(adj_matrix.shape[1]):
                 if adj_matrix[i, j] > 0:
@@ -38,229 +242,482 @@ class UnionFind:
         return ufds
     @classmethod
-    def from_adj_list(cls, adj_list):
-        ufds = cls(len(adj_list))
         for i in range(len(adj_list)):
             for j in adj_list[i]:
                 ufds.unite(i, j)
         return ufds
     @classmethod
-    def from_edge_list(cls, edge_list, num_nodes):
-        ufds = cls(num_nodes)
         for edge in edge_list:
             ufds.unite(edge[0], edge[1])
         return ufds
-    def find(self, x):
         if self.parent[x] == x:
             return x
         self.parent[x] = self.find(self.parent[x])
         return self.parent[x]
-    def unite(self, x, y):
         x = self.find(x)
         y = self.find(y)
         if x != y:
             if self.size[x] < self.size[y]:
                 x, y = y, x
             self.parent[y] = x
             self.size[x] += self.size[y]
             self.num_components -= 1
-    def get_components_of(self, x):
         x = self.find(x)
         return [i for i in range(len(self.parent)) if self.find(i) == x]
-    def are_connected(self, x, y):
         return self.find(x) == self.find(y)
-    def get_size(self, x):
         return self.size[self.find(x)]
-    def get_num_components(self):
         return self.num_components
-    def get_labels_for_connected_components(self):
-        map_parent_to_label = {}
-        labels = []
         for i in range(len(self.parent)):
-            parent = self.find(i)
             if parent not in map_parent_to_label:
                 map_parent_to_label[parent] = len(map_parent_to_label)
             labels.append(map_parent_to_label[parent])
         return labels
-def visualise_single_image_prediction(image_as_np_array, predictions, filename):
     figure, subplot = plt.subplots(1, 1, figsize=(10, 10))
     subplot.imshow(image_as_np_array)
     plot_bboxes(subplot, predictions["panels"], color="green")
     plot_bboxes(subplot, predictions["texts"], color="red",
                 visibility=predictions["is_essential_text"])
     plot_bboxes(subplot, predictions["characters"], color="blue")
     plot_bboxes(subplot, predictions["tails"], color="purple")
     for i, name in enumerate(predictions["character_names"]):
-        char_bbox = predictions["characters"][i]
         x1, y1, x2, y2 = char_bbox
         subplot.text(x1, y1 - 2, name,
                      verticalalignment='bottom', horizontalalignment='left',
-                     # Background settings
                      bbox=dict(facecolor='blue', alpha=1, edgecolor='none'),
                      color='white', fontsize=8)
-    COLOURS = [
-        "#b7ff51",  # green
-        "#f50a8f",  # pink
-        "#4b13b6",  # purple
-        "#ddaa34",  # orange
-        "#bea2a2",  # brown
     ]
-    colour_index = 0
-    character_cluster_labels = predictions["character_cluster_labels"]
-    unique_label_sorted_by_frequency = sorted(list(set(
         character_cluster_labels)), key=lambda x: character_cluster_labels.count(x), reverse=True)
     for label in unique_label_sorted_by_frequency:
-        root = None
-        others = []
         for i in range(len(predictions["characters"])):
             if character_cluster_labels[i] == label:
                 if root is None:
-                    root = i
                 else:
-                    others.append(i)
         if colour_index >= len(COLOURS):
-            random_colour = COLOURS[0]
             while random_colour in COLOURS:
                 random_colour = "#" + \
                     "".join([random.choice("0123456789ABCDEF")
                             for j in range(6)])
         else:
-            random_colour = COLOURS[colour_index]
             colour_index += 1
-        bbox_i = predictions["characters"][root]
-        x1 = bbox_i[0] + (bbox_i[2] - bbox_i[0]) / 2
-        y1 = bbox_i[1] + (bbox_i[3] - bbox_i[1]) / 2
         subplot.plot([x1], [y1], color=random_colour, marker="o", markersize=5)
         for j in others:
-            # draw line from centre of bbox i to centre of bbox j
-            bbox_j = predictions["characters"][j]
             x1 = bbox_i[0] + (bbox_i[2] - bbox_i[0]) / 2
             y1 = bbox_i[1] + (bbox_i[3] - bbox_i[1]) / 2
-            x2 = bbox_j[0] + (bbox_j[2] - bbox_j[0]) / 2
-            y2 = bbox_j[1] + (bbox_j[3] - bbox_j[1]) / 2
             subplot.plot([x1, x2], [y1, y2], color=random_colour, linewidth=2)
             subplot.plot([x2], [y2], color=random_colour,
                          marker="o", markersize=5)
     for (i, j) in predictions["text_character_associations"]:
-        bbox_i = predictions["texts"][i]
-        bbox_j = predictions["characters"][j]
         if not predictions["is_essential_text"][i]:
             continue
-        x1 = bbox_i[0] + (bbox_i[2] - bbox_i[0]) / 2
-        y1 = bbox_i[1] + (bbox_i[3] - bbox_i[1]) / 2
-        x2 = bbox_j[0] + (bbox_j[2] - bbox_j[0]) / 2
-        y2 = bbox_j[1] + (bbox_j[3] - bbox_j[1]) / 2
         subplot.plot([x1, x2], [y1, y2], color="red",
                      linewidth=2, linestyle="dashed")
     for (i, j) in predictions["text_tail_associations"]:
-        bbox_i = predictions["texts"][i]
-        bbox_j = predictions["tails"][j]
-        x1 = bbox_i[0] + (bbox_i[2] - bbox_i[0]) / 2
-        y1 = bbox_i[1] + (bbox_i[3] - bbox_i[1]) / 2
-        x2 = bbox_j[0] + (bbox_j[2] - bbox_j[0]) / 2
-        y2 = bbox_j[1] + (bbox_j[3] - bbox_j[1]) / 2
         subplot.plot([x1, x2], [y1, y2], color="purple",
                      linewidth=2, linestyle="dashed")
     subplot.axis("off")
     if filename is not None:
         plt.savefig(filename, bbox_inches="tight", pad_inches=0)
     figure.canvas.draw()
-    image = np.array(figure.canvas.renderer._renderer)
     plt.close()
     return image
-def plot_bboxes(subplot, bboxes, color="red", visibility=None):
     if visibility is None:
         visibility = [1] * len(bboxes)
     for id, bbox in enumerate(bboxes):
         if visibility[id] == 0:
             continue
-        w = bbox[2] - bbox[0]
-        h = bbox[3] - bbox[1]
-        rect = patches.Rectangle(
             bbox[:2], w, h, linewidth=1, edgecolor=color, facecolor="none", linestyle="solid"
         )
         subplot.add_patch(rect)
-def sort_panels(rects):
-    before_rects = convert_to_list_of_lists(rects)
-    # slightly erode all rectangles initially to account for imperfect detections
-    rects = [erode_rectangle(rect, 0.05) for rect in before_rects]
-    G = nx.DiGraph()
-    G.add_nodes_from(range(len(rects)))
-    for i in range(len(rects)):
-        for j in range(len(rects)):
             if i == j:
                 continue
-            if is_there_a_directed_edge(i, j, rects):
-                G.add_edge(i, j, weight=get_distance(rects[i], rects[j]))
             else:
-                G.add_edge(j, i, weight=get_distance(rects[i], rects[j]))
     while True:
-        cycles = sorted(nx.simple_cycles(G))
         cycles = [cycle for cycle in cycles if len(cycle) > 1]
         if len(cycles) == 0:
             break
-        cycle = cycles[0]
-        edges = [e for e in zip(cycle, cycle[1:] + cycle[:1])]
-        max_cyclic_edge = max(edges, key=lambda x: G.edges[x]["weight"])
         G.remove_edge(*max_cyclic_edge)
     return list(nx.topological_sort(G))
-def is_strictly_above(rectA, rectB):
     x1A, y1A, x2A, y2A = rectA
     x1B, y1B, x2B, y2B = rectB
     return y2A < y1B
-def is_strictly_below(rectA, rectB):
     x1A, y1A, x2A, y2A = rectA
     x1B, y1B, x2B, y2B = rectB
     return y2B < y1A
-def is_strictly_left_of(rectA, rectB):
     x1A, y1A, x2A, y2A = rectA
     x1B, y1B, x2B, y2B = rectB
     return x2A < x1B
-def is_strictly_right_of(rectA, rectB):
     x1A, y1A, x2A, y2A = rectA
     x1B, y1B, x2B, y2B = rectB
     return x2B < x1A
-def intersects(rectA, rectB):
     return box(*rectA).intersects(box(*rectB))
-def is_there_a_directed_edge(a, b, rects):
-    rectA = rects[a]
-    rectB = rects[b]
-    centre_of_A = [rectA[0] + (rectA[2] - rectA[0]) / 2,
-                   rectA[1] + (rectA[3] - rectA[1]) / 2]
-    centre_of_B = [rectB[0] + (rectB[2] - rectB[0]) / 2,
-                   rectB[1] + (rectB[3] - rectB[1]) / 2]
     if np.allclose(np.array(centre_of_A), np.array(centre_of_B)):
         return box(*rectA).area > (box(*rectB)).area
     copy_A = [rectA[0], rectA[1], rectA[2], rectA[3]]
@@ -283,172 +740,430 @@ def is_there_a_directed_edge(a, b, rects):
         copy_B = erode_rectangle(copy_B, 0.05)
-def get_distance(rectA, rectB):
     return box(rectA[0], rectA[1], rectA[2], rectA[3]).distance(box(rectB[0], rectB[1], rectB[2], rectB[3]))
-def use_cuts_to_determine_edge_from_a_to_b(a, b, rects):
     rects = deepcopy(rects)
     while True:
         xmin, ymin, xmax, ymax = min(rects[a][0], rects[b][0]), min(
             rects[a][1], rects[b][1]), max(rects[a][2], rects[b][2]), max(rects[a][3], rects[b][3])
-        rect_index = [i for i in range(len(rects)) if intersects(
             rects[i], [xmin, ymin, xmax, ymax])]
-        rects_copy = [rect for rect in rects if intersects(
             rect, [xmin, ymin, xmax, ymax])]
-        # try to split the panels using a "horizontal" lines
-        overlapping_y_ranges = merge_overlapping_ranges(
             [(y1, y2) for x1, y1, x2, y2 in rects_copy])
-        panel_index_to_split = {}
         for split_index, (y1, y2) in enumerate(overlapping_y_ranges):
             for i, index in enumerate(rect_index):
                 if y1 <= rects_copy[i][1] <= rects_copy[i][3] <= y2:
                     panel_index_to_split[index] = split_index
         if panel_index_to_split[a] != panel_index_to_split[b]:
             return panel_index_to_split[a] < panel_index_to_split[b]
-        # try to split the panels using a "vertical" lines
-        overlapping_x_ranges = merge_overlapping_ranges(
             [(x1, x2) for x1, y1, x2, y2 in rects_copy])
-        panel_index_to_split = {}
         for split_index, (x1, x2) in enumerate(overlapping_x_ranges[::-1]):
             for i, index in enumerate(rect_index):
                 if x1 <= rects_copy[i][0] <= rects_copy[i][2] <= x2:
                     panel_index_to_split[index] = split_index
         if panel_index_to_split[a] != panel_index_to_split[b]:
             return panel_index_to_split[a] < panel_index_to_split[b]
-        # otherwise, erode the rectangles and try again
         rects = [erode_rectangle(rect, 0.05) for rect in rects]
-def erode_rectangle(bbox, erosion_factor):
     x1, y1, x2, y2 = bbox
     w, h = x2 - x1, y2 - y1
     cx, cy = x1 + w / 2, y1 + h / 2
     if w < h:
-        aspect_ratio = w / h
-        erosion_factor_width = erosion_factor * aspect_ratio
-        erosion_factor_height = erosion_factor
     else:
-        aspect_ratio = h / w
-        erosion_factor_width = erosion_factor
-        erosion_factor_height = erosion_factor * aspect_ratio
     w = w - w * erosion_factor_width
     h = h - h * erosion_factor_height
     x1, y1, x2, y2 = cx - w / 2, cy - h / 2, cx + w / 2, cy + h / 2
     return [x1, y1, x2, y2]
-def merge_overlapping_ranges(ranges):
     """
-    ranges: list of tuples (x1, x2)
     """
     if len(ranges) == 0:
         return []
-    ranges = sorted(ranges, key=lambda x: x[0])
-    merged_ranges = []
-    for i, r in enumerate(ranges):
         if i == 0:
             prev_x1, prev_x2 = r
             continue
         x1, x2 = r
         if x1 > prev_x2:
             merged_ranges.append((prev_x1, prev_x2))
             prev_x1, prev_x2 = x1, x2
         else:
             prev_x2 = max(prev_x2, x2)
     merged_ranges.append((prev_x1, prev_x2))
     return merged_ranges
-def sort_text_boxes_in_reading_order(text_bboxes, sorted_panel_bboxes):
-    text_bboxes = convert_to_list_of_lists(text_bboxes)
-    sorted_panel_bboxes = convert_to_list_of_lists(sorted_panel_bboxes)
-    if len(text_bboxes) == 0:
         return []
-    def indices_of_same_elements(nums):
         groups = groupby(range(len(nums)), key=lambda i: nums[i])
         return [list(indices) for _, indices in groups]
-    panel_id_for_text = get_text_to_panel_mapping(
-        text_bboxes, sorted_panel_bboxes)
-    indices_of_texts = list(range(len(text_bboxes)))
     indices_of_texts, panel_id_for_text = zip(
         *sorted(zip(indices_of_texts, panel_id_for_text), key=lambda x: x[1]))
     indices_of_texts = list(indices_of_texts)
-    grouped_indices = indices_of_same_elements(panel_id_for_text)
     for group in grouped_indices:
-        subset_of_text_indices = [indices_of_texts[i] for i in group]
-        text_bboxes_of_subset = [text_bboxes[i]
-                                 for i in subset_of_text_indices]
-        sorted_subset_indices = sort_texts_within_panel(text_bboxes_of_subset)
         indices_of_texts[group[0]: group[-1] + 1] = [subset_of_text_indices[i]
                                                      for i in sorted_subset_indices]
     return indices_of_texts
-def get_text_to_panel_mapping(text_bboxes, sorted_panel_bboxes):
-    text_to_panel_mapping = []
     for text_bbox in text_bboxes:
-        shapely_text_polygon = box(*text_bbox)
-        all_intersections = []
-        all_distances = []
         if len(sorted_panel_bboxes) == 0:
             text_to_panel_mapping.append(-1)
             continue
         for j, annotation in enumerate(sorted_panel_bboxes):
-            shapely_annotation_polygon = box(*annotation)
             if shapely_text_polygon.intersects(shapely_annotation_polygon):
-                all_intersections.append(
-                    (shapely_text_polygon.intersection(shapely_annotation_polygon).area, j))
-            all_distances.append(
-                (shapely_text_polygon.distance(shapely_annotation_polygon), j))
         if len(all_intersections) == 0:
-            text_to_panel_mapping.append(
-                min(all_distances, key=lambda x: x[0])[1])
         else:
-            text_to_panel_mapping.append(
-                max(all_intersections, key=lambda x: x[0])[1])
     return text_to_panel_mapping
-def sort_texts_within_panel(rects):
-    smallest_y = float("inf")
-    greatest_x = float("-inf")
     for i, rect in enumerate(rects):
         x1, y1, x2, y2 = rect
-        smallest_y = min(smallest_y, y1)
-        greatest_x = max(greatest_x, x2)
-    reference_point = Point(greatest_x, smallest_y)
-    polygons_and_index = []
     for i, rect in enumerate(rects):
         x1, y1, x2, y2 = rect
         polygons_and_index.append((box(x1, y1, x2, y2), i))
-    # sort points by closest to reference point
     polygons_and_index = sorted(
         polygons_and_index, key=lambda x: reference_point.distance(x[0]))
-    indices = [x[1] for x in polygons_and_index]
     return indices
-def x1y1wh_to_x1y1x2y2(bbox):
     x1, y1, w, h = bbox
     return [x1, y1, x1 + w, y1 + h]
-def x1y1x2y2_to_xywh(bbox):
     x1, y1, x2, y2 = bbox
     return [x1, y1, x2 - x1, y2 - y1]
-def convert_to_list_of_lists(rects):
     if isinstance(rects, torch.Tensor):
         return rects.tolist()
     if isinstance(rects, np.ndarray):

+"""
+Funkcje pomocnicze dla modelu Magiv2.
+═══════════════════════════════════════════════════════════════════════════════
+STRESZCZENIE ZAWARTOŚCI PLIKU
+═══════════════════════════════════════════════════════════════════════════════
+Ten moduł zawiera narzędzia pomocnicze do przetwarzania i wizualizacji wyników
+modelu Magiv2 dla analizy komiksów/mangi. Plik składa się z 5 głównych kategorii:
+1. ZARZĄDZANIE URZĄDZENIAMI
+   ├─ move_to_device() - Rekurencyjne przenoszenie danych między CPU/GPU
+   │  Obsługuje: dict, list, tuple, numpy.ndarray, torch.Tensor
+   └─ Używane przy każdym wywołaniu modelu do przeniesienia danych na właściwe urządzenie
+2. STRUKTURA UNION-FIND DO KLASTROWANIA (linie ~53-190)
+   ├─ class UnionFind - Disjoint Set Union z kompresją ścieżki i union by size
+   │  ├─ __init__(n) - Inicjalizacja n rozłącznych elementów
+   │  ├─ from_adj_matrix() - Tworzenie z macierzy sąsiedztwa
+   │  ├─ from_adj_list() - Tworzenie z listy sąsiedztwa
+   │  ├─ from_edge_list() - Tworzenie z listy krawędzi
+   │  ├─ find(x) - Znajdowanie korzenia zbioru (z path compression)
+   │  ├─ unite(x, y) - Łączenie zbiorów (z union by size)
+   │  ├─ get_components_of(x) - Wszystkie elementy w zbiorze x
+   │  ├─ are_connected(x, y) - Sprawdzanie czy x i y w tym samym zbiorze
+   │  ├─ get_size(x) - Rozmiar zbioru zawierającego x
+   │  ├─ get_num_components() - Liczba rozłącznych zbiorów
+   │  └─ get_labels_for_connected_components() - Generowanie etykiet klastrów
+   └─ Używane do grupowania postaci na podstawie macierzy podobieństwa
+3. WIZUALIZACJA WYNIKÓW
+   ├─ visualise_single_image_prediction() - Główna funkcja wizualizacji
+   │  ├─ Rysuje bounding boxy: panele (zielone), tekst (czerwone),
+   │  │  postaci (niebieskie), ogony dymków (fioletowe)
+   │  ├─ Wyświetla imiona postaci nad ich bounding boxami
+   │  ├─ Rysuje klastry postaci (ta sama osoba) jako kolorowe linie w układzie gwiazdki
+   │  ├─ Pokazuje asocjacje tekst-postać (kto mówi) - czerwone przerywane linie
+   │  ├─ Pokazuje asocjacje tekst-ogon - fioletowe przerywane linie
+   │  └─ Zwraca obraz jako numpy array lub zapisuje do pliku
+   └─ plot_bboxes() - Pomocnicza funkcja do rysowania prostokątów
+4. SORTOWANIE PANELI I TEKSTÓW W KOLEJNOŚCI CZYTANIA
+   A. Sortowanie paneli (manga: prawo->lewo, góra->dół):
+   ├─ sort_panels() - Główny algorytm sortowania paneli
+   │  ├─ Buduje skierowany graf kolejności czytania
+   │  ├─ Używa erozji paneli (5%) do obsługi niedokładnych detekcji
+   │  ├─ Usuwa cykle przez eliminację najdłuższych krawędzi
+   │  └─ Zwraca sortowanie topologiczne (kolejność czytania)
+   │
+   ├─ is_there_a_directed_edge() - Określa czy panel A jest przed B
+   │  ├─ Reguły mangi: prawo ma priorytet nad górą
+   │  ├─ Obsługuje nakładające się panele przez erozję
+   │  └─ Używa heurystyk cięć (cuts) dla skomplikowanych układów
+   │
+   ├─ use_cuts_to_determine_edge_from_a_to_b() - Zaawansowane heurystyki
+   │  ├─ Dzieli panele na "wiersze" (overlapping Y ranges)
+   │  ├─ Dzieli panele na "kolumny" (overlapping X ranges)
+   │  └─ Iteracyjna erozja gdy nie można określić kolejności
+   │
+   └─ Funkcje pomocnicze geometrii:
+      ├─ is_strictly_above/below/left_of/right_of() - Relacje przestrzenne
+      ├─ intersects() - Sprawdzanie przecięcia prostokątów (Shapely)
+      ├─ get_distance() - Odległość euklidesowa między prostokątami
+      ├─ erode_rectangle() - Zmniejszanie prostokąta z zachowaniem aspect ratio
+      └─ merge_overlapping_ranges() - Scalanie nakładających się zakresów 1D
+   B. Sortowanie tekstów:
+   ├─ sort_text_boxes_in_reading_order() - Sortuje teksty według paneli
+   │  ├─ Przypisuje każdy tekst do najbliższego panelu
+   │  ├─ Sortuje teksty według kolejności paneli
+   │  └─ W każdym panelu sortuje według odległości od prawego górnego rogu
+   │
+   ├─ get_text_to_panel_mapping() - Przypisanie tekst->panel
+   │  ├─ Preferuje nakładanie się (intersection area)
+   │  └─ Fallback: najbliższy panel (distance)
+   │
+   └─ sort_texts_within_panel() - Sortowanie w obrębie jednego panelu
+      └─ Sortuje według odległości od prawego górnego rogu panelu
+5. KONWERSJE FORMATÓW BOUNDING BOXÓW
+   ├─ x1y1wh_to_x1y1x2y2() - (x, y, width, height) -> (x1, y1, x2, y2)
+   ├─ x1y1x2y2_to_xywh() - (x1, y1, x2, y2) -> (x, y, width, height)
+   │  └─ Format COCO używa xywh zamiast corners
+   └─ convert_to_list_of_lists() - Uniwersalna konwersja torch/numpy/list
+═══════════════════════════════════════════════════════════════════════════════
+KLUCZOWE ALGORYTMY
+═══════════════════════════════════════════════════════════════════════════════
+1. UNION-FIND (O(α(n)) - prawie stała):
+   - Path compression: podczas find() ustawiamy rodzica bezpośrednio na korzeń
+   - Union by size: mniejszy zbiór dołączamy do większego dla zbalansowania
+   - Używane do klastrowania postaci z macierzy podobieństwa
+2. SORTOWANIE PANELI (O(n² log n)):
+   - Graf skierowany gdzie krawędź A->B = "A przed B"
+   - Reguły: prawo > góra (manga) lub lewo > góra (komiks zachodni)
+   - Usuwanie cykli przez eliminację najdłuższych krawędzi
+   - Sortowanie topologiczne DAG dla finalnej kolejności
+   - Erozja progresywna (5% na iterację) dla nakładających się paneli
+3. SORTOWANIE TEKSTÓW (O(n log n)):
+   - Przypisanie do paneli: max(intersection_area) lub min(distance)
+   - Sortowanie według ID panelu (panele już posortowane)
+   - W panelu: sortowanie według distance od prawego górnego rogu
+   - Odległość w Shapely: shortest distance między geometriami
+═══════════════════════════════════════════════════════════════════════════════
+ZALEŻNOŚCI ZEWNĘTRZNE
+═══════════════════════════════════════════════════════════════════════════════
+- torch: Tensory GPU/CPU, operacje na urządzeniach
+- numpy: Operacje na tablicach, NDArray typing
+- matplotlib: Wizualizacja (pyplot, patches)
+- shapely: Geometria 2D (Point, box, Polygon) - przecięcia, odległości
+- networkx: Grafy (DiGraph, topological_sort, simple_cycles)
+- typing: Type hints (Any, Dict, List, Tuple, Union, Optional)
+═══════════════════════════════════════════════════════════════════════════════
+TYPOWE UŻYCIE
+═══════════════════════════════════════════════════════════════════════════════
+# 1. Przeniesienie danych na GPU
+inputs = move_to_device({"images": np_array, "labels": [0, 1, 2]}, device)
+# 2. Klastrowanie postaci z macierzy podobieństwa
+uf = UnionFind.from_adj_matrix(similarity_matrix > threshold)
+cluster_labels = uf.get_labels_for_connected_components()
+# 3. Sortowanie paneli w kolejności czytania
+sorted_panel_indices = sort_panels(panel_bboxes)
+# 4. Sortowanie tekstów
+sorted_text_indices = sort_text_boxes_in_reading_order(
+    text_bboxes, sorted_panel_bboxes
+)
+# 5. Wizualizacja wyników
+image = visualise_single_image_prediction(
+    image_array, predictions, filename="output.png"
+)
+# 6. Konwersja formatów bbox
+coco_bbox = x1y1x2y2_to_xywh([10, 20, 30, 40])  # -> [10, 20, 20, 20]
+═══════════════════════════════════════════════════════════════════════════════
+"""
 import torch
 import numpy as np
 import random
 import matplotlib.pyplot as plt
 import matplotlib.patches as patches
 from shapely.geometry import Point, box
+from shapely.geometry.polygon import Polygon
 import networkx as nx
 from copy import deepcopy
 from itertools import groupby
+from typing import Any, Dict, List, Tuple, Union, Optional
+from numpy.typing import NDArray
+def move_to_device(inputs: Any, device: torch.device) -> Any:
+    """
+    Rekurencyjnie przenosi dane na określone urządzenie (CPU/GPU).
+    Obsługuje różne typy danych:
+    - Słowniki: przenosi każdy klucz-wartość rekurencyjnie
+    - Listy: przenosi każdy element rekurencyjnie
+    - Tuple: przenosi każdy element rekurencyjnie
+    - numpy.ndarray: konwertuje na torch.Tensor i przenosi
+    - torch.Tensor: przenosi bezpośrednio
+    Args:
+        inputs: Dane do przeniesienia (dict, list, tuple, array, tensor)
+        device: Docelowe urządzenie torch (torch.device)
+    Returns:
+        Dane przeniesione na docelowe urządzenie (ten sam typ co input)
+    """
     if hasattr(inputs, "keys"):
+        # Słownik - przenoś każdą wartość rekurencyjnie
         return {k: move_to_device(v, device) for k, v in inputs.items()}
     elif isinstance(inputs, list):
+        # Lista - przenoś każdy element rekurencyjnie
         return [move_to_device(v, device) for v in inputs]
     elif isinstance(inputs, tuple):
+        # Tuple - przenoś każdy element rekurencyjnie
         return tuple([move_to_device(v, device) for v in inputs])
     elif isinstance(inputs, np.ndarray):
+        # NumPy array - konwertuj na tensor i przenieś
         return torch.from_numpy(inputs).to(device)
     else:
+        # Tensor - przenieś bezpośrednio
         return inputs.to(device)
 class UnionFind:
+    """
+    Union-Find (Disjoint Set Union) - struktura danych do klastrowania.
+    Używana do grupowania postaci na podstawie macierzy podobieństwa.
+    Implementuje algorytm z kompresją ścieżki (path compression) i
+    łączeniem według rozmiaru (union by size) dla optymalnej wydajności.
+    Attributes:
+        parent: Lista rodziców dla każdego węzła (indeks -> rodzic)
+        size: Rozmiary poddrzew dla każdego korzenia
+        num_components: Liczba rozłącznych komponentów (klastrów)
+    """
+    def __init__(self, n: int) -> None:
+        """
+        Inicjalizuje Union-Find z n rozłącznymi elementami.
+        Args:
+            n: Liczba elementów (węzłów) w strukturze
+        """
+        self.parent: List[int] = list(range(n))
+        self.size: List[int] = [1] * n
+        self.num_components: int = n
     @classmethod
+    def from_adj_matrix(cls, adj_matrix: torch.Tensor) -> 'UnionFind':
+        """
+        Tworzy Union-Find z macierzy sąsiedztwa (adjacency matrix).
+        Łączy węzły i,j jeśli adj_matrix[i,j] > 0 (są połączone krawędzią).
+        Args:
+            adj_matrix: Macierz sąsiedztwa [n, n] (1 = połączone, 0 = rozłączone)
+        Returns:
+            Nowa instancja UnionFind z połączonymi węzłami
+        """
+        ufds: UnionFind = cls(adj_matrix.shape[0])
         for i in range(adj_matrix.shape[0]):
             for j in range(adj_matrix.shape[1]):
                 if adj_matrix[i, j] > 0:
         return ufds
     @classmethod
+    def from_adj_list(cls, adj_list: List[List[int]]) -> 'UnionFind':
+        """
+        Tworzy Union-Find z listy sąsiedztwa (adjacency list).
+        Args:
+            adj_list: Lista list, gdzie adj_list[i] zawiera sąsiadów węzła i
+        Returns:
+            Nowa instancja UnionFind z połączonymi węzłami
+        """
+        ufds: UnionFind = cls(len(adj_list))
         for i in range(len(adj_list)):
             for j in adj_list[i]:
                 ufds.unite(i, j)
         return ufds
     @classmethod
+    def from_edge_list(cls, edge_list: List[Tuple[int, int]], num_nodes: int) -> 'UnionFind':
+        """
+        Tworzy Union-Find z listy krawędzi.
+        Args:
+            edge_list: Lista krotek (i, j) reprezentujących krawędzie
+            num_nodes: Całkowita liczba węzłów w grafie
+        Returns:
+            Nowa instancja UnionFind z połączonymi węzłami
+        """
+        ufds: UnionFind = cls(num_nodes)
         for edge in edge_list:
             ufds.unite(edge[0], edge[1])
         return ufds
+    def find(self, x: int) -> int:
+        """
+        Znajduje korzeń (reprezentanta) zbioru zawierającego x.
+        Implementuje kompresję ścieżki (path compression) - podczas
+        przechodzenia do korzenia, ustawia rodzica każdego węzła
+        bezpośrednio na korzeń dla przyszłych szybszych zapytań.
+        Args:
+            x: Indeks węzła
+        Returns:
+            Indeks korzenia zbioru zawierającego x
+        """
         if self.parent[x] == x:
             return x
+        # Kompresja ścieżki - ustawiamy rodzica na korzeń
         self.parent[x] = self.find(self.parent[x])
         return self.parent[x]
+    def unite(self, x: int, y: int) -> None:
+        """
+        Łączy zbiory zawierające x i y.
+        Implementuje union by size - mniejszy zbiór jest dołączany
+        do większego dla utrzymania zbalansowanego drzewa.
+        Args:
+            x: Indeks pierwszego węzła
+            y: Indeks drugiego węzła
+        """
         x = self.find(x)
         y = self.find(y)
         if x != y:
+            # Łączenie według rozmiaru - mniejszy do większego
             if self.size[x] < self.size[y]:
                 x, y = y, x
             self.parent[y] = x
             self.size[x] += self.size[y]
             self.num_components -= 1
+    def get_components_of(self, x: int) -> List[int]:
+        """
+        Zwraca wszystkie węzły w tym samym zbiorze co x.
+        Args:
+            x: Indeks węzła
+        Returns:
+            Lista indeksów wszystkich węzłów w zbiorze zawierającym x
+        """
         x = self.find(x)
         return [i for i in range(len(self.parent)) if self.find(i) == x]
+    def are_connected(self, x: int, y: int) -> bool:
+        """
+        Sprawdza czy x i y są w tym samym zbiorze.
+        Args:
+            x: Indeks pierwszego węzła
+            y: Indeks drugiego węzła
+        Returns:
+            True jeśli x i y są w tym samym zbiorze, False w przeciwnym razie
+        """
         return self.find(x) == self.find(y)
+    def get_size(self, x: int) -> int:
+        """
+        Zwraca rozmiar zbioru zawierającego x.
+        Args:
+            x: Indeks węzła
+        Returns:
+            Liczba węzłów w zbiorze zawierającym x
+        """
         return self.size[self.find(x)]
+    def get_num_components(self) -> int:
+        """
+        Zwraca liczbę rozłącznych zbiorów (komponentów).
+        Returns:
+            Liczba rozłącznych zbiorów w strukturze
+        """
         return self.num_components
+    def get_labels_for_connected_components(self) -> List[int]:
+        """
+        Generuje etykiety klastrów dla wszystkich węzłów.
+        Węzły w tym samym zbiorze otrzymują tę samą etykietę (0, 1, 2, ...).
+        Etykiety są przypisywane w kolejności pierwszego napotkania korzenia.
+        Returns:
+            Lista etykiet klastrów (długość n), gdzie labels[i] to klaster węzła i
+        """
+        map_parent_to_label: Dict[int, int] = {}
+        labels: List[int] = []
         for i in range(len(self.parent)):
+            parent: int = self.find(i)
             if parent not in map_parent_to_label:
                 map_parent_to_label[parent] = len(map_parent_to_label)
             labels.append(map_parent_to_label[parent])
         return labels
+def visualise_single_image_prediction(
+    image_as_np_array: NDArray[np.uint8],
+    predictions: Dict[str, Any],
+    filename: Optional[str]
+) -> NDArray[np.uint8]:
+    """
+    Wizualizuje wyniki predykcji modelu na obrazie strony mangi/komiksu.
+    Rysuje:
+    - Zielone prostokąty: panele
+    - Czerwone prostokąty: tekst (tylko essential_text, tj. dialogi)
+    - Niebieskie prostokąty: postaci
+    - Fioletowe prostokąty: ogony dymków
+    - Niebieskie etykiety: imiona postaci
+    - Kolorowe linie: klastry postaci (ta sama osoba)
+    - Czerwone przerywane linie: asocjacje tekst-postać (kto mówi)
+    - Fioletowe przerywane linie: asocjacje tekst-ogon
+    Args:
+        image_as_np_array: Obraz strony jako numpy array [H, W, C]
+        predictions: Słownik z wynikami zawierający klucze:
+            - "panels", "texts", "characters", "tails": bounding boxy
+            - "character_names": imiona postaci
+            - "character_cluster_labels": etykiety klastrów postaci
+            - "text_character_associations": pary (idx_tekstu, idx_postaci)
+            - "text_tail_associations": pary (idx_tekstu, idx_ogona)
+            - "is_essential_text": flagi czy tekst to dialog
+        filename: Opcjonalna ścieżka do zapisu wizualizacji (lub None)
+    Returns:
+        Obraz wizualizacji jako numpy array [H, W, C]
+    """
     figure, subplot = plt.subplots(1, 1, figsize=(10, 10))
     subplot.imshow(image_as_np_array)
+    # Rysowanie bounding boxów dla każdego typu obiektu
     plot_bboxes(subplot, predictions["panels"], color="green")
     plot_bboxes(subplot, predictions["texts"], color="red",
                 visibility=predictions["is_essential_text"])
     plot_bboxes(subplot, predictions["characters"], color="blue")
     plot_bboxes(subplot, predictions["tails"], color="purple")
+    # Rysowanie imion postaci nad bounding boxami
     for i, name in enumerate(predictions["character_names"]):
+        char_bbox: List[float] = predictions["characters"][i]
+        x1: float
+        y1: float
+        x2: float
+        y2: float
         x1, y1, x2, y2 = char_bbox
         subplot.text(x1, y1 - 2, name,
                      verticalalignment='bottom', horizontalalignment='left',
+                     # Tło etykiety (niebieski prostokąt)
                      bbox=dict(facecolor='blue', alpha=1, edgecolor='none'),
                      color='white', fontsize=8)
+    # Paleta kolorów dla klastrów postaci
+    COLOURS: List[str] = [
+        "#b7ff51",  # zielony
+        "#f50a8f",  # różowy
+        "#4b13b6",  # fioletowy
+        "#ddaa34",  # pomarańczowy
+        "#bea2a2",  # brązowy
     ]
+    colour_index: int = 0
+    character_cluster_labels: List[int] = predictions["character_cluster_labels"]
+    # Sortowanie etykiet klastrów według częstości (najczęstsze pierwsze)
+    unique_label_sorted_by_frequency: List[int] = sorted(list(set(
         character_cluster_labels)), key=lambda x: character_cluster_labels.count(x), reverse=True)
+    # Rysowanie linii łączących postaci w tym samym klastrze (ta sama osoba)
     for label in unique_label_sorted_by_frequency:
+        root: Optional[int] = None
+        others: List[int] = []
+        # Znajdź wszystkie postaci z tym samym labelem klastra
         for i in range(len(predictions["characters"])):
             if character_cluster_labels[i] == label:
                 if root is None:
+                    root = i  # Pierwszy jako korzeń (centrum gwiazdki)
                 else:
+                    others.append(i)  # Pozostałe jako liście
+        # Wybór koloru dla tego klastra
         if colour_index >= len(COLOURS):
+            # Jeśli zabrakło predefiniowanych kolorów, generuj losowy
+            random_colour: str = COLOURS[0]
             while random_colour in COLOURS:
                 random_colour = "#" + \
                     "".join([random.choice("0123456789ABCDEF")
                             for j in range(6)])
         else:
+            random_colour: str = COLOURS[colour_index]
             colour_index += 1
+        # Oblicz centrum bbox korzenia
+        bbox_i: List[float] = predictions["characters"][root]
+        x1: float = bbox_i[0] + (bbox_i[2] - bbox_i[0]) / 2
+        y1: float = bbox_i[1] + (bbox_i[3] - bbox_i[1]) / 2
+        # Rysuj punkt w centrum korzenia
         subplot.plot([x1], [y1], color=random_colour, marker="o", markersize=5)
+        # Rysuj linie od korzenia do wszystkich innych postaci w klastrze
         for j in others:
+            bbox_j: List[float] = predictions["characters"][j]
             x1 = bbox_i[0] + (bbox_i[2] - bbox_i[0]) / 2
             y1 = bbox_i[1] + (bbox_i[3] - bbox_i[1]) / 2
+            x2: float = bbox_j[0] + (bbox_j[2] - bbox_j[0]) / 2
+            y2: float = bbox_j[1] + (bbox_j[3] - bbox_j[1]) / 2
+            # Linia od korzenia do liścia
             subplot.plot([x1, x2], [y1, y2], color=random_colour, linewidth=2)
+            # Punkt w centrum liścia
             subplot.plot([x2], [y2], color=random_colour,
                          marker="o", markersize=5)
+    # Rysowanie asocjacji tekst-postać (kto mówi - czerwone przerywane linie)
     for (i, j) in predictions["text_character_associations"]:
+        bbox_i: List[float] = predictions["texts"][i]
+        bbox_j: List[float] = predictions["characters"][j]
+        # Pomiń jeśli tekst nie jest dialogiem
         if not predictions["is_essential_text"][i]:
             continue
+        # Oblicz centra bounding boxów
+        x1: float = bbox_i[0] + (bbox_i[2] - bbox_i[0]) / 2
+        y1: float = bbox_i[1] + (bbox_i[3] - bbox_i[1]) / 2
+        x2: float = bbox_j[0] + (bbox_j[2] - bbox_j[0]) / 2
+        y2: float = bbox_j[1] + (bbox_j[3] - bbox_j[1]) / 2
+        # Rysuj linię od tekstu do postaci
         subplot.plot([x1, x2], [y1, y2], color="red",
                      linewidth=2, linestyle="dashed")
+    # Rysowanie asocjacji tekst-ogon (fioletowe przerywane linie)
     for (i, j) in predictions["text_tail_associations"]:
+        bbox_i: List[float] = predictions["texts"][i]
+        bbox_j: List[float] = predictions["tails"][j]
+        # Oblicz centra bounding boxów
+        x1: float = bbox_i[0] + (bbox_i[2] - bbox_i[0]) / 2
+        y1: float = bbox_i[1] + (bbox_i[3] - bbox_i[1]) / 2
+        x2: float = bbox_j[0] + (bbox_j[2] - bbox_j[0]) / 2
+        y2: float = bbox_j[1] + (bbox_j[3] - bbox_j[1]) / 2
+        # Rysuj linię od tekstu do ogona
         subplot.plot([x1, x2], [y1, y2], color="purple",
                      linewidth=2, linestyle="dashed")
+    # Ukryj osie wykresu
     subplot.axis("off")
+    # Zapisz do pliku jeśli podano ścieżkę
     if filename is not None:
         plt.savefig(filename, bbox_inches="tight", pad_inches=0)
+    # Konwertuj figure matplotlib na numpy array
     figure.canvas.draw()
+    image: NDArray[np.uint8] = np.array(figure.canvas.renderer._renderer)
     plt.close()
     return image
+def plot_bboxes(
+    subplot: Any,
+    bboxes: List[List[float]],
+    color: str = "red",
+    visibility: Optional[List[int]] = None
+) -> None:
+    """
+    Rysuje bounding boxy na subplocie matplotlib.
+    Args:
+        subplot: Subplot matplotlib do rysowania
+        bboxes: Lista bounding boxów w formacie [x1, y1, x2, y2]
+        color: Kolor krawędzi prostokątów (domyślnie "red")
+        visibility: Opcjonalna lista flag (1=widoczny, 0=ukryty).
+                   Jeśli None, wszystkie boxy są widoczne
+    """
     if visibility is None:
         visibility = [1] * len(bboxes)
     for id, bbox in enumerate(bboxes):
+        # Pomiń niewidoczne boxy
         if visibility[id] == 0:
             continue
+        # Oblicz szerokość i wysokość
+        w: float = bbox[2] - bbox[0]
+        h: float = bbox[3] - bbox[1]
+        # Utwórz prostokąt
+        rect: patches.Rectangle = patches.Rectangle(
             bbox[:2], w, h, linewidth=1, edgecolor=color, facecolor="none", linestyle="solid"
         )
         subplot.add_patch(rect)
+def sort_panels(rects: Union[torch.Tensor, NDArray, List[List[float]]]) -> List[int]:
+    """
+    Sortuje panele w kolejności czytania (prawo->lewo, góra->dół dla mangi).
+    Algorytm:
+    1. Lekka erozja paneli aby obsłużyć niedokładne detekcje
+    2. Budowa grafu skierowanego z krawędziami reprezentującymi kolejność czytania
+    3. Usunięcie cykli przez eliminację najdłuższych krawędzi w każdym cyklu
+    4. Sortowanie topologiczne grafu acyklicznego
+    Args:
+        rects: Bounding boxy paneli [x1, y1, x2, y2]
+    Returns:
+        Lista indeksów paneli w kolejności czytania
+    """
+    before_rects: List[List[float]] = convert_to_list_of_lists(rects)
+    # Lekka erozja prostokątów (5%) aby obsłużyć niedokładne detekcje
+    rects_eroded: List[List[float]] = [
+        erode_rectangle(rect, 0.05) for rect in before_rects]
+    # Budowa skierowanego grafu kolejności czytania
+    G: nx.DiGraph = nx.DiGraph()
+    G.add_nodes_from(range(len(rects_eroded)))
+    for i in range(len(rects_eroded)):
+        for j in range(len(rects_eroded)):
             if i == j:
                 continue
+            # Sprawdź czy istnieje krawędź i->j (i jest przed j w kolejności czytania)
+            if is_there_a_directed_edge(i, j, rects_eroded):
+                G.add_edge(i, j, weight=get_distance(
+                    rects_eroded[i], rects_eroded[j]))
             else:
+                G.add_edge(j, i, weight=get_distance(
+                    rects_eroded[i], rects_eroded[j]))
+    # Usuwanie cykli przez eliminację najdłuższych krawędzi
     while True:
+        cycles: List[List[int]] = sorted(nx.simple_cycles(G))
         cycles = [cycle for cycle in cycles if len(cycle) > 1]
         if len(cycles) == 0:
             break
+        # Weź pierwszy cykl
+        cycle: List[int] = cycles[0]
+        # Znajdź wszystkie krawędzie w cyklu
+        edges: List[Tuple[int, int]] = [
+            e for e in zip(cycle, cycle[1:] + cycle[:1])]
+        # Usuń najdłuższą krawędź (najmniej pewną)
+        max_cyclic_edge: Tuple[int, int] = max(
+            edges, key=lambda x: G.edges[x]["weight"])
         G.remove_edge(*max_cyclic_edge)
+    # Sortowanie topologiczne grafu acyklicznego daje kolejność czytania
     return list(nx.topological_sort(G))
+def is_strictly_above(rectA: List[float], rectB: List[float]) -> bool:
+    """Sprawdza czy rectA jest całkowicie nad rectB (dolna krawędź A < górna krawędź B)."""
+    x1A: float
+    y1A: float
+    x2A: float
+    y2A: float
     x1A, y1A, x2A, y2A = rectA
+    x1B: float
+    y1B: float
+    x2B: float
+    y2B: float
     x1B, y1B, x2B, y2B = rectB
     return y2A < y1B
+def is_strictly_below(rectA: List[float], rectB: List[float]) -> bool:
+    """Sprawdza czy rectA jest całkowicie pod rectB (dolna krawędź B < górna krawędź A)."""
+    x1A: float
+    y1A: float
+    x2A: float
+    y2A: float
     x1A, y1A, x2A, y2A = rectA
+    x1B: float
+    y1B: float
+    x2B: float
+    y2B: float
     x1B, y1B, x2B, y2B = rectB
     return y2B < y1A
+def is_strictly_left_of(rectA: List[float], rectB: List[float]) -> bool:
+    """Sprawdza czy rectA jest całkowicie na lewo od rectB (prawa krawędź A < lewa krawędź B)."""
+    x1A: float
+    y1A: float
+    x2A: float
+    y2A: float
     x1A, y1A, x2A, y2A = rectA
+    x1B: float
+    y1B: float
+    x2B: float
+    y2B: float
     x1B, y1B, x2B, y2B = rectB
     return x2A < x1B
+def is_strictly_right_of(rectA: List[float], rectB: List[float]) -> bool:
+    """Sprawdza czy rectA jest całkowicie na prawo od rectB (prawa krawędź B < lewa krawędź A)."""
+    x1A: float
+    y1A: float
+    x2A: float
+    y2A: float
     x1A, y1A, x2A, y2A = rectA
+    x1B: float
+    y1B: float
+    x2B: float
+    y2B: float
     x1B, y1B, x2B, y2B = rectB
     return x2B < x1A
+def intersects(rectA: List[float], rectB: List[float]) -> bool:
+    """Sprawdza czy dwa prostokąty się przecinają używając Shapely."""
     return box(*rectA).intersects(box(*rectB))
+def is_there_a_directed_edge(a: int, b: int, rects: List[List[float]]) -> bool:
+    """
+    Określa czy panel 'a' powinien być czytany przed panelem 'b'.
+    Używa reguł kolejności czytania mangi (prawo->lewo, góra->dół):
+    - Jeśli A jest na prawo i nie poniżej B -> A przed B
+    - Jeśli A jest nad i nie na lewo od B -> A przed B
+    - Dla nakładających się paneli używa erozji i heurystyk
+    Args:
+        a: Indeks pierwszego panelu
+        b: Indeks drugiego panelu
+        rects: Lista bounding boxów paneli
+    Returns:
+        True jeśli istnieje krawędź a->b (a przed b), False w przeciwnym razie
+    """
+    rectA: List[float] = rects[a]
+    rectB: List[float] = rects[b]
+    # Oblicz centra prostokątów
+    centre_of_A: List[float] = [rectA[0] + (rectA[2] - rectA[0]) / 2,
+                                rectA[1] + (rectA[3] - rectA[1]) / 2]
+    centre_of_B: List[float] = [rectB[0] + (rectB[2] - rectB[0]) / 2,
+                                rectB[1] + (rectB[3] - rectB[1]) / 2]
+    # Jeśli centra są w tym samym miejscu, większy panel jest pierwszy
     if np.allclose(np.array(centre_of_A), np.array(centre_of_B)):
         return box(*rectA).area > (box(*rectB)).area
     copy_A = [rectA[0], rectA[1], rectA[2], rectA[3]]
         copy_B = erode_rectangle(copy_B, 0.05)
+def get_distance(rectA: List[float], rectB: List[float]) -> float:
+    """
+    Oblicza odległość euklidesową między dwoma prostokątami.
+    Args:
+        rectA: Pierwszy prostokąt [x1, y1, x2, y2]
+        rectB: Drugi prostokąt [x1, y1, x2, y2]
+    Returns:
+        Odległość między prostokątami (0 jeśli się przecinają)
+    """
     return box(rectA[0], rectA[1], rectA[2], rectA[3]).distance(box(rectB[0], rectB[1], rectB[2], rectB[3]))
+def use_cuts_to_determine_edge_from_a_to_b(a: int, b: int, rects: List[List[float]]) -> bool:
+    """
+    Używa zaawansowanych heurystyk "cięć" do określenia kolejności czytania paneli.
+    Gdy standardowe reguły przestrzenne (prawo/lewo/góra/dół) nie mogą jednoznacznie
+    określić kolejności między dwoma panelami, ta funkcja stosuje algorytm dzielenia
+    przestrzeni na "wiersze" i "kolumny" aby ustalić która z tych paneli jest pierwsza.
+    Algorytm:
+    1. Wyznacza minimalny prostokąt otaczający oba panele (a i b)
+    2. Znajduje wszystkie panele przecinające ten obszar
+    3. KROK POZIOMY: Dzieli panele na "wiersze" (overlapping Y ranges)
+       - Scala nakładające się zakresy Y w nieprzekrywające się poziomy
+       - Jeśli a i b są w różnych poziomach -> wyższy poziom jest pierwszy
+    4. KROK PIONOWY: Dzieli panele na "kolumny" (overlapping X ranges, odwrócone)
+       - Scala nakładające się zakresy X w nieprzekrywające się kolumny
+       - Kolumny są odwrócone (prawo->lewo) dla mangi
+       - Jeśli a i b są w różnych kolumnach -> prawa kolumna jest pierwsza
+    5. EROZJA: Jeśli nadal nie można określić, zmniejsz panele o 5% i powtórz
+    Ta funkcja jest wywoływana tylko dla skomplikowanych układów paneli,
+    gdzie panele są częściowo nakładające się lub ułożone nieregularnie.
+    Args:
+        a: Indeks pierwszego panelu
+        b: Indeks drugiego panelu
+        rects: Lista wszystkich bounding boxów paneli [x1, y1, x2, y2]
+    Returns:
+        True jeśli panel 'a' powinien być czytany przed panelem 'b', False w przeciwnym razie
+    """
+    # Kopia głęboka aby nie modyfikować oryginalnych prostokątów
     rects = deepcopy(rects)
     while True:
+        # Oblicz minimalny prostokąt otaczający oba panele a i b
+        xmin: float
+        ymin: float
+        xmax: float
+        ymax: float
         xmin, ymin, xmax, ymax = min(rects[a][0], rects[b][0]), min(
             rects[a][1], rects[b][1]), max(rects[a][2], rects[b][2]), max(rects[a][3], rects[b][3])
+        # Znajdź indeksy wszystkich paneli przecinających otaczający prostokąt
+        rect_index: List[int] = [i for i in range(len(rects)) if intersects(
             rects[i], [xmin, ymin, xmax, ymax])]
+        # Pobierz bounding boxy tych paneli
+        rects_copy: List[List[float]] = [rect for rect in rects if intersects(
             rect, [xmin, ymin, xmax, ymax])]
+        # PRÓBA 1: Podziel panele używając "poziomych" linii (wiersze)
+        # Scal nakładające się zakresy Y aby uzyskać nieprzekrywające się poziomy
+        overlapping_y_ranges: List[Tuple[float, float]] = merge_overlapping_ranges(
             [(y1, y2) for x1, y1, x2, y2 in rects_copy])
+        panel_index_to_split: Dict[int, int] = {}
+        # Przypisz każdy panel do poziomu (split_index)
         for split_index, (y1, y2) in enumerate(overlapping_y_ranges):
             for i, index in enumerate(rect_index):
+                # Jeśli panel całkowicie mieści się w tym poziomie Y
                 if y1 <= rects_copy[i][1] <= rects_copy[i][3] <= y2:
                     panel_index_to_split[index] = split_index
+        # Jeśli a i b są w różnych poziomach -> wyższy (mniejszy Y) jest pierwszy
         if panel_index_to_split[a] != panel_index_to_split[b]:
             return panel_index_to_split[a] < panel_index_to_split[b]
+        # PRÓBA 2: Podziel panele używając "pionowych" linii (kolumny)
+        # Scal nakładające się zakresy X aby uzyskać nieprzekrywające się kolumny
+        overlapping_x_ranges: List[Tuple[float, float]] = merge_overlapping_ranges(
             [(x1, x2) for x1, y1, x2, y2 in rects_copy])
+        panel_index_to_split: Dict[int, int] = {}
+        # Przypisz każdy panel do kolumny (split_index)
+        # [::-1] odwraca kolejność dla mangi (prawo->lewo)
         for split_index, (x1, x2) in enumerate(overlapping_x_ranges[::-1]):
             for i, index in enumerate(rect_index):
+                # Jeśli panel całkowicie mieści się w tej kolumnie X
                 if x1 <= rects_copy[i][0] <= rects_copy[i][2] <= x2:
                     panel_index_to_split[index] = split_index
+        # Jeśli a i b są w różnych kolumnach -> prawa (mniejszy index po odwróceniu) jest pierwsza
         if panel_index_to_split[a] != panel_index_to_split[b]:
             return panel_index_to_split[a] < panel_index_to_split[b]
+        # PRÓBA 3: Erozja - zmniejsz prostokąty o 5% i spróbuj ponownie
+        # To pomaga gdy panele są bardzo blisko siebie lub lekko nakładające się
         rects = [erode_rectangle(rect, 0.05) for rect in rects]
+def erode_rectangle(bbox: List[float], erosion_factor: float) -> List[float]:
+    """
+    Zmniejsza prostokąt proporcjonalnie zachowując aspect ratio.
+    Erozja jest stosowana względem krótszego boku aby zachować kształt.
+    Używane do obsługi niedokładnych detekcji paneli.
+    Args:
+        bbox: Bounding box [x1, y1, x2, y2]
+        erosion_factor: Współczynnik erozji (0-1), np. 0.05 = 5% redukcja
+    Returns:
+        Zmniejszony bounding box [x1, y1, x2, y2]
+    """
+    x1: float
+    y1: float
+    x2: float
+    y2: float
     x1, y1, x2, y2 = bbox
+    w: float
+    h: float
     w, h = x2 - x1, y2 - y1
+    # Oblicz centrum
+    cx: float
+    cy: float
     cx, cy = x1 + w / 2, y1 + h / 2
+    # Oblicz współczynniki erozji względem aspect ratio
     if w < h:
+        aspect_ratio: float = w / h
+        erosion_factor_width: float = erosion_factor * aspect_ratio
+        erosion_factor_height: float = erosion_factor
     else:
+        aspect_ratio: float = h / w
+        erosion_factor_width: float = erosion_factor
+        erosion_factor_height: float = erosion_factor * aspect_ratio
+    # Zmniejsz wymiary
     w = w - w * erosion_factor_width
     h = h - h * erosion_factor_height
+    # Oblicz nowe współrzędne względem centrum
     x1, y1, x2, y2 = cx - w / 2, cy - h / 2, cx + w / 2, cy + h / 2
     return [x1, y1, x2, y2]
+def merge_overlapping_ranges(ranges: List[Tuple[float, float]]) -> List[Tuple[float, float]]:
     """
+    Scala nakładające się zakresy 1D w nieprzekrywające się zakresy.
+    Używane do dzielenia paneli na "wiersze" lub "kolumny" dla określenia
+    kolejności czytania gdy panele są ułożone nieregularnie.
+    Args:
+        ranges: Lista krotek (początek, koniec) reprezentujących zakresy
+    Returns:
+        Lista scalonych nieprzekrywających się zakresów, posortowana
     """
     if len(ranges) == 0:
         return []
+    # Sortuj zakresy według początku
+    ranges_sorted: List[Tuple[float, float]] = sorted(
+        ranges, key=lambda x: x[0])
+    merged_ranges: List[Tuple[float, float]] = []
+    prev_x1: float
+    prev_x2: float
+    for i, r in enumerate(ranges_sorted):
         if i == 0:
             prev_x1, prev_x2 = r
             continue
+        x1: float
+        x2: float
         x1, x2 = r
+        # Jeśli zakres nie nakłada się z poprzednim, dodaj poprzedni
         if x1 > prev_x2:
             merged_ranges.append((prev_x1, prev_x2))
             prev_x1, prev_x2 = x1, x2
         else:
+            # Nakładają się - scal przez rozszerzenie poprzedniego
             prev_x2 = max(prev_x2, x2)
+    # Dodaj ostatni zakres
     merged_ranges.append((prev_x1, prev_x2))
     return merged_ranges
+def sort_text_boxes_in_reading_order(
+    text_bboxes: Union[torch.Tensor, NDArray, List[List[float]]],
+    sorted_panel_bboxes: Union[torch.Tensor, NDArray, List[List[float]]]
+) -> List[int]:
+    """
+    Sortuje teksty w kolejności czytania, grupując według paneli.
+    Algorytm:
+    1. Przypisz każdy tekst do najbliższego/najbardziej nakładającego się panelu
+    2. Sortuj teksty według ID panelu (panele już są w kolejności czytania)
+    3. W obrębie każdego panelu, sortuj teksty według odległości od prawego górnego rogu
+    Args:
+        text_bboxes: Bounding boxy tekstów [x1, y1, x2, y2]
+        sorted_panel_bboxes: Bounding boxy paneli już posortowane w kolejności czytania
+    Returns:
+        Lista indeksów tekstów w kolejności czytania
+    """
+    text_bboxes_list: List[List[float]] = convert_to_list_of_lists(text_bboxes)
+    sorted_panel_bboxes_list: List[List[float]] = convert_to_list_of_lists(
+        sorted_panel_bboxes)
+    if len(text_bboxes_list) == 0:
         return []
+    def indices_of_same_elements(nums: List[int]) -> List[List[int]]:
+        """Grupuje indeksy według wartości (elementy z tą samą wartością w jednej grupie)."""
         groups = groupby(range(len(nums)), key=lambda i: nums[i])
         return [list(indices) for _, indices in groups]
+    # Przypisz każdy tekst do panelu
+    panel_id_for_text: List[int] = get_text_to_panel_mapping(
+        text_bboxes_list, sorted_panel_bboxes_list)
+    # Sortuj teksty według ID panelu
+    indices_of_texts: List[int] = list(range(len(text_bboxes_list)))
     indices_of_texts, panel_id_for_text = zip(
         *sorted(zip(indices_of_texts, panel_id_for_text), key=lambda x: x[1]))
     indices_of_texts = list(indices_of_texts)
+    # Dla każdej grupy tekstów w tym samym panelu, sortuj wewnątrz panelu
+    grouped_indices: List[List[int]] = indices_of_same_elements(
+        panel_id_for_text)
     for group in grouped_indices:
+        subset_of_text_indices: List[int] = [
+            indices_of_texts[i] for i in group]
+        text_bboxes_of_subset: List[List[float]] = [text_bboxes_list[i]
+                                                    for i in subset_of_text_indices]
+        # Sortuj teksty w obrębie panelu (według odległości od prawego górnego rogu)
+        sorted_subset_indices: List[int] = sort_texts_within_panel(
+            text_bboxes_of_subset)
         indices_of_texts[group[0]: group[-1] + 1] = [subset_of_text_indices[i]
                                                      for i in sorted_subset_indices]
     return indices_of_texts
+def get_text_to_panel_mapping(
+    text_bboxes: List[List[float]],
+    sorted_panel_bboxes: List[List[float]]
+) -> List[int]:
+    """
+    Przypisuje każdy tekst do najbliższego/najbardziej nakładającego się panelu.
+    Algorytm priorytetów:
+    1. PRIORYTET 1 - Przecięcie (intersection): Jeśli tekst przecina się z jakimś panelem,
+       wybierz panel z największą powierzchnią przecięcia (tekst "w środku" panelu)
+    2. PRIORYTET 2 - Odległość (distance): Jeśli tekst nie przecina się z żadnym panelem,
+       wybierz najbliższy panel (tekst "obok" panelu)
+    3. BRAK PANELI: Jeśli nie ma żadnych paneli, przypisz -1 (brak przypisania)
+    Ta funkcja jest kluczowa dla sortowania tekstów w kolejności czytania,
+    ponieważ teksty są grupowane według paneli, a panele są już posortowane.
+    Args:
+        text_bboxes: Lista bounding boxów tekstów [x1, y1, x2, y2]
+        sorted_panel_bboxes: Lista bounding boxów paneli [x1, y1, x2, y2],
+                            już posortowana w kolejności czytania
+    Returns:
+        Lista indeksów paneli dla każdego tekstu (długość = len(text_bboxes)).
+        Wartość -1 oznacza brak przypisania (gdy nie ma żadnych paneli).
+    """
+    text_to_panel_mapping: List[int] = []
     for text_bbox in text_bboxes:
+        # Konwertuj bbox tekstu na polygon Shapely
+        shapely_text_polygon: Polygon = box(*text_bbox)
+        all_intersections: List[Tuple[float, int]] = []  # (area, panel_index)
+        # (distance, panel_index)
+        all_distances: List[Tuple[float, int]] = []
+        # Brak paneli - przypisz -1
         if len(sorted_panel_bboxes) == 0:
             text_to_panel_mapping.append(-1)
             continue
+        # Sprawdź wszystkie panele
         for j, annotation in enumerate(sorted_panel_bboxes):
+            # Konwertuj bbox panelu na polygon Shapely
+            shapely_annotation_polygon: Polygon = box(*annotation)
+            # Jeśli tekst przecina się z panelem, zapisz powierzchnię przecięcia
             if shapely_text_polygon.intersects(shapely_annotation_polygon):
+                intersection_area: float = shapely_text_polygon.intersection(
+                    shapely_annotation_polygon).area
+                all_intersections.append((intersection_area, j))
+            # Zawsze oblicz odległość (fallback jeśli brak przecięć)
+            distance: float = shapely_text_polygon.distance(
+                shapely_annotation_polygon)
+            all_distances.append((distance, j))
+        # DECYZJA: Czy są przecięcia?
         if len(all_intersections) == 0:
+            # Brak przecięć -> wybierz najbliższy panel (minimalna odległość)
+            closest_panel_index: int = min(
+                all_distances, key=lambda x: x[0])[1]
+            text_to_panel_mapping.append(closest_panel_index)
         else:
+            # Są przecięcia -> wybierz panel z największą powierzchnią przecięcia
+            best_panel_index: int = max(
+                all_intersections, key=lambda x: x[0])[1]
+            text_to_panel_mapping.append(best_panel_index)
     return text_to_panel_mapping
+def sort_texts_within_panel(rects: List[List[float]]) -> List[int]:
+    """
+    Sortuje teksty w obrębie jednego panelu według odległości od prawego górnego rogu.
+    Dla mangi (czytanej prawo->lewo, góra->dół), teksty są czytane od prawego
+    górnego rogu. Algorytm:
+    1. Znajdź prawy górny róg panelu (max(X), min(Y) ze wszystkich tekstów)
+    2. Oblicz odległość każdego tekstu od tego punktu odniesienia
+    3. Sortuj teksty według odległości (najbliższe pierwsze)
+    Tekst najbliższy prawego górnego rogu jest czytany jako pierwszy,
+    następnie kolejne w dół i w lewo.
+    Args:
+        rects: Lista bounding boxów tekstów w jednym panelu [x1, y1, x2, y2]
+    Returns:
+        Lista indeksów tekstów posortowana według kolejności czytania
+        (indeks 0 = pierwszy tekst do przeczytania)
+    """
+    # Znajdź prawy górny róg obszaru (punkt odniesienia dla mangi)
+    smallest_y: float = float("inf")   # Najmniejszy Y = najwyższy punkt
+    greatest_x: float = float("-inf")  # Największy X = najbardziej prawy punkt
     for i, rect in enumerate(rects):
+        x1: float
+        y1: float
+        x2: float
+        y2: float
         x1, y1, x2, y2 = rect
+        smallest_y = min(smallest_y, y1)  # Szukaj najwyższego punktu
+        greatest_x = max(greatest_x, x2)  # Szukaj najbardziej prawego punktu
+    # Punkt odniesienia - prawy górny róg panelu
+    reference_point: Point = Point(greatest_x, smallest_y)
+    # Konwertuj prostokąty na polygony Shapely wraz z ich indeksami
+    polygons_and_index: List[Tuple[Polygon, int]] = []
     for i, rect in enumerate(rects):
+        x1: float
+        y1: float
+        x2: float
+        y2: float
         x1, y1, x2, y2 = rect
         polygons_and_index.append((box(x1, y1, x2, y2), i))
+    # Sortuj według odległości od punktu odniesienia (najmniejsza odległość pierwsza)
     polygons_and_index = sorted(
         polygons_and_index, key=lambda x: reference_point.distance(x[0]))
+    # Wyciągnij tylko indeksy (porzuć polygony)
+    indices: List[int] = [x[1] for x in polygons_and_index]
     return indices
+def x1y1wh_to_x1y1x2y2(bbox: List[float]) -> List[float]:
+    """
+    Konwertuje bbox z formatu (x1, y1, width, height) na (x1, y1, x2, y2).
+    Args:
+        bbox: Bounding box [x1, y1, width, height]
+    Returns:
+        Bounding box [x1, y1, x2, y2] (corners format)
+    """
+    x1: float
+    y1: float
+    w: float
+    h: float
     x1, y1, w, h = bbox
     return [x1, y1, x1 + w, y1 + h]
+def x1y1x2y2_to_xywh(bbox: List[float]) -> List[float]:
+    """
+    Konwertuje bbox z formatu (x1, y1, x2, y2) na (x, y, width, height).
+    Format COCO używa (x, y, w, h) zamiast corners.
+    Args:
+        bbox: Bounding box [x1, y1, x2, y2] (corners format)
+    Returns:
+        Bounding box [x, y, width, height] (COCO format)
+    """
+    x1: float
+    y1: float
+    x2: float
+    y2: float
     x1, y1, x2, y2 = bbox
     return [x1, y1, x2 - x1, y2 - y1]
+def convert_to_list_of_lists(rects: Union[torch.Tensor, NDArray, List]) -> List[List[float]]:
+    """
+    Konwertuje różne formaty bounding boxów na List[List[float]].
+    Obsługuje:
+    - torch.Tensor -> list
+    - numpy.ndarray -> list
+    - iterable -> list of lists
+    Args:
+        rects: Bounding boxy w dowolnym formacie
+    Returns:
+        Lista list [[x1, y1, x2, y2], ...]
+    """
     if isinstance(rects, torch.Tensor):
         return rects.tolist()
     if isinstance(rects, np.ndarray):

utils_PRE.py ADDED Viewed

	@@ -0,0 +1,456 @@

+import torch
+import numpy as np
+import random
+import matplotlib.pyplot as plt
+import matplotlib.patches as patches
+from shapely.geometry import Point, box
+import networkx as nx
+from copy import deepcopy
+from itertools import groupby
+def move_to_device(inputs, device):
+    if hasattr(inputs, "keys"):
+        return {k: move_to_device(v, device) for k, v in inputs.items()}
+    elif isinstance(inputs, list):
+        return [move_to_device(v, device) for v in inputs]
+    elif isinstance(inputs, tuple):
+        return tuple([move_to_device(v, device) for v in inputs])
+    elif isinstance(inputs, np.ndarray):
+        return torch.from_numpy(inputs).to(device)
+    else:
+        return inputs.to(device)
+class UnionFind:
+    def __init__(self, n):
+        self.parent = list(range(n))
+        self.size = [1] * n
+        self.num_components = n
+    @classmethod
+    def from_adj_matrix(cls, adj_matrix):
+        ufds = cls(adj_matrix.shape[0])
+        for i in range(adj_matrix.shape[0]):
+            for j in range(adj_matrix.shape[1]):
+                if adj_matrix[i, j] > 0:
+                    ufds.unite(i, j)
+        return ufds
+    @classmethod
+    def from_adj_list(cls, adj_list):
+        ufds = cls(len(adj_list))
+        for i in range(len(adj_list)):
+            for j in adj_list[i]:
+                ufds.unite(i, j)
+        return ufds
+    @classmethod
+    def from_edge_list(cls, edge_list, num_nodes):
+        ufds = cls(num_nodes)
+        for edge in edge_list:
+            ufds.unite(edge[0], edge[1])
+        return ufds
+    def find(self, x):
+        if self.parent[x] == x:
+            return x
+        self.parent[x] = self.find(self.parent[x])
+        return self.parent[x]
+    def unite(self, x, y):
+        x = self.find(x)
+        y = self.find(y)
+        if x != y:
+            if self.size[x] < self.size[y]:
+                x, y = y, x
+            self.parent[y] = x
+            self.size[x] += self.size[y]
+            self.num_components -= 1
+    def get_components_of(self, x):
+        x = self.find(x)
+        return [i for i in range(len(self.parent)) if self.find(i) == x]
+    def are_connected(self, x, y):
+        return self.find(x) == self.find(y)
+    def get_size(self, x):
+        return self.size[self.find(x)]
+    def get_num_components(self):
+        return self.num_components
+    def get_labels_for_connected_components(self):
+        map_parent_to_label = {}
+        labels = []
+        for i in range(len(self.parent)):
+            parent = self.find(i)
+            if parent not in map_parent_to_label:
+                map_parent_to_label[parent] = len(map_parent_to_label)
+            labels.append(map_parent_to_label[parent])
+        return labels
+def visualise_single_image_prediction(image_as_np_array, predictions, filename):
+    figure, subplot = plt.subplots(1, 1, figsize=(10, 10))
+    subplot.imshow(image_as_np_array)
+    plot_bboxes(subplot, predictions["panels"], color="green")
+    plot_bboxes(subplot, predictions["texts"], color="red",
+                visibility=predictions["is_essential_text"])
+    plot_bboxes(subplot, predictions["characters"], color="blue")
+    plot_bboxes(subplot, predictions["tails"], color="purple")
+    for i, name in enumerate(predictions["character_names"]):
+        char_bbox = predictions["characters"][i]
+        x1, y1, x2, y2 = char_bbox
+        subplot.text(x1, y1 - 2, name,
+                     verticalalignment='bottom', horizontalalignment='left',
+                     # Background settings
+                     bbox=dict(facecolor='blue', alpha=1, edgecolor='none'),
+                     color='white', fontsize=8)
+    COLOURS = [
+        "#b7ff51",  # green
+        "#f50a8f",  # pink
+        "#4b13b6",  # purple
+        "#ddaa34",  # orange
+        "#bea2a2",  # brown
+    ]
+    colour_index = 0
+    character_cluster_labels = predictions["character_cluster_labels"]
+    unique_label_sorted_by_frequency = sorted(list(set(
+        character_cluster_labels)), key=lambda x: character_cluster_labels.count(x), reverse=True)
+    for label in unique_label_sorted_by_frequency:
+        root = None
+        others = []
+        for i in range(len(predictions["characters"])):
+            if character_cluster_labels[i] == label:
+                if root is None:
+                    root = i
+                else:
+                    others.append(i)
+        if colour_index >= len(COLOURS):
+            random_colour = COLOURS[0]
+            while random_colour in COLOURS:
+                random_colour = "#" + \
+                    "".join([random.choice("0123456789ABCDEF")
+                            for j in range(6)])
+        else:
+            random_colour = COLOURS[colour_index]
+            colour_index += 1
+        bbox_i = predictions["characters"][root]
+        x1 = bbox_i[0] + (bbox_i[2] - bbox_i[0]) / 2
+        y1 = bbox_i[1] + (bbox_i[3] - bbox_i[1]) / 2
+        subplot.plot([x1], [y1], color=random_colour, marker="o", markersize=5)
+        for j in others:
+            # draw line from centre of bbox i to centre of bbox j
+            bbox_j = predictions["characters"][j]
+            x1 = bbox_i[0] + (bbox_i[2] - bbox_i[0]) / 2
+            y1 = bbox_i[1] + (bbox_i[3] - bbox_i[1]) / 2
+            x2 = bbox_j[0] + (bbox_j[2] - bbox_j[0]) / 2
+            y2 = bbox_j[1] + (bbox_j[3] - bbox_j[1]) / 2
+            subplot.plot([x1, x2], [y1, y2], color=random_colour, linewidth=2)
+            subplot.plot([x2], [y2], color=random_colour,
+                         marker="o", markersize=5)
+    for (i, j) in predictions["text_character_associations"]:
+        bbox_i = predictions["texts"][i]
+        bbox_j = predictions["characters"][j]
+        if not predictions["is_essential_text"][i]:
+            continue
+        x1 = bbox_i[0] + (bbox_i[2] - bbox_i[0]) / 2
+        y1 = bbox_i[1] + (bbox_i[3] - bbox_i[1]) / 2
+        x2 = bbox_j[0] + (bbox_j[2] - bbox_j[0]) / 2
+        y2 = bbox_j[1] + (bbox_j[3] - bbox_j[1]) / 2
+        subplot.plot([x1, x2], [y1, y2], color="red",
+                     linewidth=2, linestyle="dashed")
+    for (i, j) in predictions["text_tail_associations"]:
+        bbox_i = predictions["texts"][i]
+        bbox_j = predictions["tails"][j]
+        x1 = bbox_i[0] + (bbox_i[2] - bbox_i[0]) / 2
+        y1 = bbox_i[1] + (bbox_i[3] - bbox_i[1]) / 2
+        x2 = bbox_j[0] + (bbox_j[2] - bbox_j[0]) / 2
+        y2 = bbox_j[1] + (bbox_j[3] - bbox_j[1]) / 2
+        subplot.plot([x1, x2], [y1, y2], color="purple",
+                     linewidth=2, linestyle="dashed")
+    subplot.axis("off")
+    if filename is not None:
+        plt.savefig(filename, bbox_inches="tight", pad_inches=0)
+    figure.canvas.draw()
+    image = np.array(figure.canvas.renderer._renderer)
+    plt.close()
+    return image
+def plot_bboxes(subplot, bboxes, color="red", visibility=None):
+    if visibility is None:
+        visibility = [1] * len(bboxes)
+    for id, bbox in enumerate(bboxes):
+        if visibility[id] == 0:
+            continue
+        w = bbox[2] - bbox[0]
+        h = bbox[3] - bbox[1]
+        rect = patches.Rectangle(
+            bbox[:2], w, h, linewidth=1, edgecolor=color, facecolor="none", linestyle="solid"
+        )
+        subplot.add_patch(rect)
+def sort_panels(rects):
+    before_rects = convert_to_list_of_lists(rects)
+    # slightly erode all rectangles initially to account for imperfect detections
+    rects = [erode_rectangle(rect, 0.05) for rect in before_rects]
+    G = nx.DiGraph()
+    G.add_nodes_from(range(len(rects)))
+    for i in range(len(rects)):
+        for j in range(len(rects)):
+            if i == j:
+                continue
+            if is_there_a_directed_edge(i, j, rects):
+                G.add_edge(i, j, weight=get_distance(rects[i], rects[j]))
+            else:
+                G.add_edge(j, i, weight=get_distance(rects[i], rects[j]))
+    while True:
+        cycles = sorted(nx.simple_cycles(G))
+        cycles = [cycle for cycle in cycles if len(cycle) > 1]
+        if len(cycles) == 0:
+            break
+        cycle = cycles[0]
+        edges = [e for e in zip(cycle, cycle[1:] + cycle[:1])]
+        max_cyclic_edge = max(edges, key=lambda x: G.edges[x]["weight"])
+        G.remove_edge(*max_cyclic_edge)
+    return list(nx.topological_sort(G))
+def is_strictly_above(rectA, rectB):
+    x1A, y1A, x2A, y2A = rectA
+    x1B, y1B, x2B, y2B = rectB
+    return y2A < y1B
+def is_strictly_below(rectA, rectB):
+    x1A, y1A, x2A, y2A = rectA
+    x1B, y1B, x2B, y2B = rectB
+    return y2B < y1A
+def is_strictly_left_of(rectA, rectB):
+    x1A, y1A, x2A, y2A = rectA
+    x1B, y1B, x2B, y2B = rectB
+    return x2A < x1B
+def is_strictly_right_of(rectA, rectB):
+    x1A, y1A, x2A, y2A = rectA
+    x1B, y1B, x2B, y2B = rectB
+    return x2B < x1A
+def intersects(rectA, rectB):
+    return box(*rectA).intersects(box(*rectB))
+def is_there_a_directed_edge(a, b, rects):
+    rectA = rects[a]
+    rectB = rects[b]
+    centre_of_A = [rectA[0] + (rectA[2] - rectA[0]) / 2,
+                   rectA[1] + (rectA[3] - rectA[1]) / 2]
+    centre_of_B = [rectB[0] + (rectB[2] - rectB[0]) / 2,
+                   rectB[1] + (rectB[3] - rectB[1]) / 2]
+    if np.allclose(np.array(centre_of_A), np.array(centre_of_B)):
+        return box(*rectA).area > (box(*rectB)).area
+    copy_A = [rectA[0], rectA[1], rectA[2], rectA[3]]
+    copy_B = [rectB[0], rectB[1], rectB[2], rectB[3]]
+    while True:
+        if is_strictly_above(copy_A, copy_B) and not is_strictly_left_of(copy_A, copy_B):
+            return 1
+        if is_strictly_above(copy_B, copy_A) and not is_strictly_left_of(copy_B, copy_A):
+            return 0
+        if is_strictly_right_of(copy_A, copy_B) and not is_strictly_below(copy_A, copy_B):
+            return 1
+        if is_strictly_right_of(copy_B, copy_A) and not is_strictly_below(copy_B, copy_A):
+            return 0
+        if is_strictly_below(copy_A, copy_B) and is_strictly_right_of(copy_A, copy_B):
+            return use_cuts_to_determine_edge_from_a_to_b(a, b, rects)
+        if is_strictly_below(copy_B, copy_A) and is_strictly_right_of(copy_B, copy_A):
+            return use_cuts_to_determine_edge_from_a_to_b(a, b, rects)
+        # otherwise they intersect
+        copy_A = erode_rectangle(copy_A, 0.05)
+        copy_B = erode_rectangle(copy_B, 0.05)
+def get_distance(rectA, rectB):
+    return box(rectA[0], rectA[1], rectA[2], rectA[3]).distance(box(rectB[0], rectB[1], rectB[2], rectB[3]))
+def use_cuts_to_determine_edge_from_a_to_b(a, b, rects):
+    rects = deepcopy(rects)
+    while True:
+        xmin, ymin, xmax, ymax = min(rects[a][0], rects[b][0]), min(
+            rects[a][1], rects[b][1]), max(rects[a][2], rects[b][2]), max(rects[a][3], rects[b][3])
+        rect_index = [i for i in range(len(rects)) if intersects(
+            rects[i], [xmin, ymin, xmax, ymax])]
+        rects_copy = [rect for rect in rects if intersects(
+            rect, [xmin, ymin, xmax, ymax])]
+        # try to split the panels using a "horizontal" lines
+        overlapping_y_ranges = merge_overlapping_ranges(
+            [(y1, y2) for x1, y1, x2, y2 in rects_copy])
+        panel_index_to_split = {}
+        for split_index, (y1, y2) in enumerate(overlapping_y_ranges):
+            for i, index in enumerate(rect_index):
+                if y1 <= rects_copy[i][1] <= rects_copy[i][3] <= y2:
+                    panel_index_to_split[index] = split_index
+        if panel_index_to_split[a] != panel_index_to_split[b]:
+            return panel_index_to_split[a] < panel_index_to_split[b]
+        # try to split the panels using a "vertical" lines
+        overlapping_x_ranges = merge_overlapping_ranges(
+            [(x1, x2) for x1, y1, x2, y2 in rects_copy])
+        panel_index_to_split = {}
+        for split_index, (x1, x2) in enumerate(overlapping_x_ranges[::-1]):
+            for i, index in enumerate(rect_index):
+                if x1 <= rects_copy[i][0] <= rects_copy[i][2] <= x2:
+                    panel_index_to_split[index] = split_index
+        if panel_index_to_split[a] != panel_index_to_split[b]:
+            return panel_index_to_split[a] < panel_index_to_split[b]
+        # otherwise, erode the rectangles and try again
+        rects = [erode_rectangle(rect, 0.05) for rect in rects]
+def erode_rectangle(bbox, erosion_factor):
+    x1, y1, x2, y2 = bbox
+    w, h = x2 - x1, y2 - y1
+    cx, cy = x1 + w / 2, y1 + h / 2
+    if w < h:
+        aspect_ratio = w / h
+        erosion_factor_width = erosion_factor * aspect_ratio
+        erosion_factor_height = erosion_factor
+    else:
+        aspect_ratio = h / w
+        erosion_factor_width = erosion_factor
+        erosion_factor_height = erosion_factor * aspect_ratio
+    w = w - w * erosion_factor_width
+    h = h - h * erosion_factor_height
+    x1, y1, x2, y2 = cx - w / 2, cy - h / 2, cx + w / 2, cy + h / 2
+    return [x1, y1, x2, y2]
+def merge_overlapping_ranges(ranges):
+    """
+    ranges: list of tuples (x1, x2)
+    """
+    if len(ranges) == 0:
+        return []
+    ranges = sorted(ranges, key=lambda x: x[0])
+    merged_ranges = []
+    for i, r in enumerate(ranges):
+        if i == 0:
+            prev_x1, prev_x2 = r
+            continue
+        x1, x2 = r
+        if x1 > prev_x2:
+            merged_ranges.append((prev_x1, prev_x2))
+            prev_x1, prev_x2 = x1, x2
+        else:
+            prev_x2 = max(prev_x2, x2)
+    merged_ranges.append((prev_x1, prev_x2))
+    return merged_ranges
+def sort_text_boxes_in_reading_order(text_bboxes, sorted_panel_bboxes):
+    text_bboxes = convert_to_list_of_lists(text_bboxes)
+    sorted_panel_bboxes = convert_to_list_of_lists(sorted_panel_bboxes)
+    if len(text_bboxes) == 0:
+        return []
+    def indices_of_same_elements(nums):
+        groups = groupby(range(len(nums)), key=lambda i: nums[i])
+        return [list(indices) for _, indices in groups]
+    panel_id_for_text = get_text_to_panel_mapping(
+        text_bboxes, sorted_panel_bboxes)
+    indices_of_texts = list(range(len(text_bboxes)))
+    indices_of_texts, panel_id_for_text = zip(
+        *sorted(zip(indices_of_texts, panel_id_for_text), key=lambda x: x[1]))
+    indices_of_texts = list(indices_of_texts)
+    grouped_indices = indices_of_same_elements(panel_id_for_text)
+    for group in grouped_indices:
+        subset_of_text_indices = [indices_of_texts[i] for i in group]
+        text_bboxes_of_subset = [text_bboxes[i]
+                                 for i in subset_of_text_indices]
+        sorted_subset_indices = sort_texts_within_panel(text_bboxes_of_subset)
+        indices_of_texts[group[0]: group[-1] + 1] = [subset_of_text_indices[i]
+                                                     for i in sorted_subset_indices]
+    return indices_of_texts
+def get_text_to_panel_mapping(text_bboxes, sorted_panel_bboxes):
+    text_to_panel_mapping = []
+    for text_bbox in text_bboxes:
+        shapely_text_polygon = box(*text_bbox)
+        all_intersections = []
+        all_distances = []
+        if len(sorted_panel_bboxes) == 0:
+            text_to_panel_mapping.append(-1)
+            continue
+        for j, annotation in enumerate(sorted_panel_bboxes):
+            shapely_annotation_polygon = box(*annotation)
+            if shapely_text_polygon.intersects(shapely_annotation_polygon):
+                all_intersections.append(
+                    (shapely_text_polygon.intersection(shapely_annotation_polygon).area, j))
+            all_distances.append(
+                (shapely_text_polygon.distance(shapely_annotation_polygon), j))
+        if len(all_intersections) == 0:
+            text_to_panel_mapping.append(
+                min(all_distances, key=lambda x: x[0])[1])
+        else:
+            text_to_panel_mapping.append(
+                max(all_intersections, key=lambda x: x[0])[1])
+    return text_to_panel_mapping
+def sort_texts_within_panel(rects):
+    smallest_y = float("inf")
+    greatest_x = float("-inf")
+    for i, rect in enumerate(rects):
+        x1, y1, x2, y2 = rect
+        smallest_y = min(smallest_y, y1)
+        greatest_x = max(greatest_x, x2)
+    reference_point = Point(greatest_x, smallest_y)
+    polygons_and_index = []
+    for i, rect in enumerate(rects):
+        x1, y1, x2, y2 = rect
+        polygons_and_index.append((box(x1, y1, x2, y2), i))
+    # sort points by closest to reference point
+    polygons_and_index = sorted(
+        polygons_and_index, key=lambda x: reference_point.distance(x[0]))
+    indices = [x[1] for x in polygons_and_index]
+    return indices
+def x1y1wh_to_x1y1x2y2(bbox):
+    x1, y1, w, h = bbox
+    return [x1, y1, x1 + w, y1 + h]
+def x1y1x2y2_to_xywh(bbox):
+    x1, y1, x2, y2 = bbox
+    return [x1, y1, x2 - x1, y2 - y1]
+def convert_to_list_of_lists(rects):
+    if isinstance(rects, torch.Tensor):
+        return rects.tolist()
+    if isinstance(rects, np.ndarray):
+        return rects.tolist()
+    return [[a, b, c, d] for a, b, c, d in rects]