Spaces:
Sleeping
Sleeping
| # -*- coding: utf-8 -*- | |
| """ | |
| SmartEyeSsen Layout Sorter (v.LayoutDetect.2.4 - Tie-breaker in Post-processing) | |
| ================================================================================= | |
| ๋ฌธ์ ๋ ์ด์์ ์ ๋ ฌ ์๊ณ ๋ฆฌ์ฆ ๊ตฌํ (Layout Type Detection ๊ธฐ๋ฐ Hybrid) | |
| ํ์ด์ง ์ ์ฒด ๋ ์ด์์ ์ ํ(1๋จ, 2๋จ, ํผํฉํ ๋ฑ)์ ๋จผ์ ํ๋ณํ๊ณ , | |
| ์ ํ์ ๋ง๋ ๋ถํ ์ ๋ต(์ํ/์์ง) ์ ์ฉ. | |
| ๋ถํ ์คํจ ์(Base Case), ๋ ์ด์์ ์ ํ๋ณ๋ก ํนํ๋ ๊ทธ๋ฃนํ ๋ก์ง ํธ์ถ. | |
| - ํ์ค 1๋จ/2๋จ ์ปฌ๋ผ: _base_case_standard_1_column | |
| - ํผํฉํ: _base_case_mixed_layout | |
| ์ต์ข ๋ณํฉ ์ ์ ์ญ ๊ณ ์ ๊ทธ๋ฃน ์ฒ๋ฆฌ ๋ก์ง ์ ์ฉ. | |
| ์๊ณ ๋ฆฌ์ฆ ํ๋ฆ: (v.LayoutDetect.2.1/2.2/2.3๊ณผ ๋์ผ) | |
| 0. ์ ์ฒ๋ฆฌ | |
| 1. ๋ ์ด์์ ์ ํ ํ๋ณ | |
| 2. ์ ํ๋ณ ์ฌ๊ท ์ฒ๋ฆฌ | |
| 3. Base Case ์ฒ๋ฆฌ (ํ์ฒ๋ฆฌ ํฌํจ) | |
| 4. ์ต์ข ๋ณํฉ ๋ฐ ์์ ๋ถ์ฌ | |
| v.LayoutDetect.2.4: | |
| - _post_process_table_figure_assignment: ์ต์ ๊ทธ๋ฃน ํ์ ์ Y ๊ฑฐ๋ฆฌ๊ฐ ๋์ผํ ๊ฒฝ์ฐ ๋ ๋ค์ชฝ ๊ทธ๋ฃน์ ์ฐ์ ํ๋ Tie-breaker ์ถ๊ฐ. | |
| - sort_layout_elements: ํ์ฒ๋ฆฌ ํธ์ถ ์ ์ ์์ ๊ทธ๋ฃน ID ํ ๋นํ์ฌ ๋ก๊ทธ ๊ฐ๋ ์ฑ ๊ฐ์ . | |
| - (v2.3 ๋ณ๊ฒฝ ์ ์ง) _post_process_table_figure_assignment: ์ต์ ๊ทธ๋ฃน ํ์ ๋ก์ง (Lookahead). | |
| - (v2.2 ๋ณ๊ฒฝ ์ ์ง) _post_process_table_figure_assignment: ์ด๋ ์กฐ๊ฑด์ ๊ฑฐ๋ฆฌ ๋น๊ต ๋ก์ง ์ฌ์ฉ. | |
| - (v2.1 ๋ณ๊ฒฝ ์ ์ง) _post_process_table_figure_assignment: y_diff_threshold ๊ธฐ๋ณธ๊ฐ 150. | |
| - (v2.1 ๋ณ๊ฒฝ ์ ์ง) _base_case_standard_1_column: ์๋จ ๊ณ ์ ์์ ๋ถ๋ฆฌ ๋ก์ง. | |
| """ | |
| # ํ์ํ ๋ผ์ด๋ธ๋ฌ๋ฆฌ ์ํฌํธ | |
| from typing import List, Dict, Tuple, Optional, Any, Union, TYPE_CHECKING | |
| from dataclasses import dataclass, field | |
| import numpy as np | |
| from sklearn.cluster import KMeans | |
| from loguru import logger | |
| import math | |
| from enum import Enum, auto | |
| import os | |
| # Mock ๋ชจ๋ธ ์ํฌํธ (ํธํ์ฑ ์ ์ง์ฉ, ์ถํ ์ ๊ฑฐ ์์ ) | |
| from .mock_models import MockElement | |
| if TYPE_CHECKING: | |
| from sqlalchemy.orm import Session | |
| from ..models import LayoutElement | |
| # ============================================================================ | |
| # ๋ฐ์ดํฐ ํด๋์ค ๋ฐ Enum ์ ์ (๊ธฐ์กด๊ณผ ๋์ผ) | |
| # ============================================================================ | |
| class LayoutType(Enum): | |
| STANDARD_1_COLUMN = auto() | |
| STANDARD_2_COLUMN = auto() | |
| MIXED_TOP1_BOTTOM2 = auto() | |
| MIXED_TOP2_BOTTOM1 = auto() | |
| HORIZONTAL_SEP_PRESENT = auto() | |
| READING_ORDER = auto() | |
| UNKNOWN = auto() | |
| class Zone: | |
| x_min: int | |
| y_min: int | |
| x_max: int | |
| y_max: int | |
| def width(self) -> int: | |
| return max(0, self.x_max - self.x_min) | |
| def height(self) -> int: | |
| return max(0, self.y_max - self.y_min) | |
| def __repr__(self) -> str: | |
| return f"Zone(x=[{self.x_min}, {self.x_max}), y=[{self.y_min}, {self.y_max}))" | |
| class HorizontalSplit: | |
| top_zone: Zone | |
| bottom_zone: Zone | |
| separator_element: MockElement | |
| class HorizontalSplitYGap: | |
| top_zone: Zone | |
| bottom_zone: Zone | |
| split_y: float | |
| class VerticalSplit: | |
| left_zone: Zone | |
| right_zone: Zone | |
| gutter_x: float | |
| class ElementGroup: | |
| anchor: Optional[MockElement] | |
| children: List[MockElement] = field(default_factory=list) | |
| group_id: int = -1 # flatten ํจ์์์ ์ต์ข ํ ๋น, ํ์ฒ๋ฆฌ ์ ์์ ํ ๋น | |
| def add_child(self, child: MockElement): | |
| self.children.append(child) | |
| def get_all_elements_sorted(self) -> List[MockElement]: | |
| """ | |
| ๊ทธ๋ฃน ๋ด ์์๋ค์ ์ ๋ ฌํฉ๋๋ค. | |
| - ์ต์ปค(Anchor)๊ฐ ํญ์ ๊ฐ์ฅ ๋จผ์ ์์นํฉ๋๋ค. | |
| - ๋๋จธ์ง ์์(Children) ์์๋ค์ (Y, X) ์ขํ ์์ผ๋ก ์ ๋ ฌ๋ฉ๋๋ค. | |
| """ | |
| # 1. ์ต์ปค๊ฐ ์กด์ฌํ๋ฉด ๋ฆฌ์คํธ์ ์ฒซ ์์๋ก ์ค์ ํฉ๋๋ค. | |
| elements = [self.anchor] if self.anchor else [] | |
| # 2. ์์ ์์๋ค์ (Y, X) ์ขํ ๊ธฐ์ค์ผ๋ก ์ ๋ ฌํฉ๋๋ค. | |
| sorted_children = sorted( | |
| self.children, key=lambda e: (e.y_position, e.x_position) | |
| ) | |
| # 3. ์ต์ปค ์์ ๋ค์ ์ ๋ ฌ๋ ์์ ์์๋ค์ ์ถ๊ฐํฉ๋๋ค. | |
| elements.extend(sorted_children) | |
| return elements | |
| def is_empty(self) -> bool: | |
| return self.anchor is None and not self.children | |
| def __repr__(self) -> str: | |
| anchor_id = self.anchor.element_id if self.anchor else "Orphan" | |
| child_ids = sorted([c.element_id for c in self.children]) | |
| # flatten ์ ์๋ group_id๊ฐ ์์๊ฐ์ผ ์ ์์ | |
| return f"Group(ID:{self.group_id}, Anchor: {anchor_id}, Children: {child_ids})" | |
| # ============================================================================ | |
| # ์์ ์ ์ (๊ธฐ์กด๊ณผ ๋์ผ) | |
| # ============================================================================ | |
| ALLOWED_ANCHORS = ["question type", "question number", "second_question_number"] | |
| ALLOWED_CHILDREN = ["question text", "list", "choices", "figure", "table", "flowchart"] | |
| ALLOWED_CLASSES = ALLOWED_ANCHORS + ALLOWED_CHILDREN | |
| HORIZONTAL_SEP_WIDTH_THRESHOLD = 0.8 | |
| HORIZONTAL_SEP_Y_POS_THRESHOLD = 0.15 | |
| MIN_ANCHORS_FOR_SPLIT = 2 | |
| VERTICAL_GAP_THRESHOLD_RATIO = 1.5 | |
| VERTICAL_GAP_THRESHOLD_ABS = 100 | |
| KMEANS_N_CLUSTERS = 2 | |
| KMEANS_CLUSTER_SEPARATION_MIN = 50 | |
| LAYOUT_DETECT_Y_SPLIT_POINT = 0.4 | |
| LAYOUT_DETECT_X_STD_THRESHOLD_RATIO = 0.1 | |
| HORIZONTAL_ADJACENCY_Y_CENTER_RATIO = 0.7 | |
| HORIZONTAL_ADJACENCY_X_PROXIMITY = 50 | |
| BASE_CASE_TOP_ORPHAN_THRESHOLD_RATIO = 0.15 | |
| POST_PROCESS_CLOSENESS_RATIO = 0.5 | |
| POST_PROCESS_LOOKAHEAD = 2 | |
| # 2D ๊ฑฐ๋ฆฌ ๊ธฐ๋ฐ ๊ทธ๋ฃนํ ๊ด๋ จ ์์ | |
| ANCHOR_VERTICAL_PROXIMITY_THRESHOLD = 250 # px - ์ต์ปค์ Y ๊ฑฐ๋ฆฌ ์๊ณ๊ฐ | |
| ANCHOR_2D_DISTANCE_WEIGHT_X = 0.2 # X ๊ฑฐ๋ฆฌ ๊ฐ์ค์น (๋ฎ๊ฒ ์ค์ ) | |
| ANCHOR_2D_DISTANCE_WEIGHT_Y = 1.0 # Y ๊ฑฐ๋ฆฌ ๊ฐ์ค์น | |
| # ============================================================================ | |
| # ๋ฉ์ธ ํจ์: ๋ ์ด์์ ์ ํ ํ๋ณ ํ ์ ๋ ฌ (์์ ๋จ) | |
| # ============================================================================ | |
| def _sort_layout_elements_v24( | |
| elements: List[MockElement], | |
| document_type: str = "question_based", | |
| page_width: Optional[int] = None, | |
| page_height: Optional[int] = None, | |
| ) -> List[MockElement]: | |
| """ | |
| ๋ ์ด์์ ์ ํ ํ๋ณ ํ ๋ง์ถคํ ์ ๋ ฌ ๋ก์ง ์ ์ฉ (v.LayoutDetect.2.4) | |
| """ | |
| logger.info( | |
| f"๋ง์ถคํ ์ ๋ ฌ(v.LayoutDetect.2.4) ์์: {len(elements)}๊ฐ ์์, ํ์ ={document_type}" | |
| ) | |
| filtered_elements = preprocess_elements(elements, document_type) | |
| if not filtered_elements: | |
| logger.warning("์ ์ฒ๋ฆฌ ํ ์ ๋ ฌํ ์์๊ฐ ์์ต๋๋ค.") | |
| return [] | |
| if page_width is None: | |
| page_width = calculate_page_width(filtered_elements) | |
| if page_height is None: | |
| page_height = calculate_page_height(filtered_elements) | |
| logger.info(f"ํ์ด์ง ํฌ๊ธฐ: {page_width} x {page_height}") | |
| initial_zone = Zone(x_min=0, y_min=0, x_max=page_width, y_max=page_height) | |
| grouped_results: List[ElementGroup] = [] | |
| try: | |
| if document_type == "reading_order": | |
| layout_type = LayoutType.READING_ORDER | |
| logger.info(f"ํ๋ณ๋ ๋ ์ด์์ ์ ํ: {layout_type.name} (๋ฌธ์ ํ์ ์ง์ )") | |
| sorted_elements_reading = sorted( | |
| filtered_elements, key=lambda e: (e.y_position, e.x_position) | |
| ) | |
| grouped_results = [ | |
| ElementGroup(anchor=None, children=[elem]) | |
| for elem in sorted_elements_reading | |
| ] | |
| else: | |
| layout_type = detect_layout_type(filtered_elements, page_width, page_height) | |
| logger.info(f"ํ๋ณ๋ ๋ ์ด์์ ์ ํ: {layout_type.name}") | |
| if layout_type == LayoutType.STANDARD_1_COLUMN: | |
| logger.debug( | |
| f"{layout_type.name}: ๋ถํ ์์ด ์ ์ฒด ๊ตฌ์ญ ํ์ค 1๋จ Base Case ์คํ" | |
| ) | |
| grouped_results = _base_case_standard_1_column( | |
| initial_zone, filtered_elements | |
| ) | |
| elif layout_type == LayoutType.STANDARD_2_COLUMN: | |
| grouped_results = _sort_standard_2_column( | |
| initial_zone, filtered_elements | |
| ) | |
| elif layout_type in [ | |
| LayoutType.HORIZONTAL_SEP_PRESENT, | |
| LayoutType.MIXED_TOP1_BOTTOM2, | |
| LayoutType.MIXED_TOP2_BOTTOM1, | |
| LayoutType.UNKNOWN, | |
| ]: | |
| grouped_results = _sort_recursive_by_layout( | |
| initial_zone, filtered_elements, layout_type, depth=0 | |
| ) | |
| else: | |
| logger.error( | |
| f"์ฒ๋ฆฌํ ์ ์๋ ๋ ์ด์์ ์ ํ: {layout_type.name}. (Y,X) ์ ๋ ฌ๋ก ๋์ฒดํฉ๋๋ค." | |
| ) | |
| sorted_elements_fallback = sorted( | |
| filtered_elements, key=lambda e: (e.y_position, e.x_position) | |
| ) | |
| grouped_results = [ | |
| ElementGroup(anchor=None, children=[elem]) | |
| for elem in sorted_elements_fallback | |
| ] | |
| # --- ๐ ์์ : ํ์ฒ๋ฆฌ ์ ์ ์์ ๊ทธ๋ฃน ID ํ ๋น (๋ก๊น ์ฉ) --- | |
| if grouped_results and document_type == "question_based": | |
| logger.debug("ํ์ฒ๋ฆฌ ์ ์์ ๊ทธ๋ฃน ID ํ ๋น...") | |
| temp_groups_with_id = [] | |
| temp_group_id_counter = 0 | |
| temp_orphan_groups = [g for g in grouped_results if g.anchor is None] | |
| temp_non_orphan_groups = [ | |
| g for g in grouped_results if g.anchor is not None | |
| ] | |
| # ๊ณ ์ ๊ทธ๋ฃน ๋จผ์ ID ํ ๋น | |
| if temp_orphan_groups: | |
| temp_orphan_groups.sort( | |
| key=lambda g: ( | |
| min(c.y_position for c in g.children) | |
| if g.children | |
| else float("inf") | |
| ) | |
| ) | |
| for group in temp_orphan_groups: | |
| group.group_id = temp_group_id_counter | |
| temp_groups_with_id.append(group) | |
| temp_group_id_counter += 1 | |
| # ์ต์ปค ๊ทธ๋ฃน ID ํ ๋น | |
| # (์ฃผ์: _post_process... ํจ์๋ ์ต์ปค ๊ทธ๋ฃน ๋ฆฌ์คํธ๋ง ๋ฐ๋๋ก ์์ ํ์) | |
| # ์ฐ์ ์ฌ๊ธฐ์ ID๋ง ํ ๋นํ๊ณ , ํ์ฒ๋ฆฌ๋ non_orphan_groups ๋์์ผ๋ก ์ํ | |
| for group in temp_non_orphan_groups: | |
| group.group_id = temp_group_id_counter | |
| # temp_groups_with_id.append(group) # flatten ์ ์ต์ข ์์๋ ์์ง ๋ชจ๋ฆ | |
| temp_group_id_counter += 1 | |
| # ํ์ฒ๋ฆฌ๋ ์ต์ปค๊ฐ ์๋ ๊ทธ๋ฃน๋ค์ ๋์์ผ๋ก ์ํ | |
| logger.debug( | |
| f"{len(temp_non_orphan_groups)}๊ฐ ์ต์ปค ๊ทธ๋ฃน ๋์ ํ์ฒ๋ฆฌ ์คํ..." | |
| ) | |
| processed_non_orphan_groups = _post_process_table_figure_assignment( | |
| temp_non_orphan_groups | |
| ) | |
| # ์ต์ข ๊ทธ๋ฃน ๋ฆฌ์คํธ ์ฌ๊ตฌ์ฑ (๊ณ ์ + ํ์ฒ๋ฆฌ๋ ์ต์ปค ๊ทธ๋ฃน) | |
| grouped_results = temp_orphan_groups + processed_non_orphan_groups | |
| logger.debug("ํ์ฒ๋ฆฌ ๋ฐ ์์ ๊ทธ๋ฃน ID ํ ๋น ์๋ฃ.") | |
| # --- ๐ ์์ ๋ --- | |
| except Exception as e: | |
| logger.error( | |
| f"๋ง์ถคํ ์ ๋ ฌ ์ค ์ฌ๊ฐํ ์ค๋ฅ ๋ฐ์: {e}. (Y,X) ์ขํ ์ ๋ ฌ๋ก ๋์ฒดํฉ๋๋ค.", | |
| exc_info=True, | |
| ) | |
| sorted_elements_fallback = sorted( | |
| filtered_elements, key=lambda e: (e.y_position, e.x_position) | |
| ) | |
| grouped_results = [ | |
| ElementGroup(anchor=None, children=[elem]) | |
| for elem in sorted_elements_fallback | |
| ] | |
| if not grouped_results: | |
| logger.warning("๊ทธ๋ฃนํ ๊ฒฐ๊ณผ๊ฐ ๋น์ด ์์ต๋๋ค.") | |
| return [] | |
| # ์ต์ข ๋ณํฉ: ๊ณ ์ ๊ทธ๋ฃน๊ณผ ์ต์ปค ๊ทธ๋ฃน ์์ ๊ฒฐ์ (๊ธฐ์กด ๋ก์ง ์ ์ง) | |
| orphan_groups = [g for g in grouped_results if g.anchor is None] | |
| non_orphan_groups = [ | |
| g for g in grouped_results if g.anchor is not None | |
| ] # ํ์ฒ๋ฆฌ๋ ๋ฆฌ์คํธ ์ฌ์ฉ | |
| final_ordered_groups: List[ElementGroup] = [] | |
| if orphan_groups: | |
| # ๊ณ ์ ๊ทธ๋ฃน์ Y ์ขํ ๊ธฐ์ค์ผ๋ก ์ ๋ ฌ | |
| orphan_groups.sort( | |
| key=lambda g: ( | |
| min(c.y_position for c in g.children) if g.children else float("inf") | |
| ) | |
| ) | |
| logger.debug( | |
| f"์ ์ญ ๊ณ ์ ๊ทธ๋ฃน {len(orphan_groups)}๊ฐ (Y ์ขํ ์ ๋ ฌ๋จ) ๋ฆฌ์คํธ ๋งจ ์์ผ๋ก ์ด๋" | |
| ) | |
| final_ordered_groups.extend(orphan_groups) | |
| else: | |
| logger.debug("์ ์ญ ๊ณ ์ ๊ทธ๋ฃน ์์") | |
| # ์ต์ปค ๊ทธ๋ฃน์ Base Case/์ฌ๊ท ํธ์ถ์์ ๊ฒฐ์ ๋ ์์ ์ ์ง (Y์ขํ ์ ๋ ฌ ๋ถํ์) | |
| final_ordered_groups.extend(non_orphan_groups) | |
| # ์ต์ข ์์ ๋ฐ ID ๋ถ์ฌ | |
| final_sorted_elements, _, _ = flatten_groups_and_assign_order( | |
| final_ordered_groups, start_global_order=0, start_group_id=0 | |
| ) | |
| logger.info(f"๋ง์ถคํ ์ ๋ ฌ ์๋ฃ: {len(final_sorted_elements)}๊ฐ ์์") | |
| return final_sorted_elements | |
| def _use_adaptive_strategy() -> bool: | |
| """ํ๊ฒฝ ๋ณ์ ๊ธฐ๋ฐ Adaptive ์ ๋ต ์ฌ์ฉ ์ฌ๋ถ ํ๋จ""" | |
| return os.getenv("USE_ADAPTIVE_SORTER", "false").lower() in {"1", "true", "yes"} | |
| def sort_layout_elements( | |
| elements: List[MockElement], | |
| document_type: str = "question_based", | |
| page_width: Optional[int] = None, | |
| page_height: Optional[int] = None, | |
| page_dpi: Optional[float] = None, | |
| ) -> List[MockElement]: | |
| """ | |
| Adaptive ์ ๋ต ํ๋๊ทธ๊ฐ ํ์ฑํ๋ ๊ฒฝ์ฐ sorter_strategies์ Adaptive ์ํธ๋ฆฌํฌ์ธํธ๋ก ์์ํ๊ณ , | |
| ๊ทธ๋ ์ง ์์ผ๋ฉด v2.4 ์ฝ์ด ๊ตฌํ์ ๊ทธ๋๋ก ์ฌ์ฉํ๋ค. | |
| """ | |
| if _use_adaptive_strategy(): | |
| from .sorter_strategies import sort_layout_elements_adaptive | |
| return sort_layout_elements_adaptive( | |
| elements=elements, | |
| document_type=document_type, | |
| page_width=page_width, | |
| page_height=page_height, | |
| force_strategy=None, | |
| page_dpi=page_dpi, | |
| ) | |
| return _sort_layout_elements_v24( | |
| elements=elements, | |
| document_type=document_type, | |
| page_width=page_width, | |
| page_height=page_height, | |
| ) | |
| # ============================================================================ | |
| # ๋ ์ด์์ ์ ํ ํ๋ณ ํจ์ (๊ธฐ์กด๊ณผ ๋์ผ) | |
| # ============================================================================ | |
| def detect_layout_type( | |
| elements: List[MockElement], page_width: int, page_height: int | |
| ) -> LayoutType: | |
| # ... (์ฝ๋ ๋์ผ) ... | |
| """์ต์ปค ์์ ๋ถํฌ๋ฅผ ๋ถ์ํ์ฌ ํ์ด์ง ๋ ์ด์์ ์ ํ ํ๋ณ""" | |
| anchors = [e for e in elements if e.class_name in ALLOWED_ANCHORS] | |
| if len(anchors) < MIN_ANCHORS_FOR_SPLIT: | |
| logger.debug( | |
| f"๋ ์ด์์ ํ๋ณ: ์ต์ปค ์({len(anchors)}) ๋ถ์กฑ -> STANDARD_1_COLUMN" | |
| ) | |
| return LayoutType.STANDARD_1_COLUMN | |
| top_zone_height = page_height * HORIZONTAL_SEP_Y_POS_THRESHOLD | |
| wide_q_type = find_wide_question_type(elements, page_width, top_zone_height) | |
| if wide_q_type: | |
| logger.debug( | |
| f"๋ ์ด์์ ํ๋ณ: ๋์ question_type(ID:{wide_q_type.element_id}) ์กด์ฌ -> HORIZONTAL_SEP_PRESENT" | |
| ) | |
| return LayoutType.HORIZONTAL_SEP_PRESENT | |
| anchor_x_centers = np.array([[a.bbox_x + a.bbox_width / 2] for a in anchors]) | |
| is_clearly_2_column = False | |
| if len(np.unique(anchor_x_centers)) >= 2: | |
| try: | |
| kmeans = KMeans( | |
| n_clusters=KMEANS_N_CLUSTERS, random_state=42, n_init="auto" | |
| ) | |
| kmeans.fit(anchor_x_centers) | |
| centers = sorted(kmeans.cluster_centers_.flatten()) | |
| if ( | |
| len(centers) == 2 | |
| and centers[1] - centers[0] >= KMEANS_CLUSTER_SEPARATION_MIN | |
| ): | |
| is_clearly_2_column = True | |
| logger.trace( | |
| f"๋ ์ด์์ ํ๋ณ: ์ ์ฒด X ๋ถํฌ๋ 2๋จ ๊ตฌ์กฐ ๊ฐ๋ฅ์ฑ ๋์ (Centers: {centers})" | |
| ) | |
| else: | |
| logger.trace(f"๋ ์ด์์ ํ๋ณ: ์ ์ฒด X ๋ถํฌ๋ 1๋จ ๊ตฌ์กฐ ๋๋ ๋ถ๋ถ๋ช ") | |
| except Exception as e: | |
| logger.warning(f"๋ ์ด์์ ํ๋ณ ์ค K-Means ์ค๋ฅ ๋ฐ์: {e}") | |
| if is_clearly_2_column: | |
| split_y = page_height * LAYOUT_DETECT_Y_SPLIT_POINT | |
| top_anchors = [ | |
| a for a in anchors if (a.y_position + a.bbox_height / 2) < split_y | |
| ] | |
| bottom_anchors = [ | |
| a for a in anchors if (a.y_position + a.bbox_height / 2) >= split_y | |
| ] | |
| if not top_anchors or not bottom_anchors: | |
| logger.debug("๋ ์ด์์ ํ๋ณ: ์/ํ๋จ ์ต์ปค ๊ทธ๋ฃน ๋ถ์์ -> STANDARD_2_COLUMN") | |
| return LayoutType.STANDARD_2_COLUMN | |
| top_x_centers = ( | |
| np.array([[a.bbox_x + a.bbox_width / 2] for a in top_anchors]) | |
| if top_anchors | |
| else np.array([]) | |
| ) | |
| bottom_x_centers = ( | |
| np.array([[a.bbox_x + a.bbox_width / 2] for a in bottom_anchors]) | |
| if bottom_anchors | |
| else np.array([]) | |
| ) | |
| x_std_threshold = page_width * LAYOUT_DETECT_X_STD_THRESHOLD_RATIO | |
| top_is_multi_column = ( | |
| top_x_centers.size > 1 and np.std(top_x_centers) > x_std_threshold | |
| ) | |
| bottom_is_multi_column = ( | |
| bottom_x_centers.size > 1 and np.std(bottom_x_centers) > x_std_threshold | |
| ) | |
| if not top_is_multi_column and bottom_is_multi_column: | |
| logger.debug( | |
| f"๋ ์ด์์ ํ๋ณ: ์๋จ({len(top_anchors)}๊ฐ) 1๋จ, ํ๋จ({len(bottom_anchors)}๊ฐ) 2๋จ -> MIXED_TOP1_BOTTOM2" | |
| ) | |
| return LayoutType.MIXED_TOP1_BOTTOM2 | |
| elif top_is_multi_column and not bottom_is_multi_column: | |
| logger.debug( | |
| f"๋ ์ด์์ ํ๋ณ: ์๋จ({len(top_anchors)}๊ฐ) 2๋จ, ํ๋จ({len(bottom_anchors)}๊ฐ) 1๋จ -> MIXED_TOP2_BOTTOM1" | |
| ) | |
| return LayoutType.MIXED_TOP2_BOTTOM1 | |
| elif top_is_multi_column and bottom_is_multi_column: | |
| logger.debug( | |
| f"๋ ์ด์์ ํ๋ณ: ์๋จ({len(top_anchors)}๊ฐ) 2๋จ, ํ๋จ({len(bottom_anchors)}๊ฐ) 2๋จ -> STANDARD_2_COLUMN" | |
| ) | |
| return LayoutType.STANDARD_2_COLUMN | |
| else: | |
| logger.warning( | |
| f"๋ ์ด์์ ํ๋ณ: ์/ํ๋จ ๋ชจ๋ 1๋จ์ผ๋ก ๋ณด์ด๋ ์ ์ฒด๋ 2๋จ ๊ตฌ์กฐ? -> UNKNOWN" | |
| ) | |
| return LayoutType.UNKNOWN | |
| else: | |
| logger.debug("๋ ์ด์์ ํ๋ณ: ์ ์ฒด 1๋จ ๊ตฌ์กฐ -> STANDARD_1_COLUMN") | |
| return LayoutType.STANDARD_1_COLUMN | |
| # ============================================================================ | |
| # ์ฌ๊ท ์ ๋ ฌ ํจ์ (๊ธฐ์กด๊ณผ ๋์ผ) | |
| # ============================================================================ | |
| def _sort_recursive_by_layout( | |
| current_zone: Zone, | |
| elements_in_zone: List[MockElement], | |
| layout_type: LayoutType, | |
| depth: int, | |
| ) -> List[ElementGroup]: | |
| # ... (์ฝ๋ ๋์ผ) ... | |
| """๋ ์ด์์ ์ ํ์ ๋ฐ๋ผ ๋ค๋ฅธ ๋ถํ ์ฐ์ ์์๋ฅผ ์ ์ฉํ๋ ์ฌ๊ท ํจ์""" | |
| indent = " " * depth | |
| logger.debug( | |
| f"{indent}[Depth {depth}, Type: {layout_type.name}] ๊ตฌ์ญ ์ฒ๋ฆฌ ์์: {current_zone}, ์์ ์={len(elements_in_zone)}" | |
| ) | |
| if not elements_in_zone: | |
| logger.trace(f"{indent} -> ๋น ๊ตฌ์ญ") | |
| return [] | |
| if len(elements_in_zone) == 1: | |
| element = elements_in_zone[0] | |
| logger.trace(f"{indent} -> ์์ 1๊ฐ") | |
| return ( | |
| [ElementGroup(anchor=element)] | |
| if element.class_name in ALLOWED_ANCHORS | |
| else [ElementGroup(anchor=None, children=[element])] | |
| ) | |
| if layout_type == LayoutType.STANDARD_2_COLUMN: | |
| logger.debug(f"{indent} -> {layout_type.name}: ํ์ค 2๋จ ์ฒ๋ฆฌ ํจ์ ์ง์ ํธ์ถ") | |
| return _sort_standard_2_column(current_zone, elements_in_zone) | |
| split_result: Optional[ | |
| Union[HorizontalSplit, HorizontalSplitYGap, VerticalSplit] | |
| ] = None | |
| split_type = "None" | |
| if layout_type == LayoutType.HORIZONTAL_SEP_PRESENT: | |
| split_result = find_horizontal_split_by_type(current_zone, elements_in_zone) | |
| if split_result: | |
| split_type = "H_Type" | |
| else: | |
| anchors = [e for e in elements_in_zone if e.class_name in ALLOWED_ANCHORS] | |
| split_result = find_vertical_split_kmeans(current_zone, anchors) | |
| if split_result: | |
| split_type = "Vertical" | |
| else: | |
| split_result = find_horizontal_split_by_y_gap( | |
| current_zone, elements_in_zone | |
| ) | |
| if split_result: | |
| split_type = "H_YGap" | |
| elif ( | |
| layout_type == LayoutType.MIXED_TOP1_BOTTOM2 | |
| or layout_type == LayoutType.MIXED_TOP2_BOTTOM1 | |
| ): | |
| split_result = find_horizontal_split_by_y_gap(current_zone, elements_in_zone) | |
| if split_result: | |
| split_type = "H_YGap" | |
| else: | |
| split_result = find_horizontal_split_by_type(current_zone, elements_in_zone) | |
| if split_result: | |
| split_type = "H_Type" | |
| else: | |
| anchors = [ | |
| e for e in elements_in_zone if e.class_name in ALLOWED_ANCHORS | |
| ] | |
| split_result = find_vertical_split_kmeans(current_zone, anchors) | |
| if split_result: | |
| split_type = "Vertical" | |
| elif layout_type == LayoutType.UNKNOWN: | |
| split_result = find_horizontal_split_by_type(current_zone, elements_in_zone) | |
| if split_result: | |
| split_type = "H_Type" | |
| else: | |
| anchors = [e for e in elements_in_zone if e.class_name in ALLOWED_ANCHORS] | |
| split_result = find_vertical_split_kmeans(current_zone, anchors) | |
| if split_result: | |
| split_type = "Vertical" | |
| else: | |
| split_result = find_horizontal_split_by_y_gap( | |
| current_zone, elements_in_zone | |
| ) | |
| if split_result: | |
| split_type = "H_YGap" | |
| if split_result: | |
| if isinstance(split_result, (HorizontalSplit, HorizontalSplitYGap)): | |
| split_y = ( | |
| split_result.split_y | |
| if isinstance(split_result, HorizontalSplitYGap) | |
| else split_result.separator_element.y_position | |
| + split_result.separator_element.bbox_height / 2 | |
| ) | |
| top_elements = [ | |
| e | |
| for e in elements_in_zone | |
| if getattr(e, "element_id", -1) | |
| != getattr( | |
| getattr(split_result, "separator_element", None), "element_id", -2 | |
| ) | |
| and (e.bbox_y + e.bbox_height / 2) < split_y | |
| ] | |
| bottom_elements = [ | |
| e | |
| for e in elements_in_zone | |
| if getattr(e, "element_id", -1) | |
| != getattr( | |
| getattr(split_result, "separator_element", None), "element_id", -2 | |
| ) | |
| and (e.bbox_y + e.bbox_height / 2) >= split_y | |
| ] | |
| logger.debug( | |
| f"{indent} -> {split_type} ์ํ ๋ถํ ์ฑ๊ณต! Top:{len(top_elements)}, Bottom:{len(bottom_elements)}" | |
| ) | |
| top_layout_type = ( | |
| detect_layout_type( | |
| top_elements, | |
| split_result.top_zone.width, | |
| split_result.top_zone.height, | |
| ) | |
| if top_elements | |
| else LayoutType.UNKNOWN | |
| ) | |
| bottom_layout_type = ( | |
| detect_layout_type( | |
| bottom_elements, | |
| split_result.bottom_zone.width, | |
| split_result.bottom_zone.height, | |
| ) | |
| if bottom_elements | |
| else LayoutType.UNKNOWN | |
| ) | |
| sorted_top = _sort_recursive_by_layout( | |
| split_result.top_zone, top_elements, top_layout_type, depth + 1 | |
| ) | |
| sep_group = ( | |
| [ElementGroup(anchor=split_result.separator_element)] | |
| if isinstance(split_result, HorizontalSplit) | |
| else [] | |
| ) | |
| sorted_bottom = _sort_recursive_by_layout( | |
| split_result.bottom_zone, bottom_elements, bottom_layout_type, depth + 1 | |
| ) | |
| logger.debug(f"{indent} <- {split_type} ์ํ ๋ถํ ๊ฒฐ๊ณผ ๋ณํฉ") | |
| return sorted_top + sep_group + sorted_bottom | |
| elif isinstance(split_result, VerticalSplit): | |
| left_elements = [ | |
| e | |
| for e in elements_in_zone | |
| if (e.bbox_x + e.bbox_width / 2) < split_result.gutter_x | |
| ] | |
| right_elements = [ | |
| e | |
| for e in elements_in_zone | |
| if (e.bbox_x + e.bbox_width / 2) >= split_result.gutter_x | |
| ] | |
| logger.debug( | |
| f"{indent} -> Vertical ์์ง ๋ถํ ์ฑ๊ณต! Left:{len(left_elements)}, Right:{len(right_elements)}" | |
| ) | |
| left_layout_type = ( | |
| detect_layout_type( | |
| left_elements, | |
| split_result.left_zone.width, | |
| split_result.left_zone.height, | |
| ) | |
| if left_elements | |
| else LayoutType.UNKNOWN | |
| ) | |
| right_layout_type = ( | |
| detect_layout_type( | |
| right_elements, | |
| split_result.right_zone.width, | |
| split_result.right_zone.height, | |
| ) | |
| if right_elements | |
| else LayoutType.UNKNOWN | |
| ) | |
| sorted_left = _sort_recursive_by_layout( | |
| split_result.left_zone, left_elements, left_layout_type, depth + 1 | |
| ) | |
| sorted_right = _sort_recursive_by_layout( | |
| split_result.right_zone, right_elements, right_layout_type, depth + 1 | |
| ) | |
| logger.debug(f"{indent} <- Vertical ์์ง ๋ถํ ๊ฒฐ๊ณผ ๋ณํฉ") | |
| return sorted_left + sorted_right | |
| else: | |
| logger.debug( | |
| f"{indent} -> ๋ชจ๋ ๋ถํ ์คํจ, ๋ ์ด์์ ์ ํ({layout_type.name})์ ๋ฐ๋ฅธ Base Case ์คํ" | |
| ) | |
| result_groups: List[ElementGroup] = [] | |
| if layout_type == LayoutType.STANDARD_1_COLUMN: | |
| result_groups = _base_case_standard_1_column(current_zone, elements_in_zone) | |
| elif ( | |
| layout_type == LayoutType.MIXED_TOP1_BOTTOM2 | |
| or layout_type == LayoutType.MIXED_TOP2_BOTTOM1 | |
| ): | |
| result_groups = _base_case_mixed_layout( | |
| current_zone, elements_in_zone, layout_type | |
| ) | |
| elif ( | |
| layout_type == LayoutType.HORIZONTAL_SEP_PRESENT | |
| or layout_type == LayoutType.UNKNOWN | |
| ): | |
| logger.warning( | |
| f"{indent} -> {layout_type.name} ์ ํ ๋ถํ ์คํจ. 1๋จ Base Case๋ก ์ฒ๋ฆฌํฉ๋๋ค." | |
| ) | |
| result_groups = _base_case_standard_1_column(current_zone, elements_in_zone) | |
| else: | |
| logger.error( | |
| f"{indent} -> ์ฒ๋ฆฌํ ์ ์๋ Base Case ์ ํ: {layout_type.name}. 1๋จ์ผ๋ก ์ฒ๋ฆฌ." | |
| ) | |
| result_groups = _base_case_standard_1_column(current_zone, elements_in_zone) | |
| logger.debug(f"{indent} <- Base Case ์ฒ๋ฆฌ ์๋ฃ: {len(result_groups)} ๊ทธ๋ฃน ์์ฑ") | |
| return result_groups | |
| # ============================================================================ | |
| # ํ์ค 2๋จ ๋ ์ด์์ ์ฒ๋ฆฌ ํจ์ (๊ธฐ์กด๊ณผ ๋์ผ) | |
| # ============================================================================ | |
| def _sort_standard_2_column( | |
| zone: Zone, elements: List[MockElement] | |
| ) -> List[ElementGroup]: | |
| # ... (์ฝ๋ ๋์ผ) ... | |
| """ํ์ค 2๋จ ๋ ์ด์์ ์ฒ๋ฆฌ: K-Means ๋ถํ ํ ์ปฌ๋ผ๋ณ _base_case_standard_1_column ํธ์ถ""" | |
| logger.debug("ํ์ค 2๋จ ์ฒ๋ฆฌ: K-Means ๋ถํ ์๋") | |
| anchors = [e for e in elements if e.class_name in ALLOWED_ANCHORS] | |
| vertical_split = find_vertical_split_kmeans(zone, anchors) | |
| if vertical_split: | |
| logger.debug(f" -> ์์ง ๋ถํ ์ฑ๊ณต! ๋ถ๋ฆฌ์ X={vertical_split.gutter_x:.1f}") | |
| left_elements = [ | |
| e | |
| for e in elements | |
| if (e.bbox_x + e.bbox_width / 2) < vertical_split.gutter_x | |
| ] | |
| right_elements = [ | |
| e | |
| for e in elements | |
| if (e.bbox_x + e.bbox_width / 2) >= vertical_split.gutter_x | |
| ] | |
| logger.debug( | |
| f" Left ์์ ์: {len(left_elements)}, Right ์์ ์: {len(right_elements)}" | |
| ) | |
| groups_left = _base_case_standard_1_column( | |
| vertical_split.left_zone, left_elements | |
| ) | |
| groups_right = _base_case_standard_1_column( | |
| vertical_split.right_zone, right_elements | |
| ) | |
| logger.debug( | |
| f" <- ์ปฌ๋ผ๋ณ ๊ทธ๋ฃนํ ์๋ฃ (Left: {len(groups_left)} ๊ทธ๋ฃน, Right: {len(groups_right)} ๊ทธ๋ฃน)" | |
| ) | |
| return groups_left + groups_right | |
| else: | |
| logger.warning( | |
| "ํ์ค 2๋จ ์ฒ๋ฆฌ ์คํจ: ์์ง ๋ถํ ๋ถ๊ฐ. ์ ์ฒด ๊ตฌ์ญ ํ์ค 1๋จ Base Case ์คํ" | |
| ) | |
| return _base_case_standard_1_column(zone, elements) | |
| # ============================================================================ | |
| # ๋ถํ ํจ์ ๊ตฌํ (๊ธฐ์กด๊ณผ ๋์ผ) | |
| # ============================================================================ | |
| def find_wide_question_type( | |
| elements: List[MockElement], page_width: int, top_y_limit: float | |
| ) -> Optional[MockElement]: | |
| # ... (์ฝ๋ ๋์ผ) ... | |
| """ํ์ด์ง ์๋จ ์์ญ์์ ๋์ question_type ์ฐพ๊ธฐ""" | |
| wide_types = [ | |
| e | |
| for e in elements | |
| if e.class_name == "question_type" | |
| and e.y_position < top_y_limit | |
| and (e.bbox_width / page_width if page_width > 0 else 0) | |
| >= HORIZONTAL_SEP_WIDTH_THRESHOLD | |
| ] | |
| return min(wide_types, key=lambda e: e.y_position) if wide_types else None | |
| def find_horizontal_split_by_type( | |
| zone: Zone, elements: List[MockElement] | |
| ) -> Optional[HorizontalSplit]: | |
| # ... (์ฝ๋ ๋์ผ) ... | |
| """๋์ question_type์ผ๋ก ์ํ ๋ถํ """ | |
| potential_separators = [] | |
| for element in elements: | |
| if element.class_name == "question_type": | |
| width_ratio = element.bbox_width / zone.width if zone.width > 0 else 0 | |
| if width_ratio >= HORIZONTAL_SEP_WIDTH_THRESHOLD: | |
| potential_separators.append(element) | |
| if not potential_separators: | |
| return None | |
| separator = min(potential_separators, key=lambda e: e.y_position) | |
| if not (zone.y_min < separator.y_position < zone.y_max): | |
| return None | |
| top_zone = Zone(zone.x_min, zone.y_min, zone.x_max, separator.y_position) | |
| bottom_zone = Zone( | |
| zone.x_min, separator.y_position + separator.bbox_height, zone.x_max, zone.y_max | |
| ) | |
| if top_zone.height <= 0 or bottom_zone.height <= 0: | |
| return None | |
| return HorizontalSplit(top_zone, bottom_zone, separator) | |
| def find_horizontal_split_by_y_gap( | |
| zone: Zone, elements: List[MockElement] | |
| ) -> Optional[HorizontalSplitYGap]: | |
| # ... (์ฝ๋ ๋์ผ) ... | |
| """์ต์ปค Y Gap์ผ๋ก ์ํ ๋ถํ """ | |
| anchors = sorted( | |
| [e for e in elements if e.class_name in ALLOWED_ANCHORS], | |
| key=lambda e: e.y_position, | |
| ) | |
| if len(anchors) < MIN_ANCHORS_FOR_SPLIT: | |
| return None | |
| max_gap = -1 | |
| split_index = -1 | |
| avg_anchor_height = ( | |
| np.mean([a.bbox_height for a in anchors if a.bbox_height > 0]) | |
| if any(a.bbox_height > 0 for a in anchors) | |
| else 30 | |
| ) | |
| for i in range(len(anchors) - 1): | |
| gap = (anchors[i + 1].y_position + anchors[i + 1].bbox_height / 2) - ( | |
| anchors[i].y_position + anchors[i].bbox_height / 2 | |
| ) | |
| if gap > max_gap: | |
| max_gap = gap | |
| split_index = i | |
| threshold = max( | |
| avg_anchor_height * VERTICAL_GAP_THRESHOLD_RATIO, VERTICAL_GAP_THRESHOLD_ABS | |
| ) | |
| if max_gap >= threshold: | |
| split_y = ( | |
| anchors[split_index].y_position | |
| + anchors[split_index].bbox_height | |
| + anchors[split_index + 1].y_position | |
| ) / 2 | |
| if zone.y_min < split_y < zone.y_max: | |
| top_zone = Zone(zone.x_min, zone.y_min, zone.x_max, int(split_y)) | |
| bottom_zone = Zone(zone.x_min, int(split_y), zone.x_max, zone.y_max) | |
| logger.debug( | |
| f" Y Gap ๋ถ์: ์ํ ๋ถํ ๊ฐ๋ฅ (Max Gap={max_gap:.1f} >= Threshold={threshold:.1f})" | |
| ) | |
| return HorizontalSplitYGap(top_zone, bottom_zone, split_y) | |
| else: | |
| logger.warning( | |
| f" Y Gap ๋ถ์: ๋ถํ ์ ({split_y:.1f})์ด ๊ตฌ์ญ({zone.y_min}-{zone.y_max}) ๋ฐ์ ์์น. ๋ถํ ์ทจ์." | |
| ) | |
| return None | |
| else: | |
| logger.debug( | |
| f" Y Gap ๋ถ์: ์ต๋ ๊ฐ๊ฒฉ({max_gap:.1f}) ์๊ณ๊ฐ({threshold:.1f}) ๋ฏธ๋ง. ์ํ ๋ถํ ๋ถ๊ฐ." | |
| ) | |
| return None | |
| def find_vertical_split_kmeans( | |
| zone: Zone, anchors: List[MockElement] | |
| ) -> Optional[VerticalSplit]: | |
| """์ต์ปค X ์ขํ K-Means๋ก ์์ง ๋ถํ (๊ฐ์ : ์ค๋ฅธ์ชฝ ์นผ๋ผ ์์์ ๊ธฐ์ค ๋ถํ )""" | |
| if len(anchors) < MIN_ANCHORS_FOR_SPLIT: | |
| return None | |
| anchor_x_centers = np.array([[a.bbox_x + a.bbox_width / 2] for a in anchors]) | |
| if len(np.unique(anchor_x_centers)) < 2: | |
| return None | |
| try: | |
| kmeans = KMeans(n_clusters=KMEANS_N_CLUSTERS, random_state=42, n_init="auto") | |
| kmeans.fit(anchor_x_centers) | |
| centers = sorted(kmeans.cluster_centers_.flatten()) | |
| if ( | |
| len(centers) == 2 | |
| and centers[1] - centers[0] >= KMEANS_CLUSTER_SEPARATION_MIN | |
| ): | |
| # ๐ฅ ํต์ฌ ๋ณ๊ฒฝ: ์ค๋ฅธ์ชฝ ์นผ๋ผ ์ต์ปค์ ์์์ ์ ๊ฒฝ๊ณ๋ก ์ฌ์ฉ | |
| # ๋๋ฌด ํ์ดํธํ ๊ฒฝ๊ณ๊ฐ ๋ฌธ์ ๋ ๊ฒฝ์ฐ | |
| COLUMN_BOUNDARY_MARGIN = 20 # px | |
| gutter_x = centers[1] - COLUMN_BOUNDARY_MARGIN | |
| # gutter_x = centers[1] # ๊ธฐ์กด: (centers[0] + centers[1]) / 2 | |
| if zone.x_min < gutter_x < zone.x_max: | |
| left_zone = Zone(zone.x_min, zone.y_min, int(gutter_x), zone.y_max) | |
| right_zone = Zone(int(gutter_x), zone.y_min, zone.x_max, zone.y_max) | |
| logger.debug( | |
| f" ์์ง ๋ถํ ์ฑ๊ณต: ์ผ์ชฝ ์นผ๋ผ X=[{zone.x_min}, {int(gutter_x)}), " | |
| f"์ค๋ฅธ์ชฝ ์นผ๋ผ X=[{int(gutter_x)}, {zone.x_max})" | |
| ) | |
| return VerticalSplit(left_zone, right_zone, gutter_x) | |
| else: | |
| logger.warning( | |
| f" ์์ง ๋ถํ : ๊ฒฝ๊ณ์ ({gutter_x:.1f})์ด ๊ตฌ์ญ ๋ฐ. ๋ถํ ์ทจ์." | |
| ) | |
| return None | |
| else: | |
| logger.debug(f" ์์ง ๋ถํ ์คํจ: ์ค์ฌ๊ฐ ๊ฑฐ๋ฆฌ ๋ถ์กฑ") | |
| return None | |
| except Exception as e: | |
| logger.error(f" ์์ง ๋ถํ K-Means ์ค๋ฅ: {e}") | |
| return None | |
| # ============================================================================ | |
| # ํ์ฒ๋ฆฌ ํจ์ (์์ ๋จ) | |
| # ============================================================================ | |
| def _post_process_table_figure_assignment( | |
| groups: List[ElementGroup], y_diff_threshold: int = 150 | |
| ) -> List[ElementGroup]: | |
| """ | |
| ๊ทธ๋ฃนํ ํ์ฒ๋ฆฌ: ํ ์ด๋ธ/๊ทธ๋ฆผ ์์๊ฐ ํ์ฌ ์ต์ปค๋ณด๋ค ๋ค์ ์ต์ปค(๋ค)์ ํจ์ฌ ๊ฐ๊น์ฐ๋ฉด ์ด๋ ์๋ | |
| --- ์์ : ์ต์ ๊ทธ๋ฃน ํ์ ๋ฐ Tie-breaker ์ถ๊ฐ --- | |
| """ | |
| logger.debug( | |
| f" ํ ์ด๋ธ/๊ทธ๋ฆผ ํ ๋น ํ์ฒ๋ฆฌ ์์: {len(groups)}๊ฐ ๊ทธ๋ฃน (Threshold={y_diff_threshold}px, Closeness Ratio={POST_PROCESS_CLOSENESS_RATIO}, Lookahead={POST_PROCESS_LOOKAHEAD})" | |
| ) | |
| adjusted_groups = groups # ์๋ณธ ๋ฆฌ์คํธ๋ฅผ ์ง์ ์์ | |
| elements_to_move_dict: Dict[int, Tuple[MockElement, int]] = ( | |
| {} | |
| ) # {element_id: (element, target_group_idx)} | |
| moved_elements_log = [] # ๋ก๊น ์ฉ | |
| for i in range(len(adjusted_groups)): | |
| current_group = adjusted_groups[i] | |
| if not current_group.anchor: | |
| continue | |
| current_children_copy = list( | |
| current_group.children | |
| ) # ์ํ ์ค ๋ณ๊ฒฝ์ ์ํ ๋ณต์ฌ๋ณธ | |
| for child_idx, child in enumerate(current_children_copy): | |
| # ์ด๋ฏธ ์ด๋ ๋์์ผ๋ก ๊ฒฐ์ ๋ ์์๋ ๊ฑด๋๋ | |
| if child.element_id in elements_to_move_dict: | |
| continue | |
| if child.class_name in ["table", "figure", "flowchart"]: | |
| y_diff_current = child.y_position - current_group.anchor.y_position | |
| best_target_group_idx = -1 | |
| min_y_diff_next = float("inf") | |
| # ํ์ฌ ๊ทธ๋ฃน ์ดํ ๋ช ๊ฐ ๊ทธ๋ฃน๊น์ง ํ์ | |
| for lookahead_idx in range(1, POST_PROCESS_LOOKAHEAD + 1): | |
| next_group_idx = i + lookahead_idx | |
| if next_group_idx >= len(adjusted_groups): | |
| break | |
| next_group = adjusted_groups[next_group_idx] | |
| if not next_group.anchor: | |
| continue | |
| y_diff_next = abs(child.y_position - next_group.anchor.y_position) | |
| # ์ด๋ ์กฐ๊ฑด ๊ฒ์ฌ (v2.2 ์กฐ๊ฑด) | |
| if y_diff_current > (y_diff_threshold / 2) and y_diff_next < ( | |
| y_diff_current * POST_PROCESS_CLOSENESS_RATIO | |
| ): | |
| # --- ๐ Tie-breaker ์์ ๐ --- | |
| # ๋ ๊ฐ๊น์ด ๊ทธ๋ฃน์ ์ฐพ๊ฑฐ๋, ๊ฑฐ๋ฆฌ๊ฐ ๊ฐ์ง๋ง ๋ ๋ค์ ๊ทธ๋ฃน์ผ ๊ฒฝ์ฐ ๊ฐฑ์ | |
| if y_diff_next < min_y_diff_next or ( | |
| y_diff_next == min_y_diff_next | |
| and next_group_idx > best_target_group_idx | |
| ): | |
| min_y_diff_next = y_diff_next | |
| best_target_group_idx = next_group_idx | |
| # --- ๐ Tie-breaker ์์ ๋ ๐ --- | |
| # ์ต์ ๊ทธ๋ฃน์ ์ฐพ์์ผ๋ฉด ์ด๋ ๋์์ผ๋ก ๋ฑ๋ก | |
| if best_target_group_idx != -1: | |
| elements_to_move_dict[child.element_id] = ( | |
| child, | |
| best_target_group_idx, | |
| ) | |
| moved_elements_log.append( | |
| f"Elem {child.element_id} ({child.class_name}) from Grp {current_group.group_id} to Grp {adjusted_groups[best_target_group_idx].group_id}" | |
| ) | |
| logger.trace( | |
| f" ์ด๋ ํ๋ณด ํ์ : Elem {child.element_id} -> Group {adjusted_groups[best_target_group_idx].group_id} (Min Y diff next={min_y_diff_next:.0f})" | |
| ) | |
| # --- ์ค์ ์์ ์ด๋ (๋ฃจํ ์ข ๋ฃ ํ) --- | |
| if elements_to_move_dict: | |
| # 1. ์๋ณธ ๊ทธ๋ฃน์์ ์์ ์ ๊ฑฐ | |
| elements_removed_count = 0 | |
| for group in adjusted_groups: | |
| original_children_count = len(group.children) | |
| group.children = [ | |
| child | |
| for child in group.children | |
| if child.element_id not in elements_to_move_dict | |
| ] | |
| elements_removed_count += original_children_count - len(group.children) | |
| # 2. ๋์ ๊ทธ๋ฃน์ ์์ ์ถ๊ฐ | |
| elements_added_count = 0 | |
| for element_id, (element, target_group_idx) in elements_to_move_dict.items(): | |
| if 0 <= target_group_idx < len(adjusted_groups): | |
| adjusted_groups[target_group_idx].children.insert( | |
| 0, element | |
| ) # ๊ทธ๋ฃน ๋งจ ์์ ์ถ๊ฐ | |
| elements_added_count += 1 | |
| else: | |
| logger.error( | |
| f"ํ์ฒ๋ฆฌ ์ด๋ ์ค ์ ํจํ์ง ์์ ๋์ ๊ทธ๋ฃน ์ธ๋ฑ์ค: {target_group_idx} for Elem {element_id}" | |
| ) | |
| logger.debug( | |
| f" ํ์ฒ๋ฆฌ ์์ ์ด๋ ์๋ฃ: {elements_removed_count}๊ฐ ์ ๊ฑฐ, {elements_added_count}๊ฐ ์ถ๊ฐ" | |
| ) | |
| if moved_elements_log: | |
| logger.info( | |
| f" ํ ์ด๋ธ/๊ทธ๋ฆผ ํ ๋น ํ์ฒ๋ฆฌ: {len(moved_elements_log)}๊ฐ ์์ ์ด๋๋จ - {', '.join(moved_elements_log)}" | |
| ) | |
| else: | |
| logger.debug(" ํ ์ด๋ธ/๊ทธ๋ฆผ ํ ๋น ํ์ฒ๋ฆฌ: ์ด๋๋ ์์ ์์") | |
| return adjusted_groups | |
| # ============================================================================ | |
| # Base Case ํจ์๋ค (๊ธฐ์กด๊ณผ ๋์ผ v2.1) | |
| # ============================================================================ | |
| def _assign_children_to_anchors_with_2d_proximity( | |
| anchors: List[MockElement], | |
| children: List[MockElement], | |
| zone: Zone, | |
| preserve_top_orphans: bool = True, | |
| ) -> Tuple[List[ElementGroup], List[MockElement]]: | |
| """ | |
| ์ต์ปค์ ์์ ์์๋ฅผ 2D ๊ฑฐ๋ฆฌ ๊ธฐ๋ฐ์ผ๋ก ๊ทธ๋ฃนํ (Phase 1: STANDARD_2_COLUMN ์ ์ฉ) | |
| Args: | |
| anchors: ์ต์ปค ์์ ๋ฆฌ์คํธ | |
| children: ์์ ์์ ๋ฆฌ์คํธ | |
| zone: ํ์ฌ ์ฒ๋ฆฌ ์ค์ธ ๊ตฌ์ญ | |
| preserve_top_orphans: True์ผ ๊ฒฝ์ฐ ์๋จ ์์ญ์ ์์๋ ๊ณ ์๋ก ์ ์ง | |
| Returns: | |
| (๊ทธ๋ฃน ๋ฆฌ์คํธ, ๊ณ ์ ์์ ๋ฆฌ์คํธ) | |
| """ | |
| groups: List[ElementGroup] = [ElementGroup(anchor=a) for a in anchors] | |
| orphans: List[MockElement] = [] | |
| # ์๋จ ๊ณ ์ ์๊ณ๊ฐ (๊ธฐ์กด ๋ก์ง ์ ์ง ์ต์ ) | |
| top_orphan_threshold_y = ( | |
| zone.y_min + zone.height * BASE_CASE_TOP_ORPHAN_THRESHOLD_RATIO | |
| if preserve_top_orphans | |
| else zone.y_min | |
| ) | |
| for child in children: | |
| child_x_center = child.bbox_x + child.bbox_width / 2 | |
| child_y_center = child.bbox_y + child.bbox_height / 2 | |
| # ์๋จ ๊ณ ์ ์ฒดํฌ (์ ํ์ ) | |
| if preserve_top_orphans and child.bbox_y < top_orphan_threshold_y: | |
| # ์ฒซ ๋ฒ์งธ ์ต์ปค๋ณด๋ค ํจ์ฌ ์์ชฝ์ธ ๊ฒฝ์ฐ๋ง ๊ณ ์๋ก ์ฒ๋ฆฌ | |
| if not anchors or child_y_center < ( | |
| anchors[0].bbox_y - ANCHOR_VERTICAL_PROXIMITY_THRESHOLD / 2 | |
| ): | |
| orphans.append(child) | |
| logger.trace( | |
| f" Elem {child.element_id} ์๋จ ๊ณ ์ ์ ์ง (Y={child.bbox_y})" | |
| ) | |
| continue | |
| best_anchor_idx = None | |
| min_distance = float("inf") | |
| for idx, anchor in enumerate(anchors): | |
| anchor_x_center = anchor.bbox_x + anchor.bbox_width / 2 | |
| anchor_y_center = anchor.bbox_y + anchor.bbox_height / 2 | |
| # ๐ฅ ํต์ฌ ์์ : ์์์ด ์ต์ปค๋ณด๋ค ์์ชฝ์ ์์ผ๋ฉด ์ ์ธ | |
| # figure/table์ ๋ฐ๋์ ์์ ๋ณด๋ค ์์ชฝ์ ์๋ ์ต์ปค์๋ง ๋ฐฐ์ ๋์ด์ผ ํจ | |
| if child_y_center < anchor_y_center: | |
| logger.trace( | |
| f" Elem {child.element_id} โ Anchor {anchor.element_id} ์ ์ธ " | |
| f"(์์ Y={child_y_center:.0f} < ์ต์ปค Y={anchor_y_center:.0f})" | |
| ) | |
| continue | |
| # ๊ฐ์ค 2D ๊ฑฐ๋ฆฌ ๊ณ์ฐ | |
| x_diff = abs(child_x_center - anchor_x_center) * ANCHOR_2D_DISTANCE_WEIGHT_X | |
| y_diff = abs(child_y_center - anchor_y_center) * ANCHOR_2D_DISTANCE_WEIGHT_Y | |
| distance = (x_diff**2 + y_diff**2) ** 0.5 | |
| if distance < min_distance: | |
| min_distance = distance | |
| best_anchor_idx = idx | |
| # ๊ฑฐ๋ฆฌ ์๊ณ๊ฐ ์ฒดํฌ | |
| if ( | |
| best_anchor_idx is not None | |
| and min_distance < ANCHOR_VERTICAL_PROXIMITY_THRESHOLD | |
| ): | |
| groups[best_anchor_idx].children.append(child) | |
| logger.trace( | |
| f" Elem {child.element_id} โ Anchor {anchors[best_anchor_idx].element_id} " | |
| f"(2D ๊ฑฐ๋ฆฌ={min_distance:.1f})" | |
| ) | |
| else: | |
| orphans.append(child) | |
| if best_anchor_idx is None: | |
| reason = "์์ชฝ ์ต์ปค๋ง ํ์ฉ (๋ชจ๋ ์ต์ปค๊ฐ ์์๋ณด๋ค ์๋์ชฝ)" | |
| else: | |
| reason = f"์ต์ ๊ฑฐ๋ฆฌ={min_distance:.1f} > {ANCHOR_VERTICAL_PROXIMITY_THRESHOLD}" | |
| logger.debug(f" Elem {child.element_id} ๊ณ ์ ({reason})") | |
| return groups, orphans | |
| def _base_case_standard_1_column( | |
| zone: Zone, elements: List[MockElement] | |
| ) -> List[ElementGroup]: | |
| # ... (v2.1 ์ฝ๋์ ๋์ผ) ... | |
| """ํ์ค 1๋จ ๊ตฌ์ญ Base Case ์ฒ๋ฆฌ (์๋จ ๊ณ ์ ๋ถ๋ฆฌ)""" | |
| logger.debug( | |
| f" ํ์ค 1๋จ Base Case ์์ (์์ฐจ ์ฒ๋ฆฌ + ๊ณ ์ ๊ฐ์ ): {len(elements)}๊ฐ ์์ in {zone}" | |
| ) | |
| anchors = sorted( | |
| [e for e in elements if e.class_name in ALLOWED_ANCHORS], | |
| key=lambda e: e.y_position, | |
| ) | |
| children = [e for e in elements if e.class_name in ALLOWED_CHILDREN] | |
| groups: Dict[int, ElementGroup] = { | |
| anchor.element_id: ElementGroup(anchor=anchor) for anchor in anchors | |
| } | |
| assigned_children_ids = set() | |
| logger.trace(" ์ํ ์ธ์ ์ฒ๋ฆฌ ์์...") | |
| if anchors and children: | |
| for anchor in anchors: | |
| anchor_cy = anchor.bbox_y + anchor.bbox_height / 2 | |
| anchor_right_x = anchor.bbox_x + anchor.bbox_width | |
| anchor_left_x = anchor.bbox_x | |
| unassigned_children = [ | |
| c for c in children if c.element_id not in assigned_children_ids | |
| ] | |
| adjacent_child = None | |
| min_y_diff = float("inf") | |
| for child in unassigned_children: | |
| child_cy = child.bbox_y + child.bbox_height / 2 | |
| child_right_x = child.bbox_x + child.bbox_width | |
| child_left_x = child.bbox_x | |
| y_diff = abs(anchor_cy - child_cy) | |
| y_threshold = ( | |
| (anchor.bbox_height + child.bbox_height) | |
| / 2 | |
| * HORIZONTAL_ADJACENCY_Y_CENTER_RATIO | |
| if (anchor.bbox_height + child.bbox_height) > 0 | |
| else 0 | |
| ) | |
| if y_diff >= y_threshold: | |
| continue | |
| gap_right = child_left_x - anchor_right_x | |
| gap_left = anchor_left_x - child_right_x | |
| is_adjacent = (abs(gap_right) < HORIZONTAL_ADJACENCY_X_PROXIMITY) or ( | |
| abs(gap_left) < HORIZONTAL_ADJACENCY_X_PROXIMITY | |
| ) | |
| if is_adjacent and y_diff < min_y_diff: | |
| min_y_diff = y_diff | |
| adjacent_child = child | |
| if adjacent_child: | |
| logger.trace( | |
| f" ์ํ ์ธ์ ๋ฐฐ์ : ์ต์ปค ID {anchor.element_id} <- ์์ ID {adjacent_child.element_id}" | |
| ) | |
| groups[anchor.element_id].add_child(adjacent_child) | |
| assigned_children_ids.add(adjacent_child.element_id) | |
| logger.debug( | |
| f" ์ํ ์ธ์ ์ฒ๋ฆฌ ์๋ฃ: {len(assigned_children_ids)}๊ฐ ์์ ์ฐ์ ๋ฐฐ์ ๋จ" | |
| ) | |
| remaining_elements = anchors + [ | |
| c for c in children if c.element_id not in assigned_children_ids | |
| ] | |
| if not remaining_elements: | |
| logger.debug(" ๋ชจ๋ ์์๊ฐ ์ํ ์ธ์ ์ผ๋ก ๋ฐฐ์ ๋์ด ๊ทธ๋ฃนํ ์๋ฃ.") | |
| # ํ์ฒ๋ฆฌ ํธ์ถ ์ ๊ทธ๋ฃน ID ์์ ํ ๋น (์ ํ์ ) | |
| temp_groups = sorted( | |
| list(groups.values()), | |
| key=lambda g: g.anchor.y_position if g.anchor else float("inf"), | |
| ) | |
| for idx, group in enumerate(temp_groups): | |
| group.group_id = idx | |
| return _post_process_table_figure_assignment(temp_groups) | |
| # 2๋จ๊ณ: ๋๋จธ์ง ์์๋ฅผ 2D ๊ฑฐ๋ฆฌ ๊ธฐ๋ฐ์ผ๋ก ๊ทธ๋ฃนํ (Phase 1 ์ ์ฉ) | |
| remaining_children = [ | |
| c for c in children if c.element_id not in assigned_children_ids | |
| ] | |
| if remaining_children and anchors: | |
| logger.trace( | |
| f" 2๋จ๊ณ: ๋๋จธ์ง {len(remaining_children)}๊ฐ ์์ 2D ๊ฑฐ๋ฆฌ ๊ทธ๋ฃนํ..." | |
| ) | |
| # ๐ฅ 2D ๊ฑฐ๋ฆฌ ๊ธฐ๋ฐ ๊ทธ๋ฃนํ (์๋จ ๊ณ ์ ๋ณด์กด ์ต์ ํ์ฑํ) | |
| proximity_groups, proximity_orphans = ( | |
| _assign_children_to_anchors_with_2d_proximity( | |
| anchors, | |
| remaining_children, | |
| zone, | |
| preserve_top_orphans=True, # ์๋จ ๊ณ ์ ๋ณด์กด | |
| ) | |
| ) | |
| # 2D ๊ฑฐ๋ฆฌ๋ก ๋ฐฐ์ ๋ ์์๋ค์ ๊ธฐ์กด ๊ทธ๋ฃน์ ๋ณํฉ | |
| for idx, proximity_group in enumerate(proximity_groups): | |
| anchor_id = anchors[idx].element_id | |
| if anchor_id in groups: | |
| groups[anchor_id].children.extend(proximity_group.children) | |
| # 2D ๊ทธ๋ฃนํ ํ ์ฌ์ ํ ๋จ์ ์์๋ค์ ์์ฐจ ์ฒ๋ฆฌ๋ก ๋๊น | |
| remaining_elements = [ | |
| a for a in anchors if a.element_id not in assigned_children_ids | |
| ] + proximity_orphans | |
| logger.debug( | |
| f" 2๋จ๊ณ ์๋ฃ: {len(remaining_children) - len(proximity_orphans)}๊ฐ ๋ฐฐ์ , {len(proximity_orphans)}๊ฐ ๊ณ ์๋ก ์์ฐจ ์ฒ๋ฆฌ ๋๊ธฐ" | |
| ) | |
| else: | |
| remaining_elements = anchors + [ | |
| c for c in children if c.element_id not in assigned_children_ids | |
| ] | |
| if not remaining_elements: | |
| logger.debug(" 2D ๊ฑฐ๋ฆฌ ๊ทธ๋ฃนํ ํ ๋๋จธ์ง ์์ ์์. ๊ทธ๋ฃนํ ์๋ฃ.") | |
| temp_groups = sorted( | |
| list(groups.values()), | |
| key=lambda g: g.anchor.y_position if g.anchor else float("inf"), | |
| ) | |
| for idx, group in enumerate(temp_groups): | |
| group.group_id = idx | |
| return _post_process_table_figure_assignment(temp_groups) | |
| logger.trace( | |
| f" 3๋จ๊ณ: ๋๋จธ์ง ์์ {len(remaining_elements)}๊ฐ (Y, X) ์ ๋ ฌ ๋ฐ ์์ฐจ ๊ทธ๋ฃนํ ์์..." | |
| ) | |
| remaining_elements.sort(key=lambda e: (e.y_position, e.x_position)) | |
| final_groups: List[ElementGroup] = [] | |
| current_group: Optional[ElementGroup] = None | |
| initial_top_orphan_children: List[MockElement] = [] | |
| initial_bottom_orphan_children: List[MockElement] = [] | |
| first_anchor_found = False | |
| top_orphan_threshold_y = ( | |
| zone.y_min + zone.height * BASE_CASE_TOP_ORPHAN_THRESHOLD_RATIO | |
| ) | |
| for element in remaining_elements: | |
| if element.class_name in ALLOWED_ANCHORS: | |
| first_anchor_found = True | |
| if initial_top_orphan_children: | |
| logger.trace( | |
| f" ๋ ๋ฆฝ์ ์ธ ์๋จ ๊ณ ์ ๊ทธ๋ฃน ์์ฑ ({len(initial_top_orphan_children)}๊ฐ ์์)" | |
| ) | |
| final_groups.append( | |
| ElementGroup(anchor=None, children=initial_top_orphan_children) | |
| ) | |
| initial_top_orphan_children = [] | |
| if ( | |
| current_group is not None | |
| and current_group.anchor is not None | |
| and not current_group.is_empty() | |
| ): | |
| final_groups.append(current_group) | |
| if element.element_id in groups: | |
| current_group = groups[element.element_id] | |
| logger.trace(f" ์ต์ปค ๊ทธ๋ฃน ์ฌ์ฌ์ฉ (ID: {element.element_id})") | |
| else: | |
| current_group = ElementGroup(anchor=element, children=[]) | |
| logger.trace(f" ์ ์ต์ปค ๊ทธ๋ฃน ์์ (ID: {element.element_id})") | |
| if initial_bottom_orphan_children: | |
| logger.trace( | |
| f" ์ฒซ ์ต์ปค(ID: {element.element_id}) ๊ทธ๋ฃน์ ํ๋จ ๊ณ ์ ์์ {len(initial_bottom_orphan_children)}๊ฐ ์ถ๊ฐ" | |
| ) | |
| current_group.children = ( | |
| initial_bottom_orphan_children + current_group.children | |
| ) | |
| initial_bottom_orphan_children = [] | |
| else: | |
| if first_anchor_found: | |
| if current_group is None: | |
| logger.warning( | |
| f" ์ต์ปค ์์ด ์์ ์์(ID: {element.element_id}) ๋ฐ๊ฒฌ๋จ. ์์น({element.y_position:.1f}) ๋ฐ๋ผ ์์ ๊ณ ์ ๋ฆฌ์คํธ์ ์ถ๊ฐ." | |
| ) | |
| if element.y_position < top_orphan_threshold_y: | |
| initial_top_orphan_children.append(element) | |
| else: | |
| initial_bottom_orphan_children.append(element) | |
| else: | |
| current_group.add_child(element) | |
| logger.trace( | |
| f" ํ์ฌ ๊ทธ๋ฃน(์ต์ปค: {current_group.anchor.element_id if current_group.anchor else 'Orphan'})์ ์์ ์ถ๊ฐ (ID: {element.element_id})" | |
| ) | |
| else: | |
| if element.y_position < top_orphan_threshold_y: | |
| initial_top_orphan_children.append(element) | |
| logger.trace( | |
| f" ์๋จ ๊ณ ์ ์์ ์์(ID: {element.element_id}) ์์ ์ ์ฅ (Y < {top_orphan_threshold_y:.0f})" | |
| ) | |
| else: | |
| initial_bottom_orphan_children.append(element) | |
| logger.trace( | |
| f" ํ๋จ ๊ณ ์ ์์ ์์(ID: {element.element_id}) ์์ ์ ์ฅ (Y >= {top_orphan_threshold_y:.0f})" | |
| ) | |
| if initial_top_orphan_children: | |
| logger.trace( | |
| f" ๋ง์ง๋ง ๋ ๋ฆฝ ์๋จ ๊ณ ์ ๊ทธ๋ฃน ์์ฑ ({len(initial_top_orphan_children)}๊ฐ ์์)" | |
| ) | |
| final_groups.append( | |
| ElementGroup(anchor=None, children=initial_top_orphan_children) | |
| ) | |
| if current_group is not None and not current_group.is_empty(): | |
| final_groups.append(current_group) | |
| elif initial_bottom_orphan_children: | |
| logger.warning(" ๋ชจ๋ ์์๊ฐ ํ๋จ ์์ ์์์. ๋จ์ผ ๊ณ ์ ๊ทธ๋ฃน ์์ฑ.") | |
| final_groups.append( | |
| ElementGroup(anchor=None, children=initial_bottom_orphan_children) | |
| ) | |
| processed_anchor_ids = set(g.anchor.element_id for g in final_groups if g.anchor) | |
| for anchor_id, group in groups.items(): | |
| if anchor_id not in processed_anchor_ids and group.anchor: | |
| final_groups.append(group) | |
| logger.trace(f" ๋ฏธํฌํจ ์ต์ปค ๊ทธ๋ฃน ์ถ๊ฐ (์ํ ์ธ์ ๋ง): ID {anchor_id}") | |
| final_groups.sort( | |
| key=lambda g: ( | |
| g.anchor.y_position | |
| if g.anchor | |
| else (min(c.y_position for c in g.children) if g.children else float("inf")) | |
| ) | |
| ) | |
| # ํ์ฒ๋ฆฌ ํธ์ถ ์ ๊ทธ๋ฃน ID ์์ ํ ๋น | |
| for idx, group in enumerate(final_groups): | |
| group.group_id = idx | |
| final_groups = _post_process_table_figure_assignment(final_groups) | |
| logger.debug( | |
| f" ์์ฐจ ์ฒ๋ฆฌ ๊ธฐ๋ฐ ๊ทธ๋ฃนํ (+ํ์ฒ๋ฆฌ) ์๋ฃ: {len(final_groups)} ๊ทธ๋ฃน ์์ฑ" | |
| ) | |
| return final_groups | |
| def _base_case_mixed_layout( | |
| zone: Zone, elements: List[MockElement], layout_type: LayoutType | |
| ) -> List[ElementGroup]: | |
| """ํผํฉํ ๋ ์ด์์ Base Case ์ฒ๋ฆฌ (๊ธฐ์กด๊ณผ ๋์ผ)""" | |
| # ... (v2.1 ์ฝ๋์ ๋์ผ) ... | |
| logger.debug( | |
| f" ํผํฉํ Base Case ์์ ({layout_type.name}): {len(elements)}๊ฐ ์์ in {zone}" | |
| ) | |
| sorted_elements = sorted(elements, key=lambda e: (e.y_position, e.x_position)) | |
| final_groups: List[ElementGroup] = [] | |
| current_group: Optional[ElementGroup] = None | |
| initial_top_orphan_children: List[MockElement] = [] | |
| initial_bottom_orphan_children: List[MockElement] = [] | |
| first_anchor_found = False | |
| split_y = zone.y_min + zone.height * LAYOUT_DETECT_Y_SPLIT_POINT | |
| logger.trace(f" ํผํฉํ Base Case Y ๋ถํ ์ : {split_y:.1f}") | |
| for element in sorted_elements: | |
| element_y_center = element.y_position + element.bbox_height / 2 | |
| if element.class_name in ALLOWED_ANCHORS: | |
| first_anchor_found = True | |
| if initial_top_orphan_children: | |
| logger.trace( | |
| f" ๋ ๋ฆฝ์ ์ธ ์๋จ ๊ณ ์ ๊ทธ๋ฃน ์์ฑ ({len(initial_top_orphan_children)}๊ฐ ์์)" | |
| ) | |
| final_groups.append( | |
| ElementGroup(anchor=None, children=initial_top_orphan_children) | |
| ) | |
| initial_top_orphan_children = [] | |
| if current_group is not None and not current_group.is_empty(): | |
| final_groups.append(current_group) | |
| current_group = ElementGroup(anchor=element, children=[]) | |
| logger.trace(f" ์ ์ต์ปค ๊ทธ๋ฃน ์์ (ID: {element.element_id})") | |
| if initial_bottom_orphan_children: | |
| logger.trace( | |
| f" ์ฒซ ์ต์ปค(ID: {element.element_id}) ๊ทธ๋ฃน์ ํ๋จ ๊ณ ์ ์์ {len(initial_bottom_orphan_children)}๊ฐ ์ถ๊ฐ" | |
| ) | |
| current_group.children = ( | |
| initial_bottom_orphan_children + current_group.children | |
| ) | |
| initial_bottom_orphan_children = [] | |
| else: | |
| if first_anchor_found: | |
| if current_group is None: | |
| logger.warning( | |
| f" ์ต์ปค ์์ด ์์ ์์(ID: {element.element_id}) ๋ฐ๊ฒฌ๋จ. ์์น({element_y_center:.1f}) ๋ฐ๋ผ ์์ ๊ณ ์ ๋ฆฌ์คํธ์ ์ถ๊ฐ." | |
| ) | |
| if element_y_center < split_y: | |
| initial_top_orphan_children.append(element) | |
| else: | |
| initial_bottom_orphan_children.append(element) | |
| else: | |
| current_group.add_child(element) | |
| logger.trace( | |
| f" ํ์ฌ ๊ทธ๋ฃน(์ต์ปค: {current_group.anchor.element_id if current_group.anchor else 'Orphan'})์ ์์ ์ถ๊ฐ (ID: {element.element_id})" | |
| ) | |
| else: | |
| if element_y_center < split_y: | |
| initial_top_orphan_children.append(element) | |
| logger.trace( | |
| f" ์๋จ ๊ณ ์ ์์ ์์(ID: {element.element_id}) ์์ ์ ์ฅ" | |
| ) | |
| else: | |
| initial_bottom_orphan_children.append(element) | |
| logger.trace( | |
| f" ํ๋จ ๊ณ ์ ์์ ์์(ID: {element.element_id}) ์์ ์ ์ฅ" | |
| ) | |
| if initial_top_orphan_children: | |
| logger.trace( | |
| f" ๋ง์ง๋ง ๋ ๋ฆฝ ์๋จ ๊ณ ์ ๊ทธ๋ฃน ์์ฑ ({len(initial_top_orphan_children)}๊ฐ ์์)" | |
| ) | |
| final_groups.append( | |
| ElementGroup(anchor=None, children=initial_top_orphan_children) | |
| ) | |
| if current_group is not None and not current_group.is_empty(): | |
| final_groups.append(current_group) | |
| elif initial_bottom_orphan_children: | |
| logger.warning(" ๋ชจ๋ ์์๊ฐ ํ๋จ ์์ ์์์. ๋จ์ผ ๊ณ ์ ๊ทธ๋ฃน ์์ฑ.") | |
| final_groups.append( | |
| ElementGroup(anchor=None, children=initial_bottom_orphan_children) | |
| ) | |
| # ํ์ฒ๋ฆฌ ํธ์ถ ์ ๊ทธ๋ฃน ID ์์ ํ ๋น | |
| for idx, group in enumerate(final_groups): | |
| group.group_id = idx | |
| final_groups = _post_process_table_figure_assignment(final_groups) | |
| return final_groups | |
| # ============================================================================ | |
| # ์ต์ข ๋ณํฉ ๋ฐ ์์ ๋ถ์ฌ ํจ์ (๊ธฐ์กด๊ณผ ๋์ผ) | |
| # ============================================================================ | |
| def flatten_groups_and_assign_order( | |
| groups: List[ElementGroup], start_global_order: int, start_group_id: int | |
| ) -> Tuple[List[MockElement], int, int]: | |
| # ... (์ฝ๋ ๋์ผ) ... | |
| """์ฃผ์ด์ง ๊ทธ๋ฃน ๋ฆฌ์คํธ๋ฅผ ํํํํ๊ณ ์ ์ญ ์์/๊ทธ๋ฃน ID ๋ถ์ฌ""" | |
| flattened = [] | |
| global_order = start_global_order | |
| group_id_counter = start_group_id | |
| logger.debug( | |
| f" ํํํ ์์: {len(groups)}๊ฐ ๊ทธ๋ฃน (์์ order={global_order}, group_id={group_id_counter})" | |
| ) | |
| for group in groups: # ์ต์ข ์ ๋ ฌ๋ ๊ทธ๋ฃน ์์ ์ฌ์ฉ | |
| # ๊ทธ๋ฃน ๊ฐ์ฒด์ ID๋ ์์ ID์ผ ์ ์์ผ๋ฏ๋ก ์ฌ๊ธฐ์ ์ต์ข ID ํ ๋น | |
| final_group_id = group_id_counter | |
| group.group_id = final_group_id # ๋ก๊น ๋ฐ ์ฐธ์กฐ์ฉ ์ ๋ฐ์ดํธ | |
| elements_in_group = group.get_all_elements_sorted() | |
| logger.trace( | |
| f" ๊ทธ๋ฃน {final_group_id} ํํํ (Anchor: {group.anchor.element_id if group.anchor else 'Orphan'}, ์์ ์: {len(elements_in_group)})" | |
| ) | |
| for local_order, element in enumerate(elements_in_group): | |
| try: | |
| setattr(element, "order_in_question", global_order) | |
| setattr(element, "group_id", final_group_id) # ์ต์ข ๊ทธ๋ฃน ID ์ฌ์ฉ | |
| setattr(element, "order_in_group", local_order) | |
| flattened.append(element) | |
| global_order += 1 | |
| except AttributeError as e: | |
| logger.error( | |
| f"์์ (ID: {getattr(element, 'element_id', 'N/A')})์ ์ ๋ ฌ ์์ฑ ์ถ๊ฐ ์คํจ: {e}" | |
| ) | |
| group_id_counter += 1 | |
| logger.debug( | |
| f" ํํํ ์๋ฃ: {len(flattened)}๊ฐ ์์ ์์ฑ (๋ค์ order={global_order}, group_id={group_id_counter})" | |
| ) | |
| return flattened, global_order, group_id_counter | |
| # ============================================================================ | |
| # ํฌํผ ํจ์ (๊ธฐ์กด๊ณผ ๋์ผ) | |
| # ============================================================================ | |
| def preprocess_elements( | |
| elements: List[MockElement], document_type: str | |
| ) -> List[MockElement]: | |
| # ... (์ฝ๋ ๋์ผ) ... | |
| """0๋จ๊ณ ์ ์ฒ๋ฆฌ""" | |
| original_count = len(elements) | |
| if document_type == "question_based": | |
| filtered = [e for e in elements if e.class_name in ALLOWED_CLASSES] | |
| logger.info( | |
| f"์ ์ฒ๋ฆฌ (question_based): {original_count}๊ฐ โ {len(filtered)}๊ฐ (ํ์ฉ ํด๋์ค ํํฐ๋ง)" | |
| ) | |
| elif document_type == "reading_order": | |
| filtered = elements | |
| logger.info(f"์ ์ฒ๋ฆฌ (reading_order): {original_count}๊ฐ (๋ชจ๋ ํด๋์ค ํ์ฉ)") | |
| else: | |
| logger.warning(f"์ ์ ์๋ ๋ฌธ์ ํ์ '{document_type}', ๋ชจ๋ ์์ ๋ฐํ") | |
| filtered = elements | |
| valid_elements = [e for e in filtered if hasattr(e, "area") and e.area > 0] | |
| if len(valid_elements) < len(filtered): | |
| logger.warning( | |
| f"์ ์ฒ๋ฆฌ: ๋ฉด์ ์ด 0 ์ดํ์ธ ์์ {len(filtered) - len(valid_elements)}๊ฐ ์ ๊ฑฐ" | |
| ) | |
| return valid_elements | |
| def calculate_page_width(elements: List[MockElement]) -> int: | |
| # ... (์ฝ๋ ๋์ผ) ... | |
| """ํ์ด์ง ๋๋น ์ถ์ """ | |
| if not elements: | |
| return 0 | |
| return max(e.bbox_x + e.bbox_width for e in elements) if elements else 0 | |
| def calculate_page_height(elements: List[MockElement]) -> int: | |
| # ... (์ฝ๋ ๋์ผ) ... | |
| """ํ์ด์ง ๋์ด ์ถ์ """ | |
| if not elements: | |
| return 0 | |
| return max(e.bbox_y + e.bbox_height for e in elements) if elements else 0 | |
| # ============================================================================ | |
| # DB ์ ์ฅ ํจ์ (ORM ์ฐ๋) | |
| # ============================================================================ | |
| def save_sorting_results_to_db( | |
| db: "Session", page_id: int, sorted_elements: List["LayoutElement"] | |
| ) -> Tuple[int, int]: | |
| """ | |
| ์ ๋ ฌ๋ LayoutElement ๋ฆฌ์คํธ๋ฅผ question_groups์ question_elements ํ ์ด๋ธ์ ์ ์ฅํฉ๋๋ค. | |
| Args: | |
| db: SQLAlchemy ์ธ์ | |
| page_id: ํ์ด์ง ID | |
| sorted_elements: sorter.py๋ก ์ ๋ ฌ๋ LayoutElement ๋ฆฌ์คํธ | |
| (order_in_question, group_id ์์ฑ ํ์) | |
| Returns: | |
| (์์ฑ๋ ๊ทธ๋ฃน ์, ์์ฑ๋ ์์ ์) ํํ | |
| Raises: | |
| ValueError: sorted_elements์ order_in_question ๋๋ group_id๊ฐ ์๋ ๊ฒฝ์ฐ | |
| """ | |
| from .. import crud | |
| from ..schemas import QuestionGroupCreate, QuestionElementCreate | |
| if not sorted_elements: | |
| logger.warning(f"page_id={page_id}: ์ ๋ ฌ๋ ์์๊ฐ ์์ด DB ์ ์ฅ์ ๊ฑด๋๋๋๋ค.") | |
| return 0, 0 | |
| # 1. ์์๋ค์ group_id๋ณ๋ก ๊ทธ๋ฃนํ | |
| groups_dict: Dict[int, List["LayoutElement"]] = {} | |
| for elem in sorted_elements: | |
| if not hasattr(elem, "order_in_question") or not hasattr(elem, "group_id"): | |
| raise ValueError( | |
| f"element_id={elem.element_id}: order_in_question ๋๋ group_id ์์ฑ์ด ์์ต๋๋ค. " | |
| "sorter.py์ flatten_groups_and_assign_order() ์คํ ํ ํธ์ถํ์ธ์." | |
| ) | |
| group_id = elem.group_id | |
| if group_id not in groups_dict: | |
| groups_dict[group_id] = [] | |
| groups_dict[group_id].append(elem) | |
| logger.info( | |
| f"page_id={page_id}: {len(groups_dict)}๊ฐ ๊ทธ๋ฃน, {len(sorted_elements)}๊ฐ ์์๋ฅผ DB์ ์ ์ฅ ์์" | |
| ) | |
| # 2. ๊ฐ ๊ทธ๋ฃน์ ๋ํด QuestionGroup ์์ฑ | |
| group_count = 0 | |
| element_count = 0 | |
| for group_id, group_elements in sorted(groups_dict.items()): | |
| # ์ต์ปค ์์ ์ฐพ๊ธฐ (๊ทธ๋ฃน ๋ด ์ฒซ ๋ฒ์งธ ์์๊ฐ ์ต์ปค) | |
| anchor_elem = min(group_elements, key=lambda e: e.order_in_question) | |
| # Y ๋ฒ์ ๊ณ์ฐ | |
| start_y = min(e.y_position for e in group_elements) | |
| end_y = max( | |
| e.y_position + (e.bbox_height if hasattr(e, "bbox_height") else 0) | |
| for e in group_elements | |
| ) | |
| # QuestionGroup ์์ฑ | |
| group_create = QuestionGroupCreate( | |
| page_id=page_id, | |
| anchor_element_id=anchor_elem.element_id, | |
| start_y=start_y, | |
| end_y=end_y, | |
| element_count=len(group_elements), | |
| ) | |
| db_group = crud.create_question_group(db, group_create) | |
| group_count += 1 | |
| logger.debug( | |
| f" ๊ทธ๋ฃน {group_id} โ question_group_id={db_group.question_group_id} (์ต์ปค: {anchor_elem.element_id}, ์์ ์: {len(group_elements)})" | |
| ) | |
| # 3. ๊ทธ๋ฃน ๋ด ๊ฐ ์์์ ๋ํด QuestionElement ์์ฑ | |
| for elem in group_elements: | |
| element_create = QuestionElementCreate( | |
| question_group_id=db_group.question_group_id, | |
| element_id=elem.element_id, | |
| order_in_question=elem.order_in_question + 1, | |
| ) | |
| crud.create_question_element(db, element_create) | |
| element_count += 1 | |
| logger.info( | |
| f"page_id={page_id}: DB ์ ์ฅ ์๋ฃ ({group_count}๊ฐ ๊ทธ๋ฃน, {element_count}๊ฐ ์์)" | |
| ) | |
| return group_count, element_count | |