Spaces:
Sleeping
Sleeping
| """ | |
| ํฌ๋งทํฐ ์ ํธ๋ฆฌํฐ ํจ์ ๋ชจ์. | |
| ํฌ๋งทํ ๊ท์น ์ ์ฉ, ์ ํ์ง ์ ๊ทํ, ์๊ฐ ์๋ฃ ์ค๋ช ๋ณํฉ ๋ฑ | |
| ํต์ฌ ํ์ฒ๋ฆฌ ๋ก์ง์ ํ ๊ณณ์ ๋ชจ์๋๋ค. | |
| """ | |
| from __future__ import annotations | |
| import re | |
| import unicodedata | |
| from dataclasses import dataclass | |
| from typing import Dict, Iterable, List, Optional, Tuple | |
| from .mock_models import MockElement | |
| from .formatter_rules import RuleConfig | |
| AI_PRIORITY_CLASSES = {"figure", "table", "flowchart"} | |
| # --------------------------------------------------------------------------- | |
| # ๋ฐ์ดํฐ ๋ณํ ํฌํผ | |
| # --------------------------------------------------------------------------- | |
| def ocr_inputs_to_dict(ocr_texts) -> Dict[int, str]: | |
| """ | |
| OCR ์ ๋ ฅ์ element_id โ text ๋์ ๋๋ฆฌ๋ก ๋ณํ. | |
| """ | |
| if isinstance(ocr_texts, dict): | |
| return {int(k): (v or "").strip() for k, v in ocr_texts.items()} | |
| ocr_dict: Dict[int, str] = {} | |
| for item in ocr_texts or []: | |
| try: | |
| element_id = int(getattr(item, "element_id")) | |
| text = getattr(item, "ocr_text", "") or "" | |
| except AttributeError: | |
| continue | |
| cleaned = text.strip() | |
| if cleaned: | |
| ocr_dict[element_id] = cleaned | |
| return ocr_dict | |
| def normalize_ai_descriptions( | |
| ai_descriptions: Optional[Dict[int, str]], | |
| ) -> Dict[int, str]: | |
| """ | |
| AI ์ค๋ช ๋์ ๋๋ฆฌ๋ฅผ ์ ๋ฆฌํฉ๋๋ค. | |
| """ | |
| if not ai_descriptions: | |
| return {} | |
| return { | |
| int(k): (v or "").strip() | |
| for k, v in ai_descriptions.items() | |
| if (v or "").strip() | |
| } | |
| def split_first_line(text: str) -> Tuple[str, str]: | |
| """ | |
| ๋ฌธ์์ด์ ์ฒซ ์ค๊ณผ ๋๋จธ์ง๋ก ๋ถ๋ฆฌํ๋ค. | |
| """ | |
| if not text: | |
| return "", "" | |
| lines = text.splitlines() | |
| first = lines[0] | |
| remainder = "\n".join(lines[1:]).strip() | |
| return first, remainder | |
| # --------------------------------------------------------------------------- | |
| # ์ฝํ ์ธ ํ์ฒ๋ฆฌ | |
| # --------------------------------------------------------------------------- | |
| CHOICE_PATTERN = re.compile( | |
| r"^(\(?\d{1,2}[\).]|[โ -โณ]|[A-Z][\).]|[๊ฐ-ํ]\.|[๊ฐ-ํ]\))\s*(.+)$" | |
| ) | |
| def normalize_choices(text: str) -> str: | |
| """ | |
| ์ ํ์ง ํ ์คํธ๋ฅผ ํ์คํํ๋ค. | |
| - ํจํด์ด ๋ช ํํ๋ฉด ๊ทธ๋๋ก ์ฌ์ฉ. | |
| - ๊ทธ๋ ์ง ์์ผ๋ฉด 'โข ' ๋ถ๋ฆฟ์ ๋ถ์ธ๋ค. | |
| """ | |
| lines = [line.strip() for line in text.splitlines() if line.strip()] | |
| normalized: List[str] = [] | |
| for line in lines: | |
| match = CHOICE_PATTERN.match(line) | |
| if match: | |
| label, body = match.groups() | |
| normalized.append(f"{label} {body.strip()}") | |
| else: | |
| normalized.append(f"โข {line}") | |
| return "\n".join(normalized) | |
| LIST_PATTERN = re.compile(r"^([\-โข]|\d+\.)\s*(.+)$") | |
| def normalize_list(text: str) -> str: | |
| """ | |
| ์ผ๋ฐ ๋ฆฌ์คํธ ํ ์คํธ๋ฅผ ์ ๊ทํ. | |
| """ | |
| lines = [line.strip() for line in text.splitlines() if line.strip()] | |
| normalized: List[str] = [] | |
| for line in lines: | |
| match = LIST_PATTERN.match(line) | |
| if match: | |
| normalized.append(f"- {match.group(2).strip()}") | |
| else: | |
| normalized.append(f"- {line}") | |
| return "\n".join(normalized) | |
| def normalize_reading_list(text: str) -> str: | |
| """ | |
| ์ผ๋ฐ ๋ฌธ์์ฉ ๋ฆฌ์คํธ ์ ๊ทํ (๋ถ๋ฆฟ ๊ธฐํธ ์ ์ง). | |
| """ | |
| lines = [line.strip() for line in text.splitlines() if line.strip()] | |
| normalized: List[str] = [] | |
| for line in lines: | |
| match = LIST_PATTERN.match(line) | |
| if match: | |
| normalized.append(f"โข {match.group(2).strip()}") | |
| else: | |
| normalized.append(f"โข {line}") | |
| return "\n".join(normalized) | |
| def merge_visual_description(text: str, ai_text: Optional[str]) -> str: | |
| """ | |
| ๊ทธ๋ฆผ/ํ/์์๋ ์ค๋ช ์ ๊ฒฐํฉํ๋ค. | |
| AI ์ค๋ช ์ด ์์ผ๋ฉด ์ฐ์ ์ฌ์ฉํ๊ณ , OCR ํ ์คํธ๊ฐ ์์ผ๋ฉด ๋ค์ ์ค์ ์ถ๊ฐํ๋ค. | |
| """ | |
| if ai_text and text: | |
| return f"{ai_text}\n{text}" | |
| return ai_text or text | |
| def isolate_formula(text: str) -> str: | |
| """ | |
| ์์์ ์ฃผ์ด์ง ํ ์คํธ๋ฅผ ๊ทธ๋๋ก ์ฌ์ฉํ๋ ์๋ค ๊ณต๋ฐฑ์ ์ ๋ํ๋ค. | |
| """ | |
| return text.strip() | |
| def uppercase_title(text: str) -> str: | |
| return text.strip() | |
| def normalize_question_type(text: str) -> str: | |
| """ | |
| question type OCR ๊ฒฐ๊ณผ์ ์ค ์ ๋ ฌ/๋ ธ์ด์ฆ ์ ๊ฑฐ. | |
| - ์ค๋ฐ๊ฟ์ ๊ณต๋ฐฑ์ผ๋ก ์นํํ์ฌ ํ ์ค๋ก ์ ๋ฆฌ | |
| (๊ทธ ์ธ ๋ฌธ์/๊ณต๋ฐฑ์ ์๋ณธ์ ์ต๋ํ ์ ์ง) | |
| """ | |
| normalized = unicodedata.normalize("NFKC", text or "") | |
| normalized = normalized.replace("\r\n", "\n").replace("\r", "\n") | |
| return normalized.replace("\n", " ") | |
| TRANSFORM_DISPATCH = { | |
| "normalize_choices": normalize_choices, | |
| "normalize_list": normalize_list, | |
| "normalize_reading_list": normalize_reading_list, | |
| "merge_visual_description": merge_visual_description, | |
| "isolate_formula": isolate_formula, | |
| "uppercase_title": uppercase_title, | |
| "normalize_question_type": normalize_question_type, | |
| } | |
| # --------------------------------------------------------------------------- | |
| # ๊ท์น ์ ์ฉ ๋ฐ ์ถ๋ ฅ ์ ๋ฆฌ | |
| # --------------------------------------------------------------------------- | |
| def apply_rule(rule: RuleConfig, content: str) -> str: | |
| """ | |
| ๊ท์น์ ๋ฐ๋ผ ์ฝํ ์ธ ์ ์ ๋์ฌ, ๋ค์ฌ์ฐ๊ธฐ, ์ ๋ฏธ์ฌ๋ฅผ ์ ์ฉํ๋ค. | |
| """ | |
| if not content and not rule.allow_empty: | |
| return "" | |
| working = content | |
| if rule.indent > 0: | |
| indent_str = " " * rule.indent | |
| indented_lines: List[str] = [] | |
| for line in working.splitlines(): | |
| if not line.strip(): | |
| indented_lines.append("") | |
| else: | |
| indented_lines.append(f"{indent_str}{line}") | |
| working = "\n".join(indented_lines) | |
| if not working and not rule.keep_suffix_on_empty: | |
| return rule.prefix if rule.prefix else "" | |
| return f"{rule.prefix}{working}{rule.suffix}" | |
| def clean_output(text: str) -> str: | |
| """ | |
| ์ต์ข ์ถ๋ ฅ ๋ฌธ์์ด์์ ์ฐ์ ๋น ์ค ๋ฐ ํํ ๊ณต๋ฐฑ์ ์ ๋ฆฌํ๋ค. | |
| """ | |
| lines = text.splitlines() | |
| cleaned: List[str] = [] | |
| empty_streak = 0 | |
| for line in lines: | |
| stripped = line.rstrip() | |
| if stripped == "": | |
| empty_streak += 1 | |
| if empty_streak > 2: | |
| continue | |
| else: | |
| empty_streak = 0 | |
| cleaned.append(stripped) | |
| result = "\n".join(cleaned).strip() | |
| return result | |
| # --------------------------------------------------------------------------- | |
| # ๋ ๋๋ง ์ปจํ ์คํธ | |
| # --------------------------------------------------------------------------- | |
| class RenderContext: | |
| """ | |
| ๋ ๋๋ง ์ ํ์ํ ์ปจํ ์คํธ. | |
| """ | |
| ocr_texts: Dict[int, str] | |
| ai_texts: Dict[int, str] | |
| rules: Dict[str, RuleConfig] | |
| def get_texts(self, element: MockElement) -> Tuple[str, str]: | |
| element_id = getattr(element, "element_id", None) | |
| base_text = self.ocr_texts.get(element_id, "").strip() | |
| ai_text = self.ai_texts.get(element_id, "").strip() | |
| return base_text, ai_text | |
| def apply_transform( | |
| self, | |
| element: MockElement, | |
| text: str, | |
| *, | |
| base_text: str, | |
| ai_text: str, | |
| ) -> str: | |
| rule = self.rules.get(element.class_name) | |
| if not rule or not rule.transform: | |
| return text.strip() | |
| transform = TRANSFORM_DISPATCH.get(rule.transform) | |
| if not transform: | |
| return text.strip() | |
| if rule.transform == "merge_visual_description": | |
| return transform(base_text.strip(), ai_text.strip()) | |
| return transform(text.strip()) | |
| def format_element( | |
| self, element: MockElement, content_override: Optional[str] = None | |
| ) -> str: | |
| """ | |
| ๊ฐ๋ณ ์์๋ฅผ ๊ท์น์ ๋ฐ๋ผ ๋ฌธ์์ด๋ก ๋ณํํ๋ค. | |
| """ | |
| element_id = getattr(element, "element_id", None) | |
| base_text = ( | |
| content_override | |
| if content_override is not None | |
| else self.ocr_texts.get(element_id, "") | |
| ).strip() | |
| ai_text = self.ai_texts.get(element_id, "").strip() | |
| # ๊ทธ๋ฆผ/ํ/์์๋๋ transform ํจ์์์ ๋ณํฉ ์ฒ๋ฆฌ | |
| if element.class_name in AI_PRIORITY_CLASSES: | |
| # merge_visual_description transform์ ๋งก๊น | |
| working = base_text | |
| else: | |
| working = base_text or ai_text | |
| # Transform ์ ์ฉ (merge_visual_description์ด ai_text์ ๋ณํฉ) | |
| working = self.apply_transform( | |
| element, | |
| working, | |
| base_text=base_text, | |
| ai_text=ai_text, | |
| ) | |
| # Transform ํ์๋ ๋น์ด์๊ณ AI ์ค๋ช ์ด ์์ผ๋ฉด AI ์ค๋ช ์ฌ์ฉ | |
| if not working and ai_text and element.class_name in AI_PRIORITY_CLASSES: | |
| working = ai_text | |
| # ๊ท์น ์ ์ฉ (prefix, suffix, indent) | |
| rule = self.rules.get(element.class_name) | |
| if rule: | |
| return apply_rule(rule, working) | |
| # ๊ท์น์ด ์์ผ๋ฉด ๊ธฐ๋ณธ์ ์ผ๋ก ํ ์ค ์ถ๋ ฅ | |
| return f"{working}\n" if working else "" | |