Spaces:
Sleeping
Sleeping
| """ | |
| ์ต์ปค ๊ธฐ๋ฐ ํ ์คํธ ํฌ๋งทํฐ ๊ท์น ์ ์ | |
| ================================= | |
| ์ค์ ์๋น์ค๋ formatting_rules DB ํ ์ด๋ธ์ ์ฐธ๊ณ ํ๋ ๊ฒ์ ๋ชฉํ๋ก ํ์ง๋ง, | |
| ํ์ฌ ๊ตฌํ์์๋ ์ฝ๋ ๋ ๋ฒจ์ ๊ธฐ๋ณธ ๊ท์น์ ์ ๊ณตํ๊ณ , ํฅํ DB ์ค๋ฒ๋ผ์ด๋๋ฅผ | |
| ์ํด ๋์ผํ ๊ตฌ์กฐ๋ฅผ ์ ์งํ๋ค. | |
| """ | |
| from __future__ import annotations | |
| from dataclasses import dataclass, replace | |
| from typing import Dict, Optional, TYPE_CHECKING | |
| if TYPE_CHECKING: | |
| from sqlalchemy.orm import Session | |
| class RuleConfig: | |
| """ | |
| ๊ฐ๋ณ ํด๋์ค์ ๋ํ ํฌ๋งทํ ๊ท์น. | |
| Attributes: | |
| prefix: ์ฝํ ์ธ ์์ ๋ถ์ผ ๋ฌธ์์ด. | |
| suffix: ์ฝํ ์ธ ๋ค์ ๋ถ์ผ ๋ฌธ์์ด. | |
| indent: ๋ค์ฌ์ฐ๊ธฐ ๊ณต๋ฐฑ ์(๊ฐ ๋ผ์ธ์ ์ ์ฉ). | |
| transform: formatter_utils์์ ์ฌ์ฉํ ํ์ฒ๋ฆฌ ํจ์ ์ด๋ฆ. | |
| allow_empty: True๋ฉด ๋น ์ฝํ ์ธ ๋ผ๋ ๊ท์น์ ์ ์ฉ. | |
| keep_suffix_on_empty: ๋น ์ฝํ ์ธ ์ผ ๋๋ suffix๋ฅผ ์ ์งํ ์ง ์ฌ๋ถ. | |
| """ | |
| prefix: str = "" | |
| suffix: str = "\n" | |
| indent: int = 0 | |
| transform: Optional[str] = None | |
| allow_empty: bool = False | |
| keep_suffix_on_empty: bool = False | |
| # --------------------------------------------------------------------------- | |
| # ๊ธฐ๋ณธ ๊ท์น: ๋ฌธ์ ์ง(question_based) ๋ฌธ์ | |
| # --------------------------------------------------------------------------- | |
| QUESTION_BASED_RULES: Dict[str, RuleConfig] = { | |
| # ์ต์ปค | |
| "question type": RuleConfig(prefix="\n\n[", suffix="]\n", indent=0, transform="normalize_question_type"), | |
| "question number": RuleConfig(prefix="\n\n", suffix=". ", indent=0, allow_empty=False), | |
| "second_question_number": RuleConfig(prefix="\n ", suffix="", indent=3, allow_empty=False), | |
| # ๋ณธ๋ฌธ | |
| "question text": RuleConfig(prefix="", suffix="\n", indent=3), | |
| "plain text": RuleConfig(prefix="", suffix="\n", indent=0), | |
| "unit": RuleConfig(prefix="", suffix="\n", indent=3), | |
| "list": RuleConfig(prefix=" - ", suffix="\n", indent=0, transform="normalize_list"), | |
| "choices": RuleConfig(prefix="", suffix="\n", indent=3, transform="normalize_choices"), | |
| # ์๊ฐ ์๋ฃ | |
| "figure": RuleConfig(prefix="\n [๊ทธ๋ฆผ ์ค๋ช ]\n", suffix="\n\n", indent=3, transform="merge_visual_description", allow_empty=True), | |
| "table": RuleConfig(prefix="\n [ํ ์ค๋ช ]\n", suffix="\n\n", indent=3, transform="merge_visual_description", allow_empty=True), | |
| "flowchart": RuleConfig(prefix="\n [์์๋ ์ค๋ช ]\n", suffix="\n\n", indent=3, transform="merge_visual_description", allow_empty=True), | |
| # ์บก์ ๋ฐ ๋ฉํ | |
| "figure_caption": RuleConfig(prefix=" (๊ทธ๋ฆผ ์บก์ ) ", suffix="\n\n", indent=0), | |
| "table caption": RuleConfig(prefix=" (ํ ์บก์ ) ", suffix="\n\n", indent=0), | |
| "table footnote": RuleConfig(prefix=" * ", suffix="\n", indent=0), | |
| "formula_caption": RuleConfig(prefix=" (์์ ์ค๋ช ) ", suffix="\n", indent=0), | |
| "isolated_formula": RuleConfig(prefix="\n [์์]\n", suffix="\n", indent=3, transform="isolate_formula") | |
| } | |
| # --------------------------------------------------------------------------- | |
| # ๊ธฐ๋ณธ ๊ท์น: ์ผ๋ฐ ๋ฌธ์(reading_order) ๋ฌธ์ | |
| # --------------------------------------------------------------------------- | |
| READING_ORDER_RULES: Dict[str, RuleConfig] = { | |
| "title": RuleConfig(prefix="", suffix="\n\n", indent=0, transform="uppercase_title"), | |
| "heading": RuleConfig(prefix="\n", suffix="\n\n", indent=0), | |
| "plain text": RuleConfig(prefix="", suffix="\n\n", indent=0), | |
| "list": RuleConfig(prefix="", suffix="\n", indent=0, transform="normalize_reading_list"), | |
| "figure": RuleConfig(prefix="\n[๊ทธ๋ฆผ] ", suffix="\n\n", indent=0, transform="merge_visual_description"), | |
| "table": RuleConfig(prefix="\n[ํ] ", suffix="\n\n", indent=0, transform="merge_visual_description"), | |
| "figure_caption": RuleConfig(prefix="(๊ทธ๋ฆผ ์บก์ ) ", suffix="\n", indent=0), | |
| "table caption": RuleConfig(prefix="(ํ ์บก์ ) ", suffix="\n", indent=0), | |
| "table footnote": RuleConfig(prefix="* ", suffix="\n", indent=0), | |
| } | |
| RULE_MAP_BY_DOC_TYPE: Dict[str, Dict[str, RuleConfig]] = { | |
| "question_based": QUESTION_BASED_RULES, | |
| "reading_order": READING_ORDER_RULES, | |
| } | |
| def get_rules_for_document_type(document_type: str) -> Dict[str, RuleConfig]: | |
| """ | |
| ์ง์ ๋ ๋ฌธ์ ํ์ ์ ๊ท์น ์ฌ์ ์ ๋ณต์ฌํ์ฌ ๋ฐํํฉ๋๋ค. | |
| Args: | |
| document_type: "question_based" ๋๋ "reading_order" | |
| Returns: | |
| class_name โ RuleConfig ๋งคํ (๋ณต์ฌ๋ณธ) | |
| """ | |
| base_rules = RULE_MAP_BY_DOC_TYPE.get(document_type) | |
| if base_rules is None: | |
| raise ValueError(f"์ง์ํ์ง ์๋ ๋ฌธ์ ํ์ ์ ๋๋ค: {document_type}") | |
| return {class_name: replace(rule) for class_name, rule in base_rules.items()} | |
| def fetch_db_rules(db: "Session", doc_type_id: int) -> Dict[str, Dict[str, str]]: | |
| """ | |
| DB์์ formatting_rules๋ฅผ ์กฐํํ์ฌ ๋ฎ์ด์ฐ๊ธฐ ์ ๋ณด๋ฅผ ๋ฐํํฉ๋๋ค. | |
| Args: | |
| db: SQLAlchemy ์ธ์ | |
| doc_type_id: document_types.doc_type_id (1=๋ฌธ์ ์ง, 2=์ผ๋ฐ๋ฌธ์) | |
| Returns: | |
| class_name โ {prefix, suffix, indent} ํํ์ ๋ฎ์ด์ฐ๊ธฐ ์ ๋ณด | |
| """ | |
| # Import here to avoid circular dependency | |
| from .. import crud | |
| db_rules = crud.get_all_formatting_rules(db) | |
| if not db_rules: | |
| return {} | |
| override_dict: Dict[str, Dict[str, str]] = {} | |
| for rule in db_rules: | |
| # doc_type_id๊ฐ ์ผ์นํ๊ฑฐ๋ NULL(๊ณตํต ๊ท์น)์ธ ๊ฒฝ์ฐ๋ง ์ ์ฉ | |
| if rule.doc_type_id is None or rule.doc_type_id == doc_type_id: | |
| override_dict[rule.class_name] = { | |
| "prefix": rule.prefix or "", | |
| "suffix": rule.suffix or "\n", | |
| "indent": str(rule.indent_level or 0), | |
| } | |
| return override_dict | |
| def override_rules_with_db( | |
| base_rules: Dict[str, RuleConfig], | |
| db_records: Optional[Dict[str, Dict[str, str]]] = None | |
| ) -> Dict[str, RuleConfig]: | |
| """ | |
| DB ๋ ์ฝ๋ ์ ๋ณด๋ฅผ ์ฌ์ฉํ์ฌ ๊ท์น์ ๋ฎ์ด์๋๋ค. | |
| Args: | |
| base_rules: ์ฝ๋ ๊ธฐ๋ณธ ๊ท์น ์ฌ์ . | |
| db_records: class_name โ {prefix, suffix, indent} ํํ์ ๋ฎ์ด์ฐ๊ธฐ ์ ๋ณด. | |
| Returns: | |
| ๋ฎ์ด์ฐ๊ธฐ ์ ์ฉ๋ ๊ท์น ์ฌ์ . | |
| """ | |
| if not db_records: | |
| return base_rules | |
| updated_rules = dict(base_rules) | |
| for class_name, override in db_records.items(): | |
| rule = updated_rules.get(class_name) | |
| if not rule: | |
| continue | |
| updated_rules[class_name] = RuleConfig( | |
| prefix=override.get("prefix", rule.prefix), | |
| suffix=override.get("suffix", rule.suffix), | |
| indent=int(override.get("indent", rule.indent)), | |
| transform=rule.transform, | |
| allow_empty=rule.allow_empty, | |
| keep_suffix_on_empty=rule.keep_suffix_on_empty, | |
| ) | |
| return updated_rules | |
| def get_rule_for_class( | |
| class_name: str, | |
| document_type: str, | |
| db: Optional["Session"] = None, | |
| doc_type_id: Optional[int] = None | |
| ) -> RuleConfig: | |
| """ | |
| ์ฃผ์ด์ง ํด๋์ค๋ช ์ ๋ํ ํฌ๋งทํ ๊ท์น์ ๋ฐํํฉ๋๋ค. | |
| DB ์ธ์ ์ด ์ ๊ณต๋๋ฉด DB ์ค๋ฒ๋ผ์ด๋๋ฅผ ์ ์ฉํฉ๋๋ค. | |
| Args: | |
| class_name: ๋ ์ด์์ ์์ ํด๋์ค๋ช | |
| document_type: "question_based" ๋๋ "reading_order" | |
| db: SQLAlchemy ์ธ์ (์ ํ) | |
| doc_type_id: ๋ฌธ์ ํ์ ID (์ ํ, db ์ ๊ณต ์ ํ์) | |
| Returns: | |
| ํด๋น ํด๋์ค์ RuleConfig | |
| """ | |
| base_rules = get_rules_for_document_type(document_type) | |
| if db and doc_type_id: | |
| db_records = fetch_db_rules(db, doc_type_id) | |
| rules = override_rules_with_db(base_rules, db_records) | |
| else: | |
| rules = base_rules | |
| # ๊ธฐ๋ณธ๊ฐ ๋ฐํ (๊ท์น์ด ์๋ ๊ฒฝ์ฐ) | |
| return rules.get(class_name, RuleConfig()) | |