smarteye-backend / app /services /formatter_rules.py
KwanHak's picture
sync: Smart_Demo ๋ธŒ๋žœ์น˜์˜ Backend ์ฝ”๋“œ ๋ณ‘ํ•ฉ & ์ด๋ฏธ์ง€ ๋กœ๋“œ๋ฅผ ์œ„ํ•œ MultiFileLoader ์ปดํฌ๋„ŒํŠธ ๊ตฌํ˜„
82c1146
"""
์•ต์ปค ๊ธฐ๋ฐ˜ ํ…์ŠคํŠธ ํฌ๋งทํ„ฐ ๊ทœ์น™ ์ •์˜
=================================
์‹ค์ œ ์„œ๋น„์Šค๋Š” formatting_rules DB ํ…Œ์ด๋ธ”์„ ์ฐธ๊ณ ํ•˜๋Š” ๊ฒƒ์„ ๋ชฉํ‘œ๋กœ ํ•˜์ง€๋งŒ,
ํ˜„์žฌ ๊ตฌํ˜„์—์„œ๋Š” ์ฝ”๋“œ ๋ ˆ๋ฒจ์˜ ๊ธฐ๋ณธ ๊ทœ์น™์„ ์ œ๊ณตํ•˜๊ณ , ํ–ฅํ›„ DB ์˜ค๋ฒ„๋ผ์ด๋“œ๋ฅผ
์œ„ํ•ด ๋™์ผํ•œ ๊ตฌ์กฐ๋ฅผ ์œ ์ง€ํ•œ๋‹ค.
"""
from __future__ import annotations
from dataclasses import dataclass, replace
from typing import Dict, Optional, TYPE_CHECKING
if TYPE_CHECKING:
from sqlalchemy.orm import Session
@dataclass(frozen=True)
class RuleConfig:
"""
๊ฐœ๋ณ„ ํด๋ž˜์Šค์— ๋Œ€ํ•œ ํฌ๋งทํŒ… ๊ทœ์น™.
Attributes:
prefix: ์ฝ˜ํ…์ธ  ์•ž์— ๋ถ™์ผ ๋ฌธ์ž์—ด.
suffix: ์ฝ˜ํ…์ธ  ๋’ค์— ๋ถ™์ผ ๋ฌธ์ž์—ด.
indent: ๋“ค์—ฌ์“ฐ๊ธฐ ๊ณต๋ฐฑ ์ˆ˜(๊ฐ ๋ผ์ธ์— ์ ์šฉ).
transform: formatter_utils์—์„œ ์‚ฌ์šฉํ•  ํ›„์ฒ˜๋ฆฌ ํ•จ์ˆ˜ ์ด๋ฆ„.
allow_empty: True๋ฉด ๋นˆ ์ฝ˜ํ…์ธ ๋ผ๋„ ๊ทœ์น™์„ ์ ์šฉ.
keep_suffix_on_empty: ๋นˆ ์ฝ˜ํ…์ธ ์ผ ๋•Œ๋„ suffix๋ฅผ ์œ ์ง€ํ• ์ง€ ์—ฌ๋ถ€.
"""
prefix: str = ""
suffix: str = "\n"
indent: int = 0
transform: Optional[str] = None
allow_empty: bool = False
keep_suffix_on_empty: bool = False
# ---------------------------------------------------------------------------
# ๊ธฐ๋ณธ ๊ทœ์น™: ๋ฌธ์ œ์ง€(question_based) ๋ฌธ์„œ
# ---------------------------------------------------------------------------
QUESTION_BASED_RULES: Dict[str, RuleConfig] = {
# ์•ต์ปค
"question type": RuleConfig(prefix="\n\n[", suffix="]\n", indent=0, transform="normalize_question_type"),
"question number": RuleConfig(prefix="\n\n", suffix=". ", indent=0, allow_empty=False),
"second_question_number": RuleConfig(prefix="\n ", suffix="", indent=3, allow_empty=False),
# ๋ณธ๋ฌธ
"question text": RuleConfig(prefix="", suffix="\n", indent=3),
"plain text": RuleConfig(prefix="", suffix="\n", indent=0),
"unit": RuleConfig(prefix="", suffix="\n", indent=3),
"list": RuleConfig(prefix=" - ", suffix="\n", indent=0, transform="normalize_list"),
"choices": RuleConfig(prefix="", suffix="\n", indent=3, transform="normalize_choices"),
# ์‹œ๊ฐ ์ž๋ฃŒ
"figure": RuleConfig(prefix="\n [๊ทธ๋ฆผ ์„ค๋ช…]\n", suffix="\n\n", indent=3, transform="merge_visual_description", allow_empty=True),
"table": RuleConfig(prefix="\n [ํ‘œ ์„ค๋ช…]\n", suffix="\n\n", indent=3, transform="merge_visual_description", allow_empty=True),
"flowchart": RuleConfig(prefix="\n [์ˆœ์„œ๋„ ์„ค๋ช…]\n", suffix="\n\n", indent=3, transform="merge_visual_description", allow_empty=True),
# ์บก์…˜ ๋ฐ ๋ฉ”ํƒ€
"figure_caption": RuleConfig(prefix=" (๊ทธ๋ฆผ ์บก์…˜) ", suffix="\n\n", indent=0),
"table caption": RuleConfig(prefix=" (ํ‘œ ์บก์…˜) ", suffix="\n\n", indent=0),
"table footnote": RuleConfig(prefix=" * ", suffix="\n", indent=0),
"formula_caption": RuleConfig(prefix=" (์ˆ˜์‹ ์„ค๋ช…) ", suffix="\n", indent=0),
"isolated_formula": RuleConfig(prefix="\n [์ˆ˜์‹]\n", suffix="\n", indent=3, transform="isolate_formula")
}
# ---------------------------------------------------------------------------
# ๊ธฐ๋ณธ ๊ทœ์น™: ์ผ๋ฐ˜ ๋ฌธ์„œ(reading_order) ๋ฌธ์„œ
# ---------------------------------------------------------------------------
READING_ORDER_RULES: Dict[str, RuleConfig] = {
"title": RuleConfig(prefix="", suffix="\n\n", indent=0, transform="uppercase_title"),
"heading": RuleConfig(prefix="\n", suffix="\n\n", indent=0),
"plain text": RuleConfig(prefix="", suffix="\n\n", indent=0),
"list": RuleConfig(prefix="", suffix="\n", indent=0, transform="normalize_reading_list"),
"figure": RuleConfig(prefix="\n[๊ทธ๋ฆผ] ", suffix="\n\n", indent=0, transform="merge_visual_description"),
"table": RuleConfig(prefix="\n[ํ‘œ] ", suffix="\n\n", indent=0, transform="merge_visual_description"),
"figure_caption": RuleConfig(prefix="(๊ทธ๋ฆผ ์บก์…˜) ", suffix="\n", indent=0),
"table caption": RuleConfig(prefix="(ํ‘œ ์บก์…˜) ", suffix="\n", indent=0),
"table footnote": RuleConfig(prefix="* ", suffix="\n", indent=0),
}
RULE_MAP_BY_DOC_TYPE: Dict[str, Dict[str, RuleConfig]] = {
"question_based": QUESTION_BASED_RULES,
"reading_order": READING_ORDER_RULES,
}
def get_rules_for_document_type(document_type: str) -> Dict[str, RuleConfig]:
"""
์ง€์ •๋œ ๋ฌธ์„œ ํƒ€์ž…์˜ ๊ทœ์น™ ์‚ฌ์ „์„ ๋ณต์‚ฌํ•˜์—ฌ ๋ฐ˜ํ™˜ํ•ฉ๋‹ˆ๋‹ค.
Args:
document_type: "question_based" ๋˜๋Š” "reading_order"
Returns:
class_name โ†’ RuleConfig ๋งคํ•‘ (๋ณต์‚ฌ๋ณธ)
"""
base_rules = RULE_MAP_BY_DOC_TYPE.get(document_type)
if base_rules is None:
raise ValueError(f"์ง€์›ํ•˜์ง€ ์•Š๋Š” ๋ฌธ์„œ ํƒ€์ž…์ž…๋‹ˆ๋‹ค: {document_type}")
return {class_name: replace(rule) for class_name, rule in base_rules.items()}
def fetch_db_rules(db: "Session", doc_type_id: int) -> Dict[str, Dict[str, str]]:
"""
DB์—์„œ formatting_rules๋ฅผ ์กฐํšŒํ•˜์—ฌ ๋ฎ์–ด์“ฐ๊ธฐ ์ •๋ณด๋ฅผ ๋ฐ˜ํ™˜ํ•ฉ๋‹ˆ๋‹ค.
Args:
db: SQLAlchemy ์„ธ์…˜
doc_type_id: document_types.doc_type_id (1=๋ฌธ์ œ์ง€, 2=์ผ๋ฐ˜๋ฌธ์„œ)
Returns:
class_name โ†’ {prefix, suffix, indent} ํ˜•ํƒœ์˜ ๋ฎ์–ด์“ฐ๊ธฐ ์ •๋ณด
"""
# Import here to avoid circular dependency
from .. import crud
db_rules = crud.get_all_formatting_rules(db)
if not db_rules:
return {}
override_dict: Dict[str, Dict[str, str]] = {}
for rule in db_rules:
# doc_type_id๊ฐ€ ์ผ์น˜ํ•˜๊ฑฐ๋‚˜ NULL(๊ณตํ†ต ๊ทœ์น™)์ธ ๊ฒฝ์šฐ๋งŒ ์ ์šฉ
if rule.doc_type_id is None or rule.doc_type_id == doc_type_id:
override_dict[rule.class_name] = {
"prefix": rule.prefix or "",
"suffix": rule.suffix or "\n",
"indent": str(rule.indent_level or 0),
}
return override_dict
def override_rules_with_db(
base_rules: Dict[str, RuleConfig],
db_records: Optional[Dict[str, Dict[str, str]]] = None
) -> Dict[str, RuleConfig]:
"""
DB ๋ ˆ์ฝ”๋“œ ์ •๋ณด๋ฅผ ์‚ฌ์šฉํ•˜์—ฌ ๊ทœ์น™์„ ๋ฎ์–ด์”๋‹ˆ๋‹ค.
Args:
base_rules: ์ฝ”๋“œ ๊ธฐ๋ณธ ๊ทœ์น™ ์‚ฌ์ „.
db_records: class_name โ†’ {prefix, suffix, indent} ํ˜•ํƒœ์˜ ๋ฎ์–ด์“ฐ๊ธฐ ์ •๋ณด.
Returns:
๋ฎ์–ด์“ฐ๊ธฐ ์ ์šฉ๋œ ๊ทœ์น™ ์‚ฌ์ „.
"""
if not db_records:
return base_rules
updated_rules = dict(base_rules)
for class_name, override in db_records.items():
rule = updated_rules.get(class_name)
if not rule:
continue
updated_rules[class_name] = RuleConfig(
prefix=override.get("prefix", rule.prefix),
suffix=override.get("suffix", rule.suffix),
indent=int(override.get("indent", rule.indent)),
transform=rule.transform,
allow_empty=rule.allow_empty,
keep_suffix_on_empty=rule.keep_suffix_on_empty,
)
return updated_rules
def get_rule_for_class(
class_name: str,
document_type: str,
db: Optional["Session"] = None,
doc_type_id: Optional[int] = None
) -> RuleConfig:
"""
์ฃผ์–ด์ง„ ํด๋ž˜์Šค๋ช…์— ๋Œ€ํ•œ ํฌ๋งทํŒ… ๊ทœ์น™์„ ๋ฐ˜ํ™˜ํ•ฉ๋‹ˆ๋‹ค.
DB ์„ธ์…˜์ด ์ œ๊ณต๋˜๋ฉด DB ์˜ค๋ฒ„๋ผ์ด๋“œ๋ฅผ ์ ์šฉํ•ฉ๋‹ˆ๋‹ค.
Args:
class_name: ๋ ˆ์ด์•„์›ƒ ์š”์†Œ ํด๋ž˜์Šค๋ช…
document_type: "question_based" ๋˜๋Š” "reading_order"
db: SQLAlchemy ์„ธ์…˜ (์„ ํƒ)
doc_type_id: ๋ฌธ์„œ ํƒ€์ž… ID (์„ ํƒ, db ์ œ๊ณต ์‹œ ํ•„์š”)
Returns:
ํ•ด๋‹น ํด๋ž˜์Šค์˜ RuleConfig
"""
base_rules = get_rules_for_document_type(document_type)
if db and doc_type_id:
db_records = fetch_db_rules(db, doc_type_id)
rules = override_rules_with_db(base_rules, db_records)
else:
rules = base_rules
# ๊ธฐ๋ณธ๊ฐ’ ๋ฐ˜ํ™˜ (๊ทœ์น™์ด ์—†๋Š” ๊ฒฝ์šฐ)
return rules.get(class_name, RuleConfig())