Spaces:

riacho
/

Solar-News-Translator-Final

Sleeping

File size: 9,175 Bytes

f4e0387

"""Rule-based preprocessing: Korean number→English unit conversion + article date injection."""

import re
from datetime import datetime

# Korean magnitude markers (cumulative within a compound number)
KO_MAGNITUDE = [("조", 10**12), ("억", 10**8), ("만", 10**4)]

# Recognized currency / counter units. Currency names follow international
# naming conventions ("Korean won", "Japanese yen", "Chinese yuan"). Counter
# units stay simple.
UNIT_MAP = {
    "원": "Korean won",
    "달러": "dollars",
    "유로": "euros",
    "엔": "Japanese yen",
    "위안": "Chinese yuan",
    "명": "people",
    "개": "units",
    "건": "cases",
    "대": "vehicles",
    "톤": "tons",
}

# Compound Korean number followed by unit. The number portion may contain any
# combination of 조 / 억 / 만 plus a tail digit run, optionally followed by an
# approximation marker (여 / 약 / 가량 / 여 등) before the unit. We require at
# least one digit-magnitude pair OR a digit-only tail to avoid matching bare
# units.
APPROX = r"(?:여|여\s+|가량|가량\s+|쯤|쯤\s+|약\s+|약)"
NUM_UNIT_PATTERN = re.compile(
    r"(?P<num>"
    r"(?:\d[\d,]*(?:\.\d+)?\s*조\s*)?"
    r"(?:\d[\d,]*(?:\.\d+)?(?:\s*" + APPROX + r")?\s*억\s*)?"
    r"(?:\d[\d,]*(?:\.\d+)?(?:\s*" + APPROX + r")?\s*만\s*)?"
    r"(?:\d[\d,]*(?:\.\d+)?)?"
    r")\s*(?P<unit>원|달러|유로|엔|위안|명|개|건|대|톤)"
)

# Percentage points. Matches all common Korean variants of "X%p / X%포인트 /
# X퍼센트 포인트". Must be detected and replaced BEFORE plain numbers so that
# "10%" doesn't slip through unattended (we want "10 percentage points", never
# just "10%").
PERCENT_POINT_PATTERN = re.compile(
    r"(?P<num>\d[\d,]*(?:\.\d+)?)\s*"
    r"(?:%\s*p|%\s*포인트|퍼센트\s*포인트|％\s*포인트)",
    re.IGNORECASE,
)


def parse_korean_number(text: str) -> float | None:
    """Parse a compound Korean magnitude string (e.g., '1조4000억', '120억', '5000만',
    '7800여억') → numeric value. Approximation markers (여 / 가량 / 쯤 / 약) are stripped."""
    s = text.replace(",", "").replace(" ", "")
    # Strip approximation markers anywhere they appear within the number string
    s = re.sub(r"(여|가량|쯤|약)", "", s)
    total = 0.0
    matched = False
    for marker, mult in KO_MAGNITUDE:
        m = re.match(r"(\d+(?:\.\d+)?)" + marker, s)
        if m:
            total += float(m.group(1)) * mult
            s = s[m.end():]
            matched = True
    if s:
        try:
            total += float(s)
            matched = True
        except ValueError:
            pass
    return total if matched else None


def format_english_amount(amount: float, unit_kor: str) -> str:
    """Format amount in English unit. Uses 'million/billion/trillion' for large numbers."""
    unit_en = UNIT_MAP.get(unit_kor, unit_kor)

    def _fmt(n: float) -> str:
        # Preserve up to 4 decimals; trim trailing zeros; thousands-separate the integer part.
        if abs(n - round(n)) < 1e-9:
            return f"{int(round(n)):,}"
        s = f"{n:.4f}".rstrip("0").rstrip(".")
        if "." in s:
            int_part, dec = s.split(".")
            return f"{int(int_part):,}.{dec}"
        return f"{int(s):,}"

    if amount >= 10**12:
        return f"{_fmt(amount / 10**12)} trillion {unit_en}"
    if amount >= 10**9:
        return f"{_fmt(amount / 10**9)} billion {unit_en}"
    if amount >= 10**6:
        return f"{_fmt(amount / 10**6)} million {unit_en}"
    # < 10^6: write as thousand-separated integer (avoid awkward "135.574 thousand")
    return f"{_fmt(amount)} {unit_en}"


def detect_korean_numbers(text: str) -> list[dict]:
    """Return list of {span, start, end, amount, unit, english} entries found in text."""
    results = []
    seen_spans = set()
    for m in NUM_UNIT_PATTERN.finditer(text):
        num_str = m.group("num").strip()
        unit = m.group("unit")
        if not re.search(r"\d", num_str):
            continue  # skip bare unit
        amount = parse_korean_number(num_str)
        if amount is None or amount == 0:
            continue
        full = f"{num_str}{unit}"
        # Dedup identical spans (e.g., the same "100억원" appearing many times)
        if full in seen_spans:
            continue
        seen_spans.add(full)
        results.append({
            "span": full,
            "start": m.start(),
            "end": m.end(),
            "amount": amount,
            "unit_ko": unit,
            "english": format_english_amount(amount, unit),
        })
    return results


def parse_article_date(date_str: str) -> str | None:
    """Normalize an article date to a YYYY-MM-DD string, return None if unparseable."""
    if not date_str:
        return None
    s = date_str.strip()
    # Try a few common formats
    for fmt in ("%Y-%m-%d", "%Y.%m.%d", "%Y/%m/%d", "%Y%m%d"):
        try:
            return datetime.strptime(s[:10], fmt).strftime("%Y-%m-%d")
        except ValueError:
            continue
    return None


def _format_simple_number(num_str: str) -> str:
    """Format a plain digit string (with possible commas/decimals) keeping the value as-is."""
    s = num_str.replace(",", "")
    try:
        n = float(s)
    except ValueError:
        return num_str
    if abs(n - round(n)) < 1e-9:
        return f"{int(round(n)):,}"
    return f"{n:,g}"


def replace_numbers_inline(text: str) -> tuple[str, list[dict]]:
    """Replace Korean number+unit spans in `text` with their English equivalents.

    Two passes (each non-overlapping internally), applied in priority order so
    that longer / more specific patterns win:
      1. Percentage points  ("10%포인트" → "10 percentage points")
      2. Korean magnitude+unit ("120억원" → "12 billion Korean won")

    Returns (rewritten_text, list_of_replacements_with_offsets_in_original).
    """
    matches: list[dict] = []
    used_ranges: list[tuple[int, int]] = []

    def overlaps(s: int, e: int) -> bool:
        return any(not (e <= us or s >= ue) for us, ue in used_ranges)

    # Pass 1: percentage points (highest priority)
    for m in PERCENT_POINT_PATTERN.finditer(text):
        s, e = m.start(), m.end()
        if overlaps(s, e):
            continue
        num_str = m.group("num")
        english = f"{_format_simple_number(num_str)} percentage points"
        matches.append({
            "start": s,
            "end": e,
            "span": text[s:e],
            "amount": None,
            "unit_ko": "%포인트",
            "english": english,
        })
        used_ranges.append((s, e))

    # Pass 2: Korean magnitude+currency/counter
    for m in NUM_UNIT_PATTERN.finditer(text):
        s, e = m.start(), m.end()
        if overlaps(s, e):
            continue
        num_str = m.group("num").strip()
        unit = m.group("unit")
        if not re.search(r"\d", num_str):
            continue
        amount = parse_korean_number(num_str)
        if amount is None or amount == 0:
            continue
        english = format_english_amount(amount, unit)
        matches.append({
            "start": s,
            "end": e,
            "span": text[s:e],
            "amount": amount,
            "unit_ko": unit,
            "english": english,
        })
        used_ranges.append((s, e))

    # Apply replacements right-to-left to preserve earlier offsets
    matches.sort(key=lambda r: r["start"])
    out = text
    for r in sorted(matches, key=lambda r: r["start"], reverse=True):
        out = out[:r["start"]] + r["english"] + out[r["end"]:]

    return out, matches


def preprocess(text: str, article_date: str | None) -> tuple[str, dict]:
    """Inline-replace Korean numbers with English units inside the user text.

    The article date is NOT injected into the user message; callers should append
    it to the system prompt via `system_prompt_date_suffix()` instead.

    Returns (rewritten_text, debug_info).
    """
    norm_date = parse_article_date(article_date) if article_date else None
    rewritten, replacements = replace_numbers_inline(text)
    return rewritten, {"date": norm_date, "conversions": replacements}


def system_prompt_date_suffix(article_date: str | None) -> str:
    """Return the line to append to the system prompt for date anchoring.

    Empty string if no date is available.
    """
    norm_date = parse_article_date(article_date) if article_date else None
    if not norm_date:
        return ""
    return f"\n\n[Article published date: {norm_date}]"


if __name__ == "__main__":
    # Smoke test
    samples = [
        ("1023억원과 5조4000억원, 그리고 300만원", "2025-03-15"),
        ("897억달러 매출", "2025-04-02"),
        ("지난 19일 회담이 열렸다", "2025-01-23"),
        ("100억원대 분쟁이 발생", "2024-11-01"),
        ("5000만원의 보너스를 받았다", "2025-02-10"),
    ]
    for text, date in samples:
        out, info = preprocess(text, date)
        print(f"\n--- date={date} ---")
        print(f"input:  {text}")
        print(f"detected: {info['conversions']}")
        print(f"resolved date: {info['date']}")
        print(f"augmented:\n{out}")