Spaces:
Sleeping
Sleeping
| """ | |
| Korean News Translator — Final (Rule-augmented) Qualitative Eval | |
| Single-version demo for production confirmation: | |
| - Solar Pro2 (reasoning_effort=high, temperature=0.0) | |
| - prompts/rule_aug.txt — baseline prompt + RULE-PRECOMPUTED METADATA section | |
| - rules.py — Korean number→English unit inline replacement + article-date anchor | |
| Flow: | |
| 1. User provides article URL or pastes Korean body | |
| 2. User MUST provide article published date (YYYY-MM-DD) — manual in this demo; | |
| in production this comes from CMS publish timestamp automatically. | |
| 3. App preprocesses: inline-replaces Korean numbers, appends date to system prompt | |
| 4. App calls Solar Pro2 once → shows English translation | |
| 5. User leaves a qualitative comment (no rating/vote) → persisted | |
| Persistence: | |
| - If HF_TOKEN + FEEDBACK_DATASET_REPO set → push/pull JSONL on HF Datasets | |
| - Otherwise → local ./feedback.jsonl (dev mode) | |
| """ | |
| from __future__ import annotations | |
| import json | |
| import os | |
| import re | |
| import time | |
| import uuid | |
| from datetime import datetime | |
| from pathlib import Path | |
| import gradio as gr | |
| import pandas as pd | |
| import requests | |
| from dotenv import load_dotenv | |
| from openai import OpenAI | |
| from rules import preprocess, system_prompt_date_suffix | |
| load_dotenv() | |
| # ── Config ──────────────────────────────────────────────────────────────── | |
| UPSTAGE_BASE_URL = "https://api.upstage.ai/v1" | |
| UPSTAGE_API_KEY = os.getenv("UPSTAGE_API_KEY") or os.getenv("OPENAI_API_KEY") | |
| FIRECRAWL_API_KEY = os.getenv("FIRECRAWL_API_KEY") | |
| HF_TOKEN = os.getenv("HF_TOKEN") | |
| FEEDBACK_DATASET_REPO = os.getenv("FEEDBACK_DATASET_REPO") | |
| LOCAL_FEEDBACK_FILE = Path(__file__).parent / "feedback.jsonl" | |
| SCRIPT_DIR = Path(__file__).parent | |
| SYSTEM_PROMPT = (SCRIPT_DIR / "prompts" / "rule_aug.txt").read_text(encoding="utf-8") | |
| MODEL = "solar-pro2" | |
| REASONING_EFFORT = "high" | |
| TEMPERATURE = 0.0 | |
| _client: OpenAI | None = None | |
| def _get_client() -> OpenAI: | |
| """Lazy init so the Space can boot even when secrets are missing. | |
| The friendly error surfaces only at translate-time.""" | |
| global _client | |
| if _client is not None: | |
| return _client | |
| if not UPSTAGE_API_KEY: | |
| raise RuntimeError( | |
| "UPSTAGE_API_KEY가 설정되지 않았습니다. " | |
| "Space → Settings → Variables and secrets에서 추가해주세요." | |
| ) | |
| _client = OpenAI(api_key=UPSTAGE_API_KEY, base_url=UPSTAGE_BASE_URL) | |
| return _client | |
| # ── Translation ─────────────────────────────────────────────────────────── | |
| def call_upstage(system_prompt: str, user_text: str) -> str: | |
| resp = _get_client().chat.completions.create( | |
| model=MODEL, | |
| messages=[ | |
| {"role": "system", "content": system_prompt}, | |
| {"role": "user", "content": user_text}, | |
| ], | |
| temperature=TEMPERATURE, | |
| reasoning_effort=REASONING_EFFORT, | |
| max_tokens=8000, | |
| ) | |
| return resp.choices[0].message.content.strip() | |
| _META_MARKERS = ( | |
| "verification checklist", "checklist confirmation", | |
| "before submitting", "### verification", "**verification", | |
| ) | |
| def _strip_meta_trailer(text: str) -> str: | |
| if not text: | |
| return text | |
| lowered = text.lower() | |
| earliest = len(text) | |
| for marker in _META_MARKERS: | |
| idx = lowered.find(marker) | |
| if 0 <= idx < earliest: | |
| earliest = idx | |
| if earliest < len(text): | |
| return text[:earliest].rstrip() | |
| return text | |
| # ── Crawling (same as reference) ────────────────────────────────────────── | |
| def crawl_article(url: str) -> tuple[str, str]: | |
| if not FIRECRAWL_API_KEY: | |
| return "", "Firecrawl API 키가 설정되지 않았습니다. 본문을 직접 붙여넣어 주세요." | |
| if not url.strip(): | |
| return "", "URL을 입력해주세요." | |
| payload = { | |
| "url": url, | |
| "formats": [{ | |
| "type": "json", | |
| "prompt": ( | |
| "뉴스 기사의 본문 텍스트만 추출하세요.\n\n" | |
| "반드시 제외할 것:\n" | |
| "- 기사 제목, 기자/저자 이름, 입력일/업데이트일\n" | |
| "- **이미지/사진/그래픽/일러스트/도표/인포그래픽 캡션**\n" | |
| "- 광고, 편집자 주, 관련기사 링크\n" | |
| "- 추천기사, 댓글, 구독 안내, 사이트 내비게이션, 인기기사/핫뉴스 위젯\n" | |
| "- '저작권자 ⓒ ...', 'ⓒ' 표시 단락\n\n" | |
| "오직 기자가 작성한 본문 단락만 순서대로 이어서 반환하세요. " | |
| '형식: {"body": "본문 텍스트 전체 (단락은 \\n\\n 으로 구분)"}' | |
| ), | |
| }], | |
| "onlyMainContent": True, "blockAds": True, "removeBase64Images": True, | |
| } | |
| try: | |
| r = requests.post( | |
| "https://api.firecrawl.dev/v2/scrape", | |
| headers={"Authorization": f"Bearer {FIRECRAWL_API_KEY}", | |
| "Content-Type": "application/json"}, | |
| json=payload, timeout=(10, 180), | |
| ) | |
| if r.status_code != 200: | |
| return "", f"스크래핑 실패 ({r.status_code}): {r.text[:200]}" | |
| body = ((r.json().get("data") or {}).get("json") or {}).get("body", "").strip() | |
| if not body: | |
| return "", "본문을 찾을 수 없습니다." | |
| return body, "" | |
| except Exception as e: | |
| return "", f"크롤링 오류: {e}" | |
| # ── Persistence ─────────────────────────────────────────────────────────── | |
| def _dataset_file_path() -> str: | |
| return "feedback.jsonl" | |
| def append_feedback(row: dict) -> None: | |
| if HF_TOKEN and FEEDBACK_DATASET_REPO: | |
| try: | |
| from huggingface_hub import hf_hub_download | |
| remote_path = hf_hub_download( | |
| repo_id=FEEDBACK_DATASET_REPO, | |
| filename=_dataset_file_path(), | |
| repo_type="dataset", token=HF_TOKEN, force_download=True, | |
| ) | |
| with open(remote_path, "rb") as src, LOCAL_FEEDBACK_FILE.open("wb") as dst: | |
| dst.write(src.read()) | |
| except Exception as e: | |
| print(f"[INFO] No existing dataset file — resetting local: {e}") | |
| if LOCAL_FEEDBACK_FILE.exists(): | |
| LOCAL_FEEDBACK_FILE.unlink() | |
| with LOCAL_FEEDBACK_FILE.open("a", encoding="utf-8") as f: | |
| f.write(json.dumps(row, ensure_ascii=False) + "\n") | |
| if HF_TOKEN and FEEDBACK_DATASET_REPO: | |
| try: | |
| from huggingface_hub import HfApi | |
| HfApi(token=HF_TOKEN).upload_file( | |
| path_or_fileobj=str(LOCAL_FEEDBACK_FILE), | |
| path_in_repo=_dataset_file_path(), | |
| repo_id=FEEDBACK_DATASET_REPO, | |
| repo_type="dataset", | |
| commit_message=f"feedback {row.get('id','')}", | |
| ) | |
| except Exception as e: | |
| print(f"[WARN] HF dataset upload failed: {e}") | |
| def load_feedback() -> list[dict]: | |
| if HF_TOKEN and FEEDBACK_DATASET_REPO: | |
| try: | |
| from huggingface_hub import hf_hub_download | |
| p = hf_hub_download( | |
| repo_id=FEEDBACK_DATASET_REPO, | |
| filename=_dataset_file_path(), | |
| repo_type="dataset", token=HF_TOKEN, force_download=True, | |
| ) | |
| return [json.loads(l) for l in open(p, encoding="utf-8") if l.strip()] | |
| except Exception as e: | |
| print(f"[INFO] HF dataset fetch failed ({e}). Returning empty.") | |
| return [] | |
| if LOCAL_FEEDBACK_FILE.exists(): | |
| return [json.loads(l) for l in LOCAL_FEEDBACK_FILE.open(encoding="utf-8") if l.strip()] | |
| return [] | |
| # ── Validation ──────────────────────────────────────────────────────────── | |
| DATE_RE = re.compile(r"^\d{4}-\d{2}-\d{2}$") | |
| def _valid_date(s: str) -> bool: | |
| if not s or not DATE_RE.match(s.strip()): | |
| return False | |
| try: | |
| datetime.strptime(s.strip(), "%Y-%m-%d") | |
| return True | |
| except ValueError: | |
| return False | |
| # ── Translation pipeline ────────────────────────────────────────────────── | |
| def run_translation(url: str, direct_text: str, article_date: str): | |
| """Generator: streams progressive UI updates.""" | |
| if not _valid_date(article_date): | |
| yield ("", "", "❌ 기사 발행일을 `YYYY-MM-DD` 형식으로 입력해주세요. " | |
| "(실제 서비스에서는 CMS의 발행 시각이 자동 입력됩니다.)", | |
| gr.update(visible=False), {}) | |
| return | |
| src = direct_text.strip() | |
| crawl_note = "" | |
| if not src: | |
| if not url.strip(): | |
| yield ("", "", "❌ URL 또는 한국어 본문 중 하나를 입력해주세요.", | |
| gr.update(visible=False), {}) | |
| return | |
| yield ("", "", "🌐 URL 크롤링 중... (최대 30초)", | |
| gr.update(visible=False), {}) | |
| crawled, err = crawl_article(url) | |
| if err: | |
| yield ("", "", f"❌ {err}", gr.update(visible=False), {}) | |
| return | |
| src = crawled | |
| crawl_note = f"(크롤링 완료, {len(src)} chars) " | |
| # Apply rule preprocessing (still runs internally; just not shown in UI) | |
| augmented_user_text, info = preprocess(src, article_date) | |
| system_prompt = SYSTEM_PROMPT + system_prompt_date_suffix(article_date) | |
| rule_summary_lines = [f"📅 발행일 anchor: `{info['date']}`"] | |
| if info["conversions"]: | |
| rule_summary_lines.append(f"🔢 인라인 치환 {len(info['conversions'])}건") | |
| else: | |
| rule_summary_lines.append("🔢 인라인 치환: 없음") | |
| rule_summary = " · ".join(rule_summary_lines) | |
| yield (src, "", | |
| f"✅ 원문 추출 완료 {crawl_note}— 룰 적용 후 Solar Pro2 호출 중...", | |
| gr.update(visible=False), {}) | |
| t0 = time.time() | |
| try: | |
| translation = _strip_meta_trailer(call_upstage(system_prompt, augmented_user_text)) | |
| except Exception as e: | |
| yield (src, "", f"❌ 번역 오류: {e}", gr.update(visible=False), {}) | |
| return | |
| elapsed = time.time() - t0 | |
| state_val = { | |
| "source": src, | |
| "source_url": url.strip(), | |
| "article_date": article_date.strip(), | |
| "augmented_user_text": augmented_user_text, | |
| "translation": translation, | |
| "rule_info": { | |
| "date": info["date"], | |
| "conversions": info["conversions"], | |
| }, | |
| "elapsed_sec": round(elapsed, 1), | |
| } | |
| status = ( | |
| f"✅ 번역 완료 ({elapsed:.1f}s) — {rule_summary}\n\n" | |
| "정성평가 의견을 남겨주세요." | |
| ) | |
| yield (src, translation, status, gr.update(visible=True), state_val) | |
| def submit_comment(comment: str, rater_name: str, state: dict): | |
| if not state: | |
| return "먼저 번역을 생성해주세요.", gr.update() | |
| if not (comment or "").strip(): | |
| return "코멘트를 입력해주세요.", gr.update() | |
| row = { | |
| "id": uuid.uuid4().hex[:12], | |
| "timestamp": datetime.now().isoformat(timespec="seconds"), | |
| "rater": (rater_name or "anonymous").strip()[:40], | |
| "comment": comment.strip(), | |
| "source_url": state.get("source_url", ""), | |
| "article_date": state.get("article_date", ""), | |
| "source_full": state.get("source", ""), | |
| "augmented_user_text": state.get("augmented_user_text", ""), | |
| "translation": state.get("translation", ""), | |
| "rule_info": state.get("rule_info", {}), | |
| "source_excerpt": (state.get("source", "")[:200]).replace("\n", " "), | |
| "elapsed_sec": state.get("elapsed_sec", 0), | |
| "model": MODEL, | |
| "reasoning_effort": REASONING_EFFORT, | |
| "temperature": TEMPERATURE, | |
| } | |
| append_feedback(row) | |
| return ( | |
| f"### ✅ 제출 완료\n\nID: `{row['id']}` — 감사합니다!", | |
| gr.update(value=""), | |
| ) | |
| def refresh_results(): | |
| rows = load_feedback() | |
| cols = ["timestamp", "rater", "comment", "article_date", | |
| "source_url", "source_excerpt"] | |
| if not rows: | |
| return pd.DataFrame(columns=cols), "아직 제출된 코멘트가 없습니다." | |
| df = pd.DataFrame(rows) | |
| show_cols = [c for c in cols if c in df.columns] | |
| df = df[show_cols].sort_values("timestamp", ascending=False) | |
| return df, f"**총 {len(df)}건**의 정성 코멘트가 누적되어 있습니다." | |
| # ── UI ──────────────────────────────────────────────────────────────────── | |
| CSS = """ | |
| .gradio-container { font-family: system-ui, -apple-system, sans-serif !important; } | |
| .trans-box textarea { font-size: 0.95em !important; line-height: 1.55 !important; } | |
| .feedback-card { | |
| background: linear-gradient(180deg, #f8fafc 0%, #eef2ff 100%) !important; | |
| border: 2px solid #6366f1 !important; | |
| border-radius: 12px !important; | |
| padding: 20px !important; | |
| margin-top: 16px !important; | |
| } | |
| .submit-btn { font-size: 1.1em !important; padding: 12px !important; | |
| margin-top: 8px !important; } | |
| """ | |
| with gr.Blocks(title="Korean News Translator — Final", css=CSS, | |
| theme=gr.themes.Default()) as demo: | |
| with gr.Tabs(): | |
| # ── Tab 1: 번역 + 코멘트 ────────────────────────────────────── | |
| with gr.TabItem("번역 & 정성평가"): | |
| with gr.Row(): | |
| url_in = gr.Textbox(label="기사 URL (선택)", | |
| placeholder="https://...", scale=3) | |
| date_in = gr.Textbox( | |
| label="기사 발행일 (필수, YYYY-MM-DD)", | |
| placeholder="2026-04-30", | |
| scale=2, | |
| info="실제 서비스에서는 자동 입력. 데모에서만 수동.", | |
| ) | |
| gen_btn = gr.Button("번역 생성", variant="primary", scale=1) | |
| with gr.Accordion("직접 한국어 본문 입력 (URL 대신)", open=False): | |
| text_in = gr.Textbox(label="한국어 본문", lines=8, | |
| placeholder="본문을 붙여넣기 하세요") | |
| status_md = gr.Markdown() | |
| with gr.Row(): | |
| with gr.Column(): | |
| gr.Markdown("### 원문 (Korean)") | |
| src_box = gr.Textbox(label="", lines=20, interactive=False, | |
| elem_classes=["trans-box"]) | |
| with gr.Column(): | |
| gr.Markdown("### 영문 번역 결과") | |
| out_box = gr.Textbox(label="", lines=20, interactive=False, | |
| elem_classes=["trans-box"]) | |
| with gr.Group(visible=False, elem_classes=["feedback-card"]) as fb_group: | |
| gr.Markdown("## 📝 정성 코멘트") | |
| with gr.Row(): | |
| rater_in = gr.Textbox( | |
| label="닉네임 (선택)", placeholder="anonymous", | |
| max_lines=1, scale=1, | |
| ) | |
| comment_in = gr.Textbox( | |
| label="평가 의견 (자유 기술)", | |
| lines=5, | |
| placeholder=( | |
| "자연스러움 / 뉴스체 / 직역투 / 오역 / 누락 / " | |
| "숫자·단위 / 날짜·시점 / 인용 처리 / 고유명사 등 " | |
| "자유롭게 남겨주세요." | |
| ), | |
| ) | |
| submit_btn = gr.Button( | |
| "✅ 코멘트 제출", variant="primary", | |
| elem_classes=["submit-btn"], | |
| ) | |
| ack_md = gr.Markdown() | |
| state_box = gr.State({}) | |
| gen_btn.click( | |
| fn=run_translation, | |
| inputs=[url_in, text_in, date_in], | |
| outputs=[src_box, out_box, status_md, fb_group, state_box], | |
| ) | |
| submit_btn.click( | |
| fn=submit_comment, | |
| inputs=[comment_in, rater_in, state_box], | |
| outputs=[ack_md, comment_in], | |
| ) | |
| # ── Tab 2: 누적 코멘트 ──────────────────────────────────────── | |
| with gr.TabItem("누적 코멘트"): | |
| refresh_btn = gr.Button("🔄 새로고침") | |
| summary_md = gr.Markdown() | |
| results_df = gr.Dataframe( | |
| label="제출된 정성 코멘트", | |
| headers=["timestamp", "rater", "comment", "article_date", | |
| "source_url", "source_excerpt"], | |
| wrap=True, interactive=False, | |
| ) | |
| refresh_btn.click(fn=refresh_results, outputs=[results_df, summary_md]) | |
| demo.load(fn=refresh_results, outputs=[results_df, summary_md]) | |
| if __name__ == "__main__": | |
| demo.launch() | |