Spaces:

riacho
/

Solar-News-Translator-Final

Sleeping

App Files Files Community

Solar-News-Translator-Final / app.py

riacho

Upload app.py with huggingface_hub

a27e0a4 verified about 1 month ago

raw

history blame contribute delete

17.6 kB

	"""
	Korean News Translator — Final (Rule-augmented) Qualitative Eval

	Single-version demo for production confirmation:
	- Solar Pro2 (reasoning_effort=high, temperature=0.0)
	- prompts/rule_aug.txt — baseline prompt + RULE-PRECOMPUTED METADATA section
	- rules.py — Korean number→English unit inline replacement + article-date anchor

	Flow:
	1. User provides article URL or pastes Korean body
	2. User MUST provide article published date (YYYY-MM-DD) — manual in this demo;
	in production this comes from CMS publish timestamp automatically.
	3. App preprocesses: inline-replaces Korean numbers, appends date to system prompt
	4. App calls Solar Pro2 once → shows English translation
	5. User leaves a qualitative comment (no rating/vote) → persisted

	Persistence:
	- If HF_TOKEN + FEEDBACK_DATASET_REPO set → push/pull JSONL on HF Datasets
	- Otherwise → local ./feedback.jsonl (dev mode)
	"""

	from __future__ import annotations

	import json
	import os
	import re
	import time
	import uuid
	from datetime import datetime
	from pathlib import Path

	import gradio as gr
	import pandas as pd
	import requests
	from dotenv import load_dotenv
	from openai import OpenAI

	from rules import preprocess, system_prompt_date_suffix

	load_dotenv()

	# ── Config ────────────────────────────────────────────────────────────────
	UPSTAGE_BASE_URL = "https://api.upstage.ai/v1"
	UPSTAGE_API_KEY = os.getenv("UPSTAGE_API_KEY") or os.getenv("OPENAI_API_KEY")
	FIRECRAWL_API_KEY = os.getenv("FIRECRAWL_API_KEY")
	HF_TOKEN = os.getenv("HF_TOKEN")
	FEEDBACK_DATASET_REPO = os.getenv("FEEDBACK_DATASET_REPO")
	LOCAL_FEEDBACK_FILE = Path(__file__).parent / "feedback.jsonl"

	SCRIPT_DIR = Path(__file__).parent
	SYSTEM_PROMPT = (SCRIPT_DIR / "prompts" / "rule_aug.txt").read_text(encoding="utf-8")

	MODEL = "solar-pro2"
	REASONING_EFFORT = "high"
	TEMPERATURE = 0.0

	_client: OpenAI \| None = None


	def _get_client() -> OpenAI:
	"""Lazy init so the Space can boot even when secrets are missing.
	The friendly error surfaces only at translate-time."""
	global _client
	if _client is not None:
	return _client
	if not UPSTAGE_API_KEY:
	raise RuntimeError(
	"UPSTAGE_API_KEY가 설정되지 않았습니다. "
	"Space → Settings → Variables and secrets에서 추가해주세요."
	)
	_client = OpenAI(api_key=UPSTAGE_API_KEY, base_url=UPSTAGE_BASE_URL)
	return _client


	# ── Translation ───────────────────────────────────────────────────────────
	def call_upstage(system_prompt: str, user_text: str) -> str:
	resp = _get_client().chat.completions.create(
	model=MODEL,
	messages=[
	{"role": "system", "content": system_prompt},
	{"role": "user", "content": user_text},
	],
	temperature=TEMPERATURE,
	reasoning_effort=REASONING_EFFORT,
	max_tokens=8000,
	)
	return resp.choices[0].message.content.strip()


	_META_MARKERS = (
	"verification checklist", "checklist confirmation",
	"before submitting", "### verification", "**verification",
	)


	def _strip_meta_trailer(text: str) -> str:
	if not text:
	return text
	lowered = text.lower()
	earliest = len(text)
	for marker in _META_MARKERS:
	idx = lowered.find(marker)
	if 0 <= idx < earliest:
	earliest = idx
	if earliest < len(text):
	return text[:earliest].rstrip()
	return text


	# ── Crawling (same as reference) ──────────────────────────────────────────
	def crawl_article(url: str) -> tuple[str, str]:
	if not FIRECRAWL_API_KEY:
	return "", "Firecrawl API 키가 설정되지 않았습니다. 본문을 직접 붙여넣어 주세요."
	if not url.strip():
	return "", "URL을 입력해주세요."

	payload = {
	"url": url,
	"formats": [{
	"type": "json",
	"prompt": (
	"뉴스 기사의 본문 텍스트만 추출하세요.\n\n"
	"반드시 제외할 것:\n"
	"- 기사 제목, 기자/저자 이름, 입력일/업데이트일\n"
	"- 이미지/사진/그래픽/일러스트/도표/인포그래픽 캡션\n"
	"- 광고, 편집자 주, 관련기사 링크\n"
	"- 추천기사, 댓글, 구독 안내, 사이트 내비게이션, 인기기사/핫뉴스 위젯\n"
	"- '저작권자 ⓒ ...', 'ⓒ' 표시 단락\n\n"
	"오직 기자가 작성한 본문 단락만 순서대로 이어서 반환하세요. "
	'형식: {"body": "본문 텍스트 전체 (단락은 \\n\\n 으로 구분)"}'
	),
	}],
	"onlyMainContent": True, "blockAds": True, "removeBase64Images": True,
	}
	try:
	r = requests.post(
	"https://api.firecrawl.dev/v2/scrape",
	headers={"Authorization": f"Bearer {FIRECRAWL_API_KEY}",
	"Content-Type": "application/json"},
	json=payload, timeout=(10, 180),
	)
	if r.status_code != 200:
	return "", f"스크래핑 실패 ({r.status_code}): {r.text[:200]}"
	body = ((r.json().get("data") or {}).get("json") or {}).get("body", "").strip()
	if not body:
	return "", "본문을 찾을 수 없습니다."
	return body, ""
	except Exception as e:
	return "", f"크롤링 오류: {e}"


	# ── Persistence ───────────────────────────────────────────────────────────
	def _dataset_file_path() -> str:
	return "feedback.jsonl"


	def append_feedback(row: dict) -> None:
	if HF_TOKEN and FEEDBACK_DATASET_REPO:
	try:
	from huggingface_hub import hf_hub_download
	remote_path = hf_hub_download(
	repo_id=FEEDBACK_DATASET_REPO,
	filename=_dataset_file_path(),
	repo_type="dataset", token=HF_TOKEN, force_download=True,
	)
	with open(remote_path, "rb") as src, LOCAL_FEEDBACK_FILE.open("wb") as dst:
	dst.write(src.read())
	except Exception as e:
	print(f"[INFO] No existing dataset file — resetting local: {e}")
	if LOCAL_FEEDBACK_FILE.exists():
	LOCAL_FEEDBACK_FILE.unlink()

	with LOCAL_FEEDBACK_FILE.open("a", encoding="utf-8") as f:
	f.write(json.dumps(row, ensure_ascii=False) + "\n")

	if HF_TOKEN and FEEDBACK_DATASET_REPO:
	try:
	from huggingface_hub import HfApi
	HfApi(token=HF_TOKEN).upload_file(
	path_or_fileobj=str(LOCAL_FEEDBACK_FILE),
	path_in_repo=_dataset_file_path(),
	repo_id=FEEDBACK_DATASET_REPO,
	repo_type="dataset",
	commit_message=f"feedback {row.get('id','')}",
	)
	except Exception as e:
	print(f"[WARN] HF dataset upload failed: {e}")


	def load_feedback() -> list[dict]:
	if HF_TOKEN and FEEDBACK_DATASET_REPO:
	try:
	from huggingface_hub import hf_hub_download
	p = hf_hub_download(
	repo_id=FEEDBACK_DATASET_REPO,
	filename=_dataset_file_path(),
	repo_type="dataset", token=HF_TOKEN, force_download=True,
	)
	return [json.loads(l) for l in open(p, encoding="utf-8") if l.strip()]
	except Exception as e:
	print(f"[INFO] HF dataset fetch failed ({e}). Returning empty.")
	return []
	if LOCAL_FEEDBACK_FILE.exists():
	return [json.loads(l) for l in LOCAL_FEEDBACK_FILE.open(encoding="utf-8") if l.strip()]
	return []


	# ── Validation ────────────────────────────────────────────────────────────
	DATE_RE = re.compile(r"^\d{4}-\d{2}-\d{2}$")


	def _valid_date(s: str) -> bool:
	if not s or not DATE_RE.match(s.strip()):
	return False
	try:
	datetime.strptime(s.strip(), "%Y-%m-%d")
	return True
	except ValueError:
	return False


	# ── Translation pipeline ──────────────────────────────────────────────────
	def run_translation(url: str, direct_text: str, article_date: str):
	"""Generator: streams progressive UI updates."""
	if not _valid_date(article_date):
	yield ("", "", "❌ 기사 발행일을 `YYYY-MM-DD` 형식으로 입력해주세요. "
	"(실제 서비스에서는 CMS의 발행 시각이 자동 입력됩니다.)",
	gr.update(visible=False), {})
	return

	src = direct_text.strip()
	crawl_note = ""

	if not src:
	if not url.strip():
	yield ("", "", "❌ URL 또는 한국어 본문 중 하나를 입력해주세요.",
	gr.update(visible=False), {})
	return
	yield ("", "", "🌐 URL 크롤링 중... (최대 30초)",
	gr.update(visible=False), {})
	crawled, err = crawl_article(url)
	if err:
	yield ("", "", f"❌ {err}", gr.update(visible=False), {})
	return
	src = crawled
	crawl_note = f"(크롤링 완료, {len(src)} chars) "

	# Apply rule preprocessing (still runs internally; just not shown in UI)
	augmented_user_text, info = preprocess(src, article_date)
	system_prompt = SYSTEM_PROMPT + system_prompt_date_suffix(article_date)

	rule_summary_lines = [f"📅 발행일 anchor: `{info['date']}`"]
	if info["conversions"]:
	rule_summary_lines.append(f"🔢 인라인 치환 {len(info['conversions'])}건")
	else:
	rule_summary_lines.append("🔢 인라인 치환: 없음")
	rule_summary = " · ".join(rule_summary_lines)

	yield (src, "",
	f"✅ 원문 추출 완료 {crawl_note}— 룰 적용 후 Solar Pro2 호출 중...",
	gr.update(visible=False), {})

	t0 = time.time()
	try:
	translation = _strip_meta_trailer(call_upstage(system_prompt, augmented_user_text))
	except Exception as e:
	yield (src, "", f"❌ 번역 오류: {e}", gr.update(visible=False), {})
	return
	elapsed = time.time() - t0

	state_val = {
	"source": src,
	"source_url": url.strip(),
	"article_date": article_date.strip(),
	"augmented_user_text": augmented_user_text,
	"translation": translation,
	"rule_info": {
	"date": info["date"],
	"conversions": info["conversions"],
	},
	"elapsed_sec": round(elapsed, 1),
	}
	status = (
	f"✅ 번역 완료 ({elapsed:.1f}s) — {rule_summary}\n\n"
	"정성평가 의견을 남겨주세요."
	)
	yield (src, translation, status, gr.update(visible=True), state_val)


	def submit_comment(comment: str, rater_name: str, state: dict):
	if not state:
	return "먼저 번역을 생성해주세요.", gr.update()
	if not (comment or "").strip():
	return "코멘트를 입력해주세요.", gr.update()

	row = {
	"id": uuid.uuid4().hex[:12],
	"timestamp": datetime.now().isoformat(timespec="seconds"),
	"rater": (rater_name or "anonymous").strip()[:40],
	"comment": comment.strip(),
	"source_url": state.get("source_url", ""),
	"article_date": state.get("article_date", ""),
	"source_full": state.get("source", ""),
	"augmented_user_text": state.get("augmented_user_text", ""),
	"translation": state.get("translation", ""),
	"rule_info": state.get("rule_info", {}),
	"source_excerpt": (state.get("source", "")[:200]).replace("\n", " "),
	"elapsed_sec": state.get("elapsed_sec", 0),
	"model": MODEL,
	"reasoning_effort": REASONING_EFFORT,
	"temperature": TEMPERATURE,
	}
	append_feedback(row)
	return (
	f"### ✅ 제출 완료\n\nID: `{row['id']}` — 감사합니다!",
	gr.update(value=""),
	)


	def refresh_results():
	rows = load_feedback()
	cols = ["timestamp", "rater", "comment", "article_date",
	"source_url", "source_excerpt"]
	if not rows:
	return pd.DataFrame(columns=cols), "아직 제출된 코멘트가 없습니다."
	df = pd.DataFrame(rows)
	show_cols = [c for c in cols if c in df.columns]
	df = df[show_cols].sort_values("timestamp", ascending=False)
	return df, f"총 {len(df)}건의 정성 코멘트가 누적되어 있습니다."


	# ── UI ────────────────────────────────────────────────────────────────────
	CSS = """
	.gradio-container { font-family: system-ui, -apple-system, sans-serif !important; }
	.trans-box textarea { font-size: 0.95em !important; line-height: 1.55 !important; }
	.feedback-card {
	background: linear-gradient(180deg, #f8fafc 0%, #eef2ff 100%) !important;
	border: 2px solid #6366f1 !important;
	border-radius: 12px !important;
	padding: 20px !important;
	margin-top: 16px !important;
	}
	.submit-btn { font-size: 1.1em !important; padding: 12px !important;
	margin-top: 8px !important; }
	"""

	with gr.Blocks(title="Korean News Translator — Final", css=CSS,
	theme=gr.themes.Default()) as demo:
	with gr.Tabs():
	# ── Tab 1: 번역 + 코멘트 ──────────────────────────────────────
	with gr.TabItem("번역 & 정성평가"):
	with gr.Row():
	url_in = gr.Textbox(label="기사 URL (선택)",
	placeholder="https://...", scale=3)
	date_in = gr.Textbox(
	label="기사 발행일 (필수, YYYY-MM-DD)",
	placeholder="2026-04-30",
	scale=2,
	info="실제 서비스에서는 자동 입력. 데모에서만 수동.",
	)
	gen_btn = gr.Button("번역 생성", variant="primary", scale=1)

	with gr.Accordion("직접 한국어 본문 입력 (URL 대신)", open=False):
	text_in = gr.Textbox(label="한국어 본문", lines=8,
	placeholder="본문을 붙여넣기 하세요")

	status_md = gr.Markdown()

	with gr.Row():
	with gr.Column():
	gr.Markdown("### 원문 (Korean)")
	src_box = gr.Textbox(label="", lines=20, interactive=False,
	elem_classes=["trans-box"])
	with gr.Column():
	gr.Markdown("### 영문 번역 결과")
	out_box = gr.Textbox(label="", lines=20, interactive=False,
	elem_classes=["trans-box"])

	with gr.Group(visible=False, elem_classes=["feedback-card"]) as fb_group:
	gr.Markdown("## 📝 정성 코멘트")
	with gr.Row():
	rater_in = gr.Textbox(
	label="닉네임 (선택)", placeholder="anonymous",
	max_lines=1, scale=1,
	)
	comment_in = gr.Textbox(
	label="평가 의견 (자유 기술)",
	lines=5,
	placeholder=(
	"자연스러움 / 뉴스체 / 직역투 / 오역 / 누락 / "
	"숫자·단위 / 날짜·시점 / 인용 처리 / 고유명사 등 "
	"자유롭게 남겨주세요."
	),
	)
	submit_btn = gr.Button(
	"✅ 코멘트 제출", variant="primary",
	elem_classes=["submit-btn"],
	)
	ack_md = gr.Markdown()

	state_box = gr.State({})

	gen_btn.click(
	fn=run_translation,
	inputs=[url_in, text_in, date_in],
	outputs=[src_box, out_box, status_md, fb_group, state_box],
	)
	submit_btn.click(
	fn=submit_comment,
	inputs=[comment_in, rater_in, state_box],
	outputs=[ack_md, comment_in],
	)

	# ── Tab 2: 누적 코멘트 ────────────────────────────────────────
	with gr.TabItem("누적 코멘트"):
	refresh_btn = gr.Button("🔄 새로고침")
	summary_md = gr.Markdown()
	results_df = gr.Dataframe(
	label="제출된 정성 코멘트",
	headers=["timestamp", "rater", "comment", "article_date",
	"source_url", "source_excerpt"],
	wrap=True, interactive=False,
	)
	refresh_btn.click(fn=refresh_results, outputs=[results_df, summary_md])
	demo.load(fn=refresh_results, outputs=[results_df, summary_md])


	if __name__ == "__main__":
	demo.launch()