Spaces:
Sleeping
Sleeping
Commit ·
c421c84
1
Parent(s): ed17f2a
Single-tab feedback UI (Good/NotBad/Critical); FT dedup tighter
Browse files- .gitignore +1 -0
- app.py +14 -155
- blindtest/__init__.py +3 -3
- blindtest/__pycache__/__init__.cpython-312.pyc +0 -0
- blindtest/__pycache__/db.cpython-312.pyc +0 -0
- blindtest/__pycache__/ui.cpython-312.pyc +0 -0
- blindtest/db.py +33 -0
- blindtest/schema.sql +14 -0
- blindtest/ui.py +94 -158
- pipelines.py +21 -11
.gitignore
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
__pycache__/
|
app.py
CHANGED
|
@@ -1,22 +1,17 @@
|
|
| 1 |
-
"""Chosun proofreading
|
| 2 |
|
| 3 |
-
|
| 4 |
-
1. 비교 (Comparator): 2-column A (Baseline 251231) vs B (🏆 v16)
|
| 5 |
-
2. 블라인드 테스트: label-hidden pairwise preference voting, Supabase-backed
|
| 6 |
"""
|
| 7 |
|
| 8 |
import os
|
| 9 |
-
from collections.abc import Iterator
|
| 10 |
-
from concurrent.futures import ThreadPoolExecutor, as_completed
|
| 11 |
|
| 12 |
import gradio as gr
|
| 13 |
-
from diff_utils import highlight_diff
|
| 14 |
from dotenv import load_dotenv
|
| 15 |
from openai import OpenAI
|
| 16 |
-
from pipelines import
|
| 17 |
from postprocess import load_vocabulary
|
| 18 |
|
| 19 |
-
from blindtest import
|
| 20 |
|
| 21 |
load_dotenv()
|
| 22 |
|
|
@@ -27,163 +22,27 @@ _vocab_path = os.path.join(os.path.dirname(__file__), "data", "vocabulary.csv")
|
|
| 27 |
vocabulary = load_vocabulary(_vocab_path)
|
| 28 |
|
| 29 |
|
| 30 |
-
|
| 31 |
-
_SLOTS = ("A", "B")
|
| 32 |
-
|
| 33 |
-
|
| 34 |
-
def _build_state(
|
| 35 |
-
input_text: str,
|
| 36 |
-
results: dict[str, dict | None],
|
| 37 |
-
) -> tuple:
|
| 38 |
-
"""Build 2-pipeline output tuple: 2 outputs + 2 diffs + meta."""
|
| 39 |
-
outs = [
|
| 40 |
-
results[s]["output"] if results[s] else _PENDING_TEXT for s in _SLOTS
|
| 41 |
-
]
|
| 42 |
-
diffs = [
|
| 43 |
-
highlight_diff(input_text, results[s]["output"]) if results[s] else ""
|
| 44 |
-
for s in _SLOTS
|
| 45 |
-
]
|
| 46 |
-
|
| 47 |
-
def _label(result: dict | None, default: str) -> str:
|
| 48 |
-
if not result:
|
| 49 |
-
return default
|
| 50 |
-
t = f"{result['processing_time']:.1f}s"
|
| 51 |
-
errs = result.get("step_errors") or []
|
| 52 |
-
if errs:
|
| 53 |
-
t += f" ({len(errs)} err: {', '.join(errs[:2])}{'...' if len(errs) > 2 else ''})"
|
| 54 |
-
return t
|
| 55 |
-
|
| 56 |
-
times = [_label(results[s], "실행 중...") for s in _SLOTS]
|
| 57 |
-
meta = " | ".join(f"Pipeline {s}: {t}" for s, t in zip(_SLOTS, times))
|
| 58 |
-
|
| 59 |
-
return (*outs, *diffs, meta)
|
| 60 |
-
|
| 61 |
-
|
| 62 |
-
def compare(
|
| 63 |
-
input_text: str,
|
| 64 |
-
pipe_a: str, model_a: str, prompt_a: str,
|
| 65 |
-
pipe_b: str, model_b: str, prompt_b: str,
|
| 66 |
-
) -> Iterator[tuple]:
|
| 67 |
-
"""Run 2 pipelines concurrently and yield partial results as each finishes."""
|
| 68 |
-
empty = tuple([""] * 4) + ("입력 텍스트를 입력해주세요.",)
|
| 69 |
-
|
| 70 |
-
if not input_text or not input_text.strip():
|
| 71 |
-
yield empty
|
| 72 |
-
return
|
| 73 |
-
|
| 74 |
-
if not client:
|
| 75 |
-
yield tuple([""] * 4) + (
|
| 76 |
-
"UPSTAGE_API_KEY 환경변수가 설정되지 않았습니다. .env 파일을 확인해주세요.",
|
| 77 |
-
)
|
| 78 |
-
return
|
| 79 |
-
|
| 80 |
-
results: dict[str, dict | None] = {s: None for s in _SLOTS}
|
| 81 |
-
yield _build_state(input_text, results)
|
| 82 |
-
|
| 83 |
-
configs = {
|
| 84 |
-
"A": (pipe_a, model_a, prompt_a),
|
| 85 |
-
"B": (pipe_b, model_b, prompt_b),
|
| 86 |
-
}
|
| 87 |
-
|
| 88 |
-
with ThreadPoolExecutor(max_workers=2) as executor:
|
| 89 |
-
futures = {
|
| 90 |
-
executor.submit(
|
| 91 |
-
run_pipeline, input_text, pipe, model, prompt, client, vocabulary
|
| 92 |
-
): slot
|
| 93 |
-
for slot, (pipe, model, prompt) in configs.items()
|
| 94 |
-
}
|
| 95 |
-
|
| 96 |
-
for fut in as_completed(futures):
|
| 97 |
-
slot = futures[fut]
|
| 98 |
-
try:
|
| 99 |
-
result = fut.result()
|
| 100 |
-
except Exception as exc:
|
| 101 |
-
result = {"output": f"에러: {exc}", "processing_time": 0.0}
|
| 102 |
-
results[slot] = result
|
| 103 |
-
yield _build_state(input_text, results)
|
| 104 |
-
|
| 105 |
-
|
| 106 |
-
# --- UI ---
|
| 107 |
-
|
| 108 |
-
pipeline_choices = list(PIPELINES.keys())
|
| 109 |
-
model_choices = ["solar-pro2", "solar-pro3"]
|
| 110 |
-
prompt_choices = list_prompts() or ["prod_251231"]
|
| 111 |
|
| 112 |
|
| 113 |
-
def _default_prompt(preferred_prefix: str,
|
| 114 |
matches = [p for p in prompt_choices if p.startswith(preferred_prefix)]
|
| 115 |
if matches:
|
| 116 |
return matches[-1]
|
| 117 |
-
return
|
| 118 |
|
| 119 |
|
| 120 |
-
|
| 121 |
-
_default_prompt_b = _default_prompt("dev_260408_v16", -1)
|
| 122 |
|
| 123 |
with gr.Blocks(title="Chosun 교정교열 데모") as demo:
|
| 124 |
gr.Markdown("# Chosun 교정교열 데모")
|
|
|
|
| 125 |
|
| 126 |
-
|
| 127 |
-
|
| 128 |
-
|
| 129 |
-
|
| 130 |
-
|
| 131 |
-
baseline_config=("251231_default", "solar-pro2", _default_prompt_a),
|
| 132 |
-
candidate_config=("260408_v16", "solar-pro3", _default_prompt_b),
|
| 133 |
-
)
|
| 134 |
-
|
| 135 |
-
with gr.Tab("파이프라인 비교"):
|
| 136 |
-
gr.Markdown("Baseline(251231 Pro2×3)과 신규 v16(Pro3×1)을 나란히 비교합니다.")
|
| 137 |
-
|
| 138 |
-
input_text = gr.Textbox(
|
| 139 |
-
label="원문 입력",
|
| 140 |
-
lines=8,
|
| 141 |
-
placeholder="교정할 텍스트를 입력하세요.",
|
| 142 |
-
)
|
| 143 |
-
|
| 144 |
-
with gr.Row():
|
| 145 |
-
with gr.Column():
|
| 146 |
-
gr.Markdown("### A (Baseline 251231)")
|
| 147 |
-
pipe_a = gr.Dropdown(pipeline_choices, value="251231_default", label="파이프라인")
|
| 148 |
-
model_a = gr.Dropdown(model_choices, value="solar-pro2", label="모델")
|
| 149 |
-
prompt_a = gr.Dropdown(prompt_choices, value=_default_prompt_a, label="프롬프트")
|
| 150 |
-
with gr.Column():
|
| 151 |
-
gr.Markdown("### B (🏆 v16)")
|
| 152 |
-
pipe_b = gr.Dropdown(pipeline_choices, value="260408_v16", label="파이프라인")
|
| 153 |
-
model_b = gr.Dropdown(model_choices, value="solar-pro3", label="모델")
|
| 154 |
-
prompt_b = gr.Dropdown(prompt_choices, value=_default_prompt_b, label="프롬프트")
|
| 155 |
-
|
| 156 |
-
btn = gr.Button(
|
| 157 |
-
"비교 실행 (⌘+Enter / Ctrl+Enter)",
|
| 158 |
-
variant="primary",
|
| 159 |
-
elem_id="compare-run-btn",
|
| 160 |
-
)
|
| 161 |
-
|
| 162 |
-
with gr.Row():
|
| 163 |
-
output_a = gr.Textbox(label="A 결과", lines=10)
|
| 164 |
-
output_b = gr.Textbox(label="B 결과", lines=10)
|
| 165 |
-
|
| 166 |
-
with gr.Accordion("Diff 상세 비교", open=True):
|
| 167 |
-
gr.Markdown("#### 원문 vs A")
|
| 168 |
-
diff_orig_a_html = gr.HTML()
|
| 169 |
-
gr.Markdown("#### 원문 vs B")
|
| 170 |
-
diff_orig_b_html = gr.HTML()
|
| 171 |
-
|
| 172 |
-
meta_info = gr.Textbox(label="실행 정보", interactive=False)
|
| 173 |
-
|
| 174 |
-
btn.click(
|
| 175 |
-
fn=compare,
|
| 176 |
-
inputs=[
|
| 177 |
-
input_text,
|
| 178 |
-
pipe_a, model_a, prompt_a,
|
| 179 |
-
pipe_b, model_b, prompt_b,
|
| 180 |
-
],
|
| 181 |
-
outputs=[
|
| 182 |
-
output_a, output_b,
|
| 183 |
-
diff_orig_a_html, diff_orig_b_html,
|
| 184 |
-
meta_info,
|
| 185 |
-
],
|
| 186 |
-
)
|
| 187 |
|
| 188 |
_SHORTCUT_JS = """
|
| 189 |
() => {
|
|
|
|
| 1 |
+
"""Chosun proofreading demo — Gradio UI.
|
| 2 |
|
| 3 |
+
Single tab: solar-pro3 v16 pipeline + Good/Not Bad/Critical feedback.
|
|
|
|
|
|
|
| 4 |
"""
|
| 5 |
|
| 6 |
import os
|
|
|
|
|
|
|
| 7 |
|
| 8 |
import gradio as gr
|
|
|
|
| 9 |
from dotenv import load_dotenv
|
| 10 |
from openai import OpenAI
|
| 11 |
+
from pipelines import list_prompts
|
| 12 |
from postprocess import load_vocabulary
|
| 13 |
|
| 14 |
+
from blindtest import build_feedback_tab
|
| 15 |
|
| 16 |
load_dotenv()
|
| 17 |
|
|
|
|
| 22 |
vocabulary = load_vocabulary(_vocab_path)
|
| 23 |
|
| 24 |
|
| 25 |
+
prompt_choices = list_prompts() or ["dev_260408_v16"]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 26 |
|
| 27 |
|
| 28 |
+
def _default_prompt(preferred_prefix: str, fallback: str) -> str:
|
| 29 |
matches = [p for p in prompt_choices if p.startswith(preferred_prefix)]
|
| 30 |
if matches:
|
| 31 |
return matches[-1]
|
| 32 |
+
return fallback
|
| 33 |
|
| 34 |
|
| 35 |
+
_v16_prompt = _default_prompt("dev_260408_v16", prompt_choices[-1])
|
|
|
|
| 36 |
|
| 37 |
with gr.Blocks(title="Chosun 교정교열 데모") as demo:
|
| 38 |
gr.Markdown("# Chosun 교정교열 데모")
|
| 39 |
+
gr.Markdown("solar-pro3 기반 교정 결과를 확인하고 피드백을 남겨주세요.")
|
| 40 |
|
| 41 |
+
build_feedback_tab(
|
| 42 |
+
client=client,
|
| 43 |
+
vocabulary=vocabulary,
|
| 44 |
+
pipeline_config=("260408_v16", "solar-pro3", _v16_prompt),
|
| 45 |
+
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 46 |
|
| 47 |
_SHORTCUT_JS = """
|
| 48 |
() => {
|
blindtest/__init__.py
CHANGED
|
@@ -1,5 +1,5 @@
|
|
| 1 |
-
"""
|
| 2 |
|
| 3 |
-
from .ui import
|
| 4 |
|
| 5 |
-
__all__ = ["
|
|
|
|
| 1 |
+
"""Feedback module — single-output Good/Not Bad/Critical rating, Supabase-backed."""
|
| 2 |
|
| 3 |
+
from .ui import build_feedback_tab
|
| 4 |
|
| 5 |
+
__all__ = ["build_feedback_tab"]
|
blindtest/__pycache__/__init__.cpython-312.pyc
DELETED
|
Binary file (407 Bytes)
|
|
|
blindtest/__pycache__/db.cpython-312.pyc
DELETED
|
Binary file (6.09 kB)
|
|
|
blindtest/__pycache__/ui.cpython-312.pyc
DELETED
|
Binary file (12 kB)
|
|
|
blindtest/db.py
CHANGED
|
@@ -147,6 +147,39 @@ def save_task(
|
|
| 147 |
return None
|
| 148 |
|
| 149 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 150 |
def save_vote(task_id: int | None, choice: str, comment: str) -> bool:
|
| 151 |
if not is_configured() or task_id is None:
|
| 152 |
return False
|
|
|
|
| 147 |
return None
|
| 148 |
|
| 149 |
|
| 150 |
+
def save_rating(pipeline_run_id: int | None, rating: str, comment: str) -> bool:
|
| 151 |
+
"""Save Good / Not Bad / Critical rating for a single pipeline run."""
|
| 152 |
+
if not is_configured() or pipeline_run_id is None:
|
| 153 |
+
return False
|
| 154 |
+
try:
|
| 155 |
+
_post(
|
| 156 |
+
"ratings",
|
| 157 |
+
{
|
| 158 |
+
"pipeline_run_id": pipeline_run_id,
|
| 159 |
+
"rating": rating,
|
| 160 |
+
"comment": comment or None,
|
| 161 |
+
},
|
| 162 |
+
)
|
| 163 |
+
return True
|
| 164 |
+
except Exception as exc:
|
| 165 |
+
_record_error("save_rating", exc)
|
| 166 |
+
return False
|
| 167 |
+
|
| 168 |
+
|
| 169 |
+
def fetch_rating_counts() -> dict[str, int]:
|
| 170 |
+
if not is_configured():
|
| 171 |
+
return {}
|
| 172 |
+
try:
|
| 173 |
+
rows = _get("ratings", {"select": "rating"})
|
| 174 |
+
counts: dict[str, int] = {}
|
| 175 |
+
for row in rows:
|
| 176 |
+
counts[row["rating"]] = counts.get(row["rating"], 0) + 1
|
| 177 |
+
return counts
|
| 178 |
+
except Exception as exc:
|
| 179 |
+
_record_error("fetch_rating_counts", exc)
|
| 180 |
+
return {}
|
| 181 |
+
|
| 182 |
+
|
| 183 |
def save_vote(task_id: int | None, choice: str, comment: str) -> bool:
|
| 184 |
if not is_configured() or task_id is None:
|
| 185 |
return False
|
blindtest/schema.sql
CHANGED
|
@@ -38,12 +38,24 @@ create table if not exists votes (
|
|
| 38 |
|
| 39 |
create index if not exists votes_task_idx on votes (task_id);
|
| 40 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 41 |
-- Allow anon key (used by the Gradio app) to insert/select.
|
| 42 |
-- RLS is enabled by default on public tables; without policies, anon writes silently fail.
|
| 43 |
alter table articles enable row level security;
|
| 44 |
alter table pipeline_runs enable row level security;
|
| 45 |
alter table tasks enable row level security;
|
| 46 |
alter table votes enable row level security;
|
|
|
|
| 47 |
|
| 48 |
create policy "anon insert articles" on articles for insert to anon with check (true);
|
| 49 |
create policy "anon select articles" on articles for select to anon using (true);
|
|
@@ -53,6 +65,8 @@ create policy "anon insert tasks" on tasks for insert to anon wi
|
|
| 53 |
create policy "anon select tasks" on tasks for select to anon using (true);
|
| 54 |
create policy "anon insert votes" on votes for insert to anon with check (true);
|
| 55 |
create policy "anon select votes" on votes for select to anon using (true);
|
|
|
|
|
|
|
| 56 |
|
| 57 |
-- Aggregation view: winner is identified by pipeline_key (un-blinded).
|
| 58 |
create or replace view vote_summary as
|
|
|
|
| 38 |
|
| 39 |
create index if not exists votes_task_idx on votes (task_id);
|
| 40 |
|
| 41 |
+
-- Single-output rating table (current UI: Good / Not Bad / Critical).
|
| 42 |
+
create table if not exists ratings (
|
| 43 |
+
id bigserial primary key,
|
| 44 |
+
pipeline_run_id bigint references pipeline_runs(id) on delete cascade,
|
| 45 |
+
rating text not null check (rating in ('good','not_bad','critical')),
|
| 46 |
+
comment text,
|
| 47 |
+
rated_at timestamptz default now()
|
| 48 |
+
);
|
| 49 |
+
|
| 50 |
+
create index if not exists ratings_run_idx on ratings (pipeline_run_id);
|
| 51 |
+
|
| 52 |
-- Allow anon key (used by the Gradio app) to insert/select.
|
| 53 |
-- RLS is enabled by default on public tables; without policies, anon writes silently fail.
|
| 54 |
alter table articles enable row level security;
|
| 55 |
alter table pipeline_runs enable row level security;
|
| 56 |
alter table tasks enable row level security;
|
| 57 |
alter table votes enable row level security;
|
| 58 |
+
alter table ratings enable row level security;
|
| 59 |
|
| 60 |
create policy "anon insert articles" on articles for insert to anon with check (true);
|
| 61 |
create policy "anon select articles" on articles for select to anon using (true);
|
|
|
|
| 65 |
create policy "anon select tasks" on tasks for select to anon using (true);
|
| 66 |
create policy "anon insert votes" on votes for insert to anon with check (true);
|
| 67 |
create policy "anon select votes" on votes for select to anon using (true);
|
| 68 |
+
create policy "anon insert ratings" on ratings for insert to anon with check (true);
|
| 69 |
+
create policy "anon select ratings" on ratings for select to anon using (true);
|
| 70 |
|
| 71 |
-- Aggregation view: winner is identified by pipeline_key (un-blinded).
|
| 72 |
create or replace view vote_summary as
|
blindtest/ui.py
CHANGED
|
@@ -1,20 +1,16 @@
|
|
| 1 |
-
"""Gradio UI for
|
| 2 |
|
| 3 |
Flow:
|
| 4 |
-
1. User enters source text and clicks "
|
| 5 |
-
2.
|
| 6 |
-
|
| 7 |
-
|
| 8 |
-
|
| 9 |
-
5. On submit: vote is saved, the real pipeline identity is revealed, and
|
| 10 |
-
the running tally is refreshed.
|
| 11 |
"""
|
| 12 |
|
| 13 |
from __future__ import annotations
|
| 14 |
|
| 15 |
-
import random
|
| 16 |
import time
|
| 17 |
-
from concurrent.futures import ThreadPoolExecutor
|
| 18 |
from typing import Any
|
| 19 |
|
| 20 |
import gradio as gr
|
|
@@ -26,202 +22,142 @@ from . import db
|
|
| 26 |
PipelineConfig = tuple[str, str, str] # (pipeline_key, model, prompt_key)
|
| 27 |
|
| 28 |
|
| 29 |
-
def
|
| 30 |
-
client: Any,
|
| 31 |
-
vocabulary: list[dict],
|
| 32 |
-
text: str,
|
| 33 |
-
config: PipelineConfig,
|
| 34 |
-
) -> dict:
|
| 35 |
-
pipeline_key, model, prompt_key = config
|
| 36 |
-
start = time.time()
|
| 37 |
-
try:
|
| 38 |
-
result = run_pipeline(text, pipeline_key, model, prompt_key, client, vocabulary)
|
| 39 |
-
result.setdefault("processing_time", time.time() - start)
|
| 40 |
-
return result
|
| 41 |
-
except Exception as exc: # pragma: no cover
|
| 42 |
-
return {"output": f"에러: {exc}", "processing_time": time.time() - start}
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
def _format_summary(summary: list[dict], counts: dict[str, int]) -> str:
|
| 46 |
total = sum(counts.values())
|
| 47 |
if total == 0:
|
| 48 |
-
return "아직
|
| 49 |
-
|
| 50 |
-
|
| 51 |
-
|
| 52 |
-
|
| 53 |
-
|
| 54 |
-
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
|
| 58 |
-
|
| 59 |
-
|
| 60 |
-
|
| 61 |
-
def build_blindtest_tab(
|
| 62 |
client: Any,
|
| 63 |
vocabulary: list[dict],
|
| 64 |
-
|
| 65 |
-
candidate_config: PipelineConfig,
|
| 66 |
) -> None:
|
| 67 |
-
"""Build the
|
| 68 |
-
|
| 69 |
-
configured = db.is_configured()
|
| 70 |
|
| 71 |
-
|
| 72 |
-
"두 교정 결과 중 어느 쪽이 더 낫다고 느끼는지 익명으로 투표합니다. "
|
| 73 |
-
"라벨은 제출 후 공개됩니다."
|
| 74 |
-
)
|
| 75 |
|
| 76 |
-
|
| 77 |
-
slot_a_is_baseline = gr.State(True)
|
| 78 |
-
task_id_state = gr.State(None)
|
| 79 |
-
run_a_id_state = gr.State(None)
|
| 80 |
-
run_b_id_state = gr.State(None)
|
| 81 |
|
| 82 |
input_text = gr.Textbox(
|
| 83 |
label="원문 입력",
|
| 84 |
-
lines=
|
| 85 |
placeholder="교정할 텍스트를 입력하세요.",
|
| 86 |
)
|
| 87 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 88 |
|
| 89 |
status = gr.Markdown("")
|
| 90 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 91 |
with gr.Row():
|
| 92 |
-
|
| 93 |
-
|
| 94 |
-
|
| 95 |
-
diff_a = gr.HTML(label="원문 대비 diff A")
|
| 96 |
-
with gr.Column():
|
| 97 |
-
gr.Markdown("### 결과 B")
|
| 98 |
-
output_b = gr.Textbox(label="교정 결과 B", lines=10, interactive=False)
|
| 99 |
-
diff_b = gr.HTML(label="원문 대비 diff B")
|
| 100 |
-
|
| 101 |
-
gr.Markdown("### 어느 쪽이 더 낫나요?")
|
| 102 |
-
with gr.Row():
|
| 103 |
-
vote_a = gr.Button("A 가 낫다", variant="primary")
|
| 104 |
-
vote_b = gr.Button("B 가 낫다", variant="primary")
|
| 105 |
-
vote_tie = gr.Button("비슷하다")
|
| 106 |
-
vote_bad = gr.Button("둘 다 나쁘다")
|
| 107 |
|
| 108 |
comment = gr.Textbox(label="코멘트 (선택)", lines=2)
|
| 109 |
-
|
| 110 |
-
reveal = gr.Markdown("")
|
| 111 |
summary_md = gr.Markdown("")
|
| 112 |
|
| 113 |
-
def
|
| 114 |
if not text or not text.strip():
|
| 115 |
return (
|
| 116 |
gr.update(value="입력 텍스트가 비어있습니다."),
|
| 117 |
-
gr.update(),
|
| 118 |
-
|
|
|
|
| 119 |
gr.update(value=""),
|
| 120 |
)
|
| 121 |
if client is None:
|
| 122 |
return (
|
| 123 |
gr.update(value="UPSTAGE_API_KEY 미설정."),
|
| 124 |
-
gr.update(),
|
| 125 |
-
|
|
|
|
| 126 |
gr.update(value=""),
|
| 127 |
)
|
| 128 |
|
| 129 |
-
|
| 130 |
-
|
| 131 |
-
|
| 132 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 133 |
|
| 134 |
-
|
| 135 |
-
|
| 136 |
-
fut_b = ex.submit(_run_one, client, vocabulary, text, cfg_slot_b)
|
| 137 |
-
res_a = fut_a.result()
|
| 138 |
-
res_b = fut_b.result()
|
| 139 |
|
| 140 |
-
article_id = db.save_article(text)
|
| 141 |
-
|
| 142 |
-
run_slot_a_id = db.save_pipeline_run(
|
| 143 |
-
article_id,
|
| 144 |
-
pipeline_key=cfg_slot_a[0],
|
| 145 |
-
prompt_key=cfg_slot_a[2],
|
| 146 |
-
model=cfg_slot_a[1],
|
| 147 |
-
output=res_a["output"],
|
| 148 |
-
processing_time_s=float(res_a.get("processing_time", 0.0)),
|
| 149 |
-
) if configured else None
|
| 150 |
-
run_slot_b_id = db.save_pipeline_run(
|
| 151 |
article_id,
|
| 152 |
-
pipeline_key=
|
| 153 |
-
prompt_key=
|
| 154 |
-
model=
|
| 155 |
-
output=
|
| 156 |
-
processing_time_s=float(
|
| 157 |
-
)
|
| 158 |
-
task_id = db.save_task(article_id, run_slot_a_id, run_slot_b_id) if configured else None
|
| 159 |
-
|
| 160 |
-
diff_a_html = highlight_diff(text, res_a["output"])
|
| 161 |
-
diff_b_html = highlight_diff(text, res_b["output"])
|
| 162 |
|
| 163 |
return (
|
| 164 |
-
gr.update(value=f"
|
| 165 |
-
gr.update(value=
|
| 166 |
-
gr.update(value=
|
| 167 |
-
|
| 168 |
-
gr.update(value=
|
| 169 |
-
a_is_baseline, task_id, run_slot_a_id, run_slot_b_id,
|
| 170 |
-
gr.update(value=""), # clear reveal
|
| 171 |
)
|
| 172 |
|
| 173 |
-
|
| 174 |
-
|
| 175 |
inputs=[input_text],
|
| 176 |
-
outputs=[
|
| 177 |
-
status,
|
| 178 |
-
output_a, output_b,
|
| 179 |
-
diff_a, diff_b,
|
| 180 |
-
slot_a_is_baseline, task_id_state, run_a_id_state, run_b_id_state,
|
| 181 |
-
reveal,
|
| 182 |
-
],
|
| 183 |
)
|
| 184 |
|
| 185 |
-
def
|
| 186 |
-
def handler(
|
| 187 |
-
saved = db.
|
| 188 |
-
if
|
| 189 |
-
|
| 190 |
-
|
| 191 |
-
else:
|
| 192 |
-
label_a = candidate_config[0]
|
| 193 |
-
label_b = baseline_config[0]
|
| 194 |
-
reveal_md = (
|
| 195 |
-
f"**공개** — A: `{label_a}` · B: `{label_b}`\n\n"
|
| 196 |
-
f"선택: **{choice}**"
|
| 197 |
-
)
|
| 198 |
-
summary = _format_summary(db.fetch_summary(), db.fetch_vote_counts())
|
| 199 |
-
return gr.update(value=reveal_md), gr.update(value=summary)
|
| 200 |
return handler
|
| 201 |
|
| 202 |
-
|
| 203 |
-
|
| 204 |
-
inputs=[
|
| 205 |
-
outputs=[
|
| 206 |
-
)
|
| 207 |
-
vote_b.click(
|
| 208 |
-
_make_vote_handler("B"),
|
| 209 |
-
inputs=[task_id_state, slot_a_is_baseline, comment],
|
| 210 |
-
outputs=[reveal, summary_md],
|
| 211 |
)
|
| 212 |
-
|
| 213 |
-
|
| 214 |
-
inputs=[
|
| 215 |
-
outputs=[
|
| 216 |
)
|
| 217 |
-
|
| 218 |
-
|
| 219 |
-
inputs=[
|
| 220 |
-
outputs=[
|
| 221 |
)
|
| 222 |
|
| 223 |
refresh_btn = gr.Button("집계 새로고침", size="sm")
|
| 224 |
refresh_btn.click(
|
| 225 |
-
lambda: gr.update(value=_format_summary(db.
|
| 226 |
outputs=[summary_md],
|
| 227 |
)
|
|
|
|
| 1 |
+
"""Gradio UI for single-output proofreading feedback.
|
| 2 |
|
| 3 |
Flow:
|
| 4 |
+
1. User enters source text and clicks "교정 실행".
|
| 5 |
+
2. The configured pipeline runs.
|
| 6 |
+
3. Output + diff are shown.
|
| 7 |
+
4. User picks Good / Not Bad / Critical (+ optional comment).
|
| 8 |
+
5. On submit: rating is saved to Supabase ratings table.
|
|
|
|
|
|
|
| 9 |
"""
|
| 10 |
|
| 11 |
from __future__ import annotations
|
| 12 |
|
|
|
|
| 13 |
import time
|
|
|
|
| 14 |
from typing import Any
|
| 15 |
|
| 16 |
import gradio as gr
|
|
|
|
| 22 |
PipelineConfig = tuple[str, str, str] # (pipeline_key, model, prompt_key)
|
| 23 |
|
| 24 |
|
| 25 |
+
def _format_summary(counts: dict[str, int]) -> str:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 26 |
total = sum(counts.values())
|
| 27 |
if total == 0:
|
| 28 |
+
return "아직 피드백이 없습니다."
|
| 29 |
+
g = counts.get("good", 0)
|
| 30 |
+
n = counts.get("not_bad", 0)
|
| 31 |
+
c = counts.get("critical", 0)
|
| 32 |
+
return (
|
| 33 |
+
f"**총 피드백**: {total}\n\n"
|
| 34 |
+
f"- Good: **{g}**\n"
|
| 35 |
+
f"- Not Bad: **{n}**\n"
|
| 36 |
+
f"- Critical: **{c}**"
|
| 37 |
+
)
|
| 38 |
+
|
| 39 |
+
|
| 40 |
+
def build_feedback_tab(
|
|
|
|
| 41 |
client: Any,
|
| 42 |
vocabulary: list[dict],
|
| 43 |
+
pipeline_config: PipelineConfig,
|
|
|
|
| 44 |
) -> None:
|
| 45 |
+
"""Build the single-pipeline feedback UI. Call inside a gr.Blocks/Tab."""
|
|
|
|
|
|
|
| 46 |
|
| 47 |
+
pipeline_key, model, prompt_key = pipeline_config
|
|
|
|
|
|
|
|
|
|
| 48 |
|
| 49 |
+
pipeline_run_id_state = gr.State(None)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 50 |
|
| 51 |
input_text = gr.Textbox(
|
| 52 |
label="원문 입력",
|
| 53 |
+
lines=8,
|
| 54 |
placeholder="교정할 텍스트를 입력하세요.",
|
| 55 |
)
|
| 56 |
+
|
| 57 |
+
run_btn = gr.Button(
|
| 58 |
+
"교정 실행 (⌘+Enter / Ctrl+Enter)",
|
| 59 |
+
variant="primary",
|
| 60 |
+
elem_id="compare-run-btn",
|
| 61 |
+
)
|
| 62 |
|
| 63 |
status = gr.Markdown("")
|
| 64 |
|
| 65 |
+
output = gr.Textbox(label="교정 결과", lines=12, interactive=False)
|
| 66 |
+
diff_html = gr.HTML(label="원문 대비 diff")
|
| 67 |
+
|
| 68 |
+
gr.Markdown("### 피드백")
|
| 69 |
with gr.Row():
|
| 70 |
+
rate_good = gr.Button("👍 Good", variant="primary")
|
| 71 |
+
rate_notbad = gr.Button("🆗 Not Bad")
|
| 72 |
+
rate_critical = gr.Button("🚨 Critical", variant="stop")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 73 |
|
| 74 |
comment = gr.Textbox(label="코멘트 (선택)", lines=2)
|
| 75 |
+
rating_status = gr.Markdown("")
|
|
|
|
| 76 |
summary_md = gr.Markdown("")
|
| 77 |
|
| 78 |
+
def _on_run(text: str):
|
| 79 |
if not text or not text.strip():
|
| 80 |
return (
|
| 81 |
gr.update(value="입력 텍스트가 비어있습니다."),
|
| 82 |
+
gr.update(value=""),
|
| 83 |
+
gr.update(value=""),
|
| 84 |
+
None,
|
| 85 |
gr.update(value=""),
|
| 86 |
)
|
| 87 |
if client is None:
|
| 88 |
return (
|
| 89 |
gr.update(value="UPSTAGE_API_KEY 미설정."),
|
| 90 |
+
gr.update(value=""),
|
| 91 |
+
gr.update(value=""),
|
| 92 |
+
None,
|
| 93 |
gr.update(value=""),
|
| 94 |
)
|
| 95 |
|
| 96 |
+
start = time.time()
|
| 97 |
+
try:
|
| 98 |
+
result = run_pipeline(text, pipeline_key, model, prompt_key, client, vocabulary)
|
| 99 |
+
except Exception as exc:
|
| 100 |
+
return (
|
| 101 |
+
gr.update(value=f"에러: {exc}"),
|
| 102 |
+
gr.update(value=""),
|
| 103 |
+
gr.update(value=""),
|
| 104 |
+
None,
|
| 105 |
+
gr.update(value=""),
|
| 106 |
+
)
|
| 107 |
|
| 108 |
+
elapsed = result.get("processing_time", time.time() - start)
|
| 109 |
+
out_text = result.get("output", "")
|
|
|
|
|
|
|
|
|
|
| 110 |
|
| 111 |
+
article_id = db.save_article(text)
|
| 112 |
+
run_id = db.save_pipeline_run(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 113 |
article_id,
|
| 114 |
+
pipeline_key=pipeline_key,
|
| 115 |
+
prompt_key=prompt_key,
|
| 116 |
+
model=model,
|
| 117 |
+
output=out_text,
|
| 118 |
+
processing_time_s=float(elapsed),
|
| 119 |
+
)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 120 |
|
| 121 |
return (
|
| 122 |
+
gr.update(value=f"완료 · {elapsed:.1f}s"),
|
| 123 |
+
gr.update(value=out_text),
|
| 124 |
+
gr.update(value=highlight_diff(text, out_text)),
|
| 125 |
+
run_id,
|
| 126 |
+
gr.update(value=""),
|
|
|
|
|
|
|
| 127 |
)
|
| 128 |
|
| 129 |
+
run_btn.click(
|
| 130 |
+
_on_run,
|
| 131 |
inputs=[input_text],
|
| 132 |
+
outputs=[status, output, diff_html, pipeline_run_id_state, rating_status],
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 133 |
)
|
| 134 |
|
| 135 |
+
def _make_rating_handler(rating: str):
|
| 136 |
+
def handler(run_id, comment_text):
|
| 137 |
+
saved = db.save_rating(run_id, rating, comment_text)
|
| 138 |
+
note = "✅ 피드백 저장됨" if saved else "⚠️ 저장되지 않았습니다 (먼저 교정 실행 후 피드백을 남겨주세요)"
|
| 139 |
+
summary = _format_summary(db.fetch_rating_counts())
|
| 140 |
+
return gr.update(value=note), gr.update(value=summary)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 141 |
return handler
|
| 142 |
|
| 143 |
+
rate_good.click(
|
| 144 |
+
_make_rating_handler("good"),
|
| 145 |
+
inputs=[pipeline_run_id_state, comment],
|
| 146 |
+
outputs=[rating_status, summary_md],
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 147 |
)
|
| 148 |
+
rate_notbad.click(
|
| 149 |
+
_make_rating_handler("not_bad"),
|
| 150 |
+
inputs=[pipeline_run_id_state, comment],
|
| 151 |
+
outputs=[rating_status, summary_md],
|
| 152 |
)
|
| 153 |
+
rate_critical.click(
|
| 154 |
+
_make_rating_handler("critical"),
|
| 155 |
+
inputs=[pipeline_run_id_state, comment],
|
| 156 |
+
outputs=[rating_status, summary_md],
|
| 157 |
)
|
| 158 |
|
| 159 |
refresh_btn = gr.Button("집계 새로고침", size="sm")
|
| 160 |
refresh_btn.click(
|
| 161 |
+
lambda: gr.update(value=_format_summary(db.fetch_rating_counts())),
|
| 162 |
outputs=[summary_md],
|
| 163 |
)
|
pipelines.py
CHANGED
|
@@ -857,23 +857,33 @@ def _process_single_bulk(
|
|
| 857 |
)
|
| 858 |
# FT model returns raw text (no JSON). Other steps return JSON with "output" field.
|
| 859 |
if is_ft_model:
|
| 860 |
-
# FT duplication guard
|
| 861 |
-
#
|
| 862 |
-
#
|
| 863 |
-
#
|
| 864 |
-
#
|
| 865 |
-
|
| 866 |
-
|
|
|
|
| 867 |
stripped = response.strip()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 868 |
first_end = len(stripped) // 2
|
| 869 |
head = stripped[:first_end].rstrip()
|
| 870 |
tail = stripped[first_end:].strip()
|
| 871 |
-
# If the two halves are ~identical (>=0.8 char-level equality),
|
| 872 |
-
# or one is a substring of the other, keep only the first half.
|
| 873 |
if head and (head == tail or head in tail or tail in head):
|
| 874 |
return head
|
| 875 |
-
|
| 876 |
-
#
|
| 877 |
return bulk
|
| 878 |
return response
|
| 879 |
# Fallback to original_text when LLM returns non-JSON hallucination — matches
|
|
|
|
| 857 |
)
|
| 858 |
# FT model returns raw text (no JSON). Other steps return JSON with "output" field.
|
| 859 |
if is_ft_model:
|
| 860 |
+
# FT duplication guard. Symptoms seen in the wild:
|
| 861 |
+
# (a) Half-half echo: model returns input twice (length ~2×).
|
| 862 |
+
# (b) Prefix echo: model returns full input verbatim then appends a
|
| 863 |
+
# partial re-correction tail (length ~1.2–1.5×).
|
| 864 |
+
# Both yield a downstream "duplicated paragraph" bug, so we trigger the
|
| 865 |
+
# guard at 1.25× and try several recovery patterns before falling back
|
| 866 |
+
# to the original input.
|
| 867 |
+
if response and len(response) >= len(bulk) * 1.25:
|
| 868 |
stripped = response.strip()
|
| 869 |
+
bulk_stripped = bulk.strip()
|
| 870 |
+
|
| 871 |
+
# (b) Output starts with full input verbatim → strip the appended tail.
|
| 872 |
+
if (
|
| 873 |
+
bulk_stripped
|
| 874 |
+
and stripped.startswith(bulk_stripped)
|
| 875 |
+
and len(stripped) > len(bulk_stripped) * 1.05
|
| 876 |
+
):
|
| 877 |
+
return bulk_stripped
|
| 878 |
+
|
| 879 |
+
# (a) Half-half echo.
|
| 880 |
first_end = len(stripped) // 2
|
| 881 |
head = stripped[:first_end].rstrip()
|
| 882 |
tail = stripped[first_end:].strip()
|
|
|
|
|
|
|
| 883 |
if head and (head == tail or head in tail or tail in head):
|
| 884 |
return head
|
| 885 |
+
|
| 886 |
+
# Unknown duplication pattern → safer to return the input.
|
| 887 |
return bulk
|
| 888 |
return response
|
| 889 |
# Fallback to original_text when LLM returns non-JSON hallucination — matches
|