Spaces:

dev-strender
/

proofread-20261h-demo

Sleeping

App Files Files Community

dev-strender commited on Apr 16

Commit

c421c84

1 Parent(s): ed17f2a

Single-tab feedback UI (Good/NotBad/Critical); FT dedup tighter

Browse files

Files changed (10) hide show

.gitignore +1 -0
app.py +14 -155
blindtest/__init__.py +3 -3
blindtest/__pycache__/__init__.cpython-312.pyc +0 -0
blindtest/__pycache__/db.cpython-312.pyc +0 -0
blindtest/__pycache__/ui.cpython-312.pyc +0 -0
blindtest/db.py +33 -0
blindtest/schema.sql +14 -0
blindtest/ui.py +94 -158
pipelines.py +21 -11

.gitignore ADDED Viewed

	@@ -0,0 +1 @@


1	+ __pycache__/

app.py CHANGED Viewed

@@ -1,22 +1,17 @@
-"""Chosun proofreading pipeline comparator — Gradio UI.
-Tabs:
-  1. 비교 (Comparator): 2-column A (Baseline 251231) vs B (🏆 v16)
-  2. 블라인드 테스트: label-hidden pairwise preference voting, Supabase-backed
 """
 import os
-from collections.abc import Iterator
-from concurrent.futures import ThreadPoolExecutor, as_completed
 import gradio as gr
-from diff_utils import highlight_diff
 from dotenv import load_dotenv
 from openai import OpenAI
-from pipelines import PIPELINES, list_prompts, run_pipeline
 from postprocess import load_vocabulary
-from blindtest import build_blindtest_tab
 load_dotenv()
@@ -27,163 +22,27 @@ _vocab_path = os.path.join(os.path.dirname(__file__), "data", "vocabulary.csv")
 vocabulary = load_vocabulary(_vocab_path)
-_PENDING_TEXT = "⏳ 실행 중..."
-_SLOTS = ("A", "B")
-def _build_state(
-    input_text: str,
-    results: dict[str, dict | None],
-) -> tuple:
-    """Build 2-pipeline output tuple: 2 outputs + 2 diffs + meta."""
-    outs = [
-        results[s]["output"] if results[s] else _PENDING_TEXT for s in _SLOTS
-    ]
-    diffs = [
-        highlight_diff(input_text, results[s]["output"]) if results[s] else ""
-        for s in _SLOTS
-    ]
-    def _label(result: dict | None, default: str) -> str:
-        if not result:
-            return default
-        t = f"{result['processing_time']:.1f}s"
-        errs = result.get("step_errors") or []
-        if errs:
-            t += f" ({len(errs)} err: {', '.join(errs[:2])}{'...' if len(errs) > 2 else ''})"
-        return t
-    times = [_label(results[s], "실행 중...") for s in _SLOTS]
-    meta = " | ".join(f"Pipeline {s}: {t}" for s, t in zip(_SLOTS, times))
-    return (*outs, *diffs, meta)
-def compare(
-    input_text: str,
-    pipe_a: str, model_a: str, prompt_a: str,
-    pipe_b: str, model_b: str, prompt_b: str,
-) -> Iterator[tuple]:
-    """Run 2 pipelines concurrently and yield partial results as each finishes."""
-    empty = tuple([""] * 4) + ("입력 텍스트를 입력해주세요.",)
-    if not input_text or not input_text.strip():
-        yield empty
-        return
-    if not client:
-        yield tuple([""] * 4) + (
-            "UPSTAGE_API_KEY 환경변수가 설정되지 않았습니다. .env 파일을 확인해주세요.",
-        )
-        return
-    results: dict[str, dict | None] = {s: None for s in _SLOTS}
-    yield _build_state(input_text, results)
-    configs = {
-        "A": (pipe_a, model_a, prompt_a),
-        "B": (pipe_b, model_b, prompt_b),
-    }
-    with ThreadPoolExecutor(max_workers=2) as executor:
-        futures = {
-            executor.submit(
-                run_pipeline, input_text, pipe, model, prompt, client, vocabulary
-            ): slot
-            for slot, (pipe, model, prompt) in configs.items()
-        }
-        for fut in as_completed(futures):
-            slot = futures[fut]
-            try:
-                result = fut.result()
-            except Exception as exc:
-                result = {"output": f"에러: {exc}", "processing_time": 0.0}
-            results[slot] = result
-            yield _build_state(input_text, results)
-# --- UI ---
-pipeline_choices = list(PIPELINES.keys())
-model_choices = ["solar-pro2", "solar-pro3"]
-prompt_choices = list_prompts() or ["prod_251231"]
-def _default_prompt(preferred_prefix: str, fallback_index: int) -> str:
     matches = [p for p in prompt_choices if p.startswith(preferred_prefix)]
     if matches:
         return matches[-1]
-    return prompt_choices[fallback_index]
-_default_prompt_a = _default_prompt("prod_251231", 0)
-_default_prompt_b = _default_prompt("dev_260408_v16", -1)
 with gr.Blocks(title="Chosun 교정교열 데모") as demo:
     gr.Markdown("# Chosun 교정교열 데모")
-    with gr.Tabs():
-        with gr.Tab("블라인드 테스트"):
-            build_blindtest_tab(
-                client=client,
-                vocabulary=vocabulary,
-                baseline_config=("251231_default", "solar-pro2", _default_prompt_a),
-                candidate_config=("260408_v16", "solar-pro3", _default_prompt_b),
-            )
-        with gr.Tab("파이프라인 비교"):
-            gr.Markdown("Baseline(251231 Pro2×3)과 신규 v16(Pro3×1)을 나란히 비교합니다.")
-            input_text = gr.Textbox(
-                label="원문 입력",
-                lines=8,
-                placeholder="교정할 텍스트를 입력하세요.",
-            )
-            with gr.Row():
-                with gr.Column():
-                    gr.Markdown("### A (Baseline 251231)")
-                    pipe_a = gr.Dropdown(pipeline_choices, value="251231_default", label="파이프라인")
-                    model_a = gr.Dropdown(model_choices, value="solar-pro2", label="모델")
-                    prompt_a = gr.Dropdown(prompt_choices, value=_default_prompt_a, label="프롬프트")
-                with gr.Column():
-                    gr.Markdown("### B (🏆 v16)")
-                    pipe_b = gr.Dropdown(pipeline_choices, value="260408_v16", label="파이프라인")
-                    model_b = gr.Dropdown(model_choices, value="solar-pro3", label="모델")
-                    prompt_b = gr.Dropdown(prompt_choices, value=_default_prompt_b, label="프롬프트")
-            btn = gr.Button(
-                "비교 실행 (⌘+Enter / Ctrl+Enter)",
-                variant="primary",
-                elem_id="compare-run-btn",
-            )
-            with gr.Row():
-                output_a = gr.Textbox(label="A 결과", lines=10)
-                output_b = gr.Textbox(label="B 결과", lines=10)
-            with gr.Accordion("Diff 상세 비교", open=True):
-                gr.Markdown("#### 원문 vs A")
-                diff_orig_a_html = gr.HTML()
-                gr.Markdown("#### 원문 vs B")
-                diff_orig_b_html = gr.HTML()
-            meta_info = gr.Textbox(label="실행 정보", interactive=False)
-            btn.click(
-                fn=compare,
-                inputs=[
-                    input_text,
-                    pipe_a, model_a, prompt_a,
-                    pipe_b, model_b, prompt_b,
-                ],
-                outputs=[
-                    output_a, output_b,
-                    diff_orig_a_html, diff_orig_b_html,
-                    meta_info,
-                ],
-            )
     _SHORTCUT_JS = """
 () => {

+"""Chosun proofreading demo — Gradio UI.
+Single tab: solar-pro3 v16 pipeline + Good/Not Bad/Critical feedback.
 """
 import os
 import gradio as gr
 from dotenv import load_dotenv
 from openai import OpenAI
+from pipelines import list_prompts
 from postprocess import load_vocabulary
+from blindtest import build_feedback_tab
 load_dotenv()
 vocabulary = load_vocabulary(_vocab_path)
+prompt_choices = list_prompts() or ["dev_260408_v16"]
+def _default_prompt(preferred_prefix: str, fallback: str) -> str:
     matches = [p for p in prompt_choices if p.startswith(preferred_prefix)]
     if matches:
         return matches[-1]
+    return fallback
+_v16_prompt = _default_prompt("dev_260408_v16", prompt_choices[-1])
 with gr.Blocks(title="Chosun 교정교열 데모") as demo:
     gr.Markdown("# Chosun 교정교열 데모")
+    gr.Markdown("solar-pro3 기반 교정 결과를 확인하고 피드백을 남겨주세요.")
+    build_feedback_tab(
+        client=client,
+        vocabulary=vocabulary,
+        pipeline_config=("260408_v16", "solar-pro3", _v16_prompt),
+    )
     _SHORTCUT_JS = """
 () => {

blindtest/__init__.py CHANGED Viewed

@@ -1,5 +1,5 @@
-"""Blind test module — pairwise preference voting, Supabase-backed."""
-from .ui import build_blindtest_tab
-__all__ = ["build_blindtest_tab"]

+"""Feedback module — single-output Good/Not Bad/Critical rating, Supabase-backed."""
+from .ui import build_feedback_tab
+__all__ = ["build_feedback_tab"]

blindtest/__pycache__/__init__.cpython-312.pyc DELETED Viewed

Binary file (407 Bytes)

blindtest/__pycache__/db.cpython-312.pyc DELETED Viewed

Binary file (6.09 kB)

blindtest/__pycache__/ui.cpython-312.pyc DELETED Viewed

Binary file (12 kB)

blindtest/db.py CHANGED Viewed

@@ -147,6 +147,39 @@ def save_task(
         return None
 def save_vote(task_id: int | None, choice: str, comment: str) -> bool:
     if not is_configured() or task_id is None:
         return False

         return None
+def save_rating(pipeline_run_id: int | None, rating: str, comment: str) -> bool:
+    """Save Good / Not Bad / Critical rating for a single pipeline run."""
+    if not is_configured() or pipeline_run_id is None:
+        return False
+    try:
+        _post(
+            "ratings",
+            {
+                "pipeline_run_id": pipeline_run_id,
+                "rating": rating,
+                "comment": comment or None,
+            },
+        )
+        return True
+    except Exception as exc:
+        _record_error("save_rating", exc)
+        return False
+def fetch_rating_counts() -> dict[str, int]:
+    if not is_configured():
+        return {}
+    try:
+        rows = _get("ratings", {"select": "rating"})
+        counts: dict[str, int] = {}
+        for row in rows:
+            counts[row["rating"]] = counts.get(row["rating"], 0) + 1
+        return counts
+    except Exception as exc:
+        _record_error("fetch_rating_counts", exc)
+        return {}
 def save_vote(task_id: int | None, choice: str, comment: str) -> bool:
     if not is_configured() or task_id is None:
         return False

blindtest/schema.sql CHANGED Viewed

@@ -38,12 +38,24 @@ create table if not exists votes (
 create index if not exists votes_task_idx on votes (task_id);
 -- Allow anon key (used by the Gradio app) to insert/select.
 -- RLS is enabled by default on public tables; without policies, anon writes silently fail.
 alter table articles       enable row level security;
 alter table pipeline_runs  enable row level security;
 alter table tasks          enable row level security;
 alter table votes          enable row level security;
 create policy "anon insert articles"      on articles      for insert to anon with check (true);
 create policy "anon select articles"      on articles      for select to anon using (true);
@@ -53,6 +65,8 @@ create policy "anon insert tasks"         on tasks         for insert to anon wi
 create policy "anon select tasks"         on tasks         for select to anon using (true);
 create policy "anon insert votes"         on votes         for insert to anon with check (true);
 create policy "anon select votes"         on votes         for select to anon using (true);
 -- Aggregation view: winner is identified by pipeline_key (un-blinded).
 create or replace view vote_summary as

 create index if not exists votes_task_idx on votes (task_id);
+-- Single-output rating table (current UI: Good / Not Bad / Critical).
+create table if not exists ratings (
+    id bigserial primary key,
+    pipeline_run_id bigint references pipeline_runs(id) on delete cascade,
+    rating text not null check (rating in ('good','not_bad','critical')),
+    comment text,
+    rated_at timestamptz default now()
+);
+create index if not exists ratings_run_idx on ratings (pipeline_run_id);
 -- Allow anon key (used by the Gradio app) to insert/select.
 -- RLS is enabled by default on public tables; without policies, anon writes silently fail.
 alter table articles       enable row level security;
 alter table pipeline_runs  enable row level security;
 alter table tasks          enable row level security;
 alter table votes          enable row level security;
+alter table ratings        enable row level security;
 create policy "anon insert articles"      on articles      for insert to anon with check (true);
 create policy "anon select articles"      on articles      for select to anon using (true);
 create policy "anon select tasks"         on tasks         for select to anon using (true);
 create policy "anon insert votes"         on votes         for insert to anon with check (true);
 create policy "anon select votes"         on votes         for select to anon using (true);
+create policy "anon insert ratings"       on ratings       for insert to anon with check (true);
+create policy "anon select ratings"       on ratings       for select to anon using (true);
 -- Aggregation view: winner is identified by pipeline_key (un-blinded).
 create or replace view vote_summary as

blindtest/ui.py CHANGED Viewed

@@ -1,20 +1,16 @@
-"""Gradio UI for pairwise blind-test voting.
 Flow:
-  1. User enters source text and clicks "생성".
-  2. Both pipelines run in parallel; randomized assignment decides which
-     becomes slot A vs slot B in the UI.
-  3. Outputs are shown with neutral labels and a diff against the source.
-  4. User picks A / B / 비슷 / 둘 다 나쁨 (+ optional comment) and submits.
-  5. On submit: vote is saved, the real pipeline identity is revealed, and
-     the running tally is refreshed.
 """
 from __future__ import annotations
-import random
 import time
-from concurrent.futures import ThreadPoolExecutor
 from typing import Any
 import gradio as gr
@@ -26,202 +22,142 @@ from . import db
 PipelineConfig = tuple[str, str, str]  # (pipeline_key, model, prompt_key)
-def _run_one(
-    client: Any,
-    vocabulary: list[dict],
-    text: str,
-    config: PipelineConfig,
-) -> dict:
-    pipeline_key, model, prompt_key = config
-    start = time.time()
-    try:
-        result = run_pipeline(text, pipeline_key, model, prompt_key, client, vocabulary)
-        result.setdefault("processing_time", time.time() - start)
-        return result
-    except Exception as exc:  # pragma: no cover
-        return {"output": f"에러: {exc}", "processing_time": time.time() - start}
-def _format_summary(summary: list[dict], counts: dict[str, int]) -> str:
     total = sum(counts.values())
     if total == 0:
-        return "아직 투표가 없습니다."
-    lines = [f"**총 투표**: {total}"]
-    if summary:
-        lines.append("\n**파이프라인별 승수 (A/B 선택만 집계)**")
-        for row in sorted(summary, key=lambda r: -r.get("wins", 0)):
-            lines.append(f"- `{row['winner_pipeline']}`: {row['wins']}")
-    tie = counts.get("tie", 0)
-    bad = counts.get("both_bad", 0)
-    if tie or bad:
-        lines.append(f"\n비슷: {tie} · 둘 다 나쁨: {bad}")
-    return "\n".join(lines)
-def build_blindtest_tab(
     client: Any,
     vocabulary: list[dict],
-    baseline_config: PipelineConfig,
-    candidate_config: PipelineConfig,
 ) -> None:
-    """Build the blind-test Gradio tab. Must be called inside a gr.Blocks/Tab."""
-    configured = db.is_configured()
-    gr.Markdown(
-        "두 교정 결과 중 어느 쪽이 더 낫다고 느끼는지 익명으로 투표합니다. "
-        "라벨은 제출 후 공개됩니다."
-    )
-    # Hidden state: which config is in slot A (True = baseline, False = candidate)
-    slot_a_is_baseline = gr.State(True)
-    task_id_state = gr.State(None)
-    run_a_id_state = gr.State(None)
-    run_b_id_state = gr.State(None)
     input_text = gr.Textbox(
         label="원문 입력",
-        lines=6,
         placeholder="교정할 텍스트를 입력하세요.",
     )
-    generate_btn = gr.Button("A / B 생성", variant="primary")
     status = gr.Markdown("")
     with gr.Row():
-        with gr.Column():
-            gr.Markdown("### 결과 A")
-            output_a = gr.Textbox(label="교정 결과 A", lines=10, interactive=False)
-            diff_a = gr.HTML(label="원문 대비 diff A")
-        with gr.Column():
-            gr.Markdown("### 결과 B")
-            output_b = gr.Textbox(label="교정 결과 B", lines=10, interactive=False)
-            diff_b = gr.HTML(label="원문 대비 diff B")
-    gr.Markdown("### 어느 쪽이 더 낫나요?")
-    with gr.Row():
-        vote_a = gr.Button("A 가 낫다", variant="primary")
-        vote_b = gr.Button("B 가 낫다", variant="primary")
-        vote_tie = gr.Button("비슷하다")
-        vote_bad = gr.Button("둘 다 나쁘다")
     comment = gr.Textbox(label="코멘트 (선택)", lines=2)
-    reveal = gr.Markdown("")
     summary_md = gr.Markdown("")
-    def _on_generate(text: str):
         if not text or not text.strip():
             return (
                 gr.update(value="입력 텍스트가 비어있습니다."),
-                gr.update(), gr.update(), gr.update(), gr.update(),
-                True, None, None, None,
                 gr.update(value=""),
             )
         if client is None:
             return (
                 gr.update(value="UPSTAGE_API_KEY 미설정."),
-                gr.update(), gr.update(), gr.update(), gr.update(),
-                True, None, None, None,
                 gr.update(value=""),
             )
-        # Randomize which config gets slot A
-        a_is_baseline = random.random() < 0.5
-        cfg_slot_a = baseline_config if a_is_baseline else candidate_config
-        cfg_slot_b = candidate_config if a_is_baseline else baseline_config
-        with ThreadPoolExecutor(max_workers=2) as ex:
-            fut_a = ex.submit(_run_one, client, vocabulary, text, cfg_slot_a)
-            fut_b = ex.submit(_run_one, client, vocabulary, text, cfg_slot_b)
-            res_a = fut_a.result()
-            res_b = fut_b.result()
-        article_id = db.save_article(text) if configured else None
-        # Persist runs using the TRUE pipeline key (not slot label)
-        run_slot_a_id = db.save_pipeline_run(
-            article_id,
-            pipeline_key=cfg_slot_a[0],
-            prompt_key=cfg_slot_a[2],
-            model=cfg_slot_a[1],
-            output=res_a["output"],
-            processing_time_s=float(res_a.get("processing_time", 0.0)),
-        ) if configured else None
-        run_slot_b_id = db.save_pipeline_run(
             article_id,
-            pipeline_key=cfg_slot_b[0],
-            prompt_key=cfg_slot_b[2],
-            model=cfg_slot_b[1],
-            output=res_b["output"],
-            processing_time_s=float(res_b.get("processing_time", 0.0)),
-        ) if configured else None
-        task_id = db.save_task(article_id, run_slot_a_id, run_slot_b_id) if configured else None
-        diff_a_html = highlight_diff(text, res_a["output"])
-        diff_b_html = highlight_diff(text, res_b["output"])
         return (
-            gr.update(value=f"생성 완료 · A {res_a.get('processing_time', 0):.1f}s · B {res_b.get('processing_time', 0):.1f}s"),
-            gr.update(value=res_a["output"]),
-            gr.update(value=res_b["output"]),
-            gr.update(value=diff_a_html),
-            gr.update(value=diff_b_html),
-            a_is_baseline, task_id, run_slot_a_id, run_slot_b_id,
-            gr.update(value=""),  # clear reveal
         )
-    generate_btn.click(
-        _on_generate,
         inputs=[input_text],
-        outputs=[
-            status,
-            output_a, output_b,
-            diff_a, diff_b,
-            slot_a_is_baseline, task_id_state, run_a_id_state, run_b_id_state,
-            reveal,
-        ],
     )
-    def _make_vote_handler(choice: str):
-        def handler(task_id, a_is_baseline, comment_text):
-            saved = db.save_vote(task_id, choice, comment_text) if configured else False
-            if a_is_baseline:
-                label_a = baseline_config[0]
-                label_b = candidate_config[0]
-            else:
-                label_a = candidate_config[0]
-                label_b = baseline_config[0]
-            reveal_md = (
-                f"**공개** — A: `{label_a}` · B: `{label_b}`\n\n"
-                f"선택: **{choice}**"
-            )
-            summary = _format_summary(db.fetch_summary(), db.fetch_vote_counts())
-            return gr.update(value=reveal_md), gr.update(value=summary)
         return handler
-    vote_a.click(
-        _make_vote_handler("A"),
-        inputs=[task_id_state, slot_a_is_baseline, comment],
-        outputs=[reveal, summary_md],
-    )
-    vote_b.click(
-        _make_vote_handler("B"),
-        inputs=[task_id_state, slot_a_is_baseline, comment],
-        outputs=[reveal, summary_md],
     )
-    vote_tie.click(
-        _make_vote_handler("tie"),
-        inputs=[task_id_state, slot_a_is_baseline, comment],
-        outputs=[reveal, summary_md],
     )
-    vote_bad.click(
-        _make_vote_handler("both_bad"),
-        inputs=[task_id_state, slot_a_is_baseline, comment],
-        outputs=[reveal, summary_md],
     )
     refresh_btn = gr.Button("집계 새로고침", size="sm")
     refresh_btn.click(
-        lambda: gr.update(value=_format_summary(db.fetch_summary(), db.fetch_vote_counts())),
         outputs=[summary_md],
     )

+"""Gradio UI for single-output proofreading feedback.
 Flow:
+  1. User enters source text and clicks "교정 실행".
+  2. The configured pipeline runs.
+  3. Output + diff are shown.
+  4. User picks Good / Not Bad / Critical (+ optional comment).
+  5. On submit: rating is saved to Supabase ratings table.
 """
 from __future__ import annotations
 import time
 from typing import Any
 import gradio as gr
 PipelineConfig = tuple[str, str, str]  # (pipeline_key, model, prompt_key)
+def _format_summary(counts: dict[str, int]) -> str:
     total = sum(counts.values())
     if total == 0:
+        return "아직 피드백이 없습니다."
+    g = counts.get("good", 0)
+    n = counts.get("not_bad", 0)
+    c = counts.get("critical", 0)
+    return (
+        f"**총 피드백**: {total}\n\n"
+        f"- Good: **{g}**\n"
+        f"- Not Bad: **{n}**\n"
+        f"- Critical: **{c}**"
+    )
+def build_feedback_tab(
     client: Any,
     vocabulary: list[dict],
+    pipeline_config: PipelineConfig,
 ) -> None:
+    """Build the single-pipeline feedback UI. Call inside a gr.Blocks/Tab."""
+    pipeline_key, model, prompt_key = pipeline_config
+    pipeline_run_id_state = gr.State(None)
     input_text = gr.Textbox(
         label="원문 입력",
+        lines=8,
         placeholder="교정할 텍스트를 입력하세요.",
     )
+    run_btn = gr.Button(
+        "교정 실행 (⌘+Enter / Ctrl+Enter)",
+        variant="primary",
+        elem_id="compare-run-btn",
+    )
     status = gr.Markdown("")
+    output = gr.Textbox(label="교정 결과", lines=12, interactive=False)
+    diff_html = gr.HTML(label="원문 대비 diff")
+    gr.Markdown("### 피드백")
     with gr.Row():
+        rate_good = gr.Button("👍 Good", variant="primary")
+        rate_notbad = gr.Button("🆗 Not Bad")
+        rate_critical = gr.Button("🚨 Critical", variant="stop")
     comment = gr.Textbox(label="코멘트 (선택)", lines=2)
+    rating_status = gr.Markdown("")
     summary_md = gr.Markdown("")
+    def _on_run(text: str):
         if not text or not text.strip():
             return (
                 gr.update(value="입력 텍스트가 비어있습니다."),
+                gr.update(value=""),
+                gr.update(value=""),
+                None,
                 gr.update(value=""),
             )
         if client is None:
             return (
                 gr.update(value="UPSTAGE_API_KEY 미설정."),
+                gr.update(value=""),
+                gr.update(value=""),
+                None,
                 gr.update(value=""),
             )
+        start = time.time()
+        try:
+            result = run_pipeline(text, pipeline_key, model, prompt_key, client, vocabulary)
+        except Exception as exc:
+            return (
+                gr.update(value=f"에러: {exc}"),
+                gr.update(value=""),
+                gr.update(value=""),
+                None,
+                gr.update(value=""),
+            )
+        elapsed = result.get("processing_time", time.time() - start)
+        out_text = result.get("output", "")
+        article_id = db.save_article(text)
+        run_id = db.save_pipeline_run(
             article_id,
+            pipeline_key=pipeline_key,
+            prompt_key=prompt_key,
+            model=model,
+            output=out_text,
+            processing_time_s=float(elapsed),
+        )
         return (
+            gr.update(value=f"완료 · {elapsed:.1f}s"),
+            gr.update(value=out_text),
+            gr.update(value=highlight_diff(text, out_text)),
+            run_id,
+            gr.update(value=""),
         )
+    run_btn.click(
+        _on_run,
         inputs=[input_text],
+        outputs=[status, output, diff_html, pipeline_run_id_state, rating_status],
     )
+    def _make_rating_handler(rating: str):
+        def handler(run_id, comment_text):
+            saved = db.save_rating(run_id, rating, comment_text)
+            note = "✅ 피드백 저장됨" if saved else "⚠️ 저장되지 않았습니다 (먼저 교정 실행 후 피드백을 남겨주세요)"
+            summary = _format_summary(db.fetch_rating_counts())
+            return gr.update(value=note), gr.update(value=summary)
         return handler
+    rate_good.click(
+        _make_rating_handler("good"),
+        inputs=[pipeline_run_id_state, comment],
+        outputs=[rating_status, summary_md],
     )
+    rate_notbad.click(
+        _make_rating_handler("not_bad"),
+        inputs=[pipeline_run_id_state, comment],
+        outputs=[rating_status, summary_md],
     )
+    rate_critical.click(
+        _make_rating_handler("critical"),
+        inputs=[pipeline_run_id_state, comment],
+        outputs=[rating_status, summary_md],
     )
     refresh_btn = gr.Button("집계 새로고침", size="sm")
     refresh_btn.click(
+        lambda: gr.update(value=_format_summary(db.fetch_rating_counts())),
         outputs=[summary_md],
     )

pipelines.py CHANGED Viewed

@@ -857,23 +857,33 @@ def _process_single_bulk(
     )
     # FT model returns raw text (no JSON). Other steps return JSON with "output" field.
     if is_ft_model:
-        # FT duplication guard: certain short inputs (e.g. those starting with
-        # a dash `-` or bullet) trigger the FT model to echo its input twice
-        # in sequence. If the response is ≥1.5× the input length and the input
-        # appears as the leading prefix, strip everything after the first
-        # occurrence to recover the intended single-pass output.
-        if response and len(response) >= len(bulk) * 1.5:
-            # Look for the input (or a light variant) appearing twice
             stripped = response.strip()
             first_end = len(stripped) // 2
             head = stripped[:first_end].rstrip()
             tail = stripped[first_end:].strip()
-            # If the two halves are ~identical (>=0.8 char-level equality),
-            # or one is a substring of the other, keep only the first half.
             if head and (head == tail or head in tail or tail in head):
                 return head
-            # Fallback: if length-duplicated but halves differ, return input
-            # unchanged rather than propagating a duplicated mess downstream.
             return bulk
         return response
     # Fallback to original_text when LLM returns non-JSON hallucination — matches

     )
     # FT model returns raw text (no JSON). Other steps return JSON with "output" field.
     if is_ft_model:
+        # FT duplication guard. Symptoms seen in the wild:
+        #   (a) Half-half echo: model returns input twice (length ~2×).
+        #   (b) Prefix echo: model returns full input verbatim then appends a
+        #       partial re-correction tail (length ~1.2–1.5×).
+        # Both yield a downstream "duplicated paragraph" bug, so we trigger the
+        # guard at 1.25× and try several recovery patterns before falling back
+        # to the original input.
+        if response and len(response) >= len(bulk) * 1.25:
             stripped = response.strip()
+            bulk_stripped = bulk.strip()
+            # (b) Output starts with full input verbatim → strip the appended tail.
+            if (
+                bulk_stripped
+                and stripped.startswith(bulk_stripped)
+                and len(stripped) > len(bulk_stripped) * 1.05
+            ):
+                return bulk_stripped
+            # (a) Half-half echo.
             first_end = len(stripped) // 2
             head = stripped[:first_end].rstrip()
             tail = stripped[first_end:].strip()
             if head and (head == tail or head in tail or tail in head):
                 return head
+            # Unknown duplication pattern → safer to return the input.
             return bulk
         return response
     # Fallback to original_text when LLM returns non-JSON hallucination — matches