Spaces:

dev-strender
/

proofread-20261h-demo

Sleeping

dev-strender Claude Opus 4.7 (1M context) commited on Apr 20

Commit

8b6780d

1 Parent(s): 20a14d9

fix: restore PARAGRAPH_SEP + per-bulk <원문> to suppress step1 echo

Step1 prompt (dev_260408_v16) expects <paragraph_separator>
structure-preservation rule, but the new pipeline sent plain \n.
Combined with passing full chunk as <원문> to every bulk, solar-pro3
echoed adjacent paragraphs ~15% of runs on 260408_v16 (measured via
Supabase; 251231_default at 0%).

- Add PARAGRAPH_SEP conversion at step1+ LLM boundary (matches
reference run.py:219-250).
- Narrow <원문> to per-bulk pre-FT snapshot, split with same bulker
(matches reference: original=step0_sep).
- Safe fallback to chunk-wide pre-FT text if bulk counts diverge.

Verified on 5 known-bad articles × 10 runs each: 4/5 articles fully
recover to 0% dup rate. Remaining case is a truncated-input data
quality issue unrelated to this fix.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>

Files changed (1) hide show

pipelines.py +72 -17

pipelines.py CHANGED Viewed

@@ -250,6 +250,11 @@ _CORRECTIONS_SCHEMA: dict = {
 # FT model uses a fixed system prompt (behavior baked into fine-tuning)
 FT_SYSTEM_PROMPT = "입력된 문서에 대한 교열 결과를 생성해 주세요."
 # Default pronoun replacements for post_process rule
 DEFAULT_PRONOUN_REPLACEMENTS: dict[str, str] = {
     "이재명 대표": "이재명 대통령",
@@ -806,7 +811,7 @@ def _process_single_bulk(
     model: str,
     prompts: dict[str, str],
     client: OpenAI,
-    original_text: str,
 ) -> str:
     """Process a single text bulk through an LLM step.
@@ -816,7 +821,12 @@ def _process_single_bulk(
         model: Model name to use.
         prompts: Loaded prompt texts.
         client: OpenAI client.
-        original_text: Original input text (for include_original format).
     Returns:
         Processed bulk text.
@@ -829,15 +839,17 @@ def _process_single_bulk(
     if prompt_key:
         system_prompt = prompts.get(prompt_key, "")
-        # For basic_correction (step1), include original text
         if prompt_key == "step1":
-            # Match reference pipeline format (pipeline_prod_251231.yaml user_template)
             user_content = (
-                f"<원문>\n{original_text}\n</원문>\n\n"
-                f"<교열_모델_수정결과>\n{bulk}\n</교열_모델_수정결과>"
             )
         else:
-            user_content = bulk
     elif is_ft_model:
         # FT models have instructions baked in — send user content only (no system prompt).
         # Sending a system prompt causes the Upstage API to return a 500 error for this model.
@@ -886,15 +898,21 @@ def _process_single_bulk(
             # Unknown duplication pattern → safer to return the input.
             return bulk
         return response
-    # Fallback to original_text when LLM returns non-JSON hallucination — matches
     # reference pipeline semantics (parse_step_output in refer/.../inference/run.py).
     extracted = extract_json_output(response, fallback=bulk)
     # step1 (basic_correction) hallucination guard: the prompt feeds the LLM
     # both <원문> and <교열_모델_수정결과>, and occasionally the model concatenates
     # corrected + original (or two slight variants) into one "output" string.
     # Symptom: result length ≈ 2× input. Detect by locating a second occurrence
-    # of the input's leading sentence (or original_text's leading sentence)
     # inside the extracted output — strong signal of duplication.
     if extracted and len(extracted) >= len(bulk) * 1.4:
         stripped = extracted.strip()
@@ -908,7 +926,7 @@ def _process_single_bulk(
                 return stripped[:second].rstrip()
             return None
-        recovered = _leading_lookup(bulk) or _leading_lookup(original_text)
         if recovered:
             return recovered
         # No clean recovery — keep the FT-pass output rather than the bloated mess.
@@ -919,34 +937,45 @@ def _process_single_bulk(
 def process_bulks_parallel(
     bulks: list[str],
     step: dict,
     model: str,
     prompts: dict[str, str],
     client: OpenAI,
-    original_text: str,
     max_workers: int = 10,
 ) -> list[str]:
     """Process multiple text bulks in parallel.
     Args:
-        bulks: List of text bulks.
         step: Step definition.
         model: Model name.
         prompts: Loaded prompt texts.
         client: OpenAI client.
-        original_text: Original input text.
         max_workers: Maximum thread pool workers.
     Returns:
         List of processed bulks in original order.
     """
     if len(bulks) <= 1:
-        return [_process_single_bulk(bulks[0], step, model, prompts, client, original_text)]
     with ThreadPoolExecutor(max_workers=max_workers) as executor:
         futures = [
-            executor.submit(_process_single_bulk, bulk, step, model, prompts, client, original_text)
-            for bulk in bulks
         ]
         return [f.result() for f in futures]
@@ -984,6 +1013,12 @@ def run_pipeline(
     start_time = time.time()
     # Per-step fallback: if any step raises, keep the previous `text` and
     # move on. A single failing step (transient API issue, template
     # mismatch, empty response in mid-pipeline) shouldn't nuke the whole
@@ -996,10 +1031,30 @@ def run_pipeline(
             if step["type"] == "rule":
                 text = apply_rule(step, text, original_text, vocabulary)
             elif step["type"] == "llm":
                 bulks = split_into_bulks(text)
                 if bulks:
                     processed = process_bulks_parallel(
-                        bulks, step, model, prompts, client, original_text
                     )
                     # Defensive: coerce any non-string bulk result before joining
                     # so the next step (which may call re.sub) never sees a

 # FT model uses a fixed system prompt (behavior baked into fine-tuning)
 FT_SYSTEM_PROMPT = "입력된 문서에 대한 교열 결과를 생성해 주세요."
+# Matches reference pipeline (refer/chosun-projects/proofread/inference/run.py).
+# \n는 step1+ 호출 직전에 이 토큰으로 치환되고, 프롬프트의 "개수/위치 100% 보존"
+# 규칙이 문단 복제/에코를 구조적으로 억제한다. 응답 파싱 후 다시 \n으로 복원.
+PARAGRAPH_SEP = "<paragraph_separator>"
 # Default pronoun replacements for post_process rule
 DEFAULT_PRONOUN_REPLACEMENTS: dict[str, str] = {
     "이재명 대표": "이재명 대통령",
     model: str,
     prompts: dict[str, str],
     client: OpenAI,
+    original_bulk: str,
 ) -> str:
     """Process a single text bulk through an LLM step.
         model: Model name to use.
         prompts: Loaded prompt texts.
         client: OpenAI client.
+        original_bulk: Pre-correction text **for this bulk** (not the full
+            chunk). Used as <원문> in step1 and as the leading-lookup fallback
+            in the hallucination guard. Per-bulk scoping mirrors the reference
+            pipeline (run.py: `original=step0_sep`) and prevents the model
+            from echoing adjacent paragraphs when a short bulk meets a large
+            <원문> block.
     Returns:
         Processed bulk text.
     if prompt_key:
         system_prompt = prompts.get(prompt_key, "")
+        # step1+ 프롬프트는 <paragraph_separator> 기반 구조 보존 규칙을 가정한다.
+        # LLM 호출 직전에 \n을 토큰으로 치환하고 파싱 후 되돌린다.
         if prompt_key == "step1":
+            original_sep = original_bulk.replace("\n", PARAGRAPH_SEP)
+            bulk_sep = bulk.replace("\n", PARAGRAPH_SEP)
             user_content = (
+                f"<원문>\n{original_sep}\n</원문>\n\n"
+                f"<교열_모델_수정결과>\n{bulk_sep}\n</교열_모델_수정결과>"
             )
         else:
+            user_content = bulk.replace("\n", PARAGRAPH_SEP)
     elif is_ft_model:
         # FT models have instructions baked in — send user content only (no system prompt).
         # Sending a system prompt causes the Upstage API to return a 500 error for this model.
             # Unknown duplication pattern → safer to return the input.
             return bulk
         return response
+    # Fallback to bulk when LLM returns non-JSON hallucination — matches
     # reference pipeline semantics (parse_step_output in refer/.../inference/run.py).
     extracted = extract_json_output(response, fallback=bulk)
+    # PARAGRAPH_SEP을 다시 \n으로 복원. 이후의 중복 가드/다운스트림 step은 모두
+    # 평문 \n을 기대한다 (rule step들이 <paragraph_separator> 토큰을 LCS/정규식
+    # 경로에서 처리하지 못할 수 있어 LLM boundary에서만 존재하도록 유지).
+    if prompt_key:
+        extracted = extracted.replace(PARAGRAPH_SEP, "\n")
     # step1 (basic_correction) hallucination guard: the prompt feeds the LLM
     # both <원문> and <교열_모델_수정결과>, and occasionally the model concatenates
     # corrected + original (or two slight variants) into one "output" string.
     # Symptom: result length ≈ 2× input. Detect by locating a second occurrence
+    # of the input's leading sentence (or original_bulk's leading sentence)
     # inside the extracted output — strong signal of duplication.
     if extracted and len(extracted) >= len(bulk) * 1.4:
         stripped = extracted.strip()
                 return stripped[:second].rstrip()
             return None
+        recovered = _leading_lookup(bulk) or _leading_lookup(original_bulk)
         if recovered:
             return recovered
         # No clean recovery — keep the FT-pass output rather than the bloated mess.
 def process_bulks_parallel(
     bulks: list[str],
+    original_bulks: list[str],
     step: dict,
     model: str,
     prompts: dict[str, str],
     client: OpenAI,
     max_workers: int = 10,
 ) -> list[str]:
     """Process multiple text bulks in parallel.
     Args:
+        bulks: List of text bulks (current pipeline state).
+        original_bulks: Per-bulk pre-correction text, paired 1:1 with
+            ``bulks``. Used as <원문> in step1 and as fallback anchor in
+            the hallucination guard.
         step: Step definition.
         model: Model name.
         prompts: Loaded prompt texts.
         client: OpenAI client.
         max_workers: Maximum thread pool workers.
     Returns:
         List of processed bulks in original order.
     """
+    if len(bulks) != len(original_bulks):
+        raise ValueError(
+            f"bulks/original_bulks length mismatch: {len(bulks)} vs {len(original_bulks)}"
+        )
     if len(bulks) <= 1:
+        return [
+            _process_single_bulk(bulks[0], step, model, prompts, client, original_bulks[0])
+        ]
     with ThreadPoolExecutor(max_workers=max_workers) as executor:
         futures = [
+            executor.submit(
+                _process_single_bulk, bulk, step, model, prompts, client, orig_bulk
+            )
+            for bulk, orig_bulk in zip(bulks, original_bulks)
         ]
         return [f.result() for f in futures]
     start_time = time.time()
+    # step1(basic_correction)의 <원문> 소스. 기존 파이프라인(run.py)에서 step1은
+    # 같은 bulk의 post-vocab / pre-FT 텍스트(= step0)를 <원문>으로 받았다.
+    # 여기서도 FT 직전 text를 스냅샷으로 잡아두고, step1 실행 시 현재 bulk들과
+    # 같은 splitter로 쪼개서 index-align된 per-bulk 원문을 넘긴다.
+    pre_ft_snapshot = text
     # Per-step fallback: if any step raises, keep the previous `text` and
     # move on. A single failing step (transient API issue, template
     # mismatch, empty response in mid-pipeline) shouldn't nuke the whole
             if step["type"] == "rule":
                 text = apply_rule(step, text, original_text, vocabulary)
             elif step["type"] == "llm":
+                step_model_name = step.get("model", model)
+                # FT 직전 text를 스냅샷 — 다음 step1이 이를 <원문>으로 소비한다.
+                if step_model_name.startswith("ft:"):
+                    pre_ft_snapshot = text
                 bulks = split_into_bulks(text)
                 if bulks:
+                    # step1은 pre-FT 스냅샷을 per-bulk 원문으로 사용.
+                    # 그 외 LLM step은 <원문>을 쓰지 않으므로 현재 bulk을 그대로
+                    # 원문으로 넘겨도 무해(hallucination 가드의 fallback 앵커로만 쓰임).
+                    if step.get("prompt_key") == "step1":
+                        orig_candidate = split_into_bulks(pre_ft_snapshot)
+                        if len(orig_candidate) == len(bulks):
+                            original_bulks = orig_candidate
+                        else:
+                            # Splitter 출력이 FT 전후로 어긋난 경우(문단 수 변동 등):
+                            # chunk-wide pre-FT 텍스트를 모든 bulk에 공통으로 사용 →
+                            # 기존 동작과 동일. 적어도 retreat, never regress.
+                            original_bulks = [pre_ft_snapshot] * len(bulks)
+                    else:
+                        original_bulks = list(bulks)
                     processed = process_bulks_parallel(
+                        bulks, original_bulks, step, model, prompts, client
                     )
                     # Defensive: coerce any non-string bulk result before joining
                     # so the next step (which may call re.sub) never sees a