Spaces:

dev-strender
/

proofread-20261h-demo

Sleeping

dev-strender Claude Opus 4.7 (1M context) commited on Apr 21

Commit

725b08e

1 Parent(s): c540d93

fix: preserve bulk boundary \n + substring-count dedupe + 30-char prefix

- bulk join "" → "\n" to keep paragraph boundaries across bulks
- apply_paragraph_dedupe: substring-count input to tolerate LLM paragraph restructuring
- prefix_len default 80 → 30 to catch corrected+original echo pairs
- temperature 0 → 0.0001 to loosen greedy decoding bias

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>

Files changed (2) hide show

pipelines.py +21 -11
postprocess.py +14 -10

pipelines.py CHANGED Viewed

@@ -416,7 +416,11 @@ def call_llm(
     system_prompt: str,
     user_content: str,
     model: str = "solar-pro2",
-    temperature: float = 0.0,
     reasoning_effort: str | None = None,
     max_tokens: int | None = None,
     response_format: dict | None = None,
@@ -804,12 +808,14 @@ def apply_rule(
         )
     elif rule == "paragraph_dedupe":
         config = step.get("config", {})
-        return apply_paragraph_dedupe(
-            text,
-            original_text,
-            min_len=config.get("min_len", 40),
-            prefix_len=config.get("prefix_len", 80),
-        )
     return text
@@ -1068,10 +1074,14 @@ def run_pipeline(
                     processed = process_bulks_parallel(
                         bulks, original_bulks, step, model, prompts, client
                     )
-                    # Defensive: coerce any non-string bulk result before joining
-                    # so the next step (which may call re.sub) never sees a
-                    # non-string input.
-                    text = "".join(str(p) if not isinstance(p, str) else p for p in processed)
             elif step["type"] == "specialist":
                 text = run_specialist(step, text, original_text, prompts, model, client)
             # If a step produced an unusably short output (< 10% of input), the

     system_prompt: str,
     user_content: str,
     model: str = "solar-pro2",
+    # Upstage 서빙 스택이 temperature=0 에서 greedy 디코딩 경로로 들어가는데,
+    # 이 경로가 특정 입력(truncated article 등)에서 학습 데이터의 "재시작" 패턴을
+    # 재현성 높게 재현하는 바이어스가 관찰됨. 0.0001 로 sampling 경로로 살짝
+    # 밀어 넣어 bias 를 흔들되, argmax 확률 비중은 거의 그대로 유지.
+    temperature: float = 0.0001,
     reasoning_effort: str | None = None,
     max_tokens: int | None = None,
     response_format: dict | None = None,
         )
     elif rule == "paragraph_dedupe":
         config = step.get("config", {})
+        # prefix_len 기본값은 apply_paragraph_dedupe 의 기본값(30)을 따른다.
+        # 명시적으로 config 에 지정된 경우만 override.
+        kwargs: dict = {}
+        if "min_len" in config:
+            kwargs["min_len"] = config["min_len"]
+        if "prefix_len" in config:
+            kwargs["prefix_len"] = config["prefix_len"]
+        return apply_paragraph_dedupe(text, original_text, **kwargs)
     return text
                     processed = process_bulks_parallel(
                         bulks, original_bulks, step, model, prompts, client
                     )
+                    # Bulk 경계는 원본에서 \n 로 분리돼 있었으므로 합칠 때도 \n 로
+                    # 연결한다. "".join 으로 붙이면 인접 bulk 의 문단이 한 줄로
+                    # 뭉개지고, 이후 split_into_bulks 재실행 시 문단 수가 줄어
+                    # step1 의 <원문>↔<교열_모델_수정결과> 정렬이 어긋나며 구조
+                    # 보존 규칙까지 깨진다 (참조: run.py:326 "\n".join).
+                    text = "\n".join(
+                        str(p) if not isinstance(p, str) else p for p in processed
+                    )
             elif step["type"] == "specialist":
                 text = run_specialist(step, text, original_text, prompts, model, client)
             # If a step produced an unusably short output (< 10% of input), the

postprocess.py CHANGED Viewed

@@ -380,7 +380,7 @@ def apply_paragraph_dedupe(
     text: str,
     original: str,
     min_len: int = 40,
-    prefix_len: int = 80,
 ) -> str:
     """LLM이 뱉은 중복 문단을 제거한다.
@@ -411,13 +411,12 @@ def apply_paragraph_dedupe(
     if not text:
         return text
-    in_paras_norm = [_normalize_paragraph(p) for p in _split_output_paragraphs(original or "")]
-    in_exact: Counter[str] = Counter()
-    in_prefix: Counter[str] = Counter()
-    for norm in in_paras_norm:
-        if len(norm) >= min_len:
-            in_exact[norm] += 1
-            in_prefix[norm[:prefix_len]] += 1
     out_paras = _split_output_paragraphs(text)
     out_exact_seen: Counter[str] = Counter()
@@ -435,12 +434,17 @@ def apply_paragraph_dedupe(
         out_exact_seen[norm] += 1
         out_prefix_seen[prefix] += 1
         exact_dup = (
-            out_exact_seen[norm] > in_exact.get(norm, 0)
             and out_exact_seen[norm] >= 2
         )
         near_dup = (
-            out_prefix_seen[prefix] > in_prefix.get(prefix, 0)
             and out_prefix_seen[prefix] >= 2
         )

     text: str,
     original: str,
     min_len: int = 40,
+    prefix_len: int = 30,
 ) -> str:
     """LLM이 뱉은 중복 문단을 제거한다.
     if not text:
         return text
+    # Input 은 "normalized 전체 문자열" 로 두고 substring count 를 쓴다.
+    # 이전 구현은 input 을 문단 Counter 로 셌는데, LLM 이 문단 경계를 재구조화
+    # (예: 줄바꿈 없이 중복 문장이 들어있던 한 문단을 여러 문단으로 쪼갬) 하면
+    # output 문단이 input 문단과 exact match 가 안 되어 input_count=0 으로 잡혀,
+    # 정당한 중복 (저자/원본이 의도한 반복) 까지 drop 되는 버그가 있었다.
+    in_norm = _normalize_paragraph(original or "")
     out_paras = _split_output_paragraphs(text)
     out_exact_seen: Counter[str] = Counter()
         out_exact_seen[norm] += 1
         out_prefix_seen[prefix] += 1
+        # input 문자열 전체에서 해당 문단(또는 prefix)이 몇 번 substring 으로
+        # 등장하는지 집계. output_count 가 input_count 를 초과할 때만 drop.
+        in_exact_count = in_norm.count(norm) if in_norm else 0
+        in_prefix_count = in_norm.count(prefix) if in_norm else 0
         exact_dup = (
+            out_exact_seen[norm] > in_exact_count
             and out_exact_seen[norm] >= 2
         )
         near_dup = (
+            out_prefix_seen[prefix] > in_prefix_count
             and out_prefix_seen[prefix] >= 2
         )