dev-strender Claude Opus 4.7 (1M context) commited on
Commit
8b6780d
·
1 Parent(s): 20a14d9

fix: restore PARAGRAPH_SEP + per-bulk <원문> to suppress step1 echo

Browse files

Step1 prompt (dev_260408_v16) expects <paragraph_separator>
structure-preservation rule, but the new pipeline sent plain \n.
Combined with passing full chunk as <원문> to every bulk, solar-pro3
echoed adjacent paragraphs ~15% of runs on 260408_v16 (measured via
Supabase; 251231_default at 0%).

- Add PARAGRAPH_SEP conversion at step1+ LLM boundary (matches
reference run.py:219-250).
- Narrow <원문> to per-bulk pre-FT snapshot, split with same bulker
(matches reference: original=step0_sep).
- Safe fallback to chunk-wide pre-FT text if bulk counts diverge.

Verified on 5 known-bad articles × 10 runs each: 4/5 articles fully
recover to 0% dup rate. Remaining case is a truncated-input data
quality issue unrelated to this fix.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>

Files changed (1) hide show
  1. pipelines.py +72 -17
pipelines.py CHANGED
@@ -250,6 +250,11 @@ _CORRECTIONS_SCHEMA: dict = {
250
  # FT model uses a fixed system prompt (behavior baked into fine-tuning)
251
  FT_SYSTEM_PROMPT = "입력된 문서에 대한 교열 결과를 생성해 주세요."
252
 
 
 
 
 
 
253
  # Default pronoun replacements for post_process rule
254
  DEFAULT_PRONOUN_REPLACEMENTS: dict[str, str] = {
255
  "이재명 대표": "이재명 대통령",
@@ -806,7 +811,7 @@ def _process_single_bulk(
806
  model: str,
807
  prompts: dict[str, str],
808
  client: OpenAI,
809
- original_text: str,
810
  ) -> str:
811
  """Process a single text bulk through an LLM step.
812
 
@@ -816,7 +821,12 @@ def _process_single_bulk(
816
  model: Model name to use.
817
  prompts: Loaded prompt texts.
818
  client: OpenAI client.
819
- original_text: Original input text (for include_original format).
 
 
 
 
 
820
 
821
  Returns:
822
  Processed bulk text.
@@ -829,15 +839,17 @@ def _process_single_bulk(
829
 
830
  if prompt_key:
831
  system_prompt = prompts.get(prompt_key, "")
832
- # For basic_correction (step1), include original text
 
833
  if prompt_key == "step1":
834
- # Match reference pipeline format (pipeline_prod_251231.yaml user_template)
 
835
  user_content = (
836
- f"<원문>\n{original_text}\n</원문>\n\n"
837
- f"<교열_모델_수정결과>\n{bulk}\n</교열_모델_수정결과>"
838
  )
839
  else:
840
- user_content = bulk
841
  elif is_ft_model:
842
  # FT models have instructions baked in — send user content only (no system prompt).
843
  # Sending a system prompt causes the Upstage API to return a 500 error for this model.
@@ -886,15 +898,21 @@ def _process_single_bulk(
886
  # Unknown duplication pattern → safer to return the input.
887
  return bulk
888
  return response
889
- # Fallback to original_text when LLM returns non-JSON hallucination — matches
890
  # reference pipeline semantics (parse_step_output in refer/.../inference/run.py).
891
  extracted = extract_json_output(response, fallback=bulk)
892
 
 
 
 
 
 
 
893
  # step1 (basic_correction) hallucination guard: the prompt feeds the LLM
894
  # both <원문> and <교열_모델_수정결과>, and occasionally the model concatenates
895
  # corrected + original (or two slight variants) into one "output" string.
896
  # Symptom: result length ≈ 2× input. Detect by locating a second occurrence
897
- # of the input's leading sentence (or original_text's leading sentence)
898
  # inside the extracted output — strong signal of duplication.
899
  if extracted and len(extracted) >= len(bulk) * 1.4:
900
  stripped = extracted.strip()
@@ -908,7 +926,7 @@ def _process_single_bulk(
908
  return stripped[:second].rstrip()
909
  return None
910
 
911
- recovered = _leading_lookup(bulk) or _leading_lookup(original_text)
912
  if recovered:
913
  return recovered
914
  # No clean recovery — keep the FT-pass output rather than the bloated mess.
@@ -919,34 +937,45 @@ def _process_single_bulk(
919
 
920
  def process_bulks_parallel(
921
  bulks: list[str],
 
922
  step: dict,
923
  model: str,
924
  prompts: dict[str, str],
925
  client: OpenAI,
926
- original_text: str,
927
  max_workers: int = 10,
928
  ) -> list[str]:
929
  """Process multiple text bulks in parallel.
930
 
931
  Args:
932
- bulks: List of text bulks.
 
 
 
933
  step: Step definition.
934
  model: Model name.
935
  prompts: Loaded prompt texts.
936
  client: OpenAI client.
937
- original_text: Original input text.
938
  max_workers: Maximum thread pool workers.
939
 
940
  Returns:
941
  List of processed bulks in original order.
942
  """
 
 
 
 
 
943
  if len(bulks) <= 1:
944
- return [_process_single_bulk(bulks[0], step, model, prompts, client, original_text)]
 
 
945
 
946
  with ThreadPoolExecutor(max_workers=max_workers) as executor:
947
  futures = [
948
- executor.submit(_process_single_bulk, bulk, step, model, prompts, client, original_text)
949
- for bulk in bulks
 
 
950
  ]
951
  return [f.result() for f in futures]
952
 
@@ -984,6 +1013,12 @@ def run_pipeline(
984
 
985
  start_time = time.time()
986
 
 
 
 
 
 
 
987
  # Per-step fallback: if any step raises, keep the previous `text` and
988
  # move on. A single failing step (transient API issue, template
989
  # mismatch, empty response in mid-pipeline) shouldn't nuke the whole
@@ -996,10 +1031,30 @@ def run_pipeline(
996
  if step["type"] == "rule":
997
  text = apply_rule(step, text, original_text, vocabulary)
998
  elif step["type"] == "llm":
 
 
 
 
 
999
  bulks = split_into_bulks(text)
1000
  if bulks:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1001
  processed = process_bulks_parallel(
1002
- bulks, step, model, prompts, client, original_text
1003
  )
1004
  # Defensive: coerce any non-string bulk result before joining
1005
  # so the next step (which may call re.sub) never sees a
 
250
  # FT model uses a fixed system prompt (behavior baked into fine-tuning)
251
  FT_SYSTEM_PROMPT = "입력된 문서에 대한 교열 결과를 생성해 주세요."
252
 
253
+ # Matches reference pipeline (refer/chosun-projects/proofread/inference/run.py).
254
+ # \n는 step1+ 호출 직전에 이 토큰으로 치환되고, 프롬프트의 "개수/위치 100% 보존"
255
+ # 규칙이 문단 복제/에코를 구조적으로 억제한다. 응답 파싱 후 다시 \n으로 복원.
256
+ PARAGRAPH_SEP = "<paragraph_separator>"
257
+
258
  # Default pronoun replacements for post_process rule
259
  DEFAULT_PRONOUN_REPLACEMENTS: dict[str, str] = {
260
  "이재명 대표": "이재명 대통령",
 
811
  model: str,
812
  prompts: dict[str, str],
813
  client: OpenAI,
814
+ original_bulk: str,
815
  ) -> str:
816
  """Process a single text bulk through an LLM step.
817
 
 
821
  model: Model name to use.
822
  prompts: Loaded prompt texts.
823
  client: OpenAI client.
824
+ original_bulk: Pre-correction text **for this bulk** (not the full
825
+ chunk). Used as <원문> in step1 and as the leading-lookup fallback
826
+ in the hallucination guard. Per-bulk scoping mirrors the reference
827
+ pipeline (run.py: `original=step0_sep`) and prevents the model
828
+ from echoing adjacent paragraphs when a short bulk meets a large
829
+ <원문> block.
830
 
831
  Returns:
832
  Processed bulk text.
 
839
 
840
  if prompt_key:
841
  system_prompt = prompts.get(prompt_key, "")
842
+ # step1+ 프롬프트는 <paragraph_separator> 기반 구조 보존 규칙을 가정한다.
843
+ # LLM 호출 직전에 \n을 토큰으로 치환하고 파싱 후 되돌린다.
844
  if prompt_key == "step1":
845
+ original_sep = original_bulk.replace("\n", PARAGRAPH_SEP)
846
+ bulk_sep = bulk.replace("\n", PARAGRAPH_SEP)
847
  user_content = (
848
+ f"<원문>\n{original_sep}\n</원문>\n\n"
849
+ f"<교열_모델_수정결과>\n{bulk_sep}\n</교열_모델_수정결과>"
850
  )
851
  else:
852
+ user_content = bulk.replace("\n", PARAGRAPH_SEP)
853
  elif is_ft_model:
854
  # FT models have instructions baked in — send user content only (no system prompt).
855
  # Sending a system prompt causes the Upstage API to return a 500 error for this model.
 
898
  # Unknown duplication pattern → safer to return the input.
899
  return bulk
900
  return response
901
+ # Fallback to bulk when LLM returns non-JSON hallucination — matches
902
  # reference pipeline semantics (parse_step_output in refer/.../inference/run.py).
903
  extracted = extract_json_output(response, fallback=bulk)
904
 
905
+ # PARAGRAPH_SEP을 다시 \n으로 복원. 이후의 중복 가드/다운스트림 step은 모두
906
+ # 평문 \n을 기대한다 (rule step들이 <paragraph_separator> 토큰을 LCS/정규식
907
+ # 경로에서 처리하지 못할 수 있어 LLM boundary에서만 존재하도록 유지).
908
+ if prompt_key:
909
+ extracted = extracted.replace(PARAGRAPH_SEP, "\n")
910
+
911
  # step1 (basic_correction) hallucination guard: the prompt feeds the LLM
912
  # both <원문> and <교열_모델_수정결과>, and occasionally the model concatenates
913
  # corrected + original (or two slight variants) into one "output" string.
914
  # Symptom: result length ≈ 2× input. Detect by locating a second occurrence
915
+ # of the input's leading sentence (or original_bulk's leading sentence)
916
  # inside the extracted output — strong signal of duplication.
917
  if extracted and len(extracted) >= len(bulk) * 1.4:
918
  stripped = extracted.strip()
 
926
  return stripped[:second].rstrip()
927
  return None
928
 
929
+ recovered = _leading_lookup(bulk) or _leading_lookup(original_bulk)
930
  if recovered:
931
  return recovered
932
  # No clean recovery — keep the FT-pass output rather than the bloated mess.
 
937
 
938
  def process_bulks_parallel(
939
  bulks: list[str],
940
+ original_bulks: list[str],
941
  step: dict,
942
  model: str,
943
  prompts: dict[str, str],
944
  client: OpenAI,
 
945
  max_workers: int = 10,
946
  ) -> list[str]:
947
  """Process multiple text bulks in parallel.
948
 
949
  Args:
950
+ bulks: List of text bulks (current pipeline state).
951
+ original_bulks: Per-bulk pre-correction text, paired 1:1 with
952
+ ``bulks``. Used as <원문> in step1 and as fallback anchor in
953
+ the hallucination guard.
954
  step: Step definition.
955
  model: Model name.
956
  prompts: Loaded prompt texts.
957
  client: OpenAI client.
 
958
  max_workers: Maximum thread pool workers.
959
 
960
  Returns:
961
  List of processed bulks in original order.
962
  """
963
+ if len(bulks) != len(original_bulks):
964
+ raise ValueError(
965
+ f"bulks/original_bulks length mismatch: {len(bulks)} vs {len(original_bulks)}"
966
+ )
967
+
968
  if len(bulks) <= 1:
969
+ return [
970
+ _process_single_bulk(bulks[0], step, model, prompts, client, original_bulks[0])
971
+ ]
972
 
973
  with ThreadPoolExecutor(max_workers=max_workers) as executor:
974
  futures = [
975
+ executor.submit(
976
+ _process_single_bulk, bulk, step, model, prompts, client, orig_bulk
977
+ )
978
+ for bulk, orig_bulk in zip(bulks, original_bulks)
979
  ]
980
  return [f.result() for f in futures]
981
 
 
1013
 
1014
  start_time = time.time()
1015
 
1016
+ # step1(basic_correction)의 <원문> 소스. 기존 파이프라인(run.py)에서 step1은
1017
+ # 같은 bulk의 post-vocab / pre-FT 텍스트(= step0)를 <원문>으로 받았다.
1018
+ # 여기서도 FT 직전 text를 스냅샷으로 잡아두고, step1 실행 시 현재 bulk들과
1019
+ # 같은 splitter로 쪼개서 index-align된 per-bulk 원문을 넘긴다.
1020
+ pre_ft_snapshot = text
1021
+
1022
  # Per-step fallback: if any step raises, keep the previous `text` and
1023
  # move on. A single failing step (transient API issue, template
1024
  # mismatch, empty response in mid-pipeline) shouldn't nuke the whole
 
1031
  if step["type"] == "rule":
1032
  text = apply_rule(step, text, original_text, vocabulary)
1033
  elif step["type"] == "llm":
1034
+ step_model_name = step.get("model", model)
1035
+ # FT 직전 text를 스냅샷 — 다음 step1이 이를 <원문>으로 소비한다.
1036
+ if step_model_name.startswith("ft:"):
1037
+ pre_ft_snapshot = text
1038
+
1039
  bulks = split_into_bulks(text)
1040
  if bulks:
1041
+ # step1은 pre-FT 스냅샷을 per-bulk 원문으로 사용.
1042
+ # 그 외 LLM step은 <원문>을 쓰지 않으므로 현재 bulk을 그대로
1043
+ # 원문으로 넘겨도 무해(hallucination 가드의 fallback 앵커로만 쓰임).
1044
+ if step.get("prompt_key") == "step1":
1045
+ orig_candidate = split_into_bulks(pre_ft_snapshot)
1046
+ if len(orig_candidate) == len(bulks):
1047
+ original_bulks = orig_candidate
1048
+ else:
1049
+ # Splitter 출력이 FT 전후로 어긋난 경우(문단 수 변동 등):
1050
+ # chunk-wide pre-FT 텍스트를 모든 bulk에 공통으로 사용 →
1051
+ # 기존 동작과 동일. 적어도 retreat, never regress.
1052
+ original_bulks = [pre_ft_snapshot] * len(bulks)
1053
+ else:
1054
+ original_bulks = list(bulks)
1055
+
1056
  processed = process_bulks_parallel(
1057
+ bulks, original_bulks, step, model, prompts, client
1058
  )
1059
  # Defensive: coerce any non-string bulk result before joining
1060
  # so the next step (which may call re.sub) never sees a