Spaces:
Sleeping
fix: restore PARAGRAPH_SEP + per-bulk <원문> to suppress step1 echo
Browse filesStep1 prompt (dev_260408_v16) expects <paragraph_separator>
structure-preservation rule, but the new pipeline sent plain \n.
Combined with passing full chunk as <원문> to every bulk, solar-pro3
echoed adjacent paragraphs ~15% of runs on 260408_v16 (measured via
Supabase; 251231_default at 0%).
- Add PARAGRAPH_SEP conversion at step1+ LLM boundary (matches
reference run.py:219-250).
- Narrow <원문> to per-bulk pre-FT snapshot, split with same bulker
(matches reference: original=step0_sep).
- Safe fallback to chunk-wide pre-FT text if bulk counts diverge.
Verified on 5 known-bad articles × 10 runs each: 4/5 articles fully
recover to 0% dup rate. Remaining case is a truncated-input data
quality issue unrelated to this fix.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
- pipelines.py +72 -17
|
@@ -250,6 +250,11 @@ _CORRECTIONS_SCHEMA: dict = {
|
|
| 250 |
# FT model uses a fixed system prompt (behavior baked into fine-tuning)
|
| 251 |
FT_SYSTEM_PROMPT = "입력된 문서에 대한 교열 결과를 생성해 주세요."
|
| 252 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 253 |
# Default pronoun replacements for post_process rule
|
| 254 |
DEFAULT_PRONOUN_REPLACEMENTS: dict[str, str] = {
|
| 255 |
"이재명 대표": "이재명 대통령",
|
|
@@ -806,7 +811,7 @@ def _process_single_bulk(
|
|
| 806 |
model: str,
|
| 807 |
prompts: dict[str, str],
|
| 808 |
client: OpenAI,
|
| 809 |
-
|
| 810 |
) -> str:
|
| 811 |
"""Process a single text bulk through an LLM step.
|
| 812 |
|
|
@@ -816,7 +821,12 @@ def _process_single_bulk(
|
|
| 816 |
model: Model name to use.
|
| 817 |
prompts: Loaded prompt texts.
|
| 818 |
client: OpenAI client.
|
| 819 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 820 |
|
| 821 |
Returns:
|
| 822 |
Processed bulk text.
|
|
@@ -829,15 +839,17 @@ def _process_single_bulk(
|
|
| 829 |
|
| 830 |
if prompt_key:
|
| 831 |
system_prompt = prompts.get(prompt_key, "")
|
| 832 |
-
#
|
|
|
|
| 833 |
if prompt_key == "step1":
|
| 834 |
-
|
|
|
|
| 835 |
user_content = (
|
| 836 |
-
f"<원문>\n{
|
| 837 |
-
f"<교열_모델_수정결과>\n{
|
| 838 |
)
|
| 839 |
else:
|
| 840 |
-
user_content = bulk
|
| 841 |
elif is_ft_model:
|
| 842 |
# FT models have instructions baked in — send user content only (no system prompt).
|
| 843 |
# Sending a system prompt causes the Upstage API to return a 500 error for this model.
|
|
@@ -886,15 +898,21 @@ def _process_single_bulk(
|
|
| 886 |
# Unknown duplication pattern → safer to return the input.
|
| 887 |
return bulk
|
| 888 |
return response
|
| 889 |
-
# Fallback to
|
| 890 |
# reference pipeline semantics (parse_step_output in refer/.../inference/run.py).
|
| 891 |
extracted = extract_json_output(response, fallback=bulk)
|
| 892 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 893 |
# step1 (basic_correction) hallucination guard: the prompt feeds the LLM
|
| 894 |
# both <원문> and <교열_모델_수정결과>, and occasionally the model concatenates
|
| 895 |
# corrected + original (or two slight variants) into one "output" string.
|
| 896 |
# Symptom: result length ≈ 2× input. Detect by locating a second occurrence
|
| 897 |
-
# of the input's leading sentence (or
|
| 898 |
# inside the extracted output — strong signal of duplication.
|
| 899 |
if extracted and len(extracted) >= len(bulk) * 1.4:
|
| 900 |
stripped = extracted.strip()
|
|
@@ -908,7 +926,7 @@ def _process_single_bulk(
|
|
| 908 |
return stripped[:second].rstrip()
|
| 909 |
return None
|
| 910 |
|
| 911 |
-
recovered = _leading_lookup(bulk) or _leading_lookup(
|
| 912 |
if recovered:
|
| 913 |
return recovered
|
| 914 |
# No clean recovery — keep the FT-pass output rather than the bloated mess.
|
|
@@ -919,34 +937,45 @@ def _process_single_bulk(
|
|
| 919 |
|
| 920 |
def process_bulks_parallel(
|
| 921 |
bulks: list[str],
|
|
|
|
| 922 |
step: dict,
|
| 923 |
model: str,
|
| 924 |
prompts: dict[str, str],
|
| 925 |
client: OpenAI,
|
| 926 |
-
original_text: str,
|
| 927 |
max_workers: int = 10,
|
| 928 |
) -> list[str]:
|
| 929 |
"""Process multiple text bulks in parallel.
|
| 930 |
|
| 931 |
Args:
|
| 932 |
-
bulks: List of text bulks.
|
|
|
|
|
|
|
|
|
|
| 933 |
step: Step definition.
|
| 934 |
model: Model name.
|
| 935 |
prompts: Loaded prompt texts.
|
| 936 |
client: OpenAI client.
|
| 937 |
-
original_text: Original input text.
|
| 938 |
max_workers: Maximum thread pool workers.
|
| 939 |
|
| 940 |
Returns:
|
| 941 |
List of processed bulks in original order.
|
| 942 |
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 943 |
if len(bulks) <= 1:
|
| 944 |
-
return [
|
|
|
|
|
|
|
| 945 |
|
| 946 |
with ThreadPoolExecutor(max_workers=max_workers) as executor:
|
| 947 |
futures = [
|
| 948 |
-
executor.submit(
|
| 949 |
-
|
|
|
|
|
|
|
| 950 |
]
|
| 951 |
return [f.result() for f in futures]
|
| 952 |
|
|
@@ -984,6 +1013,12 @@ def run_pipeline(
|
|
| 984 |
|
| 985 |
start_time = time.time()
|
| 986 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 987 |
# Per-step fallback: if any step raises, keep the previous `text` and
|
| 988 |
# move on. A single failing step (transient API issue, template
|
| 989 |
# mismatch, empty response in mid-pipeline) shouldn't nuke the whole
|
|
@@ -996,10 +1031,30 @@ def run_pipeline(
|
|
| 996 |
if step["type"] == "rule":
|
| 997 |
text = apply_rule(step, text, original_text, vocabulary)
|
| 998 |
elif step["type"] == "llm":
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 999 |
bulks = split_into_bulks(text)
|
| 1000 |
if bulks:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1001 |
processed = process_bulks_parallel(
|
| 1002 |
-
bulks, step, model, prompts, client
|
| 1003 |
)
|
| 1004 |
# Defensive: coerce any non-string bulk result before joining
|
| 1005 |
# so the next step (which may call re.sub) never sees a
|
|
|
|
| 250 |
# FT model uses a fixed system prompt (behavior baked into fine-tuning)
|
| 251 |
FT_SYSTEM_PROMPT = "입력된 문서에 대한 교열 결과를 생성해 주세요."
|
| 252 |
|
| 253 |
+
# Matches reference pipeline (refer/chosun-projects/proofread/inference/run.py).
|
| 254 |
+
# \n는 step1+ 호출 직전에 이 토큰으로 치환되고, 프롬프트의 "개수/위치 100% 보존"
|
| 255 |
+
# 규칙이 문단 복제/에코를 구조적으로 억제한다. 응답 파싱 후 다시 \n으로 복원.
|
| 256 |
+
PARAGRAPH_SEP = "<paragraph_separator>"
|
| 257 |
+
|
| 258 |
# Default pronoun replacements for post_process rule
|
| 259 |
DEFAULT_PRONOUN_REPLACEMENTS: dict[str, str] = {
|
| 260 |
"이재명 대표": "이재명 대통령",
|
|
|
|
| 811 |
model: str,
|
| 812 |
prompts: dict[str, str],
|
| 813 |
client: OpenAI,
|
| 814 |
+
original_bulk: str,
|
| 815 |
) -> str:
|
| 816 |
"""Process a single text bulk through an LLM step.
|
| 817 |
|
|
|
|
| 821 |
model: Model name to use.
|
| 822 |
prompts: Loaded prompt texts.
|
| 823 |
client: OpenAI client.
|
| 824 |
+
original_bulk: Pre-correction text **for this bulk** (not the full
|
| 825 |
+
chunk). Used as <원문> in step1 and as the leading-lookup fallback
|
| 826 |
+
in the hallucination guard. Per-bulk scoping mirrors the reference
|
| 827 |
+
pipeline (run.py: `original=step0_sep`) and prevents the model
|
| 828 |
+
from echoing adjacent paragraphs when a short bulk meets a large
|
| 829 |
+
<원문> block.
|
| 830 |
|
| 831 |
Returns:
|
| 832 |
Processed bulk text.
|
|
|
|
| 839 |
|
| 840 |
if prompt_key:
|
| 841 |
system_prompt = prompts.get(prompt_key, "")
|
| 842 |
+
# step1+ 프롬프트는 <paragraph_separator> 기반 구조 보존 규칙을 가정한다.
|
| 843 |
+
# LLM 호출 직전에 \n을 토큰으로 치환하고 파싱 후 되돌린다.
|
| 844 |
if prompt_key == "step1":
|
| 845 |
+
original_sep = original_bulk.replace("\n", PARAGRAPH_SEP)
|
| 846 |
+
bulk_sep = bulk.replace("\n", PARAGRAPH_SEP)
|
| 847 |
user_content = (
|
| 848 |
+
f"<원문>\n{original_sep}\n</원문>\n\n"
|
| 849 |
+
f"<교열_모델_수정결과>\n{bulk_sep}\n</교열_모델_수정결과>"
|
| 850 |
)
|
| 851 |
else:
|
| 852 |
+
user_content = bulk.replace("\n", PARAGRAPH_SEP)
|
| 853 |
elif is_ft_model:
|
| 854 |
# FT models have instructions baked in — send user content only (no system prompt).
|
| 855 |
# Sending a system prompt causes the Upstage API to return a 500 error for this model.
|
|
|
|
| 898 |
# Unknown duplication pattern → safer to return the input.
|
| 899 |
return bulk
|
| 900 |
return response
|
| 901 |
+
# Fallback to bulk when LLM returns non-JSON hallucination — matches
|
| 902 |
# reference pipeline semantics (parse_step_output in refer/.../inference/run.py).
|
| 903 |
extracted = extract_json_output(response, fallback=bulk)
|
| 904 |
|
| 905 |
+
# PARAGRAPH_SEP을 다시 \n으로 복원. 이후의 중복 가드/다운스트림 step은 모두
|
| 906 |
+
# 평문 \n을 기대한다 (rule step들이 <paragraph_separator> 토큰을 LCS/정규식
|
| 907 |
+
# 경로에서 처리하지 못할 수 있어 LLM boundary에서만 존재하도록 유지).
|
| 908 |
+
if prompt_key:
|
| 909 |
+
extracted = extracted.replace(PARAGRAPH_SEP, "\n")
|
| 910 |
+
|
| 911 |
# step1 (basic_correction) hallucination guard: the prompt feeds the LLM
|
| 912 |
# both <원문> and <교열_모델_수정결과>, and occasionally the model concatenates
|
| 913 |
# corrected + original (or two slight variants) into one "output" string.
|
| 914 |
# Symptom: result length ≈ 2× input. Detect by locating a second occurrence
|
| 915 |
+
# of the input's leading sentence (or original_bulk's leading sentence)
|
| 916 |
# inside the extracted output — strong signal of duplication.
|
| 917 |
if extracted and len(extracted) >= len(bulk) * 1.4:
|
| 918 |
stripped = extracted.strip()
|
|
|
|
| 926 |
return stripped[:second].rstrip()
|
| 927 |
return None
|
| 928 |
|
| 929 |
+
recovered = _leading_lookup(bulk) or _leading_lookup(original_bulk)
|
| 930 |
if recovered:
|
| 931 |
return recovered
|
| 932 |
# No clean recovery — keep the FT-pass output rather than the bloated mess.
|
|
|
|
| 937 |
|
| 938 |
def process_bulks_parallel(
|
| 939 |
bulks: list[str],
|
| 940 |
+
original_bulks: list[str],
|
| 941 |
step: dict,
|
| 942 |
model: str,
|
| 943 |
prompts: dict[str, str],
|
| 944 |
client: OpenAI,
|
|
|
|
| 945 |
max_workers: int = 10,
|
| 946 |
) -> list[str]:
|
| 947 |
"""Process multiple text bulks in parallel.
|
| 948 |
|
| 949 |
Args:
|
| 950 |
+
bulks: List of text bulks (current pipeline state).
|
| 951 |
+
original_bulks: Per-bulk pre-correction text, paired 1:1 with
|
| 952 |
+
``bulks``. Used as <원문> in step1 and as fallback anchor in
|
| 953 |
+
the hallucination guard.
|
| 954 |
step: Step definition.
|
| 955 |
model: Model name.
|
| 956 |
prompts: Loaded prompt texts.
|
| 957 |
client: OpenAI client.
|
|
|
|
| 958 |
max_workers: Maximum thread pool workers.
|
| 959 |
|
| 960 |
Returns:
|
| 961 |
List of processed bulks in original order.
|
| 962 |
"""
|
| 963 |
+
if len(bulks) != len(original_bulks):
|
| 964 |
+
raise ValueError(
|
| 965 |
+
f"bulks/original_bulks length mismatch: {len(bulks)} vs {len(original_bulks)}"
|
| 966 |
+
)
|
| 967 |
+
|
| 968 |
if len(bulks) <= 1:
|
| 969 |
+
return [
|
| 970 |
+
_process_single_bulk(bulks[0], step, model, prompts, client, original_bulks[0])
|
| 971 |
+
]
|
| 972 |
|
| 973 |
with ThreadPoolExecutor(max_workers=max_workers) as executor:
|
| 974 |
futures = [
|
| 975 |
+
executor.submit(
|
| 976 |
+
_process_single_bulk, bulk, step, model, prompts, client, orig_bulk
|
| 977 |
+
)
|
| 978 |
+
for bulk, orig_bulk in zip(bulks, original_bulks)
|
| 979 |
]
|
| 980 |
return [f.result() for f in futures]
|
| 981 |
|
|
|
|
| 1013 |
|
| 1014 |
start_time = time.time()
|
| 1015 |
|
| 1016 |
+
# step1(basic_correction)의 <원문> 소스. 기존 파이프라인(run.py)에서 step1은
|
| 1017 |
+
# 같은 bulk의 post-vocab / pre-FT 텍스트(= step0)를 <원문>으로 받았다.
|
| 1018 |
+
# 여기서도 FT 직전 text를 스냅샷으로 잡아두고, step1 실행 시 현재 bulk들과
|
| 1019 |
+
# 같은 splitter로 쪼개서 index-align된 per-bulk 원문을 넘긴다.
|
| 1020 |
+
pre_ft_snapshot = text
|
| 1021 |
+
|
| 1022 |
# Per-step fallback: if any step raises, keep the previous `text` and
|
| 1023 |
# move on. A single failing step (transient API issue, template
|
| 1024 |
# mismatch, empty response in mid-pipeline) shouldn't nuke the whole
|
|
|
|
| 1031 |
if step["type"] == "rule":
|
| 1032 |
text = apply_rule(step, text, original_text, vocabulary)
|
| 1033 |
elif step["type"] == "llm":
|
| 1034 |
+
step_model_name = step.get("model", model)
|
| 1035 |
+
# FT 직전 text를 스냅샷 — 다음 step1이 이를 <원문>으로 소비한다.
|
| 1036 |
+
if step_model_name.startswith("ft:"):
|
| 1037 |
+
pre_ft_snapshot = text
|
| 1038 |
+
|
| 1039 |
bulks = split_into_bulks(text)
|
| 1040 |
if bulks:
|
| 1041 |
+
# step1은 pre-FT 스냅샷을 per-bulk 원문으로 사용.
|
| 1042 |
+
# 그 외 LLM step은 <원문>을 쓰지 않으므로 현재 bulk을 그대로
|
| 1043 |
+
# 원문으로 넘겨도 무해(hallucination 가드의 fallback 앵커로만 쓰임).
|
| 1044 |
+
if step.get("prompt_key") == "step1":
|
| 1045 |
+
orig_candidate = split_into_bulks(pre_ft_snapshot)
|
| 1046 |
+
if len(orig_candidate) == len(bulks):
|
| 1047 |
+
original_bulks = orig_candidate
|
| 1048 |
+
else:
|
| 1049 |
+
# Splitter 출력이 FT 전후로 어긋난 경우(문단 수 변동 등):
|
| 1050 |
+
# chunk-wide pre-FT 텍스트를 모든 bulk에 공통으로 사용 →
|
| 1051 |
+
# 기존 동작과 동일. 적어도 retreat, never regress.
|
| 1052 |
+
original_bulks = [pre_ft_snapshot] * len(bulks)
|
| 1053 |
+
else:
|
| 1054 |
+
original_bulks = list(bulks)
|
| 1055 |
+
|
| 1056 |
processed = process_bulks_parallel(
|
| 1057 |
+
bulks, original_bulks, step, model, prompts, client
|
| 1058 |
)
|
| 1059 |
# Defensive: coerce any non-string bulk result before joining
|
| 1060 |
# so the next step (which may call re.sub) never sees a
|