Spaces:
Paused
Paused
fix: improve bilingual parsing to split English/Chinese by paragraph
Browse filesLLM often ignores the --- separator and outputs English paragraph
followed by Chinese paragraph. New fallback detects Chinese chars
to find the split point, so chatlog EN/ZH toggle works correctly.
Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
- scripts/conversation-loop.py +26 -0
scripts/conversation-loop.py
CHANGED
|
@@ -657,6 +657,10 @@ def call_llm(system_prompt, user_prompt):
|
|
| 657 |
return ""
|
| 658 |
|
| 659 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 660 |
def parse_bilingual(text):
|
| 661 |
"""Parse bilingual response into (en, zh). Handle action tags gracefully."""
|
| 662 |
# Remove action tags and content blocks for display
|
|
@@ -664,6 +668,7 @@ def parse_bilingual(text):
|
|
| 664 |
display = re.sub(r'\[CONTENT\].*?\[/CONTENT\]', '', display, flags=re.DOTALL)
|
| 665 |
display = display.strip()
|
| 666 |
|
|
|
|
| 667 |
if '\n---\n' in display:
|
| 668 |
parts = display.split('\n---\n', 1)
|
| 669 |
return parts[0].strip(), parts[1].strip()
|
|
@@ -672,6 +677,27 @@ def parse_bilingual(text):
|
|
| 672 |
en, zh = parts[0].strip(), parts[1].strip()
|
| 673 |
if en and zh:
|
| 674 |
return en, zh
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 675 |
return display, display
|
| 676 |
|
| 677 |
|
|
|
|
| 657 |
return ""
|
| 658 |
|
| 659 |
|
| 660 |
+
def _has_chinese(s):
|
| 661 |
+
"""Check if string contains Chinese characters."""
|
| 662 |
+
return bool(re.search(r'[\u4e00-\u9fff]', s))
|
| 663 |
+
|
| 664 |
def parse_bilingual(text):
|
| 665 |
"""Parse bilingual response into (en, zh). Handle action tags gracefully."""
|
| 666 |
# Remove action tags and content blocks for display
|
|
|
|
| 668 |
display = re.sub(r'\[CONTENT\].*?\[/CONTENT\]', '', display, flags=re.DOTALL)
|
| 669 |
display = display.strip()
|
| 670 |
|
| 671 |
+
# 1. Explicit --- separator
|
| 672 |
if '\n---\n' in display:
|
| 673 |
parts = display.split('\n---\n', 1)
|
| 674 |
return parts[0].strip(), parts[1].strip()
|
|
|
|
| 677 |
en, zh = parts[0].strip(), parts[1].strip()
|
| 678 |
if en and zh:
|
| 679 |
return en, zh
|
| 680 |
+
|
| 681 |
+
# 2. Fallback: split on double-newline between English and Chinese paragraphs
|
| 682 |
+
paragraphs = re.split(r'\n{2,}', display)
|
| 683 |
+
if len(paragraphs) >= 2:
|
| 684 |
+
# Find the split point: first paragraph with Chinese is the start of zh
|
| 685 |
+
en_parts = []
|
| 686 |
+
zh_parts = []
|
| 687 |
+
found_zh = False
|
| 688 |
+
for p in paragraphs:
|
| 689 |
+
p = p.strip()
|
| 690 |
+
if not p:
|
| 691 |
+
continue
|
| 692 |
+
if not found_zh and _has_chinese(p):
|
| 693 |
+
found_zh = True
|
| 694 |
+
if found_zh:
|
| 695 |
+
zh_parts.append(p)
|
| 696 |
+
else:
|
| 697 |
+
en_parts.append(p)
|
| 698 |
+
if en_parts and zh_parts:
|
| 699 |
+
return '\n\n'.join(en_parts), '\n\n'.join(zh_parts)
|
| 700 |
+
|
| 701 |
return display, display
|
| 702 |
|
| 703 |
|