Peter Yang commited on
Commit
d3816fa
·
1 Parent(s): 9124732

Improve translation completeness: better paragraph splitting for headings, improved prompts, lower min length for titles

Browse files
Files changed (2) hide show
  1. app.py +18 -0
  2. document_processing_agent.py +5 -1
app.py CHANGED
@@ -71,6 +71,7 @@ async def translate_document(docx_path: str, output_path: str = None):
71
  return None
72
 
73
  # Split content into paragraphs and find Chinese paragraphs
 
74
  paragraphs = content.split('\n\n')
75
  chinese_paragraphs = []
76
 
@@ -79,6 +80,23 @@ async def translate_document(docx_path: str, output_path: str = None):
79
  if para:
80
  chinese_chars = re.findall(r'[\u4e00-\u9fff]+', para)
81
  if chinese_chars:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
82
  chinese_paragraphs.append(para)
83
 
84
  # Translate each paragraph
 
71
  return None
72
 
73
  # Split content into paragraphs and find Chinese paragraphs
74
+ # First split by double newlines
75
  paragraphs = content.split('\n\n')
76
  chinese_paragraphs = []
77
 
 
80
  if para:
81
  chinese_chars = re.findall(r'[\u4e00-\u9fff]+', para)
82
  if chinese_chars:
83
+ # Check if paragraph contains a heading followed by content (e.g., "标题:内容")
84
+ # Split headings from content to ensure both are translated
85
+ if ':' in para or ':' in para:
86
+ # Check if it's a heading pattern (short line ending with colon)
87
+ lines = para.split('\n')
88
+ if len(lines) > 1:
89
+ first_line = lines[0].strip()
90
+ # If first line is short and ends with colon, treat as heading
91
+ if len(first_line) < 30 and (first_line.endswith(':') or first_line.endswith(':')):
92
+ # Add heading as separate paragraph
93
+ chinese_paragraphs.append(first_line)
94
+ # Add remaining content as separate paragraph
95
+ remaining = '\n'.join(lines[1:]).strip()
96
+ if remaining:
97
+ chinese_paragraphs.append(remaining)
98
+ continue
99
+
100
  chinese_paragraphs.append(para)
101
 
102
  # Translate each paragraph
document_processing_agent.py CHANGED
@@ -648,7 +648,11 @@ English:"""
648
  elif translation.startswith("'") and translation.endswith("'"):
649
  translation = translation[1:-1].strip()
650
 
651
- return translation if translation and len(translation) > 5 else None
 
 
 
 
652
 
653
  except Exception as e:
654
  print(f"Qwen2.5 translation error: {e}")
 
648
  elif translation.startswith("'") and translation.endswith("'"):
649
  translation = translation[1:-1].strip()
650
 
651
+ # For very short translations (like titles), lower the minimum length requirement
652
+ # Titles can be as short as 3 characters (e.g., "As Children of Light")
653
+ min_length = 3 if len(text) < 10 else 5 # Lower threshold for short inputs (likely titles)
654
+
655
+ return translation if translation and len(translation) >= min_length else None
656
 
657
  except Exception as e:
658
  print(f"Qwen2.5 translation error: {e}")