Spaces:
Sleeping
Sleeping
Peter Yang commited on
Commit ·
d3816fa
1
Parent(s): 9124732
Improve translation completeness: better paragraph splitting for headings, improved prompts, lower min length for titles
Browse files- app.py +18 -0
- document_processing_agent.py +5 -1
app.py
CHANGED
|
@@ -71,6 +71,7 @@ async def translate_document(docx_path: str, output_path: str = None):
|
|
| 71 |
return None
|
| 72 |
|
| 73 |
# Split content into paragraphs and find Chinese paragraphs
|
|
|
|
| 74 |
paragraphs = content.split('\n\n')
|
| 75 |
chinese_paragraphs = []
|
| 76 |
|
|
@@ -79,6 +80,23 @@ async def translate_document(docx_path: str, output_path: str = None):
|
|
| 79 |
if para:
|
| 80 |
chinese_chars = re.findall(r'[\u4e00-\u9fff]+', para)
|
| 81 |
if chinese_chars:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 82 |
chinese_paragraphs.append(para)
|
| 83 |
|
| 84 |
# Translate each paragraph
|
|
|
|
| 71 |
return None
|
| 72 |
|
| 73 |
# Split content into paragraphs and find Chinese paragraphs
|
| 74 |
+
# First split by double newlines
|
| 75 |
paragraphs = content.split('\n\n')
|
| 76 |
chinese_paragraphs = []
|
| 77 |
|
|
|
|
| 80 |
if para:
|
| 81 |
chinese_chars = re.findall(r'[\u4e00-\u9fff]+', para)
|
| 82 |
if chinese_chars:
|
| 83 |
+
# Check if paragraph contains a heading followed by content (e.g., "标题:内容")
|
| 84 |
+
# Split headings from content to ensure both are translated
|
| 85 |
+
if ':' in para or ':' in para:
|
| 86 |
+
# Check if it's a heading pattern (short line ending with colon)
|
| 87 |
+
lines = para.split('\n')
|
| 88 |
+
if len(lines) > 1:
|
| 89 |
+
first_line = lines[0].strip()
|
| 90 |
+
# If first line is short and ends with colon, treat as heading
|
| 91 |
+
if len(first_line) < 30 and (first_line.endswith(':') or first_line.endswith(':')):
|
| 92 |
+
# Add heading as separate paragraph
|
| 93 |
+
chinese_paragraphs.append(first_line)
|
| 94 |
+
# Add remaining content as separate paragraph
|
| 95 |
+
remaining = '\n'.join(lines[1:]).strip()
|
| 96 |
+
if remaining:
|
| 97 |
+
chinese_paragraphs.append(remaining)
|
| 98 |
+
continue
|
| 99 |
+
|
| 100 |
chinese_paragraphs.append(para)
|
| 101 |
|
| 102 |
# Translate each paragraph
|
document_processing_agent.py
CHANGED
|
@@ -648,7 +648,11 @@ English:"""
|
|
| 648 |
elif translation.startswith("'") and translation.endswith("'"):
|
| 649 |
translation = translation[1:-1].strip()
|
| 650 |
|
| 651 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 652 |
|
| 653 |
except Exception as e:
|
| 654 |
print(f"Qwen2.5 translation error: {e}")
|
|
|
|
| 648 |
elif translation.startswith("'") and translation.endswith("'"):
|
| 649 |
translation = translation[1:-1].strip()
|
| 650 |
|
| 651 |
+
# For very short translations (like titles), lower the minimum length requirement
|
| 652 |
+
# Titles can be as short as 3 characters (e.g., "As Children of Light")
|
| 653 |
+
min_length = 3 if len(text) < 10 else 5 # Lower threshold for short inputs (likely titles)
|
| 654 |
+
|
| 655 |
+
return translation if translation and len(translation) >= min_length else None
|
| 656 |
|
| 657 |
except Exception as e:
|
| 658 |
print(f"Qwen2.5 translation error: {e}")
|