Spaces:
Sleeping
Sleeping
Peter Yang commited on
Commit ·
f04ca50
1
Parent(s): 44c353a
Revert to OPUS-MT translation by default - better name handling
Browse files- Change default use_qwen_translation to False in DocumentProcessingAgent
- Change default use_qwen_translation to False in WorshipProgramGenerator
- Update app.py to explicitly use OPUS-MT (False)
- OPUS-MT correctly translates names like 章沙雁 → Zhang Shaian
- Qwen was incorrectly translating names (e.g., 'sand gee' instead of proper name)
- app.py +42 -24
- document_processing_agent.py +3 -3
app.py
CHANGED
|
@@ -40,8 +40,8 @@ async def translate_document(docx_path: str, output_path: str = None):
|
|
| 40 |
print("Please upload the original sermon/transcript DOCX file, not a generated worship program.")
|
| 41 |
return None
|
| 42 |
|
| 43 |
-
# Initialize processor with
|
| 44 |
-
processor = DocumentProcessingAgent(GEMMA_BACKEND_URL, use_qwen_translation=
|
| 45 |
|
| 46 |
# Extract content from DOCX
|
| 47 |
try:
|
|
@@ -71,33 +71,51 @@ async def translate_document(docx_path: str, output_path: str = None):
|
|
| 71 |
return None
|
| 72 |
|
| 73 |
# Split content into paragraphs and find Chinese paragraphs
|
| 74 |
-
#
|
|
|
|
| 75 |
paragraphs = content.split('\n\n')
|
| 76 |
chinese_paragraphs = []
|
|
|
|
| 77 |
|
| 78 |
for para in paragraphs:
|
| 79 |
para = para.strip()
|
| 80 |
-
if para:
|
| 81 |
-
|
| 82 |
-
|
| 83 |
-
|
| 84 |
-
|
| 85 |
-
|
| 86 |
-
|
| 87 |
-
|
| 88 |
-
|
| 89 |
-
|
| 90 |
-
|
| 91 |
-
|
| 92 |
-
|
| 93 |
-
|
| 94 |
-
|
| 95 |
-
|
| 96 |
-
|
| 97 |
-
|
| 98 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 99 |
|
| 100 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 101 |
|
| 102 |
# Translate each paragraph
|
| 103 |
bilingual_content = []
|
|
@@ -297,7 +315,7 @@ async def process_worship_program(docx_file, pdf_file, progress=gr.Progress()):
|
|
| 297 |
|
| 298 |
# Generate worship program
|
| 299 |
# Only pass bilingual file and PDF - don't process PDF as document (it's just for date extraction)
|
| 300 |
-
generator = WorshipProgramGenerator(GEMMA_BACKEND_URL, use_qwen_translation=
|
| 301 |
# Pass bilingual file (for Message section) and PDF path (for date extraction only)
|
| 302 |
sources = [bilingual_path, pdf_path]
|
| 303 |
program_content = await generator.generate_program(sources)
|
|
|
|
| 40 |
print("Please upload the original sermon/transcript DOCX file, not a generated worship program.")
|
| 41 |
return None
|
| 42 |
|
| 43 |
+
# Initialize processor with OPUS-MT translation (Qwen disabled due to name translation issues)
|
| 44 |
+
processor = DocumentProcessingAgent(GEMMA_BACKEND_URL, use_qwen_translation=False)
|
| 45 |
|
| 46 |
# Extract content from DOCX
|
| 47 |
try:
|
|
|
|
| 71 |
return None
|
| 72 |
|
| 73 |
# Split content into paragraphs and find Chinese paragraphs
|
| 74 |
+
# RUN EVERYTHING MODE: Translate ALL paragraphs containing Chinese characters
|
| 75 |
+
# Process paragraphs intelligently to avoid duplicates
|
| 76 |
paragraphs = content.split('\n\n')
|
| 77 |
chinese_paragraphs = []
|
| 78 |
+
seen_paragraphs = set() # Track to avoid duplicates
|
| 79 |
|
| 80 |
for para in paragraphs:
|
| 81 |
para = para.strip()
|
| 82 |
+
if not para:
|
| 83 |
+
continue
|
| 84 |
+
|
| 85 |
+
# Check if paragraph contains Chinese
|
| 86 |
+
chinese_chars = re.findall(r'[\u4e00-\u9fff]+', para)
|
| 87 |
+
if not chinese_chars:
|
| 88 |
+
continue
|
| 89 |
+
|
| 90 |
+
# Split by single newlines to handle titles on separate lines
|
| 91 |
+
lines = [line.strip() for line in para.split('\n') if line.strip()]
|
| 92 |
+
|
| 93 |
+
# Strategy: If paragraph has multiple lines, check if first line is a title
|
| 94 |
+
# If so, process title separately, then process remaining content
|
| 95 |
+
if len(lines) > 1:
|
| 96 |
+
first_line = lines[0]
|
| 97 |
+
first_line_has_chinese = bool(re.findall(r'[\u4e00-\u9fff]+', first_line))
|
| 98 |
+
|
| 99 |
+
# Check if first line is a title (ends with colon and is relatively short)
|
| 100 |
+
if first_line_has_chinese and (first_line.endswith(':') or first_line.endswith(':')) and len(first_line) < 50:
|
| 101 |
+
# Add title separately if not seen
|
| 102 |
+
if first_line not in seen_paragraphs:
|
| 103 |
+
chinese_paragraphs.append(first_line)
|
| 104 |
+
seen_paragraphs.add(first_line)
|
| 105 |
|
| 106 |
+
# Process remaining content
|
| 107 |
+
remaining_content = '\n'.join(lines[1:]).strip()
|
| 108 |
+
if remaining_content and remaining_content not in seen_paragraphs:
|
| 109 |
+
remaining_chinese = re.findall(r'[\u4e00-\u9fff]+', remaining_content)
|
| 110 |
+
if remaining_chinese:
|
| 111 |
+
chinese_paragraphs.append(remaining_content)
|
| 112 |
+
seen_paragraphs.add(remaining_content)
|
| 113 |
+
continue
|
| 114 |
+
|
| 115 |
+
# For single-line paragraphs or multi-line without title pattern, add whole paragraph
|
| 116 |
+
if para not in seen_paragraphs:
|
| 117 |
+
chinese_paragraphs.append(para)
|
| 118 |
+
seen_paragraphs.add(para)
|
| 119 |
|
| 120 |
# Translate each paragraph
|
| 121 |
bilingual_content = []
|
|
|
|
| 315 |
|
| 316 |
# Generate worship program
|
| 317 |
# Only pass bilingual file and PDF - don't process PDF as document (it's just for date extraction)
|
| 318 |
+
generator = WorshipProgramGenerator(GEMMA_BACKEND_URL, use_qwen_translation=False)
|
| 319 |
# Pass bilingual file (for Message section) and PDF path (for date extraction only)
|
| 320 |
sources = [bilingual_path, pdf_path]
|
| 321 |
program_content = await generator.generate_program(sources)
|
document_processing_agent.py
CHANGED
|
@@ -61,10 +61,10 @@ class DocumentContent:
|
|
| 61 |
class DocumentProcessingAgent:
|
| 62 |
"""Agent for processing various document types and extracting structured content"""
|
| 63 |
|
| 64 |
-
def __init__(self, gemma_backend_url: str, use_qwen_translation: bool =
|
| 65 |
self.gemma_backend_url = gemma_backend_url
|
| 66 |
self.supported_types = ['email', 'ppt', 'transcript', 'pdf', 'docx', 'doc', 'url']
|
| 67 |
-
# Translation settings
|
| 68 |
self.use_qwen_translation = use_qwen_translation and QWEN_TRANSLATION_AVAILABLE
|
| 69 |
# Initialize translation models lazily
|
| 70 |
self._translation_model = None # OPUS-MT
|
|
@@ -1021,7 +1021,7 @@ English:"""
|
|
| 1021 |
class WorshipProgramGenerator:
|
| 1022 |
"""Main agent for generating worship programs from multiple sources"""
|
| 1023 |
|
| 1024 |
-
def __init__(self, gemma_backend_url: str, use_qwen_translation: bool =
|
| 1025 |
self.doc_processor = DocumentProcessingAgent(gemma_backend_url, use_qwen_translation=use_qwen_translation)
|
| 1026 |
self.template_path = "WORSHIP_PROGRAM_TEMPLATE.md"
|
| 1027 |
|
|
|
|
| 61 |
class DocumentProcessingAgent:
|
| 62 |
"""Agent for processing various document types and extracting structured content"""
|
| 63 |
|
| 64 |
+
def __init__(self, gemma_backend_url: str, use_qwen_translation: bool = False):
|
| 65 |
self.gemma_backend_url = gemma_backend_url
|
| 66 |
self.supported_types = ['email', 'ppt', 'transcript', 'pdf', 'docx', 'doc', 'url']
|
| 67 |
+
# Translation settings - Default to OPUS-MT (False) due to better name handling
|
| 68 |
self.use_qwen_translation = use_qwen_translation and QWEN_TRANSLATION_AVAILABLE
|
| 69 |
# Initialize translation models lazily
|
| 70 |
self._translation_model = None # OPUS-MT
|
|
|
|
| 1021 |
class WorshipProgramGenerator:
|
| 1022 |
"""Main agent for generating worship programs from multiple sources"""
|
| 1023 |
|
| 1024 |
+
def __init__(self, gemma_backend_url: str, use_qwen_translation: bool = False):
|
| 1025 |
self.doc_processor = DocumentProcessingAgent(gemma_backend_url, use_qwen_translation=use_qwen_translation)
|
| 1026 |
self.template_path = "WORSHIP_PROGRAM_TEMPLATE.md"
|
| 1027 |
|