Spaces:

NextDrought
/

worship

Sleeping

Peter Yang commited on Nov 14, 2025

Commit

f04ca50

1 Parent(s): 44c353a

Revert to OPUS-MT translation by default - better name handling

- Change default use_qwen_translation to False in DocumentProcessingAgent
- Change default use_qwen_translation to False in WorshipProgramGenerator
- Update app.py to explicitly use OPUS-MT (False)
- OPUS-MT correctly translates names like 章沙雁 → Zhang Shaian
- Qwen was incorrectly translating names (e.g., 'sand gee' instead of proper name)

Files changed (2) hide show

app.py +42 -24
document_processing_agent.py +3 -3

app.py CHANGED Viewed

@@ -40,8 +40,8 @@ async def translate_document(docx_path: str, output_path: str = None):
         print("Please upload the original sermon/transcript DOCX file, not a generated worship program.")
         return None
-    # Initialize processor with Qwen2.5 translation enabled
-    processor = DocumentProcessingAgent(GEMMA_BACKEND_URL, use_qwen_translation=True)
     # Extract content from DOCX
     try:
@@ -71,33 +71,51 @@ async def translate_document(docx_path: str, output_path: str = None):
         return None
     # Split content into paragraphs and find Chinese paragraphs
-    # First split by double newlines
     paragraphs = content.split('\n\n')
     chinese_paragraphs = []
     for para in paragraphs:
         para = para.strip()
-        if para:
-            chinese_chars = re.findall(r'[\u4e00-\u9fff]+', para)
-            if chinese_chars:
-                # Check if paragraph contains a heading followed by content (e.g., "标题：内容")
-                # Split headings from content to ensure both are translated
-                if '：' in para or ':' in para:
-                    # Check if it's a heading pattern (short line ending with colon)
-                    lines = para.split('\n')
-                    if len(lines) > 1:
-                        first_line = lines[0].strip()
-                        # If first line is short and ends with colon, treat as heading
-                        if len(first_line) < 30 and (first_line.endswith('：') or first_line.endswith(':')):
-                            # Add heading as separate paragraph
-                            chinese_paragraphs.append(first_line)
-                            # Add remaining content as separate paragraph
-                            remaining = '\n'.join(lines[1:]).strip()
-                            if remaining:
-                                chinese_paragraphs.append(remaining)
-                            continue
-                chinese_paragraphs.append(para)
     # Translate each paragraph
     bilingual_content = []
@@ -297,7 +315,7 @@ async def process_worship_program(docx_file, pdf_file, progress=gr.Progress()):
             # Generate worship program
             # Only pass bilingual file and PDF - don't process PDF as document (it's just for date extraction)
-            generator = WorshipProgramGenerator(GEMMA_BACKEND_URL, use_qwen_translation=True)
             # Pass bilingual file (for Message section) and PDF path (for date extraction only)
             sources = [bilingual_path, pdf_path]
             program_content = await generator.generate_program(sources)

         print("Please upload the original sermon/transcript DOCX file, not a generated worship program.")
         return None
+    # Initialize processor with OPUS-MT translation (Qwen disabled due to name translation issues)
+    processor = DocumentProcessingAgent(GEMMA_BACKEND_URL, use_qwen_translation=False)
     # Extract content from DOCX
     try:
         return None
     # Split content into paragraphs and find Chinese paragraphs
+    # RUN EVERYTHING MODE: Translate ALL paragraphs containing Chinese characters
+    # Process paragraphs intelligently to avoid duplicates
     paragraphs = content.split('\n\n')
     chinese_paragraphs = []
+    seen_paragraphs = set()  # Track to avoid duplicates
     for para in paragraphs:
         para = para.strip()
+        if not para:
+            continue
+        # Check if paragraph contains Chinese
+        chinese_chars = re.findall(r'[\u4e00-\u9fff]+', para)
+        if not chinese_chars:
+            continue
+        # Split by single newlines to handle titles on separate lines
+        lines = [line.strip() for line in para.split('\n') if line.strip()]
+        # Strategy: If paragraph has multiple lines, check if first line is a title
+        # If so, process title separately, then process remaining content
+        if len(lines) > 1:
+            first_line = lines[0]
+            first_line_has_chinese = bool(re.findall(r'[\u4e00-\u9fff]+', first_line))
+            # Check if first line is a title (ends with colon and is relatively short)
+            if first_line_has_chinese and (first_line.endswith('：') or first_line.endswith(':')) and len(first_line) < 50:
+                # Add title separately if not seen
+                if first_line not in seen_paragraphs:
+                    chinese_paragraphs.append(first_line)
+                    seen_paragraphs.add(first_line)
+                # Process remaining content
+                remaining_content = '\n'.join(lines[1:]).strip()
+                if remaining_content and remaining_content not in seen_paragraphs:
+                    remaining_chinese = re.findall(r'[\u4e00-\u9fff]+', remaining_content)
+                    if remaining_chinese:
+                        chinese_paragraphs.append(remaining_content)
+                        seen_paragraphs.add(remaining_content)
+                continue
+        # For single-line paragraphs or multi-line without title pattern, add whole paragraph
+        if para not in seen_paragraphs:
+            chinese_paragraphs.append(para)
+            seen_paragraphs.add(para)
     # Translate each paragraph
     bilingual_content = []
             # Generate worship program
             # Only pass bilingual file and PDF - don't process PDF as document (it's just for date extraction)
+            generator = WorshipProgramGenerator(GEMMA_BACKEND_URL, use_qwen_translation=False)
             # Pass bilingual file (for Message section) and PDF path (for date extraction only)
             sources = [bilingual_path, pdf_path]
             program_content = await generator.generate_program(sources)

document_processing_agent.py CHANGED Viewed

@@ -61,10 +61,10 @@ class DocumentContent:
 class DocumentProcessingAgent:
     """Agent for processing various document types and extracting structured content"""
-    def __init__(self, gemma_backend_url: str, use_qwen_translation: bool = True):
         self.gemma_backend_url = gemma_backend_url
         self.supported_types = ['email', 'ppt', 'transcript', 'pdf', 'docx', 'doc', 'url']
-        # Translation settings
         self.use_qwen_translation = use_qwen_translation and QWEN_TRANSLATION_AVAILABLE
         # Initialize translation models lazily
         self._translation_model = None  # OPUS-MT
@@ -1021,7 +1021,7 @@ English:"""
 class WorshipProgramGenerator:
     """Main agent for generating worship programs from multiple sources"""
-    def __init__(self, gemma_backend_url: str, use_qwen_translation: bool = True):
         self.doc_processor = DocumentProcessingAgent(gemma_backend_url, use_qwen_translation=use_qwen_translation)
         self.template_path = "WORSHIP_PROGRAM_TEMPLATE.md"

 class DocumentProcessingAgent:
     """Agent for processing various document types and extracting structured content"""
+    def __init__(self, gemma_backend_url: str, use_qwen_translation: bool = False):
         self.gemma_backend_url = gemma_backend_url
         self.supported_types = ['email', 'ppt', 'transcript', 'pdf', 'docx', 'doc', 'url']
+        # Translation settings - Default to OPUS-MT (False) due to better name handling
         self.use_qwen_translation = use_qwen_translation and QWEN_TRANSLATION_AVAILABLE
         # Initialize translation models lazily
         self._translation_model = None  # OPUS-MT
 class WorshipProgramGenerator:
     """Main agent for generating worship programs from multiple sources"""
+    def __init__(self, gemma_backend_url: str, use_qwen_translation: bool = False):
         self.doc_processor = DocumentProcessingAgent(gemma_backend_url, use_qwen_translation=use_qwen_translation)
         self.template_path = "WORSHIP_PROGRAM_TEMPLATE.md"