Peter Yang commited on
Commit
f04ca50
·
1 Parent(s): 44c353a

Revert to OPUS-MT translation by default - better name handling

Browse files

- Change default use_qwen_translation to False in DocumentProcessingAgent
- Change default use_qwen_translation to False in WorshipProgramGenerator
- Update app.py to explicitly use OPUS-MT (False)
- OPUS-MT correctly translates names like 章沙雁 → Zhang Shaian
- Qwen was incorrectly translating names (e.g., 'sand gee' instead of proper name)

Files changed (2) hide show
  1. app.py +42 -24
  2. document_processing_agent.py +3 -3
app.py CHANGED
@@ -40,8 +40,8 @@ async def translate_document(docx_path: str, output_path: str = None):
40
  print("Please upload the original sermon/transcript DOCX file, not a generated worship program.")
41
  return None
42
 
43
- # Initialize processor with Qwen2.5 translation enabled
44
- processor = DocumentProcessingAgent(GEMMA_BACKEND_URL, use_qwen_translation=True)
45
 
46
  # Extract content from DOCX
47
  try:
@@ -71,33 +71,51 @@ async def translate_document(docx_path: str, output_path: str = None):
71
  return None
72
 
73
  # Split content into paragraphs and find Chinese paragraphs
74
- # First split by double newlines
 
75
  paragraphs = content.split('\n\n')
76
  chinese_paragraphs = []
 
77
 
78
  for para in paragraphs:
79
  para = para.strip()
80
- if para:
81
- chinese_chars = re.findall(r'[\u4e00-\u9fff]+', para)
82
- if chinese_chars:
83
- # Check if paragraph contains a heading followed by content (e.g., "标题:内容")
84
- # Split headings from content to ensure both are translated
85
- if ':' in para or ':' in para:
86
- # Check if it's a heading pattern (short line ending with colon)
87
- lines = para.split('\n')
88
- if len(lines) > 1:
89
- first_line = lines[0].strip()
90
- # If first line is short and ends with colon, treat as heading
91
- if len(first_line) < 30 and (first_line.endswith(':') or first_line.endswith(':')):
92
- # Add heading as separate paragraph
93
- chinese_paragraphs.append(first_line)
94
- # Add remaining content as separate paragraph
95
- remaining = '\n'.join(lines[1:]).strip()
96
- if remaining:
97
- chinese_paragraphs.append(remaining)
98
- continue
 
 
 
 
99
 
100
- chinese_paragraphs.append(para)
 
 
 
 
 
 
 
 
 
 
 
 
101
 
102
  # Translate each paragraph
103
  bilingual_content = []
@@ -297,7 +315,7 @@ async def process_worship_program(docx_file, pdf_file, progress=gr.Progress()):
297
 
298
  # Generate worship program
299
  # Only pass bilingual file and PDF - don't process PDF as document (it's just for date extraction)
300
- generator = WorshipProgramGenerator(GEMMA_BACKEND_URL, use_qwen_translation=True)
301
  # Pass bilingual file (for Message section) and PDF path (for date extraction only)
302
  sources = [bilingual_path, pdf_path]
303
  program_content = await generator.generate_program(sources)
 
40
  print("Please upload the original sermon/transcript DOCX file, not a generated worship program.")
41
  return None
42
 
43
+ # Initialize processor with OPUS-MT translation (Qwen disabled due to name translation issues)
44
+ processor = DocumentProcessingAgent(GEMMA_BACKEND_URL, use_qwen_translation=False)
45
 
46
  # Extract content from DOCX
47
  try:
 
71
  return None
72
 
73
  # Split content into paragraphs and find Chinese paragraphs
74
+ # RUN EVERYTHING MODE: Translate ALL paragraphs containing Chinese characters
75
+ # Process paragraphs intelligently to avoid duplicates
76
  paragraphs = content.split('\n\n')
77
  chinese_paragraphs = []
78
+ seen_paragraphs = set() # Track to avoid duplicates
79
 
80
  for para in paragraphs:
81
  para = para.strip()
82
+ if not para:
83
+ continue
84
+
85
+ # Check if paragraph contains Chinese
86
+ chinese_chars = re.findall(r'[\u4e00-\u9fff]+', para)
87
+ if not chinese_chars:
88
+ continue
89
+
90
+ # Split by single newlines to handle titles on separate lines
91
+ lines = [line.strip() for line in para.split('\n') if line.strip()]
92
+
93
+ # Strategy: If paragraph has multiple lines, check if first line is a title
94
+ # If so, process title separately, then process remaining content
95
+ if len(lines) > 1:
96
+ first_line = lines[0]
97
+ first_line_has_chinese = bool(re.findall(r'[\u4e00-\u9fff]+', first_line))
98
+
99
+ # Check if first line is a title (ends with colon and is relatively short)
100
+ if first_line_has_chinese and (first_line.endswith(':') or first_line.endswith(':')) and len(first_line) < 50:
101
+ # Add title separately if not seen
102
+ if first_line not in seen_paragraphs:
103
+ chinese_paragraphs.append(first_line)
104
+ seen_paragraphs.add(first_line)
105
 
106
+ # Process remaining content
107
+ remaining_content = '\n'.join(lines[1:]).strip()
108
+ if remaining_content and remaining_content not in seen_paragraphs:
109
+ remaining_chinese = re.findall(r'[\u4e00-\u9fff]+', remaining_content)
110
+ if remaining_chinese:
111
+ chinese_paragraphs.append(remaining_content)
112
+ seen_paragraphs.add(remaining_content)
113
+ continue
114
+
115
+ # For single-line paragraphs or multi-line without title pattern, add whole paragraph
116
+ if para not in seen_paragraphs:
117
+ chinese_paragraphs.append(para)
118
+ seen_paragraphs.add(para)
119
 
120
  # Translate each paragraph
121
  bilingual_content = []
 
315
 
316
  # Generate worship program
317
  # Only pass bilingual file and PDF - don't process PDF as document (it's just for date extraction)
318
+ generator = WorshipProgramGenerator(GEMMA_BACKEND_URL, use_qwen_translation=False)
319
  # Pass bilingual file (for Message section) and PDF path (for date extraction only)
320
  sources = [bilingual_path, pdf_path]
321
  program_content = await generator.generate_program(sources)
document_processing_agent.py CHANGED
@@ -61,10 +61,10 @@ class DocumentContent:
61
  class DocumentProcessingAgent:
62
  """Agent for processing various document types and extracting structured content"""
63
 
64
- def __init__(self, gemma_backend_url: str, use_qwen_translation: bool = True):
65
  self.gemma_backend_url = gemma_backend_url
66
  self.supported_types = ['email', 'ppt', 'transcript', 'pdf', 'docx', 'doc', 'url']
67
- # Translation settings
68
  self.use_qwen_translation = use_qwen_translation and QWEN_TRANSLATION_AVAILABLE
69
  # Initialize translation models lazily
70
  self._translation_model = None # OPUS-MT
@@ -1021,7 +1021,7 @@ English:"""
1021
  class WorshipProgramGenerator:
1022
  """Main agent for generating worship programs from multiple sources"""
1023
 
1024
- def __init__(self, gemma_backend_url: str, use_qwen_translation: bool = True):
1025
  self.doc_processor = DocumentProcessingAgent(gemma_backend_url, use_qwen_translation=use_qwen_translation)
1026
  self.template_path = "WORSHIP_PROGRAM_TEMPLATE.md"
1027
 
 
61
  class DocumentProcessingAgent:
62
  """Agent for processing various document types and extracting structured content"""
63
 
64
+ def __init__(self, gemma_backend_url: str, use_qwen_translation: bool = False):
65
  self.gemma_backend_url = gemma_backend_url
66
  self.supported_types = ['email', 'ppt', 'transcript', 'pdf', 'docx', 'doc', 'url']
67
+ # Translation settings - Default to OPUS-MT (False) due to better name handling
68
  self.use_qwen_translation = use_qwen_translation and QWEN_TRANSLATION_AVAILABLE
69
  # Initialize translation models lazily
70
  self._translation_model = None # OPUS-MT
 
1021
  class WorshipProgramGenerator:
1022
  """Main agent for generating worship programs from multiple sources"""
1023
 
1024
+ def __init__(self, gemma_backend_url: str, use_qwen_translation: bool = False):
1025
  self.doc_processor = DocumentProcessingAgent(gemma_backend_url, use_qwen_translation=use_qwen_translation)
1026
  self.template_path = "WORSHIP_PROGRAM_TEMPLATE.md"
1027