#!/usr/bin/env python3 """ Gradio Interface for Worship Program Generation Upload DOCX sermon and PDF bulletin to generate bilingual worship program """ import gradio as gr import asyncio import os import sys import tempfile import re import shutil from pathlib import Path from datetime import datetime # Add current directory to path sys.path.insert(0, os.path.dirname(os.path.abspath(__file__))) # Import document processing (only essential module) from document_processing_agent import DocumentProcessingAgent, WorshipProgramGenerator # Initialize generator (GEMMA backend not required for Hugging Face deployment) GEMMA_BACKEND_URL = os.getenv("GEMMA_BACKEND_URL", "http://localhost:8080") # ============================================================================ # Translation Functions (embedded from translate_document.py) # ============================================================================ async def translate_document(docx_path: str, output_path: str = None): """Translate entire DOCX document to bilingual format""" if not os.path.exists(docx_path): return None # Check if this looks like a worship program file (should not be translated) filename = os.path.basename(docx_path).lower() if 'worship_program' in filename or 'worship-program' in filename: print(f"Warning: File '{filename}' appears to be a worship program, not a sermon transcript.") print("Please upload the original sermon/transcript DOCX file, not a generated worship program.") return None # Initialize processor with OPUS-MT translation (Qwen disabled due to name translation issues) processor = DocumentProcessingAgent(GEMMA_BACKEND_URL, use_qwen_translation=False) # Extract content from DOCX try: content = await processor._extract_word(docx_path) except Exception as e: print(f"Error extracting content: {e}") return None # Validate that this looks like a sermon/transcript, not a worship program # Worship programs typically have structured sections like "## Call to Worship", "## Songs", etc. content_lower = content.lower() worship_program_indicators = [ '## call to worship', '## songs', '## prayer', '## message', '## announcements', 'worship program', 'scripture reference', 'today\'s bible reading' ] indicator_count = sum(1 for indicator in worship_program_indicators if indicator in content_lower) if indicator_count >= 3: print(f"Warning: The DOCX file appears to be a worship program (found {indicator_count} program indicators), not a sermon transcript.") print("Please upload the original sermon/transcript DOCX file for translation.") return None # Split content into paragraphs and find Chinese paragraphs # RUN EVERYTHING MODE: Translate ALL paragraphs containing Chinese characters # Process paragraphs intelligently to avoid duplicates paragraphs = content.split('\n\n') chinese_paragraphs = [] seen_paragraphs = set() # Track to avoid duplicates for para in paragraphs: para = para.strip() if not para: continue # Check if paragraph contains Chinese chinese_chars = re.findall(r'[\u4e00-\u9fff]+', para) if not chinese_chars: continue # Split by single newlines to handle titles on separate lines lines = [line.strip() for line in para.split('\n') if line.strip()] # Strategy: If paragraph has multiple lines, check if first line is a title # If so, process title separately, then process remaining content if len(lines) > 1: first_line = lines[0] first_line_has_chinese = bool(re.findall(r'[\u4e00-\u9fff]+', first_line)) # Check if first line is a title (ends with colon and is relatively short) if first_line_has_chinese and (first_line.endswith(':') or first_line.endswith(':')) and len(first_line) < 50: # Add title separately if not seen if first_line not in seen_paragraphs: chinese_paragraphs.append(first_line) seen_paragraphs.add(first_line) # Process remaining content remaining_content = '\n'.join(lines[1:]).strip() if remaining_content and remaining_content not in seen_paragraphs: remaining_chinese = re.findall(r'[\u4e00-\u9fff]+', remaining_content) if remaining_chinese: chinese_paragraphs.append(remaining_content) seen_paragraphs.add(remaining_content) continue # For single-line paragraphs or multi-line without title pattern, add whole paragraph if para not in seen_paragraphs: chinese_paragraphs.append(para) seen_paragraphs.add(para) # Translate each paragraph bilingual_content = [] for i, chinese_para in enumerate(chinese_paragraphs, 1): translated = await processor._translate_text(chinese_para, 'zh', 'en') bilingual_content.append(chinese_para) if translated: bilingual_content.append(translated) # Determine output path if output_path is None: input_path = Path(docx_path) output_path = input_path.parent / f"{input_path.stem}_bilingual.txt" # Write output with open(output_path, 'w', encoding='utf-8') as f: f.write("# Bilingual Document Translation\n\n") f.write(f"Source: {docx_path}\n\n") f.write("="*60 + "\n\n") f.write("\n\n".join(bilingual_content)) return str(output_path) # ============================================================================ # DOCX Conversion Functions (embedded from markdown_to_docx.py) # ============================================================================ def add_formatted_text(paragraph, text): """Add text with inline formatting (bold, italic)""" parts = re.split(r'(\*\*.*?\*\*|\*.*?\*|__.*?__|_.*?_)', text) for part in parts: if not part: continue if part.startswith('**') and part.endswith('**'): run = paragraph.add_run(part[2:-2]) run.bold = True elif part.startswith('__') and part.endswith('__'): run = paragraph.add_run(part[2:-2]) run.bold = True elif part.startswith('*') and part.endswith('*') and len(part) > 2: run = paragraph.add_run(part[1:-1]) run.italic = True elif part.startswith('_') and part.endswith('_') and len(part) > 2: run = paragraph.add_run(part[1:-1]) run.italic = True else: paragraph.add_run(part) def markdown_to_docx(markdown_path: str, docx_path: str): """Convert markdown file to DOCX format""" from docx import Document from docx.shared import Pt # Read markdown file with open(markdown_path, 'r', encoding='utf-8') as f: content = f.read() # Create new document doc = Document() style = doc.styles['Normal'] font = style.font font.name = 'Arial' font.size = Pt(11) # Split content into lines lines = content.split('\n') i = 0 while i < len(lines): line = lines[i].strip() # Skip empty lines if not line: if i < len(lines) - 1: doc.add_paragraph() i += 1 continue # Handle horizontal rules if line.startswith('---'): doc.add_paragraph('─' * 50) i += 1 continue # Handle headings if line.startswith('#'): level = len(line) - len(line.lstrip('#')) heading_text = line.lstrip('#').strip() if level == 1: doc.add_heading(heading_text, level=1) elif level == 2: doc.add_heading(heading_text, level=2) elif level == 3: doc.add_heading(heading_text, level=3) else: doc.add_heading(heading_text, level=4) i += 1 continue # Handle numbered lists if re.match(r'^\d+[\.\)]\s+', line): list_items = [] while i < len(lines) and re.match(r'^\d+[\.\)]\s+', lines[i].strip()): item_text = re.sub(r'^\d+[\.\)]\s+', '', lines[i].strip()) list_items.append(item_text) i += 1 for item in list_items: doc.add_paragraph(item, style='List Number') continue # Handle bullet lists if line.startswith('- ') or line.startswith('* '): list_items = [] while i < len(lines) and (lines[i].strip().startswith('- ') or lines[i].strip().startswith('* ')): item_text = lines[i].strip()[2:].strip() list_items.append(item_text) i += 1 for item in list_items: doc.add_paragraph(item, style='List Bullet') continue # Handle italic text (*text*) if line.startswith('*') and line.endswith('*') and len(line) > 2: para = doc.add_paragraph() run = para.add_run(line[1:-1]) run.italic = True i += 1 continue # Regular paragraph para = doc.add_paragraph() add_formatted_text(para, line) i += 1 # Save document doc.save(docx_path) return docx_path # ============================================================================ # Main Gradio Application # ============================================================================ def extract_date_from_pdf(pdf_filename: str) -> str: """Extract date from PDF filename (format: RCCA-worship-bulletin-YYYY-MM-DD.pdf)""" if not pdf_filename: return datetime.now().strftime("%Y-%m-%d") filename = Path(pdf_filename).name date_match = re.search(r'(\d{4}-\d{2}-\d{2})', filename) if date_match: return date_match.group(1) return datetime.now().strftime("%Y-%m-%d") async def process_worship_program(docx_file, pdf_file, progress=gr.Progress()): """Main processing function for Gradio interface""" if docx_file is None: return "❌ Error: Please upload a DOCX file", None if pdf_file is None: return "❌ Error: Please upload a PDF file", None try: progress(0.1, desc="📄 Extracting content from DOCX file...") # Create temporary directory for processing with tempfile.TemporaryDirectory() as temp_dir: # Copy uploaded files to temp directory docx_path = os.path.join(temp_dir, os.path.basename(docx_file.name)) pdf_path = os.path.join(temp_dir, os.path.basename(pdf_file.name)) shutil.copy2(docx_file.name, docx_path) shutil.copy2(pdf_file.name, pdf_path) # Translate DOCX progress(0.2, desc="🌐 Translating DOCX content (this may take a few minutes)...") bilingual_path_temp = await translate_document(docx_path, output_path=None) if not bilingual_path_temp or not os.path.exists(bilingual_path_temp): error_msg = "❌ Error: Translation failed. " filename = os.path.basename(docx_file.name) if 'worship_program' in filename.lower() or 'worship-program' in filename.lower(): error_msg += f"\n\nThe file '{filename}' appears to be a previously generated worship program, not a sermon transcript.\n" error_msg += "Please upload the ORIGINAL sermon/transcript DOCX file for translation." else: error_msg += "Please check the DOCX file." return error_msg, None # Copy bilingual file to current directory for persistence and easy access bilingual_filename = os.path.basename(bilingual_path_temp) bilingual_path = bilingual_filename # Save in current directory shutil.copy2(bilingual_path_temp, bilingual_path) progress(0.5, desc=f"💾 Saved bilingual translation to {bilingual_filename}...") progress(0.6, desc="✅ Translation complete! Generating worship program...") # Generate worship program # Only pass bilingual file and PDF - don't process PDF as document (it's just for date extraction) generator = WorshipProgramGenerator(GEMMA_BACKEND_URL, use_qwen_translation=False) # Pass bilingual file (for Message section) and PDF path (for date extraction only) sources = [bilingual_path, pdf_path] program_content = await generator.generate_program(sources) if not program_content: return "❌ Error: Failed to generate worship program.", None progress(0.9, desc="💾 Saving worship program...") # Save output file with date from PDF filename date_str = extract_date_from_pdf(pdf_file.name) output_filename = f"worship_program_{date_str}.md" output_path = os.path.join(temp_dir, output_filename) with open(output_path, 'w', encoding='utf-8') as f: f.write(str(program_content)) # Copy to current directory for download final_output_path = output_filename shutil.copy2(output_path, final_output_path) progress(0.95, desc="📝 Converting to DOCX format...") # Convert markdown to DOCX docx_filename = output_filename.replace('.md', '.docx') docx_path_temp = os.path.join(temp_dir, docx_filename) try: markdown_to_docx(output_path, docx_path_temp) final_docx_path = docx_filename shutil.copy2(docx_path_temp, final_docx_path) docx_created = True except Exception as e: print(f"Warning: DOCX conversion failed: {e}") docx_created = False final_docx_path = None progress(1.0, desc="✅ Complete!") # Generate status message file_size = os.path.getsize(final_output_path) content_length = len(str(program_content)) status_message = f"""✅ Worship program generated successfully! 📄 Markdown file: {final_output_path} 📊 Content length: {content_length:,} characters 💾 File size: {file_size:,} bytes 📅 Date: {date_str}""" if docx_created: docx_size = os.path.getsize(final_docx_path) status_message += f""" 📝 DOCX file: {final_docx_path} 💾 DOCX size: {docx_size:,} bytes""" status_message += "\n\nThe bilingual document has been integrated into the Message section." # Return both files if DOCX created if docx_created: return status_message, [final_output_path, final_docx_path] else: return status_message, final_output_path except Exception as e: import traceback error_msg = f"❌ Error: {str(e)}\n\n{traceback.format_exc()}" return error_msg, None def process_worship_program_sync(docx_file, pdf_file, progress=gr.Progress()): """Synchronous wrapper for async function""" return asyncio.run(process_worship_program(docx_file, pdf_file, progress)) # Create Gradio interface with gr.Blocks(title="Worship Program Generator", theme=gr.themes.Soft()) as demo: gr.Markdown(""" # 🎵 Worship Program Generator Upload your DOCX sermon/transcript and PDF worship bulletin to generate a complete bilingual worship program. **Features:** - ✅ Automatic Chinese-to-English translation using OPUS-MT - ✅ Structured worship program generation - ✅ Bilingual content integration - ✅ Date extraction from PDF filename - ✅ Markdown and DOCX output formats """) with gr.Row(): with gr.Column(): docx_input = gr.File( label="📄 DOCX Sermon/Transcript File", file_types=[".docx"], type="filepath" ) pdf_input = gr.File( label="📋 PDF Worship Bulletin", file_types=[".pdf"], type="filepath" ) process_btn = gr.Button("🚀 Generate Worship Program", variant="primary", size="lg") with gr.Column(): status_output = gr.Textbox( label="Status", lines=10, interactive=False, placeholder="Status messages will appear here..." ) download_output = gr.File( label="📥 Download Worship Program", visible=True, file_count="multiple" ) # Process button click handler process_btn.click( fn=process_worship_program_sync, inputs=[docx_input, pdf_input], outputs=[status_output, download_output], show_progress=True ) # Instructions gr.Markdown(""" ### 📝 Instructions 1. **Upload DOCX File**: Your sermon transcript or message document (Chinese content will be automatically translated) 2. **Upload PDF File**: Your worship bulletin (should contain date in filename like `RCCA-worship-bulletin-2025-11-09.pdf`) 3. **Click Generate**: The system will translate DOCX, process PDF, and generate the worship program 4. **Download**: Get both markdown and DOCX files **Note**: Translation may take a few minutes depending on document size. """) # Footer gr.Markdown(""" --- *Powered by Helsinki-NLP OPUS-MT for translation | Built with Gradio* """) if __name__ == "__main__": demo.launch()