""" ORU Transcript Formatter - Hugging Face Spaces Deployment AI-Powered Transcript Formatting with ORU Branding """ import os import tempfile from pathlib import Path import gradio as gr from dotenv import load_dotenv import anthropic from docx import Document from docx.shared import Inches, Pt from docx.enum.text import WD_ALIGN_PARAGRAPH import re # Load environment variables load_dotenv() def format_with_claude(text): """Format transcript using Claude AI.""" api_key = os.getenv('ANTHROPIC_API_KEY') if not api_key: raise ValueError("ANTHROPIC_API_KEY not found. Please add it to your Hugging Face Space secrets.") client = anthropic.Anthropic(api_key=api_key) system_prompt = """You are a professional transcript formatter. Your task is to intelligently format transcripts while preserving all original content and meaning. FORMATTING REQUIREMENTS: 1. SPEAKER NAMES: - Bold all speaker names using **Speaker Name:** format - Detect various speaker formats (Speaker:, SPEAKER, Speaker Name, etc.) - Maintain consistent formatting throughout 2. SCRIPTURE REFERENCES: - Bold ALL Scripture references in ANY format using **reference** format - Examples to detect and format: * 1 John 2:18 → **1 John 2:18** * Mark chapter 13 verse 13 → **Mark chapter 13 verse 13** * Romans 8:28-30 → **Romans 8:28-30** * First Corinthians 15 → **First Corinthians 15** * Matt. 5:3-12 → **Matt. 5:3-12** - Include partial references, book names, and various formats 3. CHARACTER ENCODING FIXES: - Fix common encoding issues - Convert smart quotes to proper Unicode - Fix any other character encoding problems 4. MUSIC SYMBOLS: - Remove excessive music symbols (♪♪♪ → ♪ or remove entirely if appropriate) - Clean up music notations while preserving meaning 5. PARAGRAPH STRUCTURE: - Create proper paragraph breaks at natural speech boundaries - Merge fragmented lines into coherent paragraphs - Maintain logical flow and readability 6. CONTENT PRESERVATION: - Preserve ALL original content and meaning - Do not add, remove, or change the substance of what was said - Maintain the speaker's voice and style 7. TIMESTAMP REMOVAL: - Remove timestamps if present (e.g., [00:15:30], (2:45), etc.) - Clean up any time markers that interrupt the flow 8. OUTPUT FORMAT: - Return the formatted text in clean markdown format - Use proper markdown syntax - Ensure readability and professional appearance Remember: Your goal is to make the transcript more readable and professional while preserving every bit of the original meaning and content.""" message = client.messages.create( model="claude-3-5-sonnet-20240620", max_tokens=8000, temperature=0.1, system=system_prompt, messages=[{"role": "user", "content": f"Please format this transcript:\n\n{text}"}] ) return message.content[0].text def create_word_document(formatted_text, title): """Create a Word document from formatted text.""" doc = Document() # Set margins sections = doc.sections for section in sections: section.top_margin = Inches(1) section.bottom_margin = Inches(1) section.left_margin = Inches(1) section.right_margin = Inches(1) # Add title title_para = doc.add_paragraph() title_run = title_para.add_run(title) title_run.font.size = Pt(16) title_run.bold = True title_para.alignment = WD_ALIGN_PARAGRAPH.CENTER title_para.space_after = Pt(24) # Add metadata meta_para = doc.add_paragraph() meta_run = meta_para.add_run("Formatted with AI • ORU Transcript Formatter") meta_run.font.size = Pt(10) meta_run.italic = True meta_para.space_after = Pt(12) # Add separator doc.add_paragraph("_" * 50).space_after = Pt(12) # Process the formatted text and add to document lines = formatted_text.split('\n') for line in lines: line = line.strip() if not line: continue para = doc.add_paragraph() # Check if line contains bold formatting (markdown style) if '**' in line: # Parse markdown-style bold formatting parts = re.split(r'(\*\*[^*]+\*\*)', line) for part in parts: if part.startswith('**') and part.endswith('**'): # Bold text bold_text = part[2:-2] # Remove ** markers run = para.add_run(bold_text) run.bold = True else: # Regular text para.add_run(part) else: # Regular paragraph para.add_run(line) # Set font size for run in para.runs: run.font.size = Pt(11) para.space_after = Pt(6) return doc def format_transcript(file): """Format a transcript file using AI.""" if file is None: return None, "Please upload a transcript file." try: # Read the uploaded file if file.name.endswith('.txt'): with open(file.name, 'r', encoding='utf-8') as f: content = f.read() else: return None, "Please upload a .txt file." if not content.strip(): return None, "The uploaded file appears to be empty." # Format using AI formatted_text = format_with_claude(content) # Create Word document title = Path(file.name).stem.replace('_', ' ').replace('-', ' ') doc = create_word_document(formatted_text, title) # Save to temporary file output_path = tempfile.mktemp(suffix='.docx') doc.save(output_path) # Return file and success message return output_path, "✅ Transcript formatted successfully! Download your Word document below." except Exception as e: error_msg = str(e) if "ANTHROPIC_API_KEY" in error_msg: return None, "❌ API key not configured. Please contact the administrator." else: return None, f"❌ Error formatting transcript: {error_msg}" # Custom CSS for ORU branding css = """ .gradio-container { background: linear-gradient(135deg, #003366 0%, #002244 100%) !important; color: white !important; } .gr-button-primary { background: linear-gradient(135deg, #FFD700 0%, #FFC107 100%) !important; color: #003366 !important; border: none !important; font-weight: bold !important; } .gr-button-primary:hover { background: linear-gradient(135deg, #FFC107 0%, #FFB300 100%) !important; transform: translateY(-1px) !important; } h1 { color: #FFD700 !important; text-align: center !important; font-size: 2.5rem !important; margin-bottom: 1rem !important; } .gr-form { background: rgba(255, 255, 255, 0.1) !important; border-radius: 15px !important; padding: 2rem !important; backdrop-filter: blur(10px) !important; } .gr-file { border: 2px dashed #4A90E2 !important; border-radius: 10px !important; background: rgba(255, 255, 255, 0.05) !important; } .footer { text-align: center !important; color: #FFD700 !important; margin-top: 2rem !important; } """ # Create Gradio interface with gr.Blocks(css=css, title="ORU Transcript Formatter") as demo: gr.HTML("""
AI-Powered Transcript Formatting • Oral Roberts University
""") with gr.Row(): with gr.Column(): file_input = gr.File( label="📄 Upload Transcript File (.txt)", file_types=[".txt"], type="filepath" ) format_btn = gr.Button( "🤖 Format Transcript", variant="primary", size="lg" ) status_output = gr.Textbox( label="Status", interactive=False, lines=2 ) with gr.Column(): file_output = gr.File( label="📥 Download Formatted Document", interactive=False ) gr.HTML(""" """) # Connect the interface format_btn.click( fn=format_transcript, inputs=[file_input], outputs=[file_output, status_output] ) # Launch the demo if __name__ == "__main__": demo.launch()