Spaces:

drbinna
/

oru-transcript-formatter

Sleeping

File size: 8,981 Bytes

"""
ORU Transcript Formatter - Hugging Face Spaces Deployment
AI-Powered Transcript Formatting with ORU Branding
"""

import os
import tempfile
from pathlib import Path
import gradio as gr
from dotenv import load_dotenv
import anthropic
from docx import Document
from docx.shared import Inches, Pt
from docx.enum.text import WD_ALIGN_PARAGRAPH
import re

# Load environment variables
load_dotenv()

def format_with_claude(text):
    """Format transcript using Claude AI."""
    api_key = os.getenv('ANTHROPIC_API_KEY')
    if not api_key:
        raise ValueError("ANTHROPIC_API_KEY not found. Please add it to your Hugging Face Space secrets.")
    
    client = anthropic.Anthropic(api_key=api_key)
    
    system_prompt = """You are a professional transcript formatter. Your task is to intelligently format transcripts while preserving all original content and meaning.

FORMATTING REQUIREMENTS:

1. SPEAKER NAMES:
   - Bold all speaker names using **Speaker Name:** format
   - Detect various speaker formats (Speaker:, SPEAKER, Speaker Name, etc.)
   - Maintain consistent formatting throughout

2. SCRIPTURE REFERENCES:
   - Bold ALL Scripture references in ANY format using **reference** format
   - Examples to detect and format:
     * 1 John 2:18 → **1 John 2:18**
     * Mark chapter 13 verse 13 → **Mark chapter 13 verse 13**
     * Romans 8:28-30 → **Romans 8:28-30**
     * First Corinthians 15 → **First Corinthians 15**
     * Matt. 5:3-12 → **Matt. 5:3-12**
   - Include partial references, book names, and various formats

3. CHARACTER ENCODING FIXES:
   - Fix common encoding issues
   - Convert smart quotes to proper Unicode
   - Fix any other character encoding problems

4. MUSIC SYMBOLS:
   - Remove excessive music symbols (♪♪♪ → ♪ or remove entirely if appropriate)
   - Clean up music notations while preserving meaning

5. PARAGRAPH STRUCTURE:
   - Create proper paragraph breaks at natural speech boundaries
   - Merge fragmented lines into coherent paragraphs
   - Maintain logical flow and readability

6. CONTENT PRESERVATION:
   - Preserve ALL original content and meaning
   - Do not add, remove, or change the substance of what was said
   - Maintain the speaker's voice and style

7. TIMESTAMP REMOVAL:
   - Remove timestamps if present (e.g., [00:15:30], (2:45), etc.)
   - Clean up any time markers that interrupt the flow

8. OUTPUT FORMAT:
   - Return the formatted text in clean markdown format
   - Use proper markdown syntax
   - Ensure readability and professional appearance

Remember: Your goal is to make the transcript more readable and professional while preserving every bit of the original meaning and content."""
    
    message = client.messages.create(
        model="claude-3-5-sonnet-20240620",
        max_tokens=8000,
        temperature=0.1,
        system=system_prompt,
        messages=[{"role": "user", "content": f"Please format this transcript:\n\n{text}"}]
    )
    
    return message.content[0].text

def create_word_document(formatted_text, title):
    """Create a Word document from formatted text."""
    doc = Document()
    
    # Set margins
    sections = doc.sections
    for section in sections:
        section.top_margin = Inches(1)
        section.bottom_margin = Inches(1)
        section.left_margin = Inches(1)
        section.right_margin = Inches(1)
    
    # Add title
    title_para = doc.add_paragraph()
    title_run = title_para.add_run(title)
    title_run.font.size = Pt(16)
    title_run.bold = True
    title_para.alignment = WD_ALIGN_PARAGRAPH.CENTER
    title_para.space_after = Pt(24)
    
    # Add metadata
    meta_para = doc.add_paragraph()
    meta_run = meta_para.add_run("Formatted with AI • ORU Transcript Formatter")
    meta_run.font.size = Pt(10)
    meta_run.italic = True
    meta_para.space_after = Pt(12)
    
    # Add separator
    doc.add_paragraph("_" * 50).space_after = Pt(12)
    
    # Process the formatted text and add to document
    lines = formatted_text.split('\n')
    for line in lines:
        line = line.strip()
        if not line:
            continue
        
        para = doc.add_paragraph()
        
        # Check if line contains bold formatting (markdown style)
        if '**' in line:
            # Parse markdown-style bold formatting
            parts = re.split(r'(\*\*[^*]+\*\*)', line)
            for part in parts:
                if part.startswith('**') and part.endswith('**'):
                    # Bold text
                    bold_text = part[2:-2]  # Remove ** markers
                    run = para.add_run(bold_text)
                    run.bold = True
                else:
                    # Regular text
                    para.add_run(part)
        else:
            # Regular paragraph
            para.add_run(line)
        
        # Set font size
        for run in para.runs:
            run.font.size = Pt(11)
        
        para.space_after = Pt(6)
    
    return doc

def format_transcript(file):
    """Format a transcript file using AI."""
    if file is None:
        return None, "Please upload a transcript file."
    
    try:
        # Read the uploaded file
        if file.name.endswith('.txt'):
            with open(file.name, 'r', encoding='utf-8') as f:
                content = f.read()
        else:
            return None, "Please upload a .txt file."
        
        if not content.strip():
            return None, "The uploaded file appears to be empty."
        
        # Format using AI
        formatted_text = format_with_claude(content)
        
        # Create Word document
        title = Path(file.name).stem.replace('_', ' ').replace('-', ' ')
        doc = create_word_document(formatted_text, title)
        
        # Save to temporary file
        output_path = tempfile.mktemp(suffix='.docx')
        doc.save(output_path)
        
        # Return file and success message
        return output_path, "✅ Transcript formatted successfully! Download your Word document below."
        
    except Exception as e:
        error_msg = str(e)
        if "ANTHROPIC_API_KEY" in error_msg:
            return None, "❌ API key not configured. Please contact the administrator."
        else:
            return None, f"❌ Error formatting transcript: {error_msg}"

# Custom CSS for ORU branding
css = """
.gradio-container {
    background: linear-gradient(135deg, #003366 0%, #002244 100%) !important;
    color: white !important;
}

.gr-button-primary {
    background: linear-gradient(135deg, #FFD700 0%, #FFC107 100%) !important;
    color: #003366 !important;
    border: none !important;
    font-weight: bold !important;
}

.gr-button-primary:hover {
    background: linear-gradient(135deg, #FFC107 0%, #FFB300 100%) !important;
    transform: translateY(-1px) !important;
}

h1 {
    color: #FFD700 !important;
    text-align: center !important;
    font-size: 2.5rem !important;
    margin-bottom: 1rem !important;
}

.gr-form {
    background: rgba(255, 255, 255, 0.1) !important;
    border-radius: 15px !important;
    padding: 2rem !important;
    backdrop-filter: blur(10px) !important;
}

.gr-file {
    border: 2px dashed #4A90E2 !important;
    border-radius: 10px !important;
    background: rgba(255, 255, 255, 0.05) !important;
}

.footer {
    text-align: center !important;
    color: #FFD700 !important;
    margin-top: 2rem !important;
}
"""

# Create Gradio interface
with gr.Blocks(css=css, title="ORU Transcript Formatter") as demo:
    gr.HTML("""
    <h1>🎓 ORU Transcript Formatter</h1>
    <p style="text-align: center; color: #FFD700; font-size: 1.2rem; margin-bottom: 2rem;">
        AI-Powered Transcript Formatting • Oral Roberts University
    </p>
    """)
    
    with gr.Row():
        with gr.Column():
            file_input = gr.File(
                label="📄 Upload Transcript File (.txt)",
                file_types=[".txt"],
                type="filepath"
            )
            
            format_btn = gr.Button(
                "🤖 Format Transcript",
                variant="primary",
                size="lg"
            )
            
            status_output = gr.Textbox(
                label="Status",
                interactive=False,
                lines=2
            )
        
        with gr.Column():
            file_output = gr.File(
                label="📥 Download Formatted Document",
                interactive=False
            )
    
    gr.HTML("""
    <div class="footer">
        <h3>✨ Features</h3>
        <p>🎯 AI-powered speaker detection • 📖 Scripture reference highlighting • 🎨 Professional formatting</p>
        <p>© 2025 Oral Roberts University • Powered by AI</p>
    </div>
    """)
    
    # Connect the interface
    format_btn.click(
        fn=format_transcript,
        inputs=[file_input],
        outputs=[file_output, status_output]
    )

# Launch the demo
if __name__ == "__main__":
    demo.launch()