File size: 8,981 Bytes
451c519
 
 
 
 
 
 
 
 
b92b8f1
 
451c519
 
 
 
 
b92b8f1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b94eec8
b92b8f1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
451c519
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b92b8f1
 
 
451c519
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b92b8f1
 
 
 
 
451c519
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b94eec8
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
"""
ORU Transcript Formatter - Hugging Face Spaces Deployment
AI-Powered Transcript Formatting with ORU Branding
"""

import os
import tempfile
from pathlib import Path
import gradio as gr
from dotenv import load_dotenv
import anthropic
from docx import Document
from docx.shared import Inches, Pt
from docx.enum.text import WD_ALIGN_PARAGRAPH
import re

# Load environment variables
load_dotenv()

def format_with_claude(text):
    """Format transcript using Claude AI."""
    api_key = os.getenv('ANTHROPIC_API_KEY')
    if not api_key:
        raise ValueError("ANTHROPIC_API_KEY not found. Please add it to your Hugging Face Space secrets.")
    
    client = anthropic.Anthropic(api_key=api_key)
    
    system_prompt = """You are a professional transcript formatter. Your task is to intelligently format transcripts while preserving all original content and meaning.

FORMATTING REQUIREMENTS:

1. SPEAKER NAMES:
   - Bold all speaker names using **Speaker Name:** format
   - Detect various speaker formats (Speaker:, SPEAKER, Speaker Name, etc.)
   - Maintain consistent formatting throughout

2. SCRIPTURE REFERENCES:
   - Bold ALL Scripture references in ANY format using **reference** format
   - Examples to detect and format:
     * 1 John 2:18 β†’ **1 John 2:18**
     * Mark chapter 13 verse 13 β†’ **Mark chapter 13 verse 13**
     * Romans 8:28-30 β†’ **Romans 8:28-30**
     * First Corinthians 15 β†’ **First Corinthians 15**
     * Matt. 5:3-12 β†’ **Matt. 5:3-12**
   - Include partial references, book names, and various formats

3. CHARACTER ENCODING FIXES:
   - Fix common encoding issues
   - Convert smart quotes to proper Unicode
   - Fix any other character encoding problems

4. MUSIC SYMBOLS:
   - Remove excessive music symbols (β™ͺβ™ͺβ™ͺ β†’ β™ͺ or remove entirely if appropriate)
   - Clean up music notations while preserving meaning

5. PARAGRAPH STRUCTURE:
   - Create proper paragraph breaks at natural speech boundaries
   - Merge fragmented lines into coherent paragraphs
   - Maintain logical flow and readability

6. CONTENT PRESERVATION:
   - Preserve ALL original content and meaning
   - Do not add, remove, or change the substance of what was said
   - Maintain the speaker's voice and style

7. TIMESTAMP REMOVAL:
   - Remove timestamps if present (e.g., [00:15:30], (2:45), etc.)
   - Clean up any time markers that interrupt the flow

8. OUTPUT FORMAT:
   - Return the formatted text in clean markdown format
   - Use proper markdown syntax
   - Ensure readability and professional appearance

Remember: Your goal is to make the transcript more readable and professional while preserving every bit of the original meaning and content."""
    
    message = client.messages.create(
        model="claude-3-5-sonnet-20240620",
        max_tokens=8000,
        temperature=0.1,
        system=system_prompt,
        messages=[{"role": "user", "content": f"Please format this transcript:\n\n{text}"}]
    )
    
    return message.content[0].text

def create_word_document(formatted_text, title):
    """Create a Word document from formatted text."""
    doc = Document()
    
    # Set margins
    sections = doc.sections
    for section in sections:
        section.top_margin = Inches(1)
        section.bottom_margin = Inches(1)
        section.left_margin = Inches(1)
        section.right_margin = Inches(1)
    
    # Add title
    title_para = doc.add_paragraph()
    title_run = title_para.add_run(title)
    title_run.font.size = Pt(16)
    title_run.bold = True
    title_para.alignment = WD_ALIGN_PARAGRAPH.CENTER
    title_para.space_after = Pt(24)
    
    # Add metadata
    meta_para = doc.add_paragraph()
    meta_run = meta_para.add_run("Formatted with AI β€’ ORU Transcript Formatter")
    meta_run.font.size = Pt(10)
    meta_run.italic = True
    meta_para.space_after = Pt(12)
    
    # Add separator
    doc.add_paragraph("_" * 50).space_after = Pt(12)
    
    # Process the formatted text and add to document
    lines = formatted_text.split('\n')
    for line in lines:
        line = line.strip()
        if not line:
            continue
        
        para = doc.add_paragraph()
        
        # Check if line contains bold formatting (markdown style)
        if '**' in line:
            # Parse markdown-style bold formatting
            parts = re.split(r'(\*\*[^*]+\*\*)', line)
            for part in parts:
                if part.startswith('**') and part.endswith('**'):
                    # Bold text
                    bold_text = part[2:-2]  # Remove ** markers
                    run = para.add_run(bold_text)
                    run.bold = True
                else:
                    # Regular text
                    para.add_run(part)
        else:
            # Regular paragraph
            para.add_run(line)
        
        # Set font size
        for run in para.runs:
            run.font.size = Pt(11)
        
        para.space_after = Pt(6)
    
    return doc

def format_transcript(file):
    """Format a transcript file using AI."""
    if file is None:
        return None, "Please upload a transcript file."
    
    try:
        # Read the uploaded file
        if file.name.endswith('.txt'):
            with open(file.name, 'r', encoding='utf-8') as f:
                content = f.read()
        else:
            return None, "Please upload a .txt file."
        
        if not content.strip():
            return None, "The uploaded file appears to be empty."
        
        # Format using AI
        formatted_text = format_with_claude(content)
        
        # Create Word document
        title = Path(file.name).stem.replace('_', ' ').replace('-', ' ')
        doc = create_word_document(formatted_text, title)
        
        # Save to temporary file
        output_path = tempfile.mktemp(suffix='.docx')
        doc.save(output_path)
        
        # Return file and success message
        return output_path, "βœ… Transcript formatted successfully! Download your Word document below."
        
    except Exception as e:
        error_msg = str(e)
        if "ANTHROPIC_API_KEY" in error_msg:
            return None, "❌ API key not configured. Please contact the administrator."
        else:
            return None, f"❌ Error formatting transcript: {error_msg}"

# Custom CSS for ORU branding
css = """
.gradio-container {
    background: linear-gradient(135deg, #003366 0%, #002244 100%) !important;
    color: white !important;
}

.gr-button-primary {
    background: linear-gradient(135deg, #FFD700 0%, #FFC107 100%) !important;
    color: #003366 !important;
    border: none !important;
    font-weight: bold !important;
}

.gr-button-primary:hover {
    background: linear-gradient(135deg, #FFC107 0%, #FFB300 100%) !important;
    transform: translateY(-1px) !important;
}

h1 {
    color: #FFD700 !important;
    text-align: center !important;
    font-size: 2.5rem !important;
    margin-bottom: 1rem !important;
}

.gr-form {
    background: rgba(255, 255, 255, 0.1) !important;
    border-radius: 15px !important;
    padding: 2rem !important;
    backdrop-filter: blur(10px) !important;
}

.gr-file {
    border: 2px dashed #4A90E2 !important;
    border-radius: 10px !important;
    background: rgba(255, 255, 255, 0.05) !important;
}

.footer {
    text-align: center !important;
    color: #FFD700 !important;
    margin-top: 2rem !important;
}
"""

# Create Gradio interface
with gr.Blocks(css=css, title="ORU Transcript Formatter") as demo:
    gr.HTML("""
    <h1>πŸŽ“ ORU Transcript Formatter</h1>
    <p style="text-align: center; color: #FFD700; font-size: 1.2rem; margin-bottom: 2rem;">
        AI-Powered Transcript Formatting β€’ Oral Roberts University
    </p>
    """)
    
    with gr.Row():
        with gr.Column():
            file_input = gr.File(
                label="πŸ“„ Upload Transcript File (.txt)",
                file_types=[".txt"],
                type="filepath"
            )
            
            format_btn = gr.Button(
                "πŸ€– Format Transcript",
                variant="primary",
                size="lg"
            )
            
            status_output = gr.Textbox(
                label="Status",
                interactive=False,
                lines=2
            )
        
        with gr.Column():
            file_output = gr.File(
                label="πŸ“₯ Download Formatted Document",
                interactive=False
            )
    
    gr.HTML("""
    <div class="footer">
        <h3>✨ Features</h3>
        <p>🎯 AI-powered speaker detection β€’ πŸ“– Scripture reference highlighting β€’ 🎨 Professional formatting</p>
        <p>Β© 2025 Oral Roberts University β€’ Powered by AI</p>
    </div>
    """)
    
    # Connect the interface
    format_btn.click(
        fn=format_transcript,
        inputs=[file_input],
        outputs=[file_output, status_output]
    )

# Launch the demo
if __name__ == "__main__":
    demo.launch()