Spaces:
Sleeping
Sleeping
| """ | |
| ORU Transcript Formatter - Hugging Face Spaces Deployment | |
| AI-Powered Transcript Formatting with ORU Branding | |
| """ | |
| import os | |
| import tempfile | |
| from pathlib import Path | |
| import gradio as gr | |
| from dotenv import load_dotenv | |
| import anthropic | |
| from docx import Document | |
| from docx.shared import Inches, Pt | |
| from docx.enum.text import WD_ALIGN_PARAGRAPH | |
| import re | |
| # Load environment variables | |
| load_dotenv() | |
| def format_with_claude(text): | |
| """Format transcript using Claude AI.""" | |
| api_key = os.getenv('ANTHROPIC_API_KEY') | |
| if not api_key: | |
| raise ValueError("ANTHROPIC_API_KEY not found. Please add it to your Hugging Face Space secrets.") | |
| client = anthropic.Anthropic(api_key=api_key) | |
| system_prompt = """You are a professional transcript formatter. Your task is to intelligently format transcripts while preserving all original content and meaning. | |
| FORMATTING REQUIREMENTS: | |
| 1. SPEAKER NAMES: | |
| - Bold all speaker names using **Speaker Name:** format | |
| - Detect various speaker formats (Speaker:, SPEAKER, Speaker Name, etc.) | |
| - Maintain consistent formatting throughout | |
| 2. SCRIPTURE REFERENCES: | |
| - Bold ALL Scripture references in ANY format using **reference** format | |
| - Examples to detect and format: | |
| * 1 John 2:18 β **1 John 2:18** | |
| * Mark chapter 13 verse 13 β **Mark chapter 13 verse 13** | |
| * Romans 8:28-30 β **Romans 8:28-30** | |
| * First Corinthians 15 β **First Corinthians 15** | |
| * Matt. 5:3-12 β **Matt. 5:3-12** | |
| - Include partial references, book names, and various formats | |
| 3. CHARACTER ENCODING FIXES: | |
| - Fix common encoding issues | |
| - Convert smart quotes to proper Unicode | |
| - Fix any other character encoding problems | |
| 4. MUSIC SYMBOLS: | |
| - Remove excessive music symbols (βͺβͺβͺ β βͺ or remove entirely if appropriate) | |
| - Clean up music notations while preserving meaning | |
| 5. PARAGRAPH STRUCTURE: | |
| - Create proper paragraph breaks at natural speech boundaries | |
| - Merge fragmented lines into coherent paragraphs | |
| - Maintain logical flow and readability | |
| 6. CONTENT PRESERVATION: | |
| - Preserve ALL original content and meaning | |
| - Do not add, remove, or change the substance of what was said | |
| - Maintain the speaker's voice and style | |
| 7. TIMESTAMP REMOVAL: | |
| - Remove timestamps if present (e.g., [00:15:30], (2:45), etc.) | |
| - Clean up any time markers that interrupt the flow | |
| 8. OUTPUT FORMAT: | |
| - Return the formatted text in clean markdown format | |
| - Use proper markdown syntax | |
| - Ensure readability and professional appearance | |
| Remember: Your goal is to make the transcript more readable and professional while preserving every bit of the original meaning and content.""" | |
| message = client.messages.create( | |
| model="claude-3-5-sonnet-20240620", | |
| max_tokens=8000, | |
| temperature=0.1, | |
| system=system_prompt, | |
| messages=[{"role": "user", "content": f"Please format this transcript:\n\n{text}"}] | |
| ) | |
| return message.content[0].text | |
| def create_word_document(formatted_text, title): | |
| """Create a Word document from formatted text.""" | |
| doc = Document() | |
| # Set margins | |
| sections = doc.sections | |
| for section in sections: | |
| section.top_margin = Inches(1) | |
| section.bottom_margin = Inches(1) | |
| section.left_margin = Inches(1) | |
| section.right_margin = Inches(1) | |
| # Add title | |
| title_para = doc.add_paragraph() | |
| title_run = title_para.add_run(title) | |
| title_run.font.size = Pt(16) | |
| title_run.bold = True | |
| title_para.alignment = WD_ALIGN_PARAGRAPH.CENTER | |
| title_para.space_after = Pt(24) | |
| # Add metadata | |
| meta_para = doc.add_paragraph() | |
| meta_run = meta_para.add_run("Formatted with AI β’ ORU Transcript Formatter") | |
| meta_run.font.size = Pt(10) | |
| meta_run.italic = True | |
| meta_para.space_after = Pt(12) | |
| # Add separator | |
| doc.add_paragraph("_" * 50).space_after = Pt(12) | |
| # Process the formatted text and add to document | |
| lines = formatted_text.split('\n') | |
| for line in lines: | |
| line = line.strip() | |
| if not line: | |
| continue | |
| para = doc.add_paragraph() | |
| # Check if line contains bold formatting (markdown style) | |
| if '**' in line: | |
| # Parse markdown-style bold formatting | |
| parts = re.split(r'(\*\*[^*]+\*\*)', line) | |
| for part in parts: | |
| if part.startswith('**') and part.endswith('**'): | |
| # Bold text | |
| bold_text = part[2:-2] # Remove ** markers | |
| run = para.add_run(bold_text) | |
| run.bold = True | |
| else: | |
| # Regular text | |
| para.add_run(part) | |
| else: | |
| # Regular paragraph | |
| para.add_run(line) | |
| # Set font size | |
| for run in para.runs: | |
| run.font.size = Pt(11) | |
| para.space_after = Pt(6) | |
| return doc | |
| def format_transcript(file): | |
| """Format a transcript file using AI.""" | |
| if file is None: | |
| return None, "Please upload a transcript file." | |
| try: | |
| # Read the uploaded file | |
| if file.name.endswith('.txt'): | |
| with open(file.name, 'r', encoding='utf-8') as f: | |
| content = f.read() | |
| else: | |
| return None, "Please upload a .txt file." | |
| if not content.strip(): | |
| return None, "The uploaded file appears to be empty." | |
| # Format using AI | |
| formatted_text = format_with_claude(content) | |
| # Create Word document | |
| title = Path(file.name).stem.replace('_', ' ').replace('-', ' ') | |
| doc = create_word_document(formatted_text, title) | |
| # Save to temporary file | |
| output_path = tempfile.mktemp(suffix='.docx') | |
| doc.save(output_path) | |
| # Return file and success message | |
| return output_path, "β Transcript formatted successfully! Download your Word document below." | |
| except Exception as e: | |
| error_msg = str(e) | |
| if "ANTHROPIC_API_KEY" in error_msg: | |
| return None, "β API key not configured. Please contact the administrator." | |
| else: | |
| return None, f"β Error formatting transcript: {error_msg}" | |
| # Custom CSS for ORU branding | |
| css = """ | |
| .gradio-container { | |
| background: linear-gradient(135deg, #003366 0%, #002244 100%) !important; | |
| color: white !important; | |
| } | |
| .gr-button-primary { | |
| background: linear-gradient(135deg, #FFD700 0%, #FFC107 100%) !important; | |
| color: #003366 !important; | |
| border: none !important; | |
| font-weight: bold !important; | |
| } | |
| .gr-button-primary:hover { | |
| background: linear-gradient(135deg, #FFC107 0%, #FFB300 100%) !important; | |
| transform: translateY(-1px) !important; | |
| } | |
| h1 { | |
| color: #FFD700 !important; | |
| text-align: center !important; | |
| font-size: 2.5rem !important; | |
| margin-bottom: 1rem !important; | |
| } | |
| .gr-form { | |
| background: rgba(255, 255, 255, 0.1) !important; | |
| border-radius: 15px !important; | |
| padding: 2rem !important; | |
| backdrop-filter: blur(10px) !important; | |
| } | |
| .gr-file { | |
| border: 2px dashed #4A90E2 !important; | |
| border-radius: 10px !important; | |
| background: rgba(255, 255, 255, 0.05) !important; | |
| } | |
| .footer { | |
| text-align: center !important; | |
| color: #FFD700 !important; | |
| margin-top: 2rem !important; | |
| } | |
| """ | |
| # Create Gradio interface | |
| with gr.Blocks(css=css, title="ORU Transcript Formatter") as demo: | |
| gr.HTML(""" | |
| <h1>π ORU Transcript Formatter</h1> | |
| <p style="text-align: center; color: #FFD700; font-size: 1.2rem; margin-bottom: 2rem;"> | |
| AI-Powered Transcript Formatting β’ Oral Roberts University | |
| </p> | |
| """) | |
| with gr.Row(): | |
| with gr.Column(): | |
| file_input = gr.File( | |
| label="π Upload Transcript File (.txt)", | |
| file_types=[".txt"], | |
| type="filepath" | |
| ) | |
| format_btn = gr.Button( | |
| "π€ Format Transcript", | |
| variant="primary", | |
| size="lg" | |
| ) | |
| status_output = gr.Textbox( | |
| label="Status", | |
| interactive=False, | |
| lines=2 | |
| ) | |
| with gr.Column(): | |
| file_output = gr.File( | |
| label="π₯ Download Formatted Document", | |
| interactive=False | |
| ) | |
| gr.HTML(""" | |
| <div class="footer"> | |
| <h3>β¨ Features</h3> | |
| <p>π― AI-powered speaker detection β’ π Scripture reference highlighting β’ π¨ Professional formatting</p> | |
| <p>Β© 2025 Oral Roberts University β’ Powered by AI</p> | |
| </div> | |
| """) | |
| # Connect the interface | |
| format_btn.click( | |
| fn=format_transcript, | |
| inputs=[file_input], | |
| outputs=[file_output, status_output] | |
| ) | |
| # Launch the demo | |
| if __name__ == "__main__": | |
| demo.launch() |