Spaces:
Sleeping
Sleeping
File size: 8,981 Bytes
451c519 b92b8f1 451c519 b92b8f1 b94eec8 b92b8f1 451c519 b92b8f1 451c519 b92b8f1 451c519 b94eec8 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 |
"""
ORU Transcript Formatter - Hugging Face Spaces Deployment
AI-Powered Transcript Formatting with ORU Branding
"""
import os
import tempfile
from pathlib import Path
import gradio as gr
from dotenv import load_dotenv
import anthropic
from docx import Document
from docx.shared import Inches, Pt
from docx.enum.text import WD_ALIGN_PARAGRAPH
import re
# Load environment variables
load_dotenv()
def format_with_claude(text):
"""Format transcript using Claude AI."""
api_key = os.getenv('ANTHROPIC_API_KEY')
if not api_key:
raise ValueError("ANTHROPIC_API_KEY not found. Please add it to your Hugging Face Space secrets.")
client = anthropic.Anthropic(api_key=api_key)
system_prompt = """You are a professional transcript formatter. Your task is to intelligently format transcripts while preserving all original content and meaning.
FORMATTING REQUIREMENTS:
1. SPEAKER NAMES:
- Bold all speaker names using **Speaker Name:** format
- Detect various speaker formats (Speaker:, SPEAKER, Speaker Name, etc.)
- Maintain consistent formatting throughout
2. SCRIPTURE REFERENCES:
- Bold ALL Scripture references in ANY format using **reference** format
- Examples to detect and format:
* 1 John 2:18 β **1 John 2:18**
* Mark chapter 13 verse 13 β **Mark chapter 13 verse 13**
* Romans 8:28-30 β **Romans 8:28-30**
* First Corinthians 15 β **First Corinthians 15**
* Matt. 5:3-12 β **Matt. 5:3-12**
- Include partial references, book names, and various formats
3. CHARACTER ENCODING FIXES:
- Fix common encoding issues
- Convert smart quotes to proper Unicode
- Fix any other character encoding problems
4. MUSIC SYMBOLS:
- Remove excessive music symbols (βͺβͺβͺ β βͺ or remove entirely if appropriate)
- Clean up music notations while preserving meaning
5. PARAGRAPH STRUCTURE:
- Create proper paragraph breaks at natural speech boundaries
- Merge fragmented lines into coherent paragraphs
- Maintain logical flow and readability
6. CONTENT PRESERVATION:
- Preserve ALL original content and meaning
- Do not add, remove, or change the substance of what was said
- Maintain the speaker's voice and style
7. TIMESTAMP REMOVAL:
- Remove timestamps if present (e.g., [00:15:30], (2:45), etc.)
- Clean up any time markers that interrupt the flow
8. OUTPUT FORMAT:
- Return the formatted text in clean markdown format
- Use proper markdown syntax
- Ensure readability and professional appearance
Remember: Your goal is to make the transcript more readable and professional while preserving every bit of the original meaning and content."""
message = client.messages.create(
model="claude-3-5-sonnet-20240620",
max_tokens=8000,
temperature=0.1,
system=system_prompt,
messages=[{"role": "user", "content": f"Please format this transcript:\n\n{text}"}]
)
return message.content[0].text
def create_word_document(formatted_text, title):
"""Create a Word document from formatted text."""
doc = Document()
# Set margins
sections = doc.sections
for section in sections:
section.top_margin = Inches(1)
section.bottom_margin = Inches(1)
section.left_margin = Inches(1)
section.right_margin = Inches(1)
# Add title
title_para = doc.add_paragraph()
title_run = title_para.add_run(title)
title_run.font.size = Pt(16)
title_run.bold = True
title_para.alignment = WD_ALIGN_PARAGRAPH.CENTER
title_para.space_after = Pt(24)
# Add metadata
meta_para = doc.add_paragraph()
meta_run = meta_para.add_run("Formatted with AI β’ ORU Transcript Formatter")
meta_run.font.size = Pt(10)
meta_run.italic = True
meta_para.space_after = Pt(12)
# Add separator
doc.add_paragraph("_" * 50).space_after = Pt(12)
# Process the formatted text and add to document
lines = formatted_text.split('\n')
for line in lines:
line = line.strip()
if not line:
continue
para = doc.add_paragraph()
# Check if line contains bold formatting (markdown style)
if '**' in line:
# Parse markdown-style bold formatting
parts = re.split(r'(\*\*[^*]+\*\*)', line)
for part in parts:
if part.startswith('**') and part.endswith('**'):
# Bold text
bold_text = part[2:-2] # Remove ** markers
run = para.add_run(bold_text)
run.bold = True
else:
# Regular text
para.add_run(part)
else:
# Regular paragraph
para.add_run(line)
# Set font size
for run in para.runs:
run.font.size = Pt(11)
para.space_after = Pt(6)
return doc
def format_transcript(file):
"""Format a transcript file using AI."""
if file is None:
return None, "Please upload a transcript file."
try:
# Read the uploaded file
if file.name.endswith('.txt'):
with open(file.name, 'r', encoding='utf-8') as f:
content = f.read()
else:
return None, "Please upload a .txt file."
if not content.strip():
return None, "The uploaded file appears to be empty."
# Format using AI
formatted_text = format_with_claude(content)
# Create Word document
title = Path(file.name).stem.replace('_', ' ').replace('-', ' ')
doc = create_word_document(formatted_text, title)
# Save to temporary file
output_path = tempfile.mktemp(suffix='.docx')
doc.save(output_path)
# Return file and success message
return output_path, "β
Transcript formatted successfully! Download your Word document below."
except Exception as e:
error_msg = str(e)
if "ANTHROPIC_API_KEY" in error_msg:
return None, "β API key not configured. Please contact the administrator."
else:
return None, f"β Error formatting transcript: {error_msg}"
# Custom CSS for ORU branding
css = """
.gradio-container {
background: linear-gradient(135deg, #003366 0%, #002244 100%) !important;
color: white !important;
}
.gr-button-primary {
background: linear-gradient(135deg, #FFD700 0%, #FFC107 100%) !important;
color: #003366 !important;
border: none !important;
font-weight: bold !important;
}
.gr-button-primary:hover {
background: linear-gradient(135deg, #FFC107 0%, #FFB300 100%) !important;
transform: translateY(-1px) !important;
}
h1 {
color: #FFD700 !important;
text-align: center !important;
font-size: 2.5rem !important;
margin-bottom: 1rem !important;
}
.gr-form {
background: rgba(255, 255, 255, 0.1) !important;
border-radius: 15px !important;
padding: 2rem !important;
backdrop-filter: blur(10px) !important;
}
.gr-file {
border: 2px dashed #4A90E2 !important;
border-radius: 10px !important;
background: rgba(255, 255, 255, 0.05) !important;
}
.footer {
text-align: center !important;
color: #FFD700 !important;
margin-top: 2rem !important;
}
"""
# Create Gradio interface
with gr.Blocks(css=css, title="ORU Transcript Formatter") as demo:
gr.HTML("""
<h1>π ORU Transcript Formatter</h1>
<p style="text-align: center; color: #FFD700; font-size: 1.2rem; margin-bottom: 2rem;">
AI-Powered Transcript Formatting β’ Oral Roberts University
</p>
""")
with gr.Row():
with gr.Column():
file_input = gr.File(
label="π Upload Transcript File (.txt)",
file_types=[".txt"],
type="filepath"
)
format_btn = gr.Button(
"π€ Format Transcript",
variant="primary",
size="lg"
)
status_output = gr.Textbox(
label="Status",
interactive=False,
lines=2
)
with gr.Column():
file_output = gr.File(
label="π₯ Download Formatted Document",
interactive=False
)
gr.HTML("""
<div class="footer">
<h3>β¨ Features</h3>
<p>π― AI-powered speaker detection β’ π Scripture reference highlighting β’ π¨ Professional formatting</p>
<p>Β© 2025 Oral Roberts University β’ Powered by AI</p>
</div>
""")
# Connect the interface
format_btn.click(
fn=format_transcript,
inputs=[file_input],
outputs=[file_output, status_output]
)
# Launch the demo
if __name__ == "__main__":
demo.launch() |