drbinna commited on
Commit
b92b8f1
Β·
verified Β·
1 Parent(s): d953b52

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +90 -7
app.py CHANGED
@@ -7,12 +7,92 @@ import os
7
  import tempfile
8
  from pathlib import Path
9
  import gradio as gr
10
- from transcript_formatter.core.claude_formatter import format_with_claude
 
11
  from docx import Document
12
  from docx.shared import Inches, Pt
13
  from docx.enum.text import WD_ALIGN_PARAGRAPH
14
  import re
15
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
16
  def create_word_document(formatted_text, title):
17
  """Create a Word document from formatted text."""
18
  doc = Document()
@@ -90,6 +170,9 @@ def format_transcript(file):
90
  else:
91
  return None, "Please upload a .txt file."
92
 
 
 
 
93
  # Format using AI
94
  formatted_text = format_with_claude(content)
95
 
@@ -105,7 +188,11 @@ def format_transcript(file):
105
  return output_path, "βœ… Transcript formatted successfully! Download your Word document below."
106
 
107
  except Exception as e:
108
- return None, f"❌ Error formatting transcript: {str(e)}"
 
 
 
 
109
 
110
  # Custom CSS for ORU branding
111
  css = """
@@ -205,8 +292,4 @@ with gr.Blocks(css=css, title="ORU Transcript Formatter") as demo:
205
 
206
  # Launch the demo
207
  if __name__ == "__main__":
208
- demo.launch(
209
- server_name="0.0.0.0",
210
- server_port=7860,
211
- share=False
212
- )
 
7
  import tempfile
8
  from pathlib import Path
9
  import gradio as gr
10
+ from dotenv import load_dotenv
11
+ import anthropic
12
  from docx import Document
13
  from docx.shared import Inches, Pt
14
  from docx.enum.text import WD_ALIGN_PARAGRAPH
15
  import re
16
 
17
+ # Load environment variables
18
+ load_dotenv()
19
+
20
+ def format_with_claude(text):
21
+ """Format transcript using Claude AI."""
22
+ api_key = os.getenv('ANTHROPIC_API_KEY')
23
+ if not api_key:
24
+ raise ValueError("ANTHROPIC_API_KEY not found. Please add it to your Hugging Face Space secrets.")
25
+
26
+ client = anthropic.Anthropic(api_key=api_key)
27
+
28
+ system_prompt = """You are a professional transcript formatter. Your task is to intelligently format transcripts while preserving all original content and meaning.
29
+
30
+ FORMATTING REQUIREMENTS:
31
+
32
+ 1. SPEAKER NAMES:
33
+ - Bold all speaker names using **Speaker Name:** format
34
+ - Detect various speaker formats (Speaker:, SPEAKER, Speaker Name, etc.)
35
+ - Maintain consistent formatting throughout
36
+
37
+ 2. SCRIPTURE REFERENCES:
38
+ - Bold ALL Scripture references in ANY format using **reference** format
39
+ - Examples to detect and format:
40
+ * 1 John 2:18 β†’ **1 John 2:18**
41
+ * Mark chapter 13 verse 13 β†’ **Mark chapter 13 verse 13**
42
+ * Romans 8:28-30 β†’ **Romans 8:28-30**
43
+ * First Corinthians 15 β†’ **First Corinthians 15**
44
+ * Matt. 5:3-12 β†’ **Matt. 5:3-12**
45
+ - Include partial references, book names, and various formats
46
+
47
+ 3. CHARACTER ENCODING FIXES:
48
+ - Fix common encoding issues:
49
+ * Γ’β„’Βͺ β†’ β™ͺ
50
+ * Ò€ℒ β†’ '
51
+ * Ò€œ β†’ "
52
+ * Ò€ β†’ "
53
+ * Ò€" β†’ β€”
54
+ * Ò€" β†’ –
55
+ - Convert smart quotes to proper Unicode
56
+ - Fix any other character encoding problems
57
+
58
+ 4. MUSIC SYMBOLS:
59
+ - Remove excessive music symbols (β™ͺβ™ͺβ™ͺ β†’ β™ͺ or remove entirely if appropriate)
60
+ - Clean up music notations while preserving meaning
61
+ - Keep single music symbols if they add context
62
+
63
+ 5. PARAGRAPH STRUCTURE:
64
+ - Create proper paragraph breaks at natural speech boundaries
65
+ - Merge fragmented lines into coherent paragraphs
66
+ - Maintain logical flow and readability
67
+ - Separate different speakers or topics appropriately
68
+
69
+ 6. CONTENT PRESERVATION:
70
+ - Preserve ALL original content and meaning
71
+ - Do not add, remove, or change the substance of what was said
72
+ - Maintain the speaker's voice and style
73
+ - Keep all important details and context
74
+
75
+ 7. TIMESTAMP REMOVAL:
76
+ - Remove timestamps if present (e.g., [00:15:30], (2:45), etc.)
77
+ - Clean up any time markers that interrupt the flow
78
+
79
+ 8. OUTPUT FORMAT:
80
+ - Return the formatted text in clean markdown format
81
+ - Use proper markdown syntax
82
+ - Ensure readability and professional appearance
83
+
84
+ Remember: Your goal is to make the transcript more readable and professional while preserving every bit of the original meaning and content."""
85
+
86
+ message = client.messages.create(
87
+ model="claude-3-5-sonnet-20240620",
88
+ max_tokens=8000,
89
+ temperature=0.1,
90
+ system=system_prompt,
91
+ messages=[{"role": "user", "content": f"Please format this transcript:\n\n{text}"}]
92
+ )
93
+
94
+ return message.content[0].text
95
+
96
  def create_word_document(formatted_text, title):
97
  """Create a Word document from formatted text."""
98
  doc = Document()
 
170
  else:
171
  return None, "Please upload a .txt file."
172
 
173
+ if not content.strip():
174
+ return None, "The uploaded file appears to be empty."
175
+
176
  # Format using AI
177
  formatted_text = format_with_claude(content)
178
 
 
188
  return output_path, "βœ… Transcript formatted successfully! Download your Word document below."
189
 
190
  except Exception as e:
191
+ error_msg = str(e)
192
+ if "ANTHROPIC_API_KEY" in error_msg:
193
+ return None, "❌ API key not configured. Please contact the administrator."
194
+ else:
195
+ return None, f"❌ Error formatting transcript: {error_msg}"
196
 
197
  # Custom CSS for ORU branding
198
  css = """
 
292
 
293
  # Launch the demo
294
  if __name__ == "__main__":
295
+