NoLev commited on
Commit
f977db3
Β·
verified Β·
1 Parent(s): 7d6b2b0

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +118 -37
app.py CHANGED
@@ -1,6 +1,11 @@
1
  import gradio as gr
2
  from transformers import pipeline
3
  import torch
 
 
 
 
 
4
 
5
  # Global cache for pipelines to avoid reloading models
6
  pipelines = {}
@@ -26,42 +31,95 @@ def get_pipeline(model_id):
26
  )
27
  return pipelines[model_id]
28
 
29
- # Transcription function with chunking for long audio
30
- def transcribe_speech(audio_file, model_id, language="english", return_timestamps=False):
31
- if audio_file is None:
32
- return "Please upload an audio file."
33
 
34
- pipe = get_pipeline(model_id)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
35
 
36
- # Generate kwargs for transcription
37
- generate_kwargs = {"task": "transcribe", "language": language}
38
- if return_timestamps:
39
- generate_kwargs["return_timestamps"] = True
 
 
 
40
 
41
- # Transcribe with chunking for long files
42
- output = pipe(
43
- audio_file,
44
- max_new_tokens=128, # Per chunk for stability
45
- generate_kwargs=generate_kwargs,
46
- chunk_length_s=30,
47
- stride_length_s=5, # Overlap for smooth transitions
48
- batch_size=8 if "tiny" not in model_id and "base" not in model_id else 16, # Adjust batch for smaller models
49
- return_timestamps=return_timestamps,
50
- )
51
 
52
- if return_timestamps:
53
- # Format with timestamps if requested
54
- if "chunks" in output:
55
- formatted = []
56
- for chunk in output["chunks"]:
57
- start = f"{chunk['timestamp'][0]:.2f}s" if chunk['timestamp'][0] is not None else "0.00s"
58
- end = f"{chunk['timestamp'][1]:.2f}s" if chunk['timestamp'][1] is not None else "?.?s"
59
- formatted.append(f"[{start} - {end}] {chunk['text']}")
60
- return "\n".join(formatted)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
61
  else:
62
- return output["text"] # Fallback
63
- else:
64
- return output["text"]
 
 
 
 
65
 
66
  # Create the Gradio app with a colorful, responsive theme
67
  theme = gr.themes.Soft(
@@ -75,7 +133,7 @@ with gr.Blocks(theme=theme, title="MP3 to Text Transcriber") as demo:
75
  gr.Markdown(
76
  """
77
  # 🎀 MP3 to Text Transcription Tool
78
- Upload an MP3 (or any audio file) and transcribe it to text using OpenAI's Whisper models.
79
  Supports long files up to hoursβ€”handles 45+ minutes effortlessly!
80
  Choose a model for speed vs. accuracy trade-off.
81
  """,
@@ -84,6 +142,7 @@ with gr.Blocks(theme=theme, title="MP3 to Text Transcriber") as demo:
84
 
85
  with gr.Row(variant="panel", elem_classes=["max-w-4xl mx-auto"]):
86
  with gr.Column(scale=1):
 
87
  audio_input = gr.Audio(
88
  sources="upload",
89
  type="filepath",
@@ -91,6 +150,13 @@ with gr.Blocks(theme=theme, title="MP3 to Text Transcriber") as demo:
91
  elem_classes=["w-full"]
92
  )
93
 
 
 
 
 
 
 
 
94
  model_dropdown = gr.Dropdown(
95
  choices=MODEL_OPTIONS,
96
  value=MODEL_OPTIONS[1], # Default to base
@@ -111,12 +177,15 @@ with gr.Blocks(theme=theme, title="MP3 to Text Transcriber") as demo:
111
  value=False,
112
  info="Adds [start - end] tags to the transcript."
113
  )
114
-
115
- transcribe_btn = gr.Button("πŸš€ Transcribe Audio", variant="primary", size="lg", elem_classes=["w-full"])
116
 
117
  with gr.Column(scale=1):
118
  status_output = gr.Markdown("Ready to transcribe! πŸ’¬", elem_classes=["text-center"])
119
 
 
 
 
 
 
120
  transcript_output = gr.Textbox(
121
  label="πŸ“ Transcript",
122
  lines=15,
@@ -130,11 +199,23 @@ with gr.Blocks(theme=theme, title="MP3 to Text Transcriber") as demo:
130
  def update_status(msg):
131
  return gr.Markdown(f"**{msg}**")
132
 
 
133
  transcribe_btn.click(
134
- fn=transcribe_speech,
135
  inputs=[audio_input, model_dropdown, language_dropdown, timestamps_checkbox],
136
- outputs=transcript_output,
137
- show_progress=True # Progress bar for long transcriptions
 
 
 
 
 
 
 
 
 
 
 
138
  ).then(
139
  fn=lambda: update_status("Transcription complete! πŸŽ‰"),
140
  outputs=status_output
 
1
  import gradio as gr
2
  from transformers import pipeline
3
  import torch
4
+ import requests
5
+ import re
6
+ import tempfile
7
+ import os
8
+ from io import BytesIO
9
 
10
  # Global cache for pipelines to avoid reloading models
11
  pipelines = {}
 
31
  )
32
  return pipelines[model_id]
33
 
34
+ # Function to fetch MP3 from Apple Podcasts episode URL
35
+ def fetch_podcast_mp3(podcast_url):
36
+ if not podcast_url or "podcasts.apple.com" not in podcast_url:
37
+ return None, "Invalid Apple Podcasts URL. Please use a valid episode link (e.g., https://podcasts.apple.com/...)."
38
 
39
+ try:
40
+ # Fetch the episode page HTML
41
+ response = requests.get(podcast_url, headers={"User-Agent": "Mozilla/5.0 (compatible; PodcastTranscriber/1.0)"})
42
+ response.raise_for_status()
43
+ html = response.text
44
+
45
+ # Extract assetUrl (MP3) using regex - looks like "assetUrl":"https://...mp3"
46
+ match = re.search(r'"assetUrl"\s*:\s*"([^"]+\.mp3[^"]*)"', html)
47
+ if not match:
48
+ return None, "Could not find MP3 URL. The episode might be private or the page structure changed."
49
+
50
+ mp3_url = match.group(1)
51
+
52
+ # Download MP3 to temp file
53
+ mp3_response = requests.get(mp3_url, headers={"User-Agent": "Mozilla/5.0 (compatible; PodcastTranscriber/1.0)"})
54
+ mp3_response.raise_for_status()
55
+
56
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp_file:
57
+ tmp_file.write(mp3_response.content)
58
+ temp_path = tmp_file.name
59
+
60
+ return temp_path, f"Downloaded episode: {os.path.getsize(temp_path) / (1024*1024):.1f} MB"
61
 
62
+ except Exception as e:
63
+ return None, f"Error fetching MP3: {str(e)}"
64
+
65
+ # Transcription function with chunking for long audio
66
+ def transcribe_speech(audio_input, model_id, language="english", return_timestamps=False, podcast_url=None):
67
+ audio_file = None
68
+ status_msg = ""
69
 
70
+ # If podcast URL provided, fetch MP3 first
71
+ if podcast_url:
72
+ audio_file, status_msg = fetch_podcast_mp3(podcast_url)
73
+ if not audio_file:
74
+ return status_msg, status_msg # Error message
75
+ else:
76
+ # Use uploaded file
77
+ if audio_input is None:
78
+ return "Please upload an audio file or provide a podcast URL.", "Ready to transcribe! πŸ’¬"
79
+ audio_file = audio_input
80
 
81
+ try:
82
+ pipe = get_pipeline(model_id)
83
+
84
+ # Generate kwargs for transcription
85
+ generate_kwargs = {"task": "transcribe", "language": language}
86
+ if return_timestamps:
87
+ generate_kwargs["return_timestamps"] = True
88
+
89
+ # Transcribe with chunking for long files
90
+ output = pipe(
91
+ audio_file,
92
+ max_new_tokens=128, # Per chunk for stability
93
+ generate_kwargs=generate_kwargs,
94
+ chunk_length_s=30,
95
+ stride_length_s=5, # Overlap for smooth transitions
96
+ batch_size=8 if "tiny" not in model_id and "base" not in model_id else 16, # Adjust batch for smaller models
97
+ return_timestamps=return_timestamps,
98
+ )
99
+
100
+ # Clean up temp file if it was downloaded
101
+ if podcast_url and os.path.exists(audio_file):
102
+ os.unlink(audio_file)
103
+
104
+ if return_timestamps:
105
+ # Format with timestamps if requested
106
+ if "chunks" in output:
107
+ formatted = []
108
+ for chunk in output["chunks"]:
109
+ start = f"{chunk['timestamp'][0]:.2f}s" if chunk['timestamp'][0] is not None else "0.00s"
110
+ end = f"{chunk['timestamp'][1]:.2f}s" if chunk['timestamp'][1] is not None else "?.?s"
111
+ formatted.append(f"[{start} - {end}] {chunk['text']}")
112
+ return "\n".join(formatted), "Transcription complete with timestamps! πŸŽ‰"
113
+ else:
114
+ return output["text"], "Transcription complete! πŸŽ‰"
115
  else:
116
+ return output["text"], "Transcription complete! πŸŽ‰"
117
+
118
+ except Exception as e:
119
+ # Clean up on error
120
+ if podcast_url and os.path.exists(audio_file):
121
+ os.unlink(audio_file)
122
+ return f"Transcription error: {str(e)}", f"Error: {str(e)}"
123
 
124
  # Create the Gradio app with a colorful, responsive theme
125
  theme = gr.themes.Soft(
 
133
  gr.Markdown(
134
  """
135
  # 🎀 MP3 to Text Transcription Tool
136
+ Upload an MP3 (or any audio file) **or** paste an Apple Podcasts episode URL to fetch and transcribe it automatically!
137
  Supports long files up to hoursβ€”handles 45+ minutes effortlessly!
138
  Choose a model for speed vs. accuracy trade-off.
139
  """,
 
142
 
143
  with gr.Row(variant="panel", elem_classes=["max-w-4xl mx-auto"]):
144
  with gr.Column(scale=1):
145
+ # Option 1: File upload
146
  audio_input = gr.Audio(
147
  sources="upload",
148
  type="filepath",
 
150
  elem_classes=["w-full"]
151
  )
152
 
153
+ # Option 2: Podcast URL
154
+ podcast_input = gr.Textbox(
155
+ label="πŸ”— Apple Podcasts Episode URL (optional)",
156
+ placeholder="e.g., https://podcasts.apple.com/us/podcast/example/id123?i=456",
157
+ elem_classes=["w-full"]
158
+ )
159
+
160
  model_dropdown = gr.Dropdown(
161
  choices=MODEL_OPTIONS,
162
  value=MODEL_OPTIONS[1], # Default to base
 
177
  value=False,
178
  info="Adds [start - end] tags to the transcript."
179
  )
 
 
180
 
181
  with gr.Column(scale=1):
182
  status_output = gr.Markdown("Ready to transcribe! πŸ’¬", elem_classes=["text-center"])
183
 
184
+ # Buttons
185
+ with gr.Row(elem_classes=["w-full"]):
186
+ transcribe_btn = gr.Button("πŸš€ Transcribe Uploaded File", variant="secondary", elem_classes=["flex-1"])
187
+ podcast_btn = gr.Button("πŸ“‘ Fetch & Transcribe Podcast", variant="primary", elem_classes=["flex-1"])
188
+
189
  transcript_output = gr.Textbox(
190
  label="πŸ“ Transcript",
191
  lines=15,
 
199
  def update_status(msg):
200
  return gr.Markdown(f"**{msg}**")
201
 
202
+ # For uploaded file
203
  transcribe_btn.click(
204
+ fn=lambda audio, model, lang, ts: transcribe_speech(audio, model, lang, ts, None),
205
  inputs=[audio_input, model_dropdown, language_dropdown, timestamps_checkbox],
206
+ outputs=[transcript_output, status_output],
207
+ show_progress=True
208
+ ).then(
209
+ fn=lambda: update_status("Transcription complete! πŸŽ‰"),
210
+ outputs=status_output
211
+ )
212
+
213
+ # For podcast URL
214
+ podcast_btn.click(
215
+ fn=lambda url, model, lang, ts: transcribe_speech(None, model, lang, ts, url),
216
+ inputs=[podcast_input, model_dropdown, language_dropdown, timestamps_checkbox],
217
+ outputs=[transcript_output, status_output],
218
+ show_progress=True
219
  ).then(
220
  fn=lambda: update_status("Transcription complete! πŸŽ‰"),
221
  outputs=status_output