fix
Browse files
app.py
CHANGED
|
@@ -179,44 +179,56 @@ def transcribe(audio, task="transcribe"):
|
|
| 179 |
raise gr.Error("No audio file submitted!")
|
| 180 |
|
| 181 |
device = "cuda" if torch.cuda.is_available() else "cpu"
|
| 182 |
-
compute_type = "float16"
|
| 183 |
batch_size = 8 # reduced batch size to be conservative with memory
|
| 184 |
|
| 185 |
-
|
| 186 |
-
|
| 187 |
-
|
| 188 |
-
|
| 189 |
-
|
| 190 |
-
|
| 191 |
-
|
| 192 |
-
|
| 193 |
-
|
| 194 |
-
|
| 195 |
-
|
| 196 |
-
|
| 197 |
-
|
| 198 |
-
|
| 199 |
-
|
| 200 |
-
|
| 201 |
-
|
| 202 |
-
|
| 203 |
-
|
| 204 |
-
|
| 205 |
-
|
| 206 |
-
|
| 207 |
-
|
| 208 |
-
|
| 209 |
-
|
| 210 |
-
|
| 211 |
-
|
| 212 |
-
|
| 213 |
-
|
| 214 |
-
|
| 215 |
-
|
| 216 |
-
|
| 217 |
-
|
| 218 |
-
|
| 219 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 220 |
|
| 221 |
|
| 222 |
@spaces.GPU(duration=120)
|
|
@@ -330,13 +342,15 @@ erase_tab = gr.Interface(
|
|
| 330 |
transcribe_tab = gr.Interface(
|
| 331 |
fn=main,
|
| 332 |
inputs=[
|
| 333 |
-
gr.Number(6,
|
| 334 |
-
gr.Audio(type="filepath"),
|
| 335 |
-
gr.Radio(["transcribe", "translate"], label="Task",
|
| 336 |
],
|
| 337 |
-
outputs="
|
|
|
|
|
|
|
| 338 |
api_name="transcribe",
|
| 339 |
-
|
| 340 |
)
|
| 341 |
|
| 342 |
demo = gr.TabbedInterface(
|
|
|
|
| 179 |
raise gr.Error("No audio file submitted!")
|
| 180 |
|
| 181 |
device = "cuda" if torch.cuda.is_available() else "cpu"
|
| 182 |
+
compute_type = "float16"
|
| 183 |
batch_size = 8 # reduced batch size to be conservative with memory
|
| 184 |
|
| 185 |
+
try:
|
| 186 |
+
# 1. Load model and transcribe
|
| 187 |
+
model = whisperx.load_model("large-v2", device, compute_type=compute_type)
|
| 188 |
+
audio_input = whisperx.load_audio(audio)
|
| 189 |
+
result = model.transcribe(audio_input, batch_size=batch_size)
|
| 190 |
+
|
| 191 |
+
# Clear GPU memory
|
| 192 |
+
del model
|
| 193 |
+
gc.collect()
|
| 194 |
+
torch.cuda.empty_cache()
|
| 195 |
+
|
| 196 |
+
# 2. Align whisper output
|
| 197 |
+
model_a, metadata = whisperx.load_align_model(language_code=result["language"], device=device)
|
| 198 |
+
result = whisperx.align(result["segments"], model_a, metadata, audio_input, device, return_char_alignments=False)
|
| 199 |
+
|
| 200 |
+
# Clear GPU memory
|
| 201 |
+
del model_a
|
| 202 |
+
gc.collect()
|
| 203 |
+
torch.cuda.empty_cache()
|
| 204 |
+
|
| 205 |
+
# 3. Assign speaker labels
|
| 206 |
+
diarize_model = whisperx.DiarizationPipeline(device=device)
|
| 207 |
+
diarize_segments = diarize_model(audio_input)
|
| 208 |
+
|
| 209 |
+
# Combine transcription with speaker diarization
|
| 210 |
+
result = whisperx.assign_word_speakers(diarize_segments, result)
|
| 211 |
+
|
| 212 |
+
# Format output with speaker labels and timestamps
|
| 213 |
+
formatted_text = []
|
| 214 |
+
for segment in result["segments"]:
|
| 215 |
+
if not isinstance(segment, dict):
|
| 216 |
+
continue
|
| 217 |
+
|
| 218 |
+
speaker = f"[Speaker {segment.get('speaker', 'Unknown')}]"
|
| 219 |
+
start_time = f"{float(segment.get('start', 0)):.2f}"
|
| 220 |
+
end_time = f"{float(segment.get('end', 0)):.2f}"
|
| 221 |
+
text = segment.get('text', '').strip()
|
| 222 |
+
formatted_text.append(f"[{start_time}s - {end_time}s] {speaker}: {text}")
|
| 223 |
+
|
| 224 |
+
return "\n".join(formatted_text)
|
| 225 |
+
|
| 226 |
+
except Exception as e:
|
| 227 |
+
raise gr.Error(f"Transcription failed: {str(e)}")
|
| 228 |
+
finally:
|
| 229 |
+
# Ensure GPU memory is cleared even if an error occurs
|
| 230 |
+
gc.collect()
|
| 231 |
+
torch.cuda.empty_cache()
|
| 232 |
|
| 233 |
|
| 234 |
@spaces.GPU(duration=120)
|
|
|
|
| 342 |
transcribe_tab = gr.Interface(
|
| 343 |
fn=main,
|
| 344 |
inputs=[
|
| 345 |
+
gr.Number(value=6, visible=False, precision=0), # API number
|
| 346 |
+
gr.Audio(type="filepath", label="Audio File"),
|
| 347 |
+
gr.Radio(choices=["transcribe", "translate"], value="transcribe", label="Task", visible=True),
|
| 348 |
],
|
| 349 |
+
outputs=gr.Textbox(label="Transcription"),
|
| 350 |
+
title="Audio Transcription",
|
| 351 |
+
description="Upload an audio file to extract text using WhisperX with speaker diarization",
|
| 352 |
api_name="transcribe",
|
| 353 |
+
examples=[]
|
| 354 |
)
|
| 355 |
|
| 356 |
demo = gr.TabbedInterface(
|