Spaces:

not-lain
/

gpu-utils

Paused

App Files Files Community

not-lain commited on Apr 4, 2025

Commit

b8354e9

1 Parent(s): ad44b76

fix

Browse files

Files changed (1) hide show

app.py +55 -41

app.py CHANGED Viewed

@@ -179,44 +179,56 @@ def transcribe(audio, task="transcribe"):
         raise gr.Error("No audio file submitted!")
     device = "cuda" if torch.cuda.is_available() else "cpu"
-    compute_type = "float16"  # can be changed to "int8" if low on GPU memory
     batch_size = 8  # reduced batch size to be conservative with memory
-    # 1. Load model and transcribe
-    model = whisperx.load_model("large-v2", device, compute_type=compute_type)
-    audio_input = whisperx.load_audio(audio)
-    result = model.transcribe(audio_input, batch_size=batch_size)
-    # Clear GPU memory
-    del model
-    gc.collect()
-    torch.cuda.empty_cache()
-    # 2. Align whisper output
-    model_a, metadata = whisperx.load_align_model(language_code=result["language"], device=device)
-    result = whisperx.align(result["segments"], model_a, metadata, audio_input, device, return_char_alignments=False)
-    # Clear GPU memory
-    del model_a
-    gc.collect()
-    torch.cuda.empty_cache()
-    # 3. Assign speaker labels
-    diarize_model = whisperx.DiarizationPipeline(device=device)
-    diarize_segments = diarize_model(audio_input)
-    # Combine transcription with speaker diarization
-    result = whisperx.assign_word_speakers(diarize_segments, result)
-    # Format output with speaker labels and timestamps
-    formatted_text = ""
-    for segment in result["segments"]:
-        speaker = f"[Speaker {segment['speaker']}]" if "speaker" in segment else ""
-        start_time = f"{segment.get('start', 0):.2f}"
-        end_time = f"{segment.get('end', 0):.2f}"
-        formatted_text += f"[{start_time}s - {end_time}s] {speaker}: {segment['text']}\n"
-    return formatted_text
 @spaces.GPU(duration=120)
@@ -330,13 +342,15 @@ erase_tab = gr.Interface(
 transcribe_tab = gr.Interface(
     fn=main,
     inputs=[
-        gr.Number(6, interactive=False),
-        gr.Audio(type="filepath"),
-        gr.Radio(["transcribe", "translate"], label="Task", value="transcribe"),
     ],
-    outputs="text",
     api_name="transcribe",
-    description="Upload an audio file to extract text using Whisper Large V3",
 )
 demo = gr.TabbedInterface(

         raise gr.Error("No audio file submitted!")
     device = "cuda" if torch.cuda.is_available() else "cpu"
+    compute_type = "float16"
     batch_size = 8  # reduced batch size to be conservative with memory
+    try:
+        # 1. Load model and transcribe
+        model = whisperx.load_model("large-v2", device, compute_type=compute_type)
+        audio_input = whisperx.load_audio(audio)
+        result = model.transcribe(audio_input, batch_size=batch_size)
+        # Clear GPU memory
+        del model
+        gc.collect()
+        torch.cuda.empty_cache()
+        # 2. Align whisper output
+        model_a, metadata = whisperx.load_align_model(language_code=result["language"], device=device)
+        result = whisperx.align(result["segments"], model_a, metadata, audio_input, device, return_char_alignments=False)
+        # Clear GPU memory
+        del model_a
+        gc.collect()
+        torch.cuda.empty_cache()
+        # 3. Assign speaker labels
+        diarize_model = whisperx.DiarizationPipeline(device=device)
+        diarize_segments = diarize_model(audio_input)
+        # Combine transcription with speaker diarization
+        result = whisperx.assign_word_speakers(diarize_segments, result)
+        # Format output with speaker labels and timestamps
+        formatted_text = []
+        for segment in result["segments"]:
+            if not isinstance(segment, dict):
+                continue
+            speaker = f"[Speaker {segment.get('speaker', 'Unknown')}]"
+            start_time = f"{float(segment.get('start', 0)):.2f}"
+            end_time = f"{float(segment.get('end', 0)):.2f}"
+            text = segment.get('text', '').strip()
+            formatted_text.append(f"[{start_time}s - {end_time}s] {speaker}: {text}")
+        return "\n".join(formatted_text)
+    except Exception as e:
+        raise gr.Error(f"Transcription failed: {str(e)}")
+    finally:
+        # Ensure GPU memory is cleared even if an error occurs
+        gc.collect()
+        torch.cuda.empty_cache()
 @spaces.GPU(duration=120)
 transcribe_tab = gr.Interface(
     fn=main,
     inputs=[
+        gr.Number(value=6, visible=False, precision=0),  # API number
+        gr.Audio(type="filepath", label="Audio File"),
+        gr.Radio(choices=["transcribe", "translate"], value="transcribe", label="Task", visible=True),
     ],
+    outputs=gr.Textbox(label="Transcription"),
+    title="Audio Transcription",
+    description="Upload an audio file to extract text using WhisperX with speaker diarization",
     api_name="transcribe",
+    examples=[]
 )
 demo = gr.TabbedInterface(