Spaces:

not-lain
/

gpu-utils

Paused

App Files Files Community

not-lain commited on Apr 4, 2025

Commit

b21d0d9

1 Parent(s): b5d1281

remove

Browse files

Files changed (2) hide show

app.py +52 -52
requirements.txt +1 -1

app.py CHANGED Viewed

@@ -11,7 +11,7 @@ from PIL import Image, ImageOps
 import numpy as np
 from simple_lama_inpainting import SimpleLama
 from contextlib import contextmanager
-import whisperx
 import gc
 @contextmanager
@@ -174,61 +174,61 @@ def erase(image=None, mask=None):
     return simple_lama(image, mask)
-def transcribe(audio):
-    if audio is None:
-        raise gr.Error("No audio file submitted!")
-    device = "cuda" if torch.cuda.is_available() else "cpu"
-    compute_type = "float16"
-    batch_size = 8  # reduced batch size to be conservative with memory
-    try:
-        # 1. Load model and transcribe
-        model = whisperx.load_model("large-v2", device, compute_type=compute_type)
-        audio_input = whisperx.load_audio(audio)
-        result = model.transcribe(audio_input, batch_size=batch_size)
-        # Clear GPU memory
-        del model
-        gc.collect()
-        torch.cuda.empty_cache()
-        # 2. Align whisper output
-        model_a, metadata = whisperx.load_align_model(language_code=result["language"], device=device)
-        result = whisperx.align(result["segments"], model_a, metadata, audio_input, device, return_char_alignments=False)
-        # Clear GPU memory
-        del model_a
-        gc.collect()
-        torch.cuda.empty_cache()
-        # 3. Assign speaker labels
-        diarize_model = whisperx.DiarizationPipeline(device=device)
-        diarize_segments = diarize_model(audio_input)
-        # Combine transcription with speaker diarization
-        result = whisperx.assign_word_speakers(diarize_segments, result)
-        # Format output with speaker labels and timestamps
-        formatted_text = []
-        for segment in result["segments"]:
-            if not isinstance(segment, dict):
-                continue
-            speaker = f"[Speaker {segment.get('speaker', 'Unknown')}]"
-            start_time = f"{float(segment.get('start', 0)):.2f}"
-            end_time = f"{float(segment.get('end', 0)):.2f}"
-            text = segment.get('text', '').strip()
-            formatted_text.append(f"[{start_time}s - {end_time}s] {speaker}: {text}")
-        return "\n".join(formatted_text)
-    except Exception as e:
-        raise gr.Error(f"Transcription failed: {str(e)}")
-    finally:
-        # Ensure GPU memory is cleared even if an error occurs
-        gc.collect()
-        torch.cuda.empty_cache()
 @spaces.GPU(duration=120)
@@ -245,8 +245,8 @@ def main(*args):
     #     return mask_generation(*args)
     elif api_num == 5:
         return erase(*args)
-    elif api_num == 6:
-        return transcribe(*args)
 rmbg_tab = gr.Interface(
@@ -367,7 +367,7 @@ demo = gr.TabbedInterface(
         "inpainting",
         #  "sam2",
         "erase",
-        "transcribe",
     ],
     title="Utilities that require GPU",
 )

 import numpy as np
 from simple_lama_inpainting import SimpleLama
 from contextlib import contextmanager
+# import whisperx
 import gc
 @contextmanager
     return simple_lama(image, mask)
+# def transcribe(audio):
+#     if audio is None:
+#         raise gr.Error("No audio file submitted!")
+#     device = "cuda" if torch.cuda.is_available() else "cpu"
+#     compute_type = "float16"
+#     batch_size = 8  # reduced batch size to be conservative with memory
+#     try:
+#         # 1. Load model and transcribe
+#         model = whisperx.load_model("large-v2", device, compute_type=compute_type)
+#         audio_input = whisperx.load_audio(audio)
+#         result = model.transcribe(audio_input, batch_size=batch_size)
+#         # Clear GPU memory
+#         del model
+#         gc.collect()
+#         torch.cuda.empty_cache()
+#         # 2. Align whisper output
+#         model_a, metadata = whisperx.load_align_model(language_code=result["language"], device=device)
+#         result = whisperx.align(result["segments"], model_a, metadata, audio_input, device, return_char_alignments=False)
+#         # Clear GPU memory
+#         del model_a
+#         gc.collect()
+#         torch.cuda.empty_cache()
+#         # 3. Assign speaker labels
+#         diarize_model = whisperx.DiarizationPipeline(device=device)
+#         diarize_segments = diarize_model(audio_input)
+#         # Combine transcription with speaker diarization
+#         result = whisperx.assign_word_speakers(diarize_segments, result)
+#         # Format output with speaker labels and timestamps
+#         formatted_text = []
+#         for segment in result["segments"]:
+#             if not isinstance(segment, dict):
+#                 continue
+#             speaker = f"[Speaker {segment.get('speaker', 'Unknown')}]"
+#             start_time = f"{float(segment.get('start', 0)):.2f}"
+#             end_time = f"{float(segment.get('end', 0)):.2f}"
+#             text = segment.get('text', '').strip()
+#             formatted_text.append(f"[{start_time}s - {end_time}s] {speaker}: {text}")
+#         return "\n".join(formatted_text)
+#     except Exception as e:
+#         raise gr.Error(f"Transcription failed: {str(e)}")
+#     finally:
+#         # Ensure GPU memory is cleared even if an error occurs
+#         gc.collect()
+#         torch.cuda.empty_cache()
 @spaces.GPU(duration=120)
     #     return mask_generation(*args)
     elif api_num == 5:
         return erase(*args)
+    # elif api_num == 6:
+    #     return transcribe(*args)
 rmbg_tab = gr.Interface(
         "inpainting",
         #  "sam2",
         "erase",
+        # "transcribe",
     ],
     title="Utilities that require GPU",
 )

requirements.txt CHANGED Viewed

@@ -22,4 +22,4 @@ einops
 # git+https://github.com/facebookresearch/sam2.git
 matplotlib
 simple-lama-inpainting
-git+https://github.com/m-bain/whisperX.git

 # git+https://github.com/facebookresearch/sam2.git
 matplotlib
 simple-lama-inpainting
+# git+https://github.com/m-bain/whisperX.git