Spaces:

banao-tech
/

MeetRecorderCommunity1

Paused

App Files Files Community

banao-tech commited on 22 days ago

Commit

3bc012e

verified ·

1 Parent(s): e0d7644

Update app.py

Browse files

Files changed (1) hide show

app.py +26 -14

app.py CHANGED Viewed

@@ -11,13 +11,15 @@ from pathlib import Path
 import gradio as gr
 import pandas as pd
 import torch
 from faster_whisper import WhisperModel
 from pyannote.audio import Pipeline
-DIAR_DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
-ASR_DEVICE = "cpu"
-ASR_COMPUTE_TYPE = "int8"
 BAD_PHRASES = [
     "transcribe exactly",
@@ -66,6 +68,13 @@ def to_wav_16k_mono(input_path: Path, output_path: Path, enhance_audio: bool):
     run_cmd(cmd)
     return output_path
 def normalize_spaces(text):
     text = (text or "").replace("\n", " ").replace("\r", " ")
     text = re.sub(r"\s+", " ", text).strip()
@@ -108,10 +117,12 @@ def format_hhmmss_mmm(seconds):
 def preflight(media_file, language, enhance_audio, num_speakers, min_speakers, max_speakers):
     lines = [
         "=== PREFLIGHT ===",
-        f"Diarization device: {DIAR_DEVICE}",
         f"ASR device: {ASR_DEVICE}",
         "Diarization model: pyannote/speaker-diarization-community-1",
-        "ASR model: medium (CPU)",
         f"Language: {language}",
         f"Enhance audio: {enhance_audio}",
         f"HF_TOKEN present: {bool(os.getenv('HF_TOKEN'))}",
@@ -132,7 +143,7 @@ def preflight(media_file, language, enhance_audio, num_speakers, min_speakers, m
         if dur is not None:
             lines.append(f"Estimated duration: {dur:.2f} sec")
             if dur > 1800:
-                lines.append("Warning: long file. Community-1 space uses CPU ASR for stability.")
     except Exception as e:
         lines.append(f"File inspection failed: {e}")
     return "\n".join(lines)
@@ -224,11 +235,11 @@ def process_media(media_file, language, enhance_audio, filter_known_bad, num_spe
         progress(0.05, desc="Preparing audio")
         to_wav_16k_mono(input_path, wav_path, enhance_audio=enhance_audio)
-        progress(0.16, desc="Loading ASR model: medium (CPU)")
         asr_model = WhisperModel("medium", device=ASR_DEVICE, compute_type=ASR_COMPUTE_TYPE, cpu_threads=4, num_workers=1)
         fw_language = None if language == "auto" else language
-        progress(0.28, desc="Transcribing")
         segments_iter, info = asr_model.transcribe(
             str(wav_path),
             language=fw_language,
@@ -282,8 +293,9 @@ def process_media(media_file, language, enhance_audio, filter_known_bad, num_spe
             if max_speakers and int(max_speakers) > 0:
                 diar_kwargs["max_speakers"] = int(max_speakers)
-        progress(0.72, desc="Running diarization")
-        output = pipeline(str(wav_path), **diar_kwargs)
         if hasattr(output, "exclusive_speaker_diarization"):
             diarization = output.exclusive_speaker_diarization
         elif hasattr(output, "speaker_diarization"):
@@ -345,6 +357,8 @@ def process_media(media_file, language, enhance_audio, filter_known_bad, num_spe
         preview_lines = [
             "=== RUN SUMMARY ===",
             f"Detected language: {info.language}",
             f"ASR segments kept: {asr_segment_count}",
             f"ASR words kept: {len(all_words)}",
             f"Raw transcript segments: {len(raw_segments)}",
@@ -367,15 +381,13 @@ with gr.Blocks(title="Diarized Speaker Segments Community-1") as demo:
     gr.Markdown(
         """
         # Diarized Speaker Segments Community-1
-        Uses **pyannote/speaker-diarization-community-1**.
         Cleanup rule:
         - if adjacent speaker segments are the same, merge them
         - otherwise do not touch them
-        Note:
-        - ASR runs on CPU for compatibility/stability
-        - diarization uses GPU if available
         """
     )
     with gr.Row():

 import gradio as gr
 import pandas as pd
+import soundfile as sf
 import torch
 from faster_whisper import WhisperModel
 from pyannote.audio import Pipeline
+GPU_AVAILABLE = torch.cuda.is_available()
+ASR_DEVICE = "cuda" if GPU_AVAILABLE else "cpu"
+DIAR_DEVICE = "cuda" if GPU_AVAILABLE else "cpu"
+ASR_COMPUTE_TYPE = "float16" if GPU_AVAILABLE else "int8"
 BAD_PHRASES = [
     "transcribe exactly",
     run_cmd(cmd)
     return output_path
+def load_waveform_for_pyannote(wav_path: Path):
+    audio, sample_rate = sf.read(str(wav_path), dtype="float32")
+    if audio.ndim > 1:
+        audio = audio.mean(axis=1)
+    waveform = torch.from_numpy(audio).unsqueeze(0)
+    return {"waveform": waveform, "sample_rate": int(sample_rate)}
 def normalize_spaces(text):
     text = (text or "").replace("\n", " ").replace("\r", " ")
     text = re.sub(r"\s+", " ", text).strip()
 def preflight(media_file, language, enhance_audio, num_speakers, min_speakers, max_speakers):
     lines = [
         "=== PREFLIGHT ===",
+        f"GPU available: {GPU_AVAILABLE}",
         f"ASR device: {ASR_DEVICE}",
+        f"Diarization device: {DIAR_DEVICE}",
         "Diarization model: pyannote/speaker-diarization-community-1",
+        "ASR model: medium",
+        f"ASR compute type: {ASR_COMPUTE_TYPE}",
         f"Language: {language}",
         f"Enhance audio: {enhance_audio}",
         f"HF_TOKEN present: {bool(os.getenv('HF_TOKEN'))}",
         if dur is not None:
             lines.append(f"Estimated duration: {dur:.2f} sec")
             if dur > 1800:
+                lines.append("Warning: long file on T4 small. GPU is used, but medium is still recommended.")
     except Exception as e:
         lines.append(f"File inspection failed: {e}")
     return "\n".join(lines)
         progress(0.05, desc="Preparing audio")
         to_wav_16k_mono(input_path, wav_path, enhance_audio=enhance_audio)
+        progress(0.16, desc="Loading ASR model: medium")
         asr_model = WhisperModel("medium", device=ASR_DEVICE, compute_type=ASR_COMPUTE_TYPE, cpu_threads=4, num_workers=1)
         fw_language = None if language == "auto" else language
+        progress(0.28, desc="Transcribing on GPU")
         segments_iter, info = asr_model.transcribe(
             str(wav_path),
             language=fw_language,
             if max_speakers and int(max_speakers) > 0:
                 diar_kwargs["max_speakers"] = int(max_speakers)
+        progress(0.70, desc="Running diarization on GPU")
+        media = load_waveform_for_pyannote(wav_path)
+        output = pipeline(media, **diar_kwargs)
         if hasattr(output, "exclusive_speaker_diarization"):
             diarization = output.exclusive_speaker_diarization
         elif hasattr(output, "speaker_diarization"):
         preview_lines = [
             "=== RUN SUMMARY ===",
             f"Detected language: {info.language}",
+            f"ASR device used: {ASR_DEVICE}",
+            f"Diarization device used: {DIAR_DEVICE}",
             f"ASR segments kept: {asr_segment_count}",
             f"ASR words kept: {len(all_words)}",
             f"Raw transcript segments: {len(raw_segments)}",
     gr.Markdown(
         """
         # Diarized Speaker Segments Community-1
+        Uses **pyannote/speaker-diarization-community-1** and **faster-whisper medium**.
         Cleanup rule:
         - if adjacent speaker segments are the same, merge them
         - otherwise do not touch them
+        This version uses GPU for both ASR and diarization when a GPU is available.
         """
     )
     with gr.Row():