Spaces:

vyluong
/

PoC_ASR_v5

Sleeping

App Files Files Community

colab-user commited on Jan 14

Commit

df0c7b6

1 Parent(s): 57cb7f6

Fix response & format

Browse files

Files changed (2) hide show

app/services/processor.py +50 -54
app/templates/index.html +2 -2

app/services/processor.py CHANGED Viewed

@@ -6,7 +6,7 @@ import logging
 import subprocess
 import time
 from pathlib import Path
-from typing import List, Dict, Optional
 from dataclasses import dataclass
 import numpy as np
@@ -69,7 +69,7 @@ def format_timestamp(seconds: float) -> str:
     return f"{minutes:02d}:{secs:05.2f}"
-def refine_segment_by_energy(
     waveform: torch.Tensor,
     sr: int,
     start_s: float,
@@ -77,40 +77,41 @@ def refine_segment_by_energy(
     pad_ms: int = 200,
     silence_db_delta: float = 16,
     min_duration_ms: int = 150,
-) -> Optional[tuple[int, int]]:
     """
-    Refine segment boundaries using RMS energy.
-    Input: seconds
-    Output: sample index (start_idx, end_idx) or None
     """
-    start_idx = max(int(start_s * sr - pad_ms / 1000 * sr), 0)
-    end_idx = min(int(end_s * sr + pad_ms / 1000 * sr), waveform.shape[1])
     if end_idx <= start_idx:
         return None
-    segment = waveform[0, start_idx:end_idx]
-    if segment.numel() == 0:
         return None
-    rms = 20 * torch.log10(torch.sqrt(torch.mean(segment ** 2)) + 1e-9)
-    silence_th = rms - silence_db_delta
-    energy = 20 * torch.log10(torch.abs(segment) + 1e-9)
-    valid = torch.nonzero(energy > silence_th)
     if valid.numel() == 0:
         return None
-    valid = valid.view(-1)
     refined_start = start_idx + valid[0].item()
     refined_end = start_idx + valid[-1].item()
-    pad = int(0.05 * sr)
-    refined_start = max(refined_start - pad, 0)
-    refined_end = min(refined_end + pad, waveform.shape[1])
-    if refined_end - refined_start < int(min_duration_ms / 1000 * sr):
         return None
     return refined_start, refined_end
@@ -126,7 +127,7 @@ class Processor:
         audio_path: Path,
         model_name: str = "PhoWhisper Large",
         language: str = "vi",
-        refine_segments: bool = True,
         # VAD options
         vad_filter: bool = True,
         vad_min_silence_ms: int = 1000,
@@ -149,61 +150,56 @@ class Processor:
         wav_path = await asyncio.get_event_loop().run_in_executor(None, convert_audio_to_wav, audio_path)
         # Step 2: Load audio
-        logger.info("Step 2: Loading audio...")
-        y_np, sr = await asyncio.get_event_loop().run_in_executor(
-            None, lambda: librosa.load(str(wav_path), sr=16000, mono=True)
-        )
-        if y_np.size == 0:
-            raise ValueError("Empty audio after librosa.load")
-        duration = len(y_np) / sr
-        logger.info(f"Audio loaded: {duration:.1f}s, {sr}Hz")
-        # convert to torch [1, T]
-        waveform = torch.from_numpy(y_np).unsqueeze(0).float()
         # Step 3: Diarization
         logger.info("Step 3: Running diarization...")
         try:
-            diarization_segments = await DiarizationService.diarize_async(wav_path)
         except Exception as e:
             logger.error(f"Diarization failed: {e}")
             # Fallback: create single segment for whole audio
-            diarization_segments = [SpeakerSegment(
                 start=0.0,
                 end=duration,
                 speaker="Speaker 1"
             )]
         # Sort by start time
-        diarization_segments.sort(key=lambda x: x.start)
-        # Step 4: Refine segment boundaries by energy
         refined_segments: List[SpeakerSegment] = []
-        for seg in diarization_segments:
-            if refine_segments:
-                result = refine_segment_by_energy(
-                    waveform=waveform,
-                    sr=sr,
-                    start_s=seg.start,
-                    end_s=seg.end,
-                )
-                if not result:
-                    continue
-                start_idx, end_idx = result
-                seg = SpeakerSegment(
                     start=start_idx / sr,
                     end=end_idx / sr,
                     speaker=seg.speaker
                 )
-            refined_segments.append(seg)
         # Step 5: Transcribe
         logger.info(f"Step 5: Transcribing {len(refined_segments)} segments...")

 import subprocess
 import time
 from pathlib import Path
+from typing import List, Dict, Optional, Tuple
 from dataclasses import dataclass
 import numpy as np
     return f"{minutes:02d}:{secs:05.2f}"
+def pad_and_refine_tensor(
     waveform: torch.Tensor,
     sr: int,
     start_s: float,
     pad_ms: int = 200,
     silence_db_delta: float = 16,
     min_duration_ms: int = 150,
+) -> Optional[Tuple[int, int]]:
     """
+    Refine segment using energy on TORCH tensor.
+    Returns sample indices or None.
     """
+    total_len = waveform.shape[1]
+    start_s = max(start_s - pad_ms / 1000, 0)
+    end_s = min(end_s + pad_ms / 1000, total_len / sr)
+    start_idx = int(start_s * sr)
+    end_idx = int(end_s * sr)
     if end_idx <= start_idx:
         return None
+    seg = waveform[:, start_idx:end_idx]
+    if seg.numel() == 0:
         return None
+    # RMS energy
+    rms = torch.sqrt(torch.mean(seg ** 2, dim=0))
+    if rms.numel() == 0:
+        return None
+    threshold = torch.quantile(rms, 0.2)
+    valid = torch.where(rms > threshold)[0]
     if valid.numel() == 0:
         return None
     refined_start = start_idx + valid[0].item()
     refined_end = start_idx + valid[-1].item()
+    if refined_end - refined_start < (min_duration_ms / 1000) * sr:
         return None
     return refined_start, refined_end
         audio_path: Path,
         model_name: str = "PhoWhisper Large",
         language: str = "vi",
+        pad_refine: bool = True,
         # VAD options
         vad_filter: bool = True,
         vad_min_silence_ms: int = 1000,
         wav_path = await asyncio.get_event_loop().run_in_executor(None, convert_audio_to_wav, audio_path)
         # Step 2: Load audio
+        y, sr = librosa.load(wav_path, sr=16000, mono=True)
+        if y.size == 0:
+            raise ValueError("Empty audio")
+        waveform = torch.from_numpy(y).unsqueeze(0).float()
+        duration = len(y) / sr
         # Step 3: Diarization
         logger.info("Step 3: Running diarization...")
         try:
+            diar_segments = await DiarizationService.diarize_async(wav_path)
         except Exception as e:
             logger.error(f"Diarization failed: {e}")
             # Fallback: create single segment for whole audio
+            diar_segments = [SpeakerSegment(
                 start=0.0,
                 end=duration,
                 speaker="Speaker 1"
             )]
         # Sort by start time
+        diar_segments.sort(key=lambda x: x.start)
+        # Step 4: Refine segment boundaries
         refined_segments: List[SpeakerSegment] = []
+        for seg in diar_segments:
+            start, end = seg.start, seg.end
+            if pad_refine:
+                refined = pad_and_refine_tensor(waveform, sr, start, end)
+                if refined is None:
+                    start_idx = int(start * sr)
+                    end_idx = int(end * sr)
+                else:
+                    start_idx, end_idx = refined
+            else:
+                start_idx = int(start * sr)
+                end_idx = int(end * sr)
+            if end_idx <= start_idx:
+                continue
+            refined_segments.append(
+                SpeakerSegment(
                     start=start_idx / sr,
                     end=end_idx / sr,
                     speaker=seg.speaker
                 )
+            )
         # Step 5: Transcribe
         logger.info(f"Step 5: Transcribing {len(refined_segments)} segments...")

app/templates/index.html CHANGED Viewed

@@ -109,13 +109,13 @@
                         </svg>
                         Download TXT
                     </a>
-                    <a href="#" id="download-srt" class="btn btn-outline" download>
                         <svg viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2">
                             <path d="M21 15v4a2 2 0 0 1-2 2H5a2 2 0 0 1-2-2v-4" />
                             <polyline points="7 10 12 15 17 10" />
                             <line x1="12" y1="15" x2="12" y2="3" />
                         </svg>
-                        Download SRT
                     </a>
                 </div>

                         </svg>
                         Download TXT
                     </a>
+                    <a href="#" id="download-csv" class="btn btn-outline" download>
                         <svg viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2">
                             <path d="M21 15v4a2 2 0 0 1-2 2H5a2 2 0 0 1-2-2v-4" />
                             <polyline points="7 10 12 15 17 10" />
                             <line x1="12" y1="15" x2="12" y2="3" />
                         </svg>
+                        Download CSV
                     </a>
                 </div>