Spaces:
Sleeping
Sleeping
colab-user commited on
Commit ·
df0c7b6
1
Parent(s): 57cb7f6
Fix response & format
Browse files- app/services/processor.py +50 -54
- app/templates/index.html +2 -2
app/services/processor.py
CHANGED
|
@@ -6,7 +6,7 @@ import logging
|
|
| 6 |
import subprocess
|
| 7 |
import time
|
| 8 |
from pathlib import Path
|
| 9 |
-
from typing import List, Dict, Optional
|
| 10 |
from dataclasses import dataclass
|
| 11 |
|
| 12 |
import numpy as np
|
|
@@ -69,7 +69,7 @@ def format_timestamp(seconds: float) -> str:
|
|
| 69 |
return f"{minutes:02d}:{secs:05.2f}"
|
| 70 |
|
| 71 |
|
| 72 |
-
def
|
| 73 |
waveform: torch.Tensor,
|
| 74 |
sr: int,
|
| 75 |
start_s: float,
|
|
@@ -77,40 +77,41 @@ def refine_segment_by_energy(
|
|
| 77 |
pad_ms: int = 200,
|
| 78 |
silence_db_delta: float = 16,
|
| 79 |
min_duration_ms: int = 150,
|
| 80 |
-
) -> Optional[
|
| 81 |
"""
|
| 82 |
-
Refine segment
|
| 83 |
-
|
| 84 |
-
Output: sample index (start_idx, end_idx) or None
|
| 85 |
"""
|
| 86 |
-
|
| 87 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 88 |
|
| 89 |
if end_idx <= start_idx:
|
| 90 |
return None
|
| 91 |
|
| 92 |
-
|
| 93 |
-
if
|
| 94 |
return None
|
| 95 |
|
| 96 |
-
|
| 97 |
-
|
|
|
|
|
|
|
| 98 |
|
| 99 |
-
|
| 100 |
-
valid = torch.
|
| 101 |
|
| 102 |
if valid.numel() == 0:
|
| 103 |
return None
|
| 104 |
|
| 105 |
-
valid = valid.view(-1)
|
| 106 |
refined_start = start_idx + valid[0].item()
|
| 107 |
refined_end = start_idx + valid[-1].item()
|
| 108 |
|
| 109 |
-
|
| 110 |
-
refined_start = max(refined_start - pad, 0)
|
| 111 |
-
refined_end = min(refined_end + pad, waveform.shape[1])
|
| 112 |
-
|
| 113 |
-
if refined_end - refined_start < int(min_duration_ms / 1000 * sr):
|
| 114 |
return None
|
| 115 |
|
| 116 |
return refined_start, refined_end
|
|
@@ -126,7 +127,7 @@ class Processor:
|
|
| 126 |
audio_path: Path,
|
| 127 |
model_name: str = "PhoWhisper Large",
|
| 128 |
language: str = "vi",
|
| 129 |
-
|
| 130 |
# VAD options
|
| 131 |
vad_filter: bool = True,
|
| 132 |
vad_min_silence_ms: int = 1000,
|
|
@@ -149,61 +150,56 @@ class Processor:
|
|
| 149 |
wav_path = await asyncio.get_event_loop().run_in_executor(None, convert_audio_to_wav, audio_path)
|
| 150 |
|
| 151 |
# Step 2: Load audio
|
| 152 |
-
|
| 153 |
-
|
| 154 |
-
|
| 155 |
-
)
|
| 156 |
-
|
| 157 |
-
if y_np.size == 0:
|
| 158 |
-
raise ValueError("Empty audio after librosa.load")
|
| 159 |
-
|
| 160 |
-
|
| 161 |
-
duration = len(y_np) / sr
|
| 162 |
-
logger.info(f"Audio loaded: {duration:.1f}s, {sr}Hz")
|
| 163 |
-
|
| 164 |
-
# convert to torch [1, T]
|
| 165 |
-
waveform = torch.from_numpy(y_np).unsqueeze(0).float()
|
| 166 |
|
| 167 |
# Step 3: Diarization
|
| 168 |
logger.info("Step 3: Running diarization...")
|
| 169 |
try:
|
| 170 |
-
|
| 171 |
except Exception as e:
|
| 172 |
logger.error(f"Diarization failed: {e}")
|
| 173 |
# Fallback: create single segment for whole audio
|
| 174 |
-
|
| 175 |
start=0.0,
|
| 176 |
end=duration,
|
| 177 |
speaker="Speaker 1"
|
| 178 |
)]
|
| 179 |
|
| 180 |
# Sort by start time
|
| 181 |
-
|
| 182 |
|
| 183 |
|
| 184 |
-
# Step 4: Refine segment boundaries
|
| 185 |
refined_segments: List[SpeakerSegment] = []
|
| 186 |
|
| 187 |
-
for seg in
|
| 188 |
-
|
| 189 |
-
|
| 190 |
-
|
| 191 |
-
|
| 192 |
-
|
| 193 |
-
|
| 194 |
-
|
| 195 |
-
|
| 196 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 197 |
|
| 198 |
-
|
| 199 |
-
|
| 200 |
start=start_idx / sr,
|
| 201 |
end=end_idx / sr,
|
| 202 |
speaker=seg.speaker
|
| 203 |
)
|
| 204 |
-
|
| 205 |
-
refined_segments.append(seg)
|
| 206 |
-
|
| 207 |
|
| 208 |
# Step 5: Transcribe
|
| 209 |
logger.info(f"Step 5: Transcribing {len(refined_segments)} segments...")
|
|
|
|
| 6 |
import subprocess
|
| 7 |
import time
|
| 8 |
from pathlib import Path
|
| 9 |
+
from typing import List, Dict, Optional, Tuple
|
| 10 |
from dataclasses import dataclass
|
| 11 |
|
| 12 |
import numpy as np
|
|
|
|
| 69 |
return f"{minutes:02d}:{secs:05.2f}"
|
| 70 |
|
| 71 |
|
| 72 |
+
def pad_and_refine_tensor(
|
| 73 |
waveform: torch.Tensor,
|
| 74 |
sr: int,
|
| 75 |
start_s: float,
|
|
|
|
| 77 |
pad_ms: int = 200,
|
| 78 |
silence_db_delta: float = 16,
|
| 79 |
min_duration_ms: int = 150,
|
| 80 |
+
) -> Optional[Tuple[int, int]]:
|
| 81 |
"""
|
| 82 |
+
Refine segment using energy on TORCH tensor.
|
| 83 |
+
Returns sample indices or None.
|
|
|
|
| 84 |
"""
|
| 85 |
+
total_len = waveform.shape[1]
|
| 86 |
+
|
| 87 |
+
start_s = max(start_s - pad_ms / 1000, 0)
|
| 88 |
+
end_s = min(end_s + pad_ms / 1000, total_len / sr)
|
| 89 |
+
|
| 90 |
+
start_idx = int(start_s * sr)
|
| 91 |
+
end_idx = int(end_s * sr)
|
| 92 |
|
| 93 |
if end_idx <= start_idx:
|
| 94 |
return None
|
| 95 |
|
| 96 |
+
seg = waveform[:, start_idx:end_idx]
|
| 97 |
+
if seg.numel() == 0:
|
| 98 |
return None
|
| 99 |
|
| 100 |
+
# RMS energy
|
| 101 |
+
rms = torch.sqrt(torch.mean(seg ** 2, dim=0))
|
| 102 |
+
if rms.numel() == 0:
|
| 103 |
+
return None
|
| 104 |
|
| 105 |
+
threshold = torch.quantile(rms, 0.2)
|
| 106 |
+
valid = torch.where(rms > threshold)[0]
|
| 107 |
|
| 108 |
if valid.numel() == 0:
|
| 109 |
return None
|
| 110 |
|
|
|
|
| 111 |
refined_start = start_idx + valid[0].item()
|
| 112 |
refined_end = start_idx + valid[-1].item()
|
| 113 |
|
| 114 |
+
if refined_end - refined_start < (min_duration_ms / 1000) * sr:
|
|
|
|
|
|
|
|
|
|
|
|
|
| 115 |
return None
|
| 116 |
|
| 117 |
return refined_start, refined_end
|
|
|
|
| 127 |
audio_path: Path,
|
| 128 |
model_name: str = "PhoWhisper Large",
|
| 129 |
language: str = "vi",
|
| 130 |
+
pad_refine: bool = True,
|
| 131 |
# VAD options
|
| 132 |
vad_filter: bool = True,
|
| 133 |
vad_min_silence_ms: int = 1000,
|
|
|
|
| 150 |
wav_path = await asyncio.get_event_loop().run_in_executor(None, convert_audio_to_wav, audio_path)
|
| 151 |
|
| 152 |
# Step 2: Load audio
|
| 153 |
+
y, sr = librosa.load(wav_path, sr=16000, mono=True)
|
| 154 |
+
if y.size == 0:
|
| 155 |
+
raise ValueError("Empty audio")
|
| 156 |
+
waveform = torch.from_numpy(y).unsqueeze(0).float()
|
| 157 |
+
duration = len(y) / sr
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 158 |
|
| 159 |
# Step 3: Diarization
|
| 160 |
logger.info("Step 3: Running diarization...")
|
| 161 |
try:
|
| 162 |
+
diar_segments = await DiarizationService.diarize_async(wav_path)
|
| 163 |
except Exception as e:
|
| 164 |
logger.error(f"Diarization failed: {e}")
|
| 165 |
# Fallback: create single segment for whole audio
|
| 166 |
+
diar_segments = [SpeakerSegment(
|
| 167 |
start=0.0,
|
| 168 |
end=duration,
|
| 169 |
speaker="Speaker 1"
|
| 170 |
)]
|
| 171 |
|
| 172 |
# Sort by start time
|
| 173 |
+
diar_segments.sort(key=lambda x: x.start)
|
| 174 |
|
| 175 |
|
| 176 |
+
# Step 4: Refine segment boundaries
|
| 177 |
refined_segments: List[SpeakerSegment] = []
|
| 178 |
|
| 179 |
+
for seg in diar_segments:
|
| 180 |
+
start, end = seg.start, seg.end
|
| 181 |
+
|
| 182 |
+
if pad_refine:
|
| 183 |
+
refined = pad_and_refine_tensor(waveform, sr, start, end)
|
| 184 |
+
if refined is None:
|
| 185 |
+
start_idx = int(start * sr)
|
| 186 |
+
end_idx = int(end * sr)
|
| 187 |
+
else:
|
| 188 |
+
start_idx, end_idx = refined
|
| 189 |
+
else:
|
| 190 |
+
start_idx = int(start * sr)
|
| 191 |
+
end_idx = int(end * sr)
|
| 192 |
+
|
| 193 |
+
if end_idx <= start_idx:
|
| 194 |
+
continue
|
| 195 |
|
| 196 |
+
refined_segments.append(
|
| 197 |
+
SpeakerSegment(
|
| 198 |
start=start_idx / sr,
|
| 199 |
end=end_idx / sr,
|
| 200 |
speaker=seg.speaker
|
| 201 |
)
|
| 202 |
+
)
|
|
|
|
|
|
|
| 203 |
|
| 204 |
# Step 5: Transcribe
|
| 205 |
logger.info(f"Step 5: Transcribing {len(refined_segments)} segments...")
|
app/templates/index.html
CHANGED
|
@@ -109,13 +109,13 @@
|
|
| 109 |
</svg>
|
| 110 |
Download TXT
|
| 111 |
</a>
|
| 112 |
-
<a href="#" id="download-
|
| 113 |
<svg viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2">
|
| 114 |
<path d="M21 15v4a2 2 0 0 1-2 2H5a2 2 0 0 1-2-2v-4" />
|
| 115 |
<polyline points="7 10 12 15 17 10" />
|
| 116 |
<line x1="12" y1="15" x2="12" y2="3" />
|
| 117 |
</svg>
|
| 118 |
-
Download
|
| 119 |
</a>
|
| 120 |
</div>
|
| 121 |
|
|
|
|
| 109 |
</svg>
|
| 110 |
Download TXT
|
| 111 |
</a>
|
| 112 |
+
<a href="#" id="download-csv" class="btn btn-outline" download>
|
| 113 |
<svg viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2">
|
| 114 |
<path d="M21 15v4a2 2 0 0 1-2 2H5a2 2 0 0 1-2-2v-4" />
|
| 115 |
<polyline points="7 10 12 15 17 10" />
|
| 116 |
<line x1="12" y1="15" x2="12" y2="3" />
|
| 117 |
</svg>
|
| 118 |
+
Download CSV
|
| 119 |
</a>
|
| 120 |
</div>
|
| 121 |
|