colab-user commited on
Commit
df0c7b6
·
1 Parent(s): 57cb7f6

Fix response & format

Browse files
app/services/processor.py CHANGED
@@ -6,7 +6,7 @@ import logging
6
  import subprocess
7
  import time
8
  from pathlib import Path
9
- from typing import List, Dict, Optional
10
  from dataclasses import dataclass
11
 
12
  import numpy as np
@@ -69,7 +69,7 @@ def format_timestamp(seconds: float) -> str:
69
  return f"{minutes:02d}:{secs:05.2f}"
70
 
71
 
72
- def refine_segment_by_energy(
73
  waveform: torch.Tensor,
74
  sr: int,
75
  start_s: float,
@@ -77,40 +77,41 @@ def refine_segment_by_energy(
77
  pad_ms: int = 200,
78
  silence_db_delta: float = 16,
79
  min_duration_ms: int = 150,
80
- ) -> Optional[tuple[int, int]]:
81
  """
82
- Refine segment boundaries using RMS energy.
83
- Input: seconds
84
- Output: sample index (start_idx, end_idx) or None
85
  """
86
- start_idx = max(int(start_s * sr - pad_ms / 1000 * sr), 0)
87
- end_idx = min(int(end_s * sr + pad_ms / 1000 * sr), waveform.shape[1])
 
 
 
 
 
88
 
89
  if end_idx <= start_idx:
90
  return None
91
 
92
- segment = waveform[0, start_idx:end_idx]
93
- if segment.numel() == 0:
94
  return None
95
 
96
- rms = 20 * torch.log10(torch.sqrt(torch.mean(segment ** 2)) + 1e-9)
97
- silence_th = rms - silence_db_delta
 
 
98
 
99
- energy = 20 * torch.log10(torch.abs(segment) + 1e-9)
100
- valid = torch.nonzero(energy > silence_th)
101
 
102
  if valid.numel() == 0:
103
  return None
104
 
105
- valid = valid.view(-1)
106
  refined_start = start_idx + valid[0].item()
107
  refined_end = start_idx + valid[-1].item()
108
 
109
- pad = int(0.05 * sr)
110
- refined_start = max(refined_start - pad, 0)
111
- refined_end = min(refined_end + pad, waveform.shape[1])
112
-
113
- if refined_end - refined_start < int(min_duration_ms / 1000 * sr):
114
  return None
115
 
116
  return refined_start, refined_end
@@ -126,7 +127,7 @@ class Processor:
126
  audio_path: Path,
127
  model_name: str = "PhoWhisper Large",
128
  language: str = "vi",
129
- refine_segments: bool = True,
130
  # VAD options
131
  vad_filter: bool = True,
132
  vad_min_silence_ms: int = 1000,
@@ -149,61 +150,56 @@ class Processor:
149
  wav_path = await asyncio.get_event_loop().run_in_executor(None, convert_audio_to_wav, audio_path)
150
 
151
  # Step 2: Load audio
152
- logger.info("Step 2: Loading audio...")
153
- y_np, sr = await asyncio.get_event_loop().run_in_executor(
154
- None, lambda: librosa.load(str(wav_path), sr=16000, mono=True)
155
- )
156
-
157
- if y_np.size == 0:
158
- raise ValueError("Empty audio after librosa.load")
159
-
160
-
161
- duration = len(y_np) / sr
162
- logger.info(f"Audio loaded: {duration:.1f}s, {sr}Hz")
163
-
164
- # convert to torch [1, T]
165
- waveform = torch.from_numpy(y_np).unsqueeze(0).float()
166
 
167
  # Step 3: Diarization
168
  logger.info("Step 3: Running diarization...")
169
  try:
170
- diarization_segments = await DiarizationService.diarize_async(wav_path)
171
  except Exception as e:
172
  logger.error(f"Diarization failed: {e}")
173
  # Fallback: create single segment for whole audio
174
- diarization_segments = [SpeakerSegment(
175
  start=0.0,
176
  end=duration,
177
  speaker="Speaker 1"
178
  )]
179
 
180
  # Sort by start time
181
- diarization_segments.sort(key=lambda x: x.start)
182
 
183
 
184
- # Step 4: Refine segment boundaries by energy
185
  refined_segments: List[SpeakerSegment] = []
186
 
187
- for seg in diarization_segments:
188
- if refine_segments:
189
- result = refine_segment_by_energy(
190
- waveform=waveform,
191
- sr=sr,
192
- start_s=seg.start,
193
- end_s=seg.end,
194
- )
195
- if not result:
196
- continue
 
 
 
 
 
 
197
 
198
- start_idx, end_idx = result
199
- seg = SpeakerSegment(
200
  start=start_idx / sr,
201
  end=end_idx / sr,
202
  speaker=seg.speaker
203
  )
204
-
205
- refined_segments.append(seg)
206
-
207
 
208
  # Step 5: Transcribe
209
  logger.info(f"Step 5: Transcribing {len(refined_segments)} segments...")
 
6
  import subprocess
7
  import time
8
  from pathlib import Path
9
+ from typing import List, Dict, Optional, Tuple
10
  from dataclasses import dataclass
11
 
12
  import numpy as np
 
69
  return f"{minutes:02d}:{secs:05.2f}"
70
 
71
 
72
+ def pad_and_refine_tensor(
73
  waveform: torch.Tensor,
74
  sr: int,
75
  start_s: float,
 
77
  pad_ms: int = 200,
78
  silence_db_delta: float = 16,
79
  min_duration_ms: int = 150,
80
+ ) -> Optional[Tuple[int, int]]:
81
  """
82
+ Refine segment using energy on TORCH tensor.
83
+ Returns sample indices or None.
 
84
  """
85
+ total_len = waveform.shape[1]
86
+
87
+ start_s = max(start_s - pad_ms / 1000, 0)
88
+ end_s = min(end_s + pad_ms / 1000, total_len / sr)
89
+
90
+ start_idx = int(start_s * sr)
91
+ end_idx = int(end_s * sr)
92
 
93
  if end_idx <= start_idx:
94
  return None
95
 
96
+ seg = waveform[:, start_idx:end_idx]
97
+ if seg.numel() == 0:
98
  return None
99
 
100
+ # RMS energy
101
+ rms = torch.sqrt(torch.mean(seg ** 2, dim=0))
102
+ if rms.numel() == 0:
103
+ return None
104
 
105
+ threshold = torch.quantile(rms, 0.2)
106
+ valid = torch.where(rms > threshold)[0]
107
 
108
  if valid.numel() == 0:
109
  return None
110
 
 
111
  refined_start = start_idx + valid[0].item()
112
  refined_end = start_idx + valid[-1].item()
113
 
114
+ if refined_end - refined_start < (min_duration_ms / 1000) * sr:
 
 
 
 
115
  return None
116
 
117
  return refined_start, refined_end
 
127
  audio_path: Path,
128
  model_name: str = "PhoWhisper Large",
129
  language: str = "vi",
130
+ pad_refine: bool = True,
131
  # VAD options
132
  vad_filter: bool = True,
133
  vad_min_silence_ms: int = 1000,
 
150
  wav_path = await asyncio.get_event_loop().run_in_executor(None, convert_audio_to_wav, audio_path)
151
 
152
  # Step 2: Load audio
153
+ y, sr = librosa.load(wav_path, sr=16000, mono=True)
154
+ if y.size == 0:
155
+ raise ValueError("Empty audio")
156
+ waveform = torch.from_numpy(y).unsqueeze(0).float()
157
+ duration = len(y) / sr
 
 
 
 
 
 
 
 
 
158
 
159
  # Step 3: Diarization
160
  logger.info("Step 3: Running diarization...")
161
  try:
162
+ diar_segments = await DiarizationService.diarize_async(wav_path)
163
  except Exception as e:
164
  logger.error(f"Diarization failed: {e}")
165
  # Fallback: create single segment for whole audio
166
+ diar_segments = [SpeakerSegment(
167
  start=0.0,
168
  end=duration,
169
  speaker="Speaker 1"
170
  )]
171
 
172
  # Sort by start time
173
+ diar_segments.sort(key=lambda x: x.start)
174
 
175
 
176
+ # Step 4: Refine segment boundaries
177
  refined_segments: List[SpeakerSegment] = []
178
 
179
+ for seg in diar_segments:
180
+ start, end = seg.start, seg.end
181
+
182
+ if pad_refine:
183
+ refined = pad_and_refine_tensor(waveform, sr, start, end)
184
+ if refined is None:
185
+ start_idx = int(start * sr)
186
+ end_idx = int(end * sr)
187
+ else:
188
+ start_idx, end_idx = refined
189
+ else:
190
+ start_idx = int(start * sr)
191
+ end_idx = int(end * sr)
192
+
193
+ if end_idx <= start_idx:
194
+ continue
195
 
196
+ refined_segments.append(
197
+ SpeakerSegment(
198
  start=start_idx / sr,
199
  end=end_idx / sr,
200
  speaker=seg.speaker
201
  )
202
+ )
 
 
203
 
204
  # Step 5: Transcribe
205
  logger.info(f"Step 5: Transcribing {len(refined_segments)} segments...")
app/templates/index.html CHANGED
@@ -109,13 +109,13 @@
109
  </svg>
110
  Download TXT
111
  </a>
112
- <a href="#" id="download-srt" class="btn btn-outline" download>
113
  <svg viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2">
114
  <path d="M21 15v4a2 2 0 0 1-2 2H5a2 2 0 0 1-2-2v-4" />
115
  <polyline points="7 10 12 15 17 10" />
116
  <line x1="12" y1="15" x2="12" y2="3" />
117
  </svg>
118
- Download SRT
119
  </a>
120
  </div>
121
 
 
109
  </svg>
110
  Download TXT
111
  </a>
112
+ <a href="#" id="download-csv" class="btn btn-outline" download>
113
  <svg viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2">
114
  <path d="M21 15v4a2 2 0 0 1-2 2H5a2 2 0 0 1-2-2v-4" />
115
  <polyline points="7 10 12 15 17 10" />
116
  <line x1="12" y1="15" x2="12" y2="3" />
117
  </svg>
118
+ Download CSV
119
  </a>
120
  </div>
121