colab-user commited on
Commit
64efa14
·
1 Parent(s): e92df6d

fix processor & UI

Browse files
Files changed (1) hide show
  1. app/services/processor.py +35 -61
app/services/processor.py CHANGED
@@ -140,96 +140,70 @@ class Processor:
140
 
141
  t0= time.time()
142
 
143
- # Step 1: Convert to WAV
144
  logger.info("Step 1: Converting audio to WAV 16kHz...")
145
  wav_path = await asyncio.get_event_loop().run_in_executor(None, convert_audio_to_wav, audio_path)
146
 
147
- # Step 2: Load audio
148
  y, sr = librosa.load(wav_path, sr=16000, mono=True)
149
  if y.size == 0:
150
  raise ValueError("Empty audio")
151
  waveform = torch.from_numpy(y).unsqueeze(0).float()
152
  duration = len(y) / sr
153
 
154
- # Step 3: Diarization
155
  logger.info("Step 3: Running diarization...")
156
 
157
- try:
158
- diarization_result: DiarizationResult = (
159
- await DiarizationService.diarize_async(wav_path)
160
- )
161
-
162
- diarization_segments = diarization_result.segments
163
- speaker_count = diarization_result.speaker_count
164
- speakers = diarization_result.speakers
165
- roles = diarization_result.roles
166
-
167
 
168
- except Exception as e:
169
- logger.error(f"Diarization failed: {e}")
170
- diarization_segments = []
171
 
172
  if not diarization_segments:
173
- diarization_segments = [SpeakerSegment(0.0, duration, "Speaker 1")]
174
- speaker_count = 1
175
- speakers = ["Speaker 1"]
176
- roles = {"Speaker 1": "UNKNOWN"}
177
-
178
- if not roles:
179
- roles = {
180
- speaker: "UNKNOWN"
181
- for speaker in speakers
182
- }
183
- # Sort by start time
184
  diarization_segments.sort(key=lambda x: x.start)
185
 
186
- # Step 4: Refine segment boundaries
187
  refined_segments: List[SpeakerSegment] = []
188
-
189
  for seg in diarization_segments:
190
- start_idx = int(seg.start * sr)
191
- end_idx = int(seg.end * sr)
192
-
193
- if pad_refine:
194
- refined = pad_and_refine_tensor(
195
- waveform, sr, seg.start, seg.end
196
- )
197
- if refined:
198
- start_idx, end_idx = refined
199
-
200
- if end_idx <= start_idx:
201
  continue
202
-
203
  refined_segments.append(
204
  SpeakerSegment(
205
- start=start_idx / sr,
206
- end=end_idx / sr,
207
- speaker=seg.speaker or "Speaker 1"
208
  )
209
  )
 
210
  if not refined_segments:
211
  refined_segments = diarization_segments
212
 
213
- logger.info(f"Refined segments: {len(refined_segments)}")
214
- speaker_duration = defaultdict(float)
215
- for seg in refined_segments:
216
- speaker_duration[seg.speaker] += seg.end - seg.start
 
 
 
 
 
217
 
218
- if speaker_duration:
219
  agent = max(speaker_duration, key=speaker_duration.get)
220
  roles = {
221
- speaker: ("NV" if speaker == agent else "KH")
222
- for speaker in speaker_duration
223
  }
224
- else:
225
- roles = {}
226
-
227
  for spk in speakers:
228
  roles.setdefault(spk, "KH")
229
 
230
- speaker_count = len(speakers)
231
-
232
- # Step 5: Transcribe
233
  vad_options = None
234
  if vad_filter:
235
  vad_options = {
@@ -275,7 +249,7 @@ class Processor:
275
  start=seg.start,
276
  end=seg.end,
277
  speaker=seg.speaker,
278
- role=roles.get(seg.speaker, "UNKNOWN"),
279
  text=text.strip(),
280
  )
281
  )
@@ -285,8 +259,8 @@ class Processor:
285
  TranscriptSegment(
286
  start=0.0,
287
  end=duration,
288
- speaker="Speaker 1",
289
- role="UNKNOWN",
290
  text="(No speech detected)"
291
  )
292
  ]
@@ -299,8 +273,8 @@ class Processor:
299
  txt_content = cls._generate_txt(
300
  processed_segments,
301
  speaker_count,
302
- processing_time,
303
  duration,
 
304
  roles
305
  )
306
 
 
140
 
141
  t0= time.time()
142
 
143
+ # 1: Convert to WAV
144
  logger.info("Step 1: Converting audio to WAV 16kHz...")
145
  wav_path = await asyncio.get_event_loop().run_in_executor(None, convert_audio_to_wav, audio_path)
146
 
147
+ # 2: Load audio
148
  y, sr = librosa.load(wav_path, sr=16000, mono=True)
149
  if y.size == 0:
150
  raise ValueError("Empty audio")
151
  waveform = torch.from_numpy(y).unsqueeze(0).float()
152
  duration = len(y) / sr
153
 
154
+ # 3: Diarization
155
  logger.info("Step 3: Running diarization...")
156
 
157
+ diarization: DiarizationResult = await DiarizationService.diarize_async(wav_path)
 
 
 
 
 
 
 
 
 
158
 
159
+ diarization_segments = diarization.segments or []
160
+ speakers = diarization.speakers or []
161
+ roles = diarization.roles or {}
162
 
163
  if not diarization_segments:
164
+ diarization_segments = [SpeakerSegment(0.0, duration, "SPEAKER_0")]
165
+ speakers = ["SPEAKER_0"]
166
+ roles = {"SPEAKER_0": "KH"}
167
+
 
 
 
 
 
 
 
168
  diarization_segments.sort(key=lambda x: x.start)
169
 
170
+ # 4: Refine segment boundaries
171
  refined_segments: List[SpeakerSegment] = []
 
172
  for seg in diarization_segments:
173
+ refined = pad_and_refine_tensor(waveform, sr, seg.start, seg.end)
174
+ if not refined:
 
 
 
 
 
 
 
 
 
175
  continue
176
+ s, e = refined
177
  refined_segments.append(
178
  SpeakerSegment(
179
+ start=s / sr,
180
+ end=e / sr,
181
+ speaker=seg.speaker,
182
  )
183
  )
184
+
185
  if not refined_segments:
186
  refined_segments = diarization_segments
187
 
188
+ # 5. Normalize speakers
189
+ speakers = sorted({seg.speaker for seg in refined_segments})
190
+ speaker_count = len(speakers)
191
+
192
+ # 6. Infer role ONLY if diarization did not provide
193
+ if not roles:
194
+ speaker_duration = defaultdict(float)
195
+ for seg in refined_segments:
196
+ speaker_duration[seg.speaker] += seg.end - seg.start
197
 
 
198
  agent = max(speaker_duration, key=speaker_duration.get)
199
  roles = {
200
+ spk: ("NV" if spk == agent else "KH")
201
+ for spk in speaker_duration
202
  }
 
 
 
203
  for spk in speakers:
204
  roles.setdefault(spk, "KH")
205
 
206
+ # 7: Transcribe
 
 
207
  vad_options = None
208
  if vad_filter:
209
  vad_options = {
 
249
  start=seg.start,
250
  end=seg.end,
251
  speaker=seg.speaker,
252
+ role=roles.get(seg.speaker, "KH"),
253
  text=text.strip(),
254
  )
255
  )
 
259
  TranscriptSegment(
260
  start=0.0,
261
  end=duration,
262
+ speaker=speakers[0],
263
+ role=roles[speakers[0]],
264
  text="(No speech detected)"
265
  )
266
  ]
 
273
  txt_content = cls._generate_txt(
274
  processed_segments,
275
  speaker_count,
 
276
  duration,
277
+ processing_time,
278
  roles
279
  )
280