Upload folder using huggingface_hub
Browse files- asr_diarization/pipeline.py +9 -11
asr_diarization/pipeline.py
CHANGED
|
@@ -44,7 +44,7 @@ class ASR_Diarization:
|
|
| 44 |
def run_diarization(self, audio_path):
|
| 45 |
diarization = self.diar_pipeline(audio_path)
|
| 46 |
return [
|
| 47 |
-
{"
|
| 48 |
for t, _, spk in diarization.itertracks(yield_label=True)
|
| 49 |
]
|
| 50 |
|
|
@@ -54,8 +54,8 @@ class ASR_Diarization:
|
|
| 54 |
speaker_segments = {}
|
| 55 |
|
| 56 |
for seg in diar_json:
|
| 57 |
-
|
| 58 |
-
start_sample, end_sample = int(
|
| 59 |
chunk = audio[0, start_sample:end_sample].numpy()
|
| 60 |
|
| 61 |
reduced = nr.reduce_noise(y=chunk, sr=sr)
|
|
@@ -66,21 +66,19 @@ class ASR_Diarization:
|
|
| 66 |
for word_info in result["chunks"]:
|
| 67 |
start_ts, end_ts = word_info.get("timestamp", (None, None)) or (None, None)
|
| 68 |
tokens.append({
|
| 69 |
-
"tag": "w",
|
| 70 |
"start": start_ts,
|
| 71 |
"end": end_ts,
|
| 72 |
-
"text": word_info["text"]
|
|
|
|
| 73 |
})
|
| 74 |
|
| 75 |
seg_dict = {
|
| 76 |
"speaker": spk,
|
| 77 |
-
"
|
| 78 |
-
"
|
| 79 |
"tokens": tokens
|
| 80 |
}
|
| 81 |
merged_segments.append(seg_dict)
|
| 82 |
-
print("Sample merged segment:", merged_segments[0])
|
| 83 |
-
|
| 84 |
|
| 85 |
if spk not in speaker_segments:
|
| 86 |
speaker_segments[spk] = []
|
|
@@ -101,8 +99,8 @@ class ASR_Diarization:
|
|
| 101 |
with open(rttm_path, "w") as f:
|
| 102 |
for seg in diar_json:
|
| 103 |
f.write(
|
| 104 |
-
f"SPEAKER {base_name} 1 {seg['
|
| 105 |
-
f"{seg['
|
| 106 |
f"{seg['speaker']} <NA>\n"
|
| 107 |
)
|
| 108 |
|
|
|
|
| 44 |
def run_diarization(self, audio_path):
|
| 45 |
diarization = self.diar_pipeline(audio_path)
|
| 46 |
return [
|
| 47 |
+
{"start": t.start, "end": t.end, "speaker": spk}
|
| 48 |
for t, _, spk in diarization.itertracks(yield_label=True)
|
| 49 |
]
|
| 50 |
|
|
|
|
| 54 |
speaker_segments = {}
|
| 55 |
|
| 56 |
for seg in diar_json:
|
| 57 |
+
start, end, spk = seg["start"], seg["end"], seg["speaker"]
|
| 58 |
+
start_sample, end_sample = int(start * sr), int(end * sr)
|
| 59 |
chunk = audio[0, start_sample:end_sample].numpy()
|
| 60 |
|
| 61 |
reduced = nr.reduce_noise(y=chunk, sr=sr)
|
|
|
|
| 66 |
for word_info in result["chunks"]:
|
| 67 |
start_ts, end_ts = word_info.get("timestamp", (None, None)) or (None, None)
|
| 68 |
tokens.append({
|
|
|
|
| 69 |
"start": start_ts,
|
| 70 |
"end": end_ts,
|
| 71 |
+
"text": word_info["text"],
|
| 72 |
+
"tag": "w"
|
| 73 |
})
|
| 74 |
|
| 75 |
seg_dict = {
|
| 76 |
"speaker": spk,
|
| 77 |
+
"start": start,
|
| 78 |
+
"end": end,
|
| 79 |
"tokens": tokens
|
| 80 |
}
|
| 81 |
merged_segments.append(seg_dict)
|
|
|
|
|
|
|
| 82 |
|
| 83 |
if spk not in speaker_segments:
|
| 84 |
speaker_segments[spk] = []
|
|
|
|
| 99 |
with open(rttm_path, "w") as f:
|
| 100 |
for seg in diar_json:
|
| 101 |
f.write(
|
| 102 |
+
f"SPEAKER {base_name} 1 {seg['start']:.6f} "
|
| 103 |
+
f"{seg['end']-seg['start']:.6f} <NA> <NA> "
|
| 104 |
f"{seg['speaker']} <NA>\n"
|
| 105 |
)
|
| 106 |
|