Update app.py
Browse files
app.py
CHANGED
|
@@ -49,10 +49,9 @@ pipe = pipeline(
|
|
| 49 |
|
| 50 |
|
| 51 |
|
| 52 |
-
def associate_speakers_with_timestamps(transcription_result, diarization, tolerance=0.
|
| 53 |
word_segments = transcription_result['chunks']
|
| 54 |
diarization_segments = list(diarization.itertracks(yield_label=True))
|
| 55 |
-
|
| 56 |
speaker_transcription = []
|
| 57 |
current_speaker = None
|
| 58 |
current_text = []
|
|
@@ -62,51 +61,53 @@ def associate_speakers_with_timestamps(transcription_result, diarization, tolera
|
|
| 62 |
def flush_current_segment():
|
| 63 |
nonlocal current_speaker, current_text
|
| 64 |
if current_speaker and current_text:
|
| 65 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 66 |
current_text = []
|
| 67 |
|
| 68 |
for word in word_segments:
|
| 69 |
word_start, word_end = word['timestamp']
|
| 70 |
word_text = word['text']
|
| 71 |
-
|
| 72 |
assigned = False
|
|
|
|
| 73 |
for i in range(last_segment_index, len(diarization_segments)):
|
| 74 |
segment, _, speaker = diarization_segments[i]
|
| 75 |
if segment.start - tolerance <= word_start < segment.end + tolerance:
|
| 76 |
if speaker != current_speaker:
|
| 77 |
-
|
| 78 |
-
|
| 79 |
-
current_text.append(word_text)
|
| 80 |
-
else:
|
| 81 |
-
flush_current_segment()
|
| 82 |
-
current_speaker = speaker
|
| 83 |
current_text.append(word_text)
|
| 84 |
last_segment_index = i
|
| 85 |
assigned = True
|
| 86 |
break
|
| 87 |
-
|
| 88 |
if not assigned:
|
| 89 |
unassigned_words.append((word_start, word_text))
|
| 90 |
-
|
|
|
|
|
|
|
| 91 |
# Traitement des mots non assignés
|
|
|
|
| 92 |
for word_start, word_text in unassigned_words:
|
| 93 |
-
closest_segment = min(diarization_segments, key=lambda x: abs(x[0].start - word_start))
|
| 94 |
speaker = closest_segment[2]
|
| 95 |
if speaker != current_speaker:
|
| 96 |
flush_current_segment()
|
| 97 |
current_speaker = speaker
|
| 98 |
current_text.append(word_text)
|
| 99 |
-
|
| 100 |
flush_current_segment()
|
| 101 |
-
|
| 102 |
# Fusion des segments courts
|
| 103 |
merged_transcription = []
|
| 104 |
for speaker, text in speaker_transcription:
|
| 105 |
-
if not merged_transcription or merged_transcription[-1][0] != speaker
|
| 106 |
merged_transcription.append((speaker, text))
|
| 107 |
else:
|
| 108 |
merged_transcription[-1] = (speaker, merged_transcription[-1][1] + " " + text)
|
| 109 |
-
|
| 110 |
return merged_transcription
|
| 111 |
|
| 112 |
def simplify_diarization_output(speaker_transcription):
|
|
|
|
| 49 |
|
| 50 |
|
| 51 |
|
| 52 |
+
def associate_speakers_with_timestamps(transcription_result, diarization, tolerance=0.05, min_segment_duration=0.1):
|
| 53 |
word_segments = transcription_result['chunks']
|
| 54 |
diarization_segments = list(diarization.itertracks(yield_label=True))
|
|
|
|
| 55 |
speaker_transcription = []
|
| 56 |
current_speaker = None
|
| 57 |
current_text = []
|
|
|
|
| 61 |
def flush_current_segment():
|
| 62 |
nonlocal current_speaker, current_text
|
| 63 |
if current_speaker and current_text:
|
| 64 |
+
segment_duration = word_segments[-1]['timestamp'][1] - word_segments[0]['timestamp'][0]
|
| 65 |
+
if segment_duration >= min_segment_duration:
|
| 66 |
+
speaker_transcription.append((current_speaker, ' '.join(current_text)))
|
| 67 |
+
else:
|
| 68 |
+
unassigned_words.extend([(word['timestamp'][0], word['text']) for word in word_segments])
|
| 69 |
current_text = []
|
| 70 |
|
| 71 |
for word in word_segments:
|
| 72 |
word_start, word_end = word['timestamp']
|
| 73 |
word_text = word['text']
|
|
|
|
| 74 |
assigned = False
|
| 75 |
+
|
| 76 |
for i in range(last_segment_index, len(diarization_segments)):
|
| 77 |
segment, _, speaker = diarization_segments[i]
|
| 78 |
if segment.start - tolerance <= word_start < segment.end + tolerance:
|
| 79 |
if speaker != current_speaker:
|
| 80 |
+
flush_current_segment()
|
| 81 |
+
current_speaker = speaker
|
|
|
|
|
|
|
|
|
|
|
|
|
| 82 |
current_text.append(word_text)
|
| 83 |
last_segment_index = i
|
| 84 |
assigned = True
|
| 85 |
break
|
| 86 |
+
|
| 87 |
if not assigned:
|
| 88 |
unassigned_words.append((word_start, word_text))
|
| 89 |
+
|
| 90 |
+
flush_current_segment()
|
| 91 |
+
|
| 92 |
# Traitement des mots non assignés
|
| 93 |
+
unassigned_words.sort(key=lambda x: x[0]) # Trier par timestamp
|
| 94 |
for word_start, word_text in unassigned_words:
|
| 95 |
+
closest_segment = min(diarization_segments, key=lambda x: min(abs(x[0].start - word_start), abs(x[0].end - word_start)))
|
| 96 |
speaker = closest_segment[2]
|
| 97 |
if speaker != current_speaker:
|
| 98 |
flush_current_segment()
|
| 99 |
current_speaker = speaker
|
| 100 |
current_text.append(word_text)
|
|
|
|
| 101 |
flush_current_segment()
|
| 102 |
+
|
| 103 |
# Fusion des segments courts
|
| 104 |
merged_transcription = []
|
| 105 |
for speaker, text in speaker_transcription:
|
| 106 |
+
if not merged_transcription or merged_transcription[-1][0] != speaker:
|
| 107 |
merged_transcription.append((speaker, text))
|
| 108 |
else:
|
| 109 |
merged_transcription[-1] = (speaker, merged_transcription[-1][1] + " " + text)
|
| 110 |
+
|
| 111 |
return merged_transcription
|
| 112 |
|
| 113 |
def simplify_diarization_output(speaker_transcription):
|