Update app.py
Browse files
app.py
CHANGED
|
@@ -49,61 +49,58 @@ pipe = pipeline(
|
|
| 49 |
|
| 50 |
|
| 51 |
|
| 52 |
-
def associate_speakers_with_timestamps(transcription_result, diarization, tolerance=0.
|
| 53 |
word_segments = transcription_result['chunks']
|
| 54 |
diarization_segments = list(diarization.itertracks(yield_label=True))
|
| 55 |
speaker_transcription = []
|
| 56 |
current_speaker = None
|
| 57 |
current_text = []
|
| 58 |
-
|
| 59 |
-
last_segment_index = 0
|
| 60 |
|
| 61 |
def flush_current_segment():
|
| 62 |
nonlocal current_speaker, current_text
|
| 63 |
if current_speaker and current_text:
|
| 64 |
-
|
| 65 |
-
if segment_duration >= min_segment_duration:
|
| 66 |
-
speaker_transcription.append((current_speaker, ' '.join(current_text)))
|
| 67 |
-
else:
|
| 68 |
-
unassigned_words.extend([(word['timestamp'][0], word['text']) for word in word_segments])
|
| 69 |
current_text = []
|
| 70 |
|
| 71 |
for word in word_segments:
|
| 72 |
word_start, word_end = word['timestamp']
|
| 73 |
word_text = word['text']
|
| 74 |
-
assigned = False
|
| 75 |
|
| 76 |
-
|
| 77 |
-
|
|
|
|
| 78 |
if segment.start - tolerance <= word_start < segment.end + tolerance:
|
| 79 |
-
|
| 80 |
-
flush_current_segment()
|
| 81 |
-
current_speaker = speaker
|
| 82 |
-
current_text.append(word_text)
|
| 83 |
-
last_segment_index = i
|
| 84 |
-
assigned = True
|
| 85 |
break
|
| 86 |
|
| 87 |
-
if
|
| 88 |
-
|
|
|
|
|
|
|
|
|
|
| 89 |
|
| 90 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 91 |
|
| 92 |
-
# Traitement des mots non assignés
|
| 93 |
-
unassigned_words.sort(key=lambda x: x[0]) # Trier par timestamp
|
| 94 |
-
for word_start, word_text in unassigned_words:
|
| 95 |
-
closest_segment = min(diarization_segments, key=lambda x: min(abs(x[0].start - word_start), abs(x[0].end - word_start)))
|
| 96 |
-
speaker = closest_segment[2]
|
| 97 |
-
if speaker != current_speaker:
|
| 98 |
-
flush_current_segment()
|
| 99 |
-
current_speaker = speaker
|
| 100 |
-
current_text.append(word_text)
|
| 101 |
flush_current_segment()
|
| 102 |
|
| 103 |
-
#
|
| 104 |
merged_transcription = []
|
| 105 |
for speaker, text in speaker_transcription:
|
| 106 |
-
if not merged_transcription or merged_transcription[-1][0] != speaker:
|
| 107 |
merged_transcription.append((speaker, text))
|
| 108 |
else:
|
| 109 |
merged_transcription[-1] = (speaker, merged_transcription[-1][1] + " " + text)
|
|
|
|
| 49 |
|
| 50 |
|
| 51 |
|
| 52 |
+
def associate_speakers_with_timestamps(transcription_result, diarization, tolerance=0.02, min_segment_duration=0.05):
|
| 53 |
word_segments = transcription_result['chunks']
|
| 54 |
diarization_segments = list(diarization.itertracks(yield_label=True))
|
| 55 |
speaker_transcription = []
|
| 56 |
current_speaker = None
|
| 57 |
current_text = []
|
| 58 |
+
last_word_end = 0
|
|
|
|
| 59 |
|
| 60 |
def flush_current_segment():
|
| 61 |
nonlocal current_speaker, current_text
|
| 62 |
if current_speaker and current_text:
|
| 63 |
+
speaker_transcription.append((current_speaker, ' '.join(current_text)))
|
|
|
|
|
|
|
|
|
|
|
|
|
| 64 |
current_text = []
|
| 65 |
|
| 66 |
for word in word_segments:
|
| 67 |
word_start, word_end = word['timestamp']
|
| 68 |
word_text = word['text']
|
|
|
|
| 69 |
|
| 70 |
+
# Trouver le segment de diarisation correspondant
|
| 71 |
+
matching_segment = None
|
| 72 |
+
for segment, _, speaker in diarization_segments:
|
| 73 |
if segment.start - tolerance <= word_start < segment.end + tolerance:
|
| 74 |
+
matching_segment = (segment, speaker)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 75 |
break
|
| 76 |
|
| 77 |
+
if matching_segment:
|
| 78 |
+
segment, speaker = matching_segment
|
| 79 |
+
if speaker != current_speaker:
|
| 80 |
+
flush_current_segment()
|
| 81 |
+
current_speaker = speaker
|
| 82 |
|
| 83 |
+
# Gérer les pauses longues
|
| 84 |
+
if word_start - last_word_end > 1.0: # Pause de plus d'une seconde
|
| 85 |
+
flush_current_segment()
|
| 86 |
+
|
| 87 |
+
current_text.append(word_text)
|
| 88 |
+
last_word_end = word_end
|
| 89 |
+
else:
|
| 90 |
+
# Si aucun segment ne correspond, attribuer au dernier locuteur connu
|
| 91 |
+
if current_speaker:
|
| 92 |
+
current_text.append(word_text)
|
| 93 |
+
else:
|
| 94 |
+
# Si c'est le premier mot sans correspondance, créer un nouveau segment
|
| 95 |
+
current_speaker = "SPEAKER_UNKNOWN"
|
| 96 |
+
current_text.append(word_text)
|
| 97 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 98 |
flush_current_segment()
|
| 99 |
|
| 100 |
+
# Fusionner les segments courts du même locuteur
|
| 101 |
merged_transcription = []
|
| 102 |
for speaker, text in speaker_transcription:
|
| 103 |
+
if not merged_transcription or merged_transcription[-1][0] != speaker or len(text.split()) > 3:
|
| 104 |
merged_transcription.append((speaker, text))
|
| 105 |
else:
|
| 106 |
merged_transcription[-1] = (speaker, merged_transcription[-1][1] + " " + text)
|