Spaces:

Woziii
/

scribe

Running

App Files Files Community

Woziii commited on Aug 20, 2024

Commit

b2a20a3

verified ·

1 Parent(s): 2b05a52

Update app.py

Browse files

Files changed (1) hide show

app.py +63 -14

app.py CHANGED Viewed

@@ -49,7 +49,7 @@ pipe = pipeline(
-def associate_speakers_with_timestamps(transcription_result, diarization, tolerance=0.02, min_segment_duration=0.05):
     word_segments = transcription_result['chunks']
     diarization_segments = list(diarization.itertracks(yield_label=True))
     speaker_transcription = []
@@ -60,14 +60,18 @@ def associate_speakers_with_timestamps(transcription_result, diarization, tolera
     def flush_current_segment():
         nonlocal current_speaker, current_text
         if current_speaker and current_text:
-            speaker_transcription.append((current_speaker, ' '.join(current_text)))
             current_text = []
     for word in word_segments:
         word_start, word_end = word['timestamp']
         word_text = word['text']
-        # Trouver le segment de diarisation correspondant
         matching_segment = None
         for segment, _, speaker in diarization_segments:
             if segment.start - tolerance <= word_start < segment.end + tolerance:
@@ -80,32 +84,77 @@ def associate_speakers_with_timestamps(transcription_result, diarization, tolera
                 flush_current_segment()
                 current_speaker = speaker
-            # Gérer les pauses longues
             if word_start - last_word_end > 1.0:  # Pause de plus d'une seconde
                 flush_current_segment()
             current_text.append(word_text)
             last_word_end = word_end
         else:
-            # Si aucun segment ne correspond, attribuer au dernier locuteur connu
             if current_speaker:
                 current_text.append(word_text)
             else:
-                # Si c'est le premier mot sans correspondance, créer un nouveau segment
                 current_speaker = "SPEAKER_UNKNOWN"
                 current_text.append(word_text)
     flush_current_segment()
-    # Fusionner les segments courts du même locuteur
-    merged_transcription = []
-    for speaker, text in speaker_transcription:
-        if not merged_transcription or merged_transcription[-1][0] != speaker or len(text.split()) > 3:
-            merged_transcription.append((speaker, text))
-        else:
-            merged_transcription[-1] = (speaker, merged_transcription[-1][1] + " " + text)
-    return merged_transcription
 def simplify_diarization_output(speaker_transcription):
     simplified = []

+def associate_speakers_with_timestamps(transcription_result, diarization, tolerance=0.02, min_segment_duration=0.05, max_words_to_merge=20):
     word_segments = transcription_result['chunks']
     diarization_segments = list(diarization.itertracks(yield_label=True))
     speaker_transcription = []
     def flush_current_segment():
         nonlocal current_speaker, current_text
         if current_speaker and current_text:
+            speaker_transcription.append({
+                "speaker": current_speaker,
+                "text": ' '.join(current_text),
+                "start": word_segments[len(speaker_transcription)]['timestamp'][0],
+                "end": word_segments[len(speaker_transcription) + len(current_text) - 1]['timestamp'][1]
+            })
             current_text = []
     for word in word_segments:
         word_start, word_end = word['timestamp']
         word_text = word['text']
         matching_segment = None
         for segment, _, speaker in diarization_segments:
             if segment.start - tolerance <= word_start < segment.end + tolerance:
                 flush_current_segment()
                 current_speaker = speaker
             if word_start - last_word_end > 1.0:  # Pause de plus d'une seconde
                 flush_current_segment()
             current_text.append(word_text)
             last_word_end = word_end
         else:
             if current_speaker:
                 current_text.append(word_text)
             else:
                 current_speaker = "SPEAKER_UNKNOWN"
                 current_text.append(word_text)
     flush_current_segment()
+    def detect_interruptions(transcription, time_threshold=0.5):
+        for i in range(len(transcription) - 1):
+            current_end = transcription[i]['end']
+            next_start = transcription[i+1]['start']
+            if next_start - current_end < time_threshold:
+                transcription[i]['text'] += ' [...]'
+                transcription[i+1]['text'] = '[...] ' + transcription[i+1]['text']
+        return transcription
+    speaker_transcription = detect_interruptions(speaker_transcription)
+    def post_process_transcription(transcription, max_words):
+        processed = []
+        current_speaker = None
+        current_text = []
+        current_start = None
+        current_end = None
+        for segment in transcription:
+            if segment['speaker'] == current_speaker and len(' '.join(current_text + [segment['text']]).split()) <= max_words:
+                current_text.append(segment['text'])
+                current_end = segment['end']
+            else:
+                if current_speaker:
+                    processed.append({
+                        "speaker": current_speaker,
+                        "text": ' '.join(current_text),
+                        "start": current_start,
+                        "end": current_end
+                    })
+                current_speaker = segment['speaker']
+                current_text = [segment['text']]
+                current_start = segment['start']
+                current_end = segment['end']
+        if current_speaker:
+            processed.append({
+                "speaker": current_speaker,
+                "text": ' '.join(current_text),
+                "start": current_start,
+                "end": current_end
+            })
+        return processed
+    merged_transcription = post_process_transcription(speaker_transcription, max_words_to_merge)
+    speakers = sorted(set(segment['speaker'] for segment in merged_transcription))
+    metadata = {
+        "speaker_count": len(speakers),
+        "speakers": speakers
+    }
+    return {
+        "transcription": merged_transcription,
+        "metadata": metadata
+    }
 def simplify_diarization_output(speaker_transcription):
     simplified = []