Spaces:

Woziii
/

scribe

Sleeping

App Files Files Community

Woziii commited on Aug 20, 2024

Commit

51ecff6

verified ·

1 Parent(s): 7ef4d9e

Update app.py

Browse files

Files changed (1) hide show

app.py +41 -109

app.py CHANGED Viewed

@@ -49,7 +49,7 @@ pipe = pipeline(
-def associate_speakers_with_timestamps(transcription_result, diarization, tolerance=0.02, min_segment_duration=0.05, max_words_to_merge=20):
     word_segments = transcription_result['chunks']
     diarization_segments = list(diarization.itertracks(yield_label=True))
     speaker_transcription = []
@@ -60,18 +60,14 @@ def associate_speakers_with_timestamps(transcription_result, diarization, tolera
     def flush_current_segment():
         nonlocal current_speaker, current_text
         if current_speaker and current_text:
-            speaker_transcription.append({
-                "speaker": current_speaker,
-                "text": ' '.join(current_text),
-                "start": word_segments[len(speaker_transcription)]['timestamp'][0],
-                "end": word_segments[len(speaker_transcription) + len(current_text) - 1]['timestamp'][1]
-            })
             current_text = []
     for word in word_segments:
         word_start, word_end = word['timestamp']
         word_text = word['text']
         matching_segment = None
         for segment, _, speaker in diarization_segments:
             if segment.start - tolerance <= word_start < segment.end + tolerance:
@@ -84,77 +80,32 @@ def associate_speakers_with_timestamps(transcription_result, diarization, tolera
                 flush_current_segment()
                 current_speaker = speaker
             if word_start - last_word_end > 1.0:  # Pause de plus d'une seconde
                 flush_current_segment()
             current_text.append(word_text)
             last_word_end = word_end
         else:
             if current_speaker:
                 current_text.append(word_text)
             else:
                 current_speaker = "SPEAKER_UNKNOWN"
                 current_text.append(word_text)
     flush_current_segment()
-    def detect_interruptions(transcription, time_threshold=0.5):
-        for i in range(len(transcription) - 1):
-            current_end = transcription[i]['end']
-            next_start = transcription[i+1]['start']
-            if next_start - current_end < time_threshold:
-                transcription[i]['text'] += ' [...]'
-                transcription[i+1]['text'] = '[...] ' + transcription[i+1]['text']
-        return transcription
-    speaker_transcription = detect_interruptions(speaker_transcription)
-    def post_process_transcription(transcription, max_words):
-        processed = []
-        current_speaker = None
-        current_text = []
-        current_start = None
-        current_end = None
-        for segment in transcription:
-            if segment['speaker'] == current_speaker and len(' '.join(current_text + [segment['text']]).split()) <= max_words:
-                current_text.append(segment['text'])
-                current_end = segment['end']
-            else:
-                if current_speaker:
-                    processed.append({
-                        "speaker": current_speaker,
-                        "text": ' '.join(current_text),
-                        "start": current_start,
-                        "end": current_end
-                    })
-                current_speaker = segment['speaker']
-                current_text = [segment['text']]
-                current_start = segment['start']
-                current_end = segment['end']
-        if current_speaker:
-            processed.append({
-                "speaker": current_speaker,
-                "text": ' '.join(current_text),
-                "start": current_start,
-                "end": current_end
-            })
-        return processed
-    merged_transcription = post_process_transcription(speaker_transcription, max_words_to_merge)
-    speakers = sorted(set(segment['speaker'] for segment in merged_transcription))
-    metadata = {
-        "speaker_count": len(speakers),
-        "speakers": speakers
-    }
-    return {
-        "transcription": merged_transcription,
-        "metadata": metadata
-    }
 def simplify_diarization_output(speaker_transcription):
     simplified = []
@@ -245,46 +196,31 @@ def transcribe_and_diarize(file_path, task, progress=gr.Progress()):
     progress(1.0, desc="Terminé!")
     return "Transcription terminée!", transcription_result['text'], speaker_transcription
-def format_to_markdown(transcription_result, audio_duration=None, location=None, speaker_age=None, context=None, custom_speaker_names=None):
-    if isinstance(transcription_result, dict):
-        metadata = transcription_result.get("metadata", {})
-        transcription = transcription_result.get("transcription", [])
-    else:
-        metadata = {}
-        transcription = transcription_result
-    speaker_count = metadata.get("speaker_count", "non spécifié")
-    speakers = metadata.get("speakers", [])
-    metadata_text = "\n".join([
-        f"- **Date de traitement** : '{datetime.now().strftime('%d/%m/%Y %H:%M')}'",
-        f"- **Durée de l'audio** : '{audio_duration if audio_duration else '[à remplir]'} secondes'",
-        f"- **Lieu** : '{location if location else '[non spécifié]'}'",
-        f"- **Âge de l'intervenant** : '{f'{speaker_age} ans' if speaker_age else '[non spécifié]'}'",
-        f"- **Contexte** : '{context if context else '[non spécifié]'}'",
-        f"- **Nombre d'interlocuteurs** : '{speaker_count}'",
-        f"- **Interlocuteurs bruts** : '{', '.join(speakers)}'"
-    ])
     try:
-        formatted_transcription = []
-        for segment in transcription:
-            speaker = segment['speaker']
-            text = segment['text']
-            start_time = format_time(segment['start'])
-            end_time = format_time(segment['end'])
-            if custom_speaker_names and speaker in custom_speaker_names:
-                display_speaker = custom_speaker_names[speaker]
-            else:
-                display_speaker = speaker
-            formatted_transcription.append(f"**[{start_time} - {end_time}] {display_speaker}**: {text}")
-        transcription_text = "\n\n".join(formatted_transcription)
     except Exception as e:
         print(f"Error formatting speaker transcription: {e}")
-        transcription_text = "Error formatting speaker transcription. Using raw transcription instead.\n\n" + str(transcription)
     formatted_output = f"""
 # Transcription Formatée
@@ -297,9 +233,6 @@ def format_to_markdown(transcription_result, audio_duration=None, location=None,
 """
     return formatted_output
-def format_time(seconds):
-    return f"{int(seconds // 60):02d}:{int(seconds % 60):02d}"
 def _return_yt_html_embed(yt_url):
     video_id = yt_url.split("?v=")[-1]
     HTML_str = (
@@ -464,7 +397,6 @@ with demo:
                 audio_duration = gr.Textbox(label="⏱️ Durée de l'audio (mm:ss)")
                 location = gr.Textbox(label="📍 Lieu de l'enregistrement")
                 speaker_age = gr.Number(label="👤 Âge de l'intervenant principal")
-                custom_speaker_names = gr.TextArea(label="Noms personnalisés des locuteurs (format: SPEAKER_00: Nom1, SPEAKER_01: Nom2)")
                 context = gr.Textbox(label="📝 Contexte de l'enregistrement")
             format_button = gr.Button("✨ Générer la transcription formatée", elem_classes="button-secondary")
@@ -529,7 +461,7 @@ with demo:
     - Modèles :
         - [Whisper-médium](https://huggingface.co/openai/whisper-medium) : Model size - 764M params - Tensor type F32 -
         - [speaker-diarization-3.1](https://huggingface.co/pyannote/speaker-diarization-3.1) : Model size - Unknow - Tensor type F32 -
-    - Version : V.2.0.0-Bêta
     - Langues : FR, EN
     - Copyright : cc-by-nc-4.0
     - [En savoir +](https://huggingface.co/spaces/Woziii/scribe/blob/main/README.md)
@@ -543,9 +475,9 @@ with demo:
     )
     format_button.click(
-    format_to_markdown,
-    inputs=[raw_output, speaker_output, audio_duration, location, speaker_age, context, custom_speaker_names],
-    outputs=formatted_output
     )
     mic_transcribe_button.click(
@@ -574,4 +506,4 @@ with demo:
 if __name__ == "__main__":
-    demo.queue().launch()

+def associate_speakers_with_timestamps(transcription_result, diarization, tolerance=0.02, min_segment_duration=0.05):
     word_segments = transcription_result['chunks']
     diarization_segments = list(diarization.itertracks(yield_label=True))
     speaker_transcription = []
     def flush_current_segment():
         nonlocal current_speaker, current_text
         if current_speaker and current_text:
+            speaker_transcription.append((current_speaker, ' '.join(current_text)))
             current_text = []
     for word in word_segments:
         word_start, word_end = word['timestamp']
         word_text = word['text']
+        # Trouver le segment de diarisation correspondant
         matching_segment = None
         for segment, _, speaker in diarization_segments:
             if segment.start - tolerance <= word_start < segment.end + tolerance:
                 flush_current_segment()
                 current_speaker = speaker
+            # Gérer les pauses longues
             if word_start - last_word_end > 1.0:  # Pause de plus d'une seconde
                 flush_current_segment()
             current_text.append(word_text)
             last_word_end = word_end
         else:
+            # Si aucun segment ne correspond, attribuer au dernier locuteur connu
             if current_speaker:
                 current_text.append(word_text)
             else:
+                # Si c'est le premier mot sans correspondance, créer un nouveau segment
                 current_speaker = "SPEAKER_UNKNOWN"
                 current_text.append(word_text)
     flush_current_segment()
+    # Fusionner les segments courts du même locuteur
+    merged_transcription = []
+    for speaker, text in speaker_transcription:
+        if not merged_transcription or merged_transcription[-1][0] != speaker or len(text.split()) > 3:
+            merged_transcription.append((speaker, text))
+        else:
+            merged_transcription[-1] = (speaker, merged_transcription[-1][1] + " " + text)
+    return merged_transcription
 def simplify_diarization_output(speaker_transcription):
     simplified = []
     progress(1.0, desc="Terminé!")
     return "Transcription terminée!", transcription_result['text'], speaker_transcription
+def format_to_markdown(transcription_text, speaker_transcription, audio_duration=None, location=None, speaker_age=None, context=None):
+    metadata = {
+        "Date de traitement": datetime.now().strftime('%d/%m/%Y %H:%M'),
+        "Durée de l'audio": f"{audio_duration} secondes" if audio_duration else "[à remplir]",
+        "Lieu": location if location else "[non spécifié]",
+        "Âge de l'intervenant": f"{speaker_age} ans" if speaker_age else "[non spécifié]",
+        "Contexte": context if context else "[non spécifié]"
+    }
+    metadata_text = "\n".join([f"- **{key}** : '{value}'" for key, value in metadata.items()])
     try:
+        if isinstance(speaker_transcription, str):
+            speaker_transcription = parse_simplified_diarization(speaker_transcription)
+        if isinstance(speaker_transcription, list) and all(isinstance(item, tuple) and len(item) == 2 for item in speaker_transcription):
+            formatted_transcription = []
+            for speaker, text in speaker_transcription:
+                formatted_transcription.append(f"**{speaker}**: {text}")
+            transcription_text = "\n\n".join(formatted_transcription)
+        else:
+            raise ValueError("Invalid speaker transcription format")
     except Exception as e:
         print(f"Error formatting speaker transcription: {e}")
+        transcription_text = "Error formatting speaker transcription. Using raw transcription instead.\n\n" + transcription_text
     formatted_output = f"""
 # Transcription Formatée
 """
     return formatted_output
 def _return_yt_html_embed(yt_url):
     video_id = yt_url.split("?v=")[-1]
     HTML_str = (
                 audio_duration = gr.Textbox(label="⏱️ Durée de l'audio (mm:ss)")
                 location = gr.Textbox(label="📍 Lieu de l'enregistrement")
                 speaker_age = gr.Number(label="👤 Âge de l'intervenant principal")
                 context = gr.Textbox(label="📝 Contexte de l'enregistrement")
             format_button = gr.Button("✨ Générer la transcription formatée", elem_classes="button-secondary")
     - Modèles :
         - [Whisper-médium](https://huggingface.co/openai/whisper-medium) : Model size - 764M params - Tensor type F32 -
         - [speaker-diarization-3.1](https://huggingface.co/pyannote/speaker-diarization-3.1) : Model size - Unknow - Tensor type F32 -
+    - Version : V.2.0.2-Bêta
     - Langues : FR, EN
     - Copyright : cc-by-nc-4.0
     - [En savoir +](https://huggingface.co/spaces/Woziii/scribe/blob/main/README.md)
     )
     format_button.click(
+        format_to_markdown,
+        inputs=[raw_output, speaker_output, audio_duration, location, speaker_age, context],
+        outputs=formatted_output
     )
     mic_transcribe_button.click(
 if __name__ == "__main__":
+    demo.queue().launch()