Woziii commited on
Commit
a8ecd5f
·
verified ·
1 Parent(s): d0523d6

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +86 -68
app.py CHANGED
@@ -1,15 +1,23 @@
 
 
 
 
 
1
  import os
2
  import shutil
3
  import zipfile
4
- from pathlib import Path
5
  import torch
 
 
6
  import gradio as gr
7
  from pydub import AudioSegment
8
  from transformers import pipeline
9
 
 
10
  # Configuration
 
11
  MODEL_NAME = "openai/whisper-large-v3"
12
- device = "cuda:0" if torch.cuda.is_available() else "cpu"
13
 
14
  pipe = pipeline(
15
  task="automatic-speech-recognition",
@@ -21,49 +29,81 @@ pipe = pipeline(
21
  TEMP_DIR = "./temp_audio"
22
  os.makedirs(TEMP_DIR, exist_ok=True)
23
 
24
- # Initialisation de l'état
25
  def init_metadata_state():
26
  return []
27
 
28
  def transcribe_audio(audio_path):
29
  if not audio_path:
30
- return "Aucun fichier audio fourni", [], None
 
 
 
 
 
 
 
 
 
31
 
32
- result = pipe(audio_path, return_timestamps="word", generate_kwargs={"language": "french"})
33
- chunks = result.get("chunks", [])
34
- text = result["text"] if "text" in result else ""
35
 
36
- table_init = []
37
- for chunk in chunks:
38
- table_init.append([chunk["text"], None, None, ""])
39
 
40
- return text, table_init, audio_path
 
 
41
 
42
- def validate_segments(audio_path, table_data, metadata_state):
43
- if not audio_path:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
44
  return [], metadata_state
45
 
46
  if os.path.exists(TEMP_DIR):
47
  shutil.rmtree(TEMP_DIR)
48
  os.makedirs(TEMP_DIR, exist_ok=True)
49
-
50
  original_audio = AudioSegment.from_file(audio_path)
51
  segment_paths = []
52
  updated_metadata = []
53
-
54
- for i, row in enumerate(table_data):
55
- if len(row) < 4:
56
- continue
57
- text, start_time, end_time, segment_id = row
58
-
59
- if not text or start_time is None or end_time is None:
60
- continue
61
- if not segment_id:
62
- segment_id = f"seg_{i+1:02d}"
63
 
64
- start_ms = int(float(start_time) * 1000)
65
- end_ms = int(float(end_time) * 1000)
 
 
 
 
66
  if start_ms < 0 or end_ms <= start_ms:
 
67
  continue
68
 
69
  segment_filename = f"{Path(audio_path).stem}_{segment_id}.wav"
@@ -80,53 +120,31 @@ def validate_segments(audio_path, table_data, metadata_state):
80
  "end_time": end_time,
81
  "id": segment_id,
82
  })
 
83
 
84
  return segment_paths, updated_metadata
85
 
86
- def generate_zip(metadata_state):
87
- if not metadata_state:
88
- return None
89
-
90
- zip_path = os.path.join(TEMP_DIR, "dataset.zip")
91
- if os.path.exists(zip_path):
92
- os.remove(zip_path)
93
-
94
- metadata_csv_path = os.path.join(TEMP_DIR, "metadata.csv")
95
- with open(metadata_csv_path, "w", encoding="utf-8") as f:
96
- f.write("audio_file|text|speaker_name|API\n")
97
- for seg in metadata_state:
98
- line = f"{seg['audio_file']}|{seg['text']}|projectname|/API_PHONETIC/\n"
99
- f.write(line)
100
-
101
- with zipfile.ZipFile(zip_path, "w", zipfile.ZIP_DEFLATED) as zf:
102
- zf.write(metadata_csv_path, "metadata.csv")
103
- for seg in metadata_state:
104
- file_path = os.path.join(TEMP_DIR, seg["audio_file"])
105
- if os.path.exists(file_path):
106
- zf.write(file_path, seg["audio_file"])
107
-
108
- return zip_path
109
-
110
  with gr.Blocks() as demo:
111
- gr.Markdown("# Application de Découpe Audio (Fixe)")
112
  metadata_state = gr.State(init_metadata_state())
113
-
114
- with gr.Column():
115
- gr.Markdown("### 1. Téléversez un fichier audio")
116
- audio_input = gr.Audio(type="filepath", label="Fichier audio")
117
-
118
- raw_transcription = gr.Textbox(label="Transcription (Whisper)", interactive=False)
119
-
120
- gr.Markdown("### 2. Définissez vos segments")
121
- table = gr.Dataframe(headers=["Texte", "Début (s)", "Fin (s)", "ID"], datatype=["str", "number", "number", "str"], row_count=5)
122
-
123
- validate_button = gr.Button("Valider et générer les extraits")
124
- generate_button = gr.Button("Générer le fichier ZIP")
125
  zip_file = gr.File(label="Télécharger le ZIP")
126
-
127
- audio_players = [gr.Audio(label=f"Extrait {i+1}", interactive=False) for i in range(5)]
128
-
129
- validate_button.click(fn=validate_segments, inputs=[audio_input, table, metadata_state], outputs=audio_players + [metadata_state])
130
- generate_button.click(fn=generate_zip, inputs=metadata_state, outputs=zip_file)
 
131
 
132
  demo.queue().launch()
 
1
+ # Version: Corrected After Test 3 (V2 - Improved with Scribe-based Timestamp Handling, Debugging Logs Added)
2
+ # Description: Cette version intègre l'affichage des timestamps pour chaque mot,
3
+ # permet une correction manuelle des erreurs, et ajoute une étape intermédiaire
4
+ # avant la validation finale avec des logs détaillés pour le débogage.
5
+
6
  import os
7
  import shutil
8
  import zipfile
 
9
  import torch
10
+ import numpy as np
11
+ from pathlib import Path
12
  import gradio as gr
13
  from pydub import AudioSegment
14
  from transformers import pipeline
15
 
16
+ # -------------------------------------------------
17
  # Configuration
18
+ # -------------------------------------------------
19
  MODEL_NAME = "openai/whisper-large-v3"
20
+ device = "cuda" if torch.cuda.is_available() else "cpu"
21
 
22
  pipe = pipeline(
23
  task="automatic-speech-recognition",
 
29
  TEMP_DIR = "./temp_audio"
30
  os.makedirs(TEMP_DIR, exist_ok=True)
31
 
 
32
  def init_metadata_state():
33
  return []
34
 
35
  def transcribe_audio(audio_path):
36
  if not audio_path:
37
+ print("[LOG] Aucun fichier audio fourni.")
38
+ return "Aucun fichier audio fourni", [], None, [], ""
39
+
40
+ print(f"[LOG] Début de la transcription de {audio_path}...")
41
+ result = pipe(audio_path, return_timestamps="word")
42
+ words = result.get("chunks", [])
43
+
44
+ if not words:
45
+ print("[LOG ERROR] Erreur : Aucun timestamp détecté.")
46
+ return "Erreur : Aucun timestamp détecté.", [], None, [], ""
47
 
48
+ raw_transcription = " ".join([w["text"] for w in words])
49
+ word_timestamps = [(w["text"], (w["timestamp"][0], w["timestamp"][1])) for w in words]
 
50
 
51
+ transcription_with_timestamps = " ".join([f"{w[0]}[{w[1][0]:.2f}-{w[1][1]:.2f}]" for w in word_timestamps])
 
 
52
 
53
+ print(f"[LOG] Transcription brute : {raw_transcription}")
54
+ print(f"[LOG DETAIL] Timestamps associés : {word_timestamps}")
55
+ return raw_transcription, [], audio_path, word_timestamps, transcription_with_timestamps
56
 
57
+ def preprocess_segments(table_data, word_timestamps):
58
+ print("[LOG] Début du prétraitement des segments...")
59
+ formatted_data = []
60
+ for i, row in enumerate(table_data):
61
+ if not row or len(row) < 1 or not row[0].strip():
62
+ print(f"[LOG WARNING] Ignoré : ligne vide à l'index {i}.")
63
+ continue
64
+
65
+ text = row[0].strip()
66
+ segment_id = f"seg_{i+1:02d}"
67
+
68
+ start_time, end_time = None, None
69
+ words_in_segment = text.split()
70
+ segment_indices = []
71
+
72
+ for j, (word, (start, end)) in enumerate(word_timestamps):
73
+ if word in words_in_segment:
74
+ segment_indices.append((j, start, end))
75
+
76
+ if segment_indices:
77
+ start_time = segment_indices[0][1]
78
+ end_time = segment_indices[-1][2] if segment_indices[-1][2] else start_time + 0.5
79
+
80
+ formatted_data.append([text, start_time, end_time, segment_id])
81
+ print(f"[LOG] Segment ajouté : {text} | Début: {start_time}, Fin: {end_time}, ID: {segment_id}")
82
+
83
+ return formatted_data
84
+
85
+ def validate_segments(audio_path, table_data, metadata_state, word_timestamps):
86
+ print("[LOG] Début de la validation des segments...")
87
+ if not audio_path or not word_timestamps:
88
+ print("[LOG ERROR] Erreur : Aucun timestamp valide trouvé !")
89
  return [], metadata_state
90
 
91
  if os.path.exists(TEMP_DIR):
92
  shutil.rmtree(TEMP_DIR)
93
  os.makedirs(TEMP_DIR, exist_ok=True)
94
+
95
  original_audio = AudioSegment.from_file(audio_path)
96
  segment_paths = []
97
  updated_metadata = []
 
 
 
 
 
 
 
 
 
 
98
 
99
+ for text, start_time, end_time, segment_id in table_data:
100
+ if start_time is None or end_time is None:
101
+ print(f"[LOG ERROR] Timestamp manquant pour : {text}")
102
+ continue
103
+
104
+ start_ms, end_ms = int(float(start_time) * 1000), int(float(end_time) * 1000)
105
  if start_ms < 0 or end_ms <= start_ms:
106
+ print(f"[LOG ERROR] Problème de découpage : {text} | {start_time}s - {end_time}s")
107
  continue
108
 
109
  segment_filename = f"{Path(audio_path).stem}_{segment_id}.wav"
 
120
  "end_time": end_time,
121
  "id": segment_id,
122
  })
123
+ print(f"[LOG] Extrait généré : {segment_filename}")
124
 
125
  return segment_paths, updated_metadata
126
 
127
+ # -------------------------------------------------
128
+ # Interface Gradio
129
+ # -------------------------------------------------
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
130
  with gr.Blocks() as demo:
131
+ gr.Markdown("# Application de Découpe Audio")
132
  metadata_state = gr.State(init_metadata_state())
133
+ extracted_segments = gr.State([])
134
+
135
+ audio_input = gr.Audio(type="filepath", label="Fichier audio")
136
+ raw_transcription = gr.Textbox(label="Transcription", interactive=False)
137
+ transcription_timestamps = gr.Textbox(label="Transcription avec Timestamps", interactive=False)
138
+ table = gr.Dataframe(headers=["Texte"], datatype=["str"], row_count=(1, "dynamic"), col_count=1)
139
+ generate_timestamps_button = gr.Button("Générer les timestamps")
140
+ validate_button = gr.Button("Valider")
141
+ generate_button = gr.Button("Générer ZIP")
 
 
 
142
  zip_file = gr.File(label="Télécharger le ZIP")
143
+ word_timestamps = gr.State()
144
+
145
+ audio_input.change(transcribe_audio, inputs=audio_input, outputs=[raw_transcription, table, audio_input, word_timestamps, transcription_timestamps])
146
+ generate_timestamps_button.click(preprocess_segments, inputs=[table, word_timestamps], outputs=table)
147
+ validate_button.click(validate_segments, inputs=[audio_input, table, metadata_state, word_timestamps], outputs=[extracted_segments, metadata_state])
148
+ generate_button.click(generate_zip, inputs=metadata_state, outputs=zip_file)
149
 
150
  demo.queue().launch()