Update app.py
Browse files
app.py
CHANGED
|
@@ -10,6 +10,11 @@ from transformers import logging
|
|
| 10 |
import math
|
| 11 |
import json
|
| 12 |
import tempfile
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 13 |
|
| 14 |
# Suprimir advertencias
|
| 15 |
warnings.filterwarnings("ignore")
|
|
@@ -31,21 +36,34 @@ MODELS = {
|
|
| 31 |
"facebook/wav2vec2-large-xlsr-53-portuguese",
|
| 32 |
"openai/whisper-medium",
|
| 33 |
"jonatasgrosman/wav2vec2-xlsr-53-portuguese"
|
|
|
|
|
|
|
|
|
|
| 34 |
]
|
| 35 |
}
|
| 36 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 37 |
# Funci贸n para verificar si ffmpeg est谩 instalado
|
| 38 |
def verify_ffmpeg_installation():
|
| 39 |
try:
|
| 40 |
subprocess.run(["ffmpeg", "-version"], stdout=subprocess.PIPE, stderr=subprocess.PIPE, check=True)
|
| 41 |
except subprocess.CalledProcessError as e:
|
| 42 |
-
|
| 43 |
raise e
|
| 44 |
|
| 45 |
def convert_audio_to_wav(audio_path):
|
| 46 |
if os.path.isdir(audio_path):
|
| 47 |
raise ValueError(f"La ruta proporcionada es un directorio, no un archivo: {audio_path}")
|
| 48 |
-
|
|
|
|
|
|
|
| 49 |
|
| 50 |
# A帽adir la opci贸n '-y' para sobrescribir el archivo existente sin preguntar
|
| 51 |
command = ["ffmpeg", "-y", "-i", audio_path, "-ac", "1", "-ar", "16000", wav_path]
|
|
@@ -53,8 +71,8 @@ def convert_audio_to_wav(audio_path):
|
|
| 53 |
process = subprocess.run(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
|
| 54 |
|
| 55 |
# Imprimir resultados para depuraci贸n
|
| 56 |
-
|
| 57 |
-
|
| 58 |
|
| 59 |
if process.returncode != 0:
|
| 60 |
raise ValueError(f"Error al convertir el archivo de audio a wav: {process.stderr.decode()}")
|
|
@@ -68,7 +86,7 @@ def detect_language(audio_path):
|
|
| 68 |
raise ValueError(f"Error al cargar el archivo de audio con librosa: {e}")
|
| 69 |
|
| 70 |
processor = WhisperProcessor.from_pretrained("openai/whisper-base")
|
| 71 |
-
model =
|
| 72 |
|
| 73 |
input_features = processor(speech, sampling_rate=16000, return_tensors="pt").input_features
|
| 74 |
predicted_ids = model.generate(input_features)
|
|
@@ -90,45 +108,26 @@ def transcribe_audio_stream(audio, model_name):
|
|
| 90 |
duration = len(speech) / rate
|
| 91 |
|
| 92 |
transcriptions = []
|
| 93 |
-
|
| 94 |
-
|
| 95 |
-
|
| 96 |
-
|
| 97 |
-
|
| 98 |
-
|
| 99 |
-
|
| 100 |
-
|
| 101 |
-
end = min(i + chunk_duration, duration)
|
| 102 |
-
chunk = speech[int(i * rate):int(end * rate)]
|
| 103 |
-
|
| 104 |
-
input_features = processor(chunk, sampling_rate=16000, return_tensors="pt").input_features
|
| 105 |
-
predicted_ids = model.generate(input_features)
|
| 106 |
-
transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
|
| 107 |
-
|
| 108 |
-
progress = min(100, (end / duration) * 100)
|
| 109 |
-
transcriptions.append({
|
| 110 |
-
"start_time": i,
|
| 111 |
-
"end_time": end,
|
| 112 |
-
"text": transcription
|
| 113 |
-
})
|
| 114 |
-
yield transcriptions, progress
|
| 115 |
-
else:
|
| 116 |
-
transcriber = pipeline("automatic-speech-recognition", model=model_name)
|
| 117 |
|
| 118 |
-
|
|
|
|
|
|
|
| 119 |
|
| 120 |
-
|
| 121 |
-
|
| 122 |
-
|
| 123 |
-
|
| 124 |
-
|
| 125 |
-
|
| 126 |
-
|
| 127 |
-
"start_time": i,
|
| 128 |
-
"end_time": end,
|
| 129 |
-
"text": result["text"]
|
| 130 |
-
})
|
| 131 |
-
yield transcriptions, progress
|
| 132 |
|
| 133 |
def detect_and_select_model(audio):
|
| 134 |
wav_audio = convert_audio_to_wav(audio)
|
|
@@ -146,62 +145,66 @@ def save_transcription(transcriptions, file_format):
|
|
| 146 |
for entry in transcriptions:
|
| 147 |
tmp.write(f"{entry['start_time']:.2f},{entry['end_time']:.2f},{entry['text']}\n".encode())
|
| 148 |
file_path = tmp.name
|
| 149 |
-
|
| 150 |
return file_path
|
| 151 |
|
| 152 |
-
def combined_interface(audio, file_format):
|
| 153 |
try:
|
| 154 |
-
|
| 155 |
-
verify_ffmpeg_installation()
|
| 156 |
|
| 157 |
language, model_options = detect_and_select_model(audio)
|
| 158 |
-
|
| 159 |
-
|
| 160 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 161 |
|
| 162 |
# Primer yield: A帽adir None para la s茅ptima salida (Archivo de Descarga)
|
| 163 |
-
yield
|
| 164 |
|
| 165 |
transcriptions = []
|
| 166 |
-
for partial_transcriptions, progress in transcribe_audio_stream(audio,
|
| 167 |
transcriptions = partial_transcriptions
|
| 168 |
full_transcription = " ".join([t["text"] for t in transcriptions])
|
| 169 |
progress_int = math.floor(progress)
|
| 170 |
status = f"Transcribing... {progress_int}% complete"
|
| 171 |
-
|
| 172 |
-
|
| 173 |
-
yield language, model_options, selected_model, full_transcription.strip(), progress_int, status, None
|
| 174 |
|
| 175 |
-
|
| 176 |
-
# Guardar transcripci贸n
|
| 177 |
file_path = save_transcription(transcriptions, file_format)
|
| 178 |
-
print(f"Transcripci贸n guardada en: {file_path}")
|
| 179 |
|
| 180 |
-
# Verificar que file_path no es un directorio
|
| 181 |
if os.path.isdir(file_path):
|
| 182 |
raise ValueError(f"El archivo de transcripci贸n deber铆a ser un archivo, pero es un directorio: {file_path}")
|
| 183 |
|
| 184 |
-
# Verificar que el archivo existe
|
| 185 |
if not os.path.isfile(file_path):
|
| 186 |
raise ValueError(f"El archivo de transcripci贸n no existe: {file_path}")
|
| 187 |
|
| 188 |
-
# Limpiar archivos temporales
|
| 189 |
os.remove("converted_audio.wav")
|
| 190 |
-
|
| 191 |
|
| 192 |
-
|
| 193 |
-
yield language, model_options, selected_model, full_transcription.strip(), 100, "Transcription complete! Download the file below.", file_path
|
| 194 |
|
| 195 |
except Exception as e:
|
| 196 |
-
|
| 197 |
-
# Asegurarse de que el yield de error tambi茅n devuelva 7 valores
|
| 198 |
yield str(e), [], "", "An error occurred during processing.", 0, "Error", ""
|
| 199 |
|
| 200 |
iface = gr.Interface(
|
| 201 |
fn=combined_interface,
|
| 202 |
inputs=[
|
| 203 |
gr.Audio(type="filepath", label="Upload Audio File"),
|
| 204 |
-
gr.Radio(choices=["JSON", "TXT"], label="Choose output format")
|
|
|
|
|
|
|
|
|
|
| 205 |
],
|
| 206 |
outputs=[
|
| 207 |
gr.Textbox(label="Detected Language"),
|
|
@@ -213,7 +216,7 @@ iface = gr.Interface(
|
|
| 213 |
gr.File(label="Download Transcription")
|
| 214 |
],
|
| 215 |
title="Multilingual Audio Transcriber with Real-time Display and Progress Indicator",
|
| 216 |
-
description="Upload an audio file to detect the language,
|
| 217 |
live=True
|
| 218 |
)
|
| 219 |
|
|
|
|
| 10 |
import math
|
| 11 |
import json
|
| 12 |
import tempfile
|
| 13 |
+
import logging
|
| 14 |
+
import concurrent.futures
|
| 15 |
+
|
| 16 |
+
# Configurar logging
|
| 17 |
+
logging.basicConfig(level=logging.INFO)
|
| 18 |
|
| 19 |
# Suprimir advertencias
|
| 20 |
warnings.filterwarnings("ignore")
|
|
|
|
| 36 |
"facebook/wav2vec2-large-xlsr-53-portuguese",
|
| 37 |
"openai/whisper-medium",
|
| 38 |
"jonatasgrosman/wav2vec2-xlsr-53-portuguese"
|
| 39 |
+
],
|
| 40 |
+
"fr": [
|
| 41 |
+
"jonatasgrosman/wav2vec2-large-xlsr-53-french"
|
| 42 |
]
|
| 43 |
}
|
| 44 |
|
| 45 |
+
# Cache de modelos para evitar m煤ltiples cargas
|
| 46 |
+
model_cache = {}
|
| 47 |
+
|
| 48 |
+
def get_model(model_name):
|
| 49 |
+
if model_name not in model_cache:
|
| 50 |
+
model_cache[model_name] = WhisperForConditionalGeneration.from_pretrained(model_name)
|
| 51 |
+
return model_cache[model_name]
|
| 52 |
+
|
| 53 |
# Funci贸n para verificar si ffmpeg est谩 instalado
|
| 54 |
def verify_ffmpeg_installation():
|
| 55 |
try:
|
| 56 |
subprocess.run(["ffmpeg", "-version"], stdout=subprocess.PIPE, stderr=subprocess.PIPE, check=True)
|
| 57 |
except subprocess.CalledProcessError as e:
|
| 58 |
+
logging.error("ffmpeg no est谩 instalado o no se puede ejecutar correctamente.")
|
| 59 |
raise e
|
| 60 |
|
| 61 |
def convert_audio_to_wav(audio_path):
|
| 62 |
if os.path.isdir(audio_path):
|
| 63 |
raise ValueError(f"La ruta proporcionada es un directorio, no un archivo: {audio_path}")
|
| 64 |
+
|
| 65 |
+
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp:
|
| 66 |
+
wav_path = tmp.name
|
| 67 |
|
| 68 |
# A帽adir la opci贸n '-y' para sobrescribir el archivo existente sin preguntar
|
| 69 |
command = ["ffmpeg", "-y", "-i", audio_path, "-ac", "1", "-ar", "16000", wav_path]
|
|
|
|
| 71 |
process = subprocess.run(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
|
| 72 |
|
| 73 |
# Imprimir resultados para depuraci贸n
|
| 74 |
+
logging.info(process.stdout.decode())
|
| 75 |
+
logging.error(process.stderr.decode())
|
| 76 |
|
| 77 |
if process.returncode != 0:
|
| 78 |
raise ValueError(f"Error al convertir el archivo de audio a wav: {process.stderr.decode()}")
|
|
|
|
| 86 |
raise ValueError(f"Error al cargar el archivo de audio con librosa: {e}")
|
| 87 |
|
| 88 |
processor = WhisperProcessor.from_pretrained("openai/whisper-base")
|
| 89 |
+
model = get_model("openai/whisper-base")
|
| 90 |
|
| 91 |
input_features = processor(speech, sampling_rate=16000, return_tensors="pt").input_features
|
| 92 |
predicted_ids = model.generate(input_features)
|
|
|
|
| 108 |
duration = len(speech) / rate
|
| 109 |
|
| 110 |
transcriptions = []
|
| 111 |
+
processor = WhisperProcessor.from_pretrained(model_name)
|
| 112 |
+
model = get_model(model_name)
|
| 113 |
+
|
| 114 |
+
chunk_duration = 30 # segundos
|
| 115 |
+
|
| 116 |
+
for i in range(0, int(duration), chunk_duration):
|
| 117 |
+
end = min(i + chunk_duration, duration)
|
| 118 |
+
chunk = speech[int(i * rate):int(end * rate)]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 119 |
|
| 120 |
+
input_features = processor(chunk, sampling_rate=16000, return_tensors="pt").input_features
|
| 121 |
+
predicted_ids = model.generate(input_features)
|
| 122 |
+
transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
|
| 123 |
|
| 124 |
+
progress = min(100, (end / duration) * 100)
|
| 125 |
+
transcriptions.append({
|
| 126 |
+
"start_time": i,
|
| 127 |
+
"end_time": end,
|
| 128 |
+
"text": transcription
|
| 129 |
+
})
|
| 130 |
+
yield transcriptions, progress
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 131 |
|
| 132 |
def detect_and_select_model(audio):
|
| 133 |
wav_audio = convert_audio_to_wav(audio)
|
|
|
|
| 145 |
for entry in transcriptions:
|
| 146 |
tmp.write(f"{entry['start_time']:.2f},{entry['end_time']:.2f},{entry['text']}\n".encode())
|
| 147 |
file_path = tmp.name
|
| 148 |
+
logging.info(f"Archivo de transcripci贸n guardado en: {file_path}")
|
| 149 |
return file_path
|
| 150 |
|
| 151 |
+
def combined_interface(audio, file_format, confirmed_language, chosen_model):
|
| 152 |
try:
|
| 153 |
+
logging.info(f"Ruta del archivo de audio subido: {audio}")
|
| 154 |
+
verify_ffmpeg_installation()
|
| 155 |
|
| 156 |
language, model_options = detect_and_select_model(audio)
|
| 157 |
+
|
| 158 |
+
# Si el usuario ha confirmado el idioma, lo usamos, sino, lo detectamos autom谩ticamente
|
| 159 |
+
if not confirmed_language:
|
| 160 |
+
confirmed_language = language
|
| 161 |
+
|
| 162 |
+
# Sugerimos un modelo, pero permitimos que el usuario elija uno
|
| 163 |
+
if not chosen_model:
|
| 164 |
+
chosen_model = model_options[0]
|
| 165 |
+
|
| 166 |
+
logging.info(f"Idioma detectado: {confirmed_language}")
|
| 167 |
+
logging.info(f"Modelos disponibles: {model_options}")
|
| 168 |
+
logging.info(f"Modelo seleccionado: {chosen_model}")
|
| 169 |
|
| 170 |
# Primer yield: A帽adir None para la s茅ptima salida (Archivo de Descarga)
|
| 171 |
+
yield confirmed_language, model_options, chosen_model, "", 0, "Initializing...", None
|
| 172 |
|
| 173 |
transcriptions = []
|
| 174 |
+
for partial_transcriptions, progress in transcribe_audio_stream(audio, chosen_model):
|
| 175 |
transcriptions = partial_transcriptions
|
| 176 |
full_transcription = " ".join([t["text"] for t in transcriptions])
|
| 177 |
progress_int = math.floor(progress)
|
| 178 |
status = f"Transcribing... {progress_int}% complete"
|
| 179 |
+
logging.info(f"Progreso: {progress_int}%")
|
| 180 |
+
yield confirmed_language, model_options, chosen_model, full_transcription.strip(), progress_int, status, None
|
|
|
|
| 181 |
|
| 182 |
+
logging.info("Guardando transcripci贸n.")
|
|
|
|
| 183 |
file_path = save_transcription(transcriptions, file_format)
|
|
|
|
| 184 |
|
|
|
|
| 185 |
if os.path.isdir(file_path):
|
| 186 |
raise ValueError(f"El archivo de transcripci贸n deber铆a ser un archivo, pero es un directorio: {file_path}")
|
| 187 |
|
|
|
|
| 188 |
if not os.path.isfile(file_path):
|
| 189 |
raise ValueError(f"El archivo de transcripci贸n no existe: {file_path}")
|
| 190 |
|
|
|
|
| 191 |
os.remove("converted_audio.wav")
|
| 192 |
+
logging.info("Archivos temporales limpiados.")
|
| 193 |
|
| 194 |
+
yield confirmed_language, model_options, chosen_model, full_transcription.strip(), 100, "Transcription complete! Download the file below.", file_path
|
|
|
|
| 195 |
|
| 196 |
except Exception as e:
|
| 197 |
+
logging.error(f"Error: {e}")
|
|
|
|
| 198 |
yield str(e), [], "", "An error occurred during processing.", 0, "Error", ""
|
| 199 |
|
| 200 |
iface = gr.Interface(
|
| 201 |
fn=combined_interface,
|
| 202 |
inputs=[
|
| 203 |
gr.Audio(type="filepath", label="Upload Audio File"),
|
| 204 |
+
gr.Radio(choices=["JSON", "TXT"], label="Choose output format"),
|
| 205 |
+
gr.Dropdown(choices=["", "es", "en", "pt", "fr"], label="Confirm detected language (optional)"),
|
| 206 |
+
gr.Dropdown(choices=["", "openai/whisper-large-v3", "facebook/wav2vec2-large-xlsr-53-spanish",
|
| 207 |
+
"jonatasgrosman/wav2vec2-xls-r-1b-spanish", "microsoft/wav2vec2-base-960h"], label="Choose model (optional)")
|
| 208 |
],
|
| 209 |
outputs=[
|
| 210 |
gr.Textbox(label="Detected Language"),
|
|
|
|
| 216 |
gr.File(label="Download Transcription")
|
| 217 |
],
|
| 218 |
title="Multilingual Audio Transcriber with Real-time Display and Progress Indicator",
|
| 219 |
+
description="Upload an audio file to detect the language, confirm the detection or choose a model, and get the transcription in real-time. Optimized for Spanish, English, and Portuguese.",
|
| 220 |
live=True
|
| 221 |
)
|
| 222 |
|