Spaces:

pierreguillou
/

conversion_audio_vers_mp3

Sleeping

App Files Files Community

conversion_audio_vers_mp3 / app.py

pierreguillou

Update app.py

bb459b1 verified 3 months ago

raw

history blame contribute delete

8.38 kB

	import gradio as gr
	import torch
	from transformers import pipeline
	from pydub import AudioSegment, effects, silence
	import os
	from langdetect import detect
	from langdetect.lang_detect_exception import LangDetectException

	# --- Configuration ---
	#LANG_MODEL_NAME = "openai/whisper-tiny" # modèle léger pour la détection de langue
	LANG_MODEL_NAME = "openai/whisper-base" # modèle medium pour une meilleure détection de langue

	device = 0 if torch.cuda.is_available() else "cpu"
	torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32

	# Pipeline léger pour la détection de langue
	lang_pipe = pipeline(
	"automatic-speech-recognition",
	model=LANG_MODEL_NAME,
	torch_dtype=torch_dtype,
	device=device,
	)

	# --- Fonctions utilitaires ---

	def convert_to_wav(audio_path):
	"""Converte qualquer arquivo de áudio para WAV mono 16 kHz."""
	try:
	audio = AudioSegment.from_file(audio_path)
	audio = audio.set_channels(1)
	audio = audio.set_frame_rate(16000)
	wav_path = os.path.splitext(audio_path)[0] + ".wav"
	audio.export(wav_path, format="wav")
	return wav_path
	except Exception as e:
	print(f"Erro ao converter para WAV: {e}")
	return None

	def make_speech_head_wav(input_wav_path, max_seconds=7):
	"""
	Version simplifiée et robuste : prend les premiers max_seconds
	après suppression du silence initial (avec protection contre les boucles).
	"""
	try:
	audio = AudioSegment.from_wav(input_wav_path)

	# Si l'audio est déjà très court, le retourner tel quel
	if len(audio) <= max_seconds * 1000:
	return input_wav_path

	# Normalisation simple
	normalized = effects.normalize(audio)

	# Tentative rapide de suppression du silence initial
	try:
	silence_thresh = normalized.dBFS - 20
	# Limiter la recherche aux 30 premières secondes maximum
	search_audio = normalized[:30000]
	start_trim = silence.detect_leading_silence(
	search_audio,
	silence_threshold=silence_thresh,
	chunk_size=100
	)
	# Limiter le trim à 15 secondes max pour éviter les cas extrêmes
	start_trim = min(start_trim, 15000)
	trimmed = normalized[start_trim:]
	except:
	# En cas d'erreur, utiliser l'audio original
	trimmed = normalized

	# Si trop court après trim, utiliser l'original
	if len(trimmed) < 2000: # Moins de 2 secondes
	trimmed = normalized

	# Prendre simplement les premiers max_seconds
	clip = trimmed[:max_seconds * 1000]

	short_path = os.path.splitext(input_wav_path)[0] + f"_head_{max_seconds}s.wav"
	clip.export(short_path, format="wav")
	return short_path

	except Exception as e:
	print(f"Erro ao criar o trecho: {e}")
	# En cas d'erreur, retourner le fichier original
	return input_wav_path

	def detect_language_on_upload(filepath):
	"""
	Détection rapide et robuste de langue avec timeout et fallbacks.
	"""
	if filepath is None:
	return "auto"

	try:
	print(f"Début détection langue pour: {filepath}")

	wav_filepath = convert_to_wav(filepath)
	if not wav_filepath:
	print("Échec conversion WAV")
	return "auto"

	# Créer un extrait court (5 secondes max)
	short_wav = make_speech_head_wav(wav_filepath, max_seconds=7)
	if not short_wav:
	short_wav = wav_filepath

	print(f"Analyse du fichier: {short_wav}")

	# Transcrição avec paramètres conservateurs
	outputs = lang_pipe(
	short_wav,
	chunk_length_s=5,
	return_timestamps=False,
	generate_kwargs={"max_new_tokens": 50} # Limiter pour éviter les timeouts
	)

	transcribed_text = outputs.get("text", "").strip()
	print(f"Texte transcrit: {transcribed_text[:100]}...")

	# Priorité au language détecté par Whisper
	whisper_lang = outputs.get("language")
	if whisper_lang and isinstance(whisper_lang, str) and len(whisper_lang) <= 5:
	print(f"Langue Whisper détectée: {whisper_lang}")
	return whisper_lang

	# Si texte trop court, retourner auto
	if len(transcribed_text) < 10:
	print("Texte trop court, retour auto")
	return "auto"

	# Fallback avec LangDetect
	detected_lang = detect(transcribed_text)
	print(f"Langue LangDetect: {detected_lang}")

	# Mapping des codes de langue
	lang_mapping = {
	'fr': 'fr', 'en': 'en', 'es': 'es', 'de': 'de', 'it': 'it',
	'pt': 'pt', 'nl': 'nl', 'pl': 'pl', 'ru': 'ru', 'ja': 'ja',
	'ko': 'ko', 'zh-cn': 'zh', 'zh': 'zh'
	}

	result = lang_mapping.get(detected_lang, "auto")
	print(f"Résultat final: {result}")
	return result

	except Exception as e:
	print(f"Erreur détection langue: {e}")
	return "auto"

	def ensure_mp3_same_name_as_input(input_path, source_wav_path):
	"""
	Cria um arquivo MP3 com o mesmo nome base do arquivo de entrada.
	"""
	try:
	base, _ = os.path.splitext(os.path.basename(input_path))
	mp3_path = f"{base}.mp3"
	audio = AudioSegment.from_wav(source_wav_path)
	audio.export(mp3_path, format="mp3", bitrate="192k")
	return mp3_path
	except Exception as e:
	print(f"Erro ao exportar MP3: {e}")
	return None

	# --- Fonction principale ---

	def make_output_mp3(filepath, language_choice):
	"""
	Conversion audio vers MP3 avec détection de langue optimisée.
	"""
	if filepath is None:
	return None, None, ""

	wav_filepath = convert_to_wav(filepath)
	if not wav_filepath:
	return None, None, ""

	mp3_path = ensure_mp3_same_name_as_input(filepath, wav_filepath)

	# Information sur la langue
	if language_choice == "auto":
	language_info = "Langue détectée automatiquement"
	else:
	language_info = f"Langue détectée: {language_choice}"

	return mp3_path, mp3_path, language_info

	# --- Interface Gradio ---

	with gr.Blocks() as demo:
	gr.HTML("<div style='text-align:center;'><h1>Conversion audio vers format MP3</h1></div>")
	gr.Markdown("Uploadez un fichier audio. La sortie sera toujours un .mp3 avec le même nom de base, écoutable en ligne et téléchargeable.")

	gr.Markdown("""
	## ⚡ Version optimisée
	- Détection rapide : Analyse les 7 premières secondes (hors silence initial)
	- Robuste : Fonctionne avec tous types de fichiers
	- Timeout protection : Évite les blocages
	""")

	with gr.Row():
	with gr.Column():
	audio_input = gr.Audio(type="filepath", label="Envoyer un fichier audio")
	language_dropdown = gr.Dropdown(
	choices=["auto", "fr", "en", "es", "de", "it", "pt", "nl", "pl", "ru", "ja", "ko", "zh"],
	value="auto",
	label="Langue (auto = détection automatique)",
	info="Détection automatique rapide après upload"
	)
	submit_btn = gr.Button("Générer MP3", variant="primary")
	reset_btn = gr.Button("Reset", variant="secondary")
	with gr.Column():
	language_info_output = gr.Textbox(label="Information sur la langue", lines=1)
	mp3_download = gr.File(label="Télécharger la sortie (.mp3)")
	mp3_playback = gr.Audio(label="Écouter la sortie (.mp3)", type="filepath")

	# Détection automatique de langue lors de l'upload
	audio_input.change(
	fn=detect_language_on_upload,
	inputs=audio_input,
	outputs=language_dropdown
	)

	submit_btn.click(
	fn=make_output_mp3,
	inputs=[audio_input, language_dropdown],
	outputs=[mp3_download, mp3_playback, language_info_output]
	)

	def reset_fields():
	return None, None, "auto", ""

	reset_btn.click(
	fn=reset_fields,
	inputs=[],
	outputs=[audio_input, mp3_download, language_dropdown, language_info_output]
	)

	demo.launch(share=True)