Spaces:

jojonocode
/

yawostt

Sleeping

App Files Files Community

yawostt / app.py

jojonocode

Update app.py

782ca36 verified about 1 month ago

raw

history blame contribute delete

4.31 kB

	import os
	import gradio as gr
	import librosa
	import numpy as np
	import ctranslate2
	from faster_whisper import WhisperModel

	# --- 1. CONFIGURATION ET CONVERSION DU MODÈLE ---
	MODEL_NAME = "abiyo27/whisper-small-ewe-2"
	CT2_MODEL_DIR = "whisper-small-ewe-2-ct2"

	# Si le modèle n'a pas encore été converti, on le fait au démarrage
	if not os.path.exists(CT2_MODEL_DIR):
	print(f"⏳ Conversion de {MODEL_NAME} au format CTranslate2 (int8)...")
	print("Cela prendra environ une minute au premier lancement.")
	# On télécharge et on convertit ton modèle HF en int8 (optimisé CPU)
	converter = ctranslate2.converters.TransformersConverter(MODEL_NAME)
	converter.convert(output_dir=CT2_MODEL_DIR, quantization="int8")
	print("✅ Conversion terminée !")

	# --- 2. CHARGEMENT OPTIMISÉ (FASTER-WHISPER) ---
	print("🚀 Chargement du modèle faster-whisper en mémoire...")
	# compute_type="int8" est le secret pour une vitesse fulgurante sur CPU
	model = WhisperModel(CT2_MODEL_DIR, device="cpu", compute_type="int8", cpu_threads=2)

	# --- 3. FONCTIONS DE TRAITEMENT ---
	def preprocess_audio(audio):
	"""Gère le rééchantillonnage strict à 16kHz de manière optimisée."""
	if audio is None:
	return None
	sr, y = audio
	y = y.astype(np.float32)
	# Normalisation
	if np.max(np.abs(y)) > 0:
	y /= np.max(np.abs(y))
	# Faster-whisper exige 16000Hz
	if sr != 16000:
	y = librosa.resample(y, orig_sr=sr, target_sr=16000)
	return y

	def transcribe(audio, state=""):
	"""Transcription de fichier ou micro complet."""
	y = preprocess_audio(audio)
	if y is None:
	return state

	# beam_size=5 donne une bonne précision. task="transcribe" forcé.
	segments, info = model.transcribe(y, beam_size=5, task="transcribe")

	# On assemble les segments de texte générés
	text = " ".join([segment.text for segment in segments])
	return text.strip()

	def stream_transcribe(audio, state=""):
	"""Transcription pour le streaming (plus agressive sur la vitesse)."""
	y = preprocess_audio(audio)
	if y is None:
	return state

	# beam_size=1 pour privilégier la vitesse extrême en streaming
	segments, info = model.transcribe(y, beam_size=1, task="transcribe")

	text = " ".join([segment.text for segment in segments])
	return text.strip()

	# --- 4. INTERFACE GRADIO ---
	with gr.Blocks(theme=gr.themes.Soft()) as demo:
	gr.Markdown(f"# 🎙️ Ewe STT - Faster Whisper CPU")
	gr.Markdown("Transcription ultra-rapide optimisée pour processeur. Traduction automatique du français vers l'Ewe ou transcription directe.")

	with gr.Tabs():
	# Onglet 1: Fichier et Enregistrement classique
	with gr.TabItem("Upload ou Enregistrement"):
	with gr.Row():
	audio_input = gr.Audio(label="Audio (Fichier ou Micro)", type="numpy")
	with gr.Row():
	transcribe_btn = gr.Button("Transcrire", variant="primary")
	output_text = gr.Textbox(label="Transcription Ewe", placeholder="Le texte apparaîtra ici...")

	transcribe_btn.click(
	fn=transcribe,
	inputs=audio_input,
	outputs=output_text,
	api_name="predict"
	)

	# Onglet 2: Streaming temps réel
	with gr.TabItem("Temps Réel (Streaming)"):
	gr.Markdown("Note : Le streaming sur CPU gratuit reste expérimental, parlez clairement.")
	stream_input = gr.Audio(
	label="Microphone",
	sources=["microphone"],
	streaming=True,
	type="numpy"
	)
	stream_output = gr.Textbox(label="Flux de transcription direct")

	stream_input.stream(
	fn=stream_transcribe,
	inputs=stream_input,
	outputs=stream_output,
	show_progress="hidden"
	)

	gr.HTML("""
	<div style="text-align: center; color: #666; margin-top: 20px;">
	Modèle utilisé : <b>yawo stt-ewe-2</b> \| Optimisation : <b>CTranslate2 (INT8)</b>
	</div>
	""")

	if __name__ == "__main__":
	# La queue est importante pour gérer plusieurs requêtes sans planter le CPU
	demo.queue().launch()