import gradio as gr import whisperx # ----------------------------- # Device and compute settings # ----------------------------- device = "cpu" # Free-tier Spaces only have CPU compute_type = "int8" # float16 only works on GPU # ----------------------------- # Load WhisperX model # ----------------------------- model_name = "inesc-id/WhisperLv3-EP-X" # Portuguese fine-tuned Whisper model model = whisperx.load_model( model_name, device=device, compute_type=compute_type, language="pt", task="transcribe" ) # ----------------------------- # Transcription function # ----------------------------- def transcribe(audio_file): # Load audio and resample to 16 kHz audio = whisperx.load_audio(audio_file, sr=16000) # Transcribe outputs = model.transcribe(audio, batch_size=4, language="pt", task="transcribe") # Concatenate segments if outputs['segments']: text = " ".join(segment['text'] for segment in outputs['segments']) else: text = "" return text # ----------------------------- # Gradio interface # ----------------------------- demo = gr.Interface( fn=transcribe, inputs=gr.Audio(sources=["microphone", "upload"], type="filepath"), outputs="text", title="CAMÕES European Portuguese Automatic Speech Recognition Demo", description=""" This is a demo for **CAMÕES**, a Whisper Model fine-tuned on around 420h of European Portuguese by the HLT lab at INESC-ID. The model being used here is "WhisperLv3-X". For more details about CAMÕES check out the [paper here](https://arxiv.org/abs/2508.19721). """) demo.launch()