import gradio as gr
import whisperx

# -----------------------------

# Device and compute settings

# -----------------------------

device = "cpu"            # Free-tier Spaces only have CPU
compute_type = "int8"     # float16 only works on GPU

# -----------------------------

# Load WhisperX model

# -----------------------------

model_name = "inesc-id/WhisperLv3-EP-X"  # Portuguese fine-tuned Whisper model
model = whisperx.load_model(
model_name,
device=device,
compute_type=compute_type,
language="pt",
task="transcribe"
)

# -----------------------------

# Transcription function

# -----------------------------

def transcribe(audio_file):
    # Load audio and resample to 16 kHz
    audio = whisperx.load_audio(audio_file, sr=16000)
    
    
    # Transcribe
    outputs = model.transcribe(audio, batch_size=4, language="pt", task="transcribe")
    
    # Concatenate segments
    if outputs['segments']:
        text = " ".join(segment['text'] for segment in outputs['segments'])
    else:
        text = ""
    
    return text


# -----------------------------

# Gradio interface

# -----------------------------

demo = gr.Interface(
fn=transcribe,
inputs=gr.Audio(sources=["microphone", "upload"], type="filepath"),
outputs="text",
title="CAMÕES European Portuguese Automatic Speech Recognition Demo",
description="""
This is a demo for **CAMÕES**, a Whisper Model fine-tuned on around 420h of European Portuguese by the HLT lab at INESC-ID.

The model being used here is "WhisperLv3-X". For more details about CAMÕES check out the [paper here](https://arxiv.org/abs/2508.19721).
""")

demo.launch()