ipa-recognizer

Sleeping

File size: 3,989 Bytes

import gradio as gr
import torch
import shutil
from pathlib import Path
from huggingface_hub import hf_hub_download
import spaces


# NeMo import
try:
    from nemo.collections.asr.models import ASRModel
except ImportError:
    print("Warning: NeMo toolkit not found. Please ensure it is installed via requirements.txt")
    ASRModel = None

def setup_model():
    if ASRModel is None:
        raise ImportError("NeMo toolkit is required but not installed.")

    print("Downloading model files...")
    checkpoint_path = hf_hub_download(repo_id="boldvoice/nemotron-phoneme-ipa-v1", filename="checkpoint.ckpt")
    tokenizer_model = hf_hub_download(repo_id="boldvoice/nemotron-phoneme-ipa-v1", filename="tokenizer/tokenizer.model")
    vocab_txt = hf_hub_download(repo_id="boldvoice/nemotron-phoneme-ipa-v1", filename="tokenizer/vocab.txt")

    print("Setting up tokenizer...")
    tokenizer_dir = Path("tokenizer")
    tokenizer_dir.mkdir(exist_ok=True)
    shutil.copy(tokenizer_model, tokenizer_dir / "tokenizer.model")
    shutil.copy(vocab_txt, tokenizer_dir / "vocab.txt")

    print("Loading base model...")
    # Load base model and change vocabulary
    model = ASRModel.from_pretrained("nvidia/nemotron-speech-streaming-en-0.6b")
    model.change_vocabulary(new_tokenizer_dir=str(tokenizer_dir), new_tokenizer_type="bpe")

    print("Loading fine-tuned weights...")
    # Load fine-tuned weights
    # Using weights_only=False as per model card instructions
    checkpoint = torch.load(checkpoint_path, map_location="cpu", weights_only=False)
    state_dict = checkpoint.get("state_dict", checkpoint)
    cleaned_state_dict = {}
    for k, v in state_dict.items():
        new_key = k.replace("model.", "", 1) if k.startswith("model.") else k
        cleaned_state_dict[new_key] = v
    model.load_state_dict(cleaned_state_dict, strict=False)

    # Disable CUDA graphs for inference
    if hasattr(model, 'decoding') and model.decoding is not None:
        decoding_cfg = model.cfg.decoding
        decoding_cfg.greedy.loop_labels = False
        decoding_cfg.greedy.use_cuda_graph_decoder = False
        model.change_decoding_strategy(decoding_cfg)

    # Configure streaming latency (Lowest latency option as default)
    model.encoder.set_default_att_context_size([70, 0])

    # Move to GPU and set eval mode
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = model.to(device)
    model.eval()
    
    return model

# Global model variable
model = None

@spaces.GPU
def predict(audio_file):
    global model
    if model is None:
        model = setup_model()
        
    if not audio_file:
        return ""
        
    try:
        # Transcribe audio
        predictions = model.transcribe([audio_file])
        
        # Output: list of IPA strings
        if predictions and len(predictions) > 0:
            pred = predictions[0]
            if hasattr(pred, 'text'):
                text = pred.text
            else:
                text = str(pred)
            # Remove SentencePiece artifacts
            text = text.replace("▁", "").strip()
            return text
        return ""
    except Exception as e:
        return f"Error during transcription: {str(e)}"

# Create Gradio interface
iface = gr.Interface(
    fn=predict,
    inputs=gr.Audio(type="filepath", label="Input Audio"),
    outputs=gr.Textbox(label="IPA Transcription"),
    title="Nemotron Phoneme IPA Recognition",
    description="Nemotron-based phoneme recognition model fine-tuned for raw IPA transcription with full diacritics preserved. Based on boldvoice/nemotron-phoneme-ipa-v1.",
)

if __name__ == "__main__":
    # Attempt to load model at startup
    try:
        model = setup_model()
        print("Model loaded successfully.")
    except Exception as e:
        print(f"Model loading will happen on first request. Error: {e}")
        
    iface.launch(
        server_name="0.0.0.0",
        server_port=7860,
        share=False
    )