ipa-recognizer / app.py
gagndeep's picture
Update app.py
3b991b8 verified
Raw
History Blame Contribute Delete
3.99 kB
import gradio as gr
import torch
import shutil
from pathlib import Path
from huggingface_hub import hf_hub_download
import spaces
# NeMo import
try:
from nemo.collections.asr.models import ASRModel
except ImportError:
print("Warning: NeMo toolkit not found. Please ensure it is installed via requirements.txt")
ASRModel = None
def setup_model():
if ASRModel is None:
raise ImportError("NeMo toolkit is required but not installed.")
print("Downloading model files...")
checkpoint_path = hf_hub_download(repo_id="boldvoice/nemotron-phoneme-ipa-v1", filename="checkpoint.ckpt")
tokenizer_model = hf_hub_download(repo_id="boldvoice/nemotron-phoneme-ipa-v1", filename="tokenizer/tokenizer.model")
vocab_txt = hf_hub_download(repo_id="boldvoice/nemotron-phoneme-ipa-v1", filename="tokenizer/vocab.txt")
print("Setting up tokenizer...")
tokenizer_dir = Path("tokenizer")
tokenizer_dir.mkdir(exist_ok=True)
shutil.copy(tokenizer_model, tokenizer_dir / "tokenizer.model")
shutil.copy(vocab_txt, tokenizer_dir / "vocab.txt")
print("Loading base model...")
# Load base model and change vocabulary
model = ASRModel.from_pretrained("nvidia/nemotron-speech-streaming-en-0.6b")
model.change_vocabulary(new_tokenizer_dir=str(tokenizer_dir), new_tokenizer_type="bpe")
print("Loading fine-tuned weights...")
# Load fine-tuned weights
# Using weights_only=False as per model card instructions
checkpoint = torch.load(checkpoint_path, map_location="cpu", weights_only=False)
state_dict = checkpoint.get("state_dict", checkpoint)
cleaned_state_dict = {}
for k, v in state_dict.items():
new_key = k.replace("model.", "", 1) if k.startswith("model.") else k
cleaned_state_dict[new_key] = v
model.load_state_dict(cleaned_state_dict, strict=False)
# Disable CUDA graphs for inference
if hasattr(model, 'decoding') and model.decoding is not None:
decoding_cfg = model.cfg.decoding
decoding_cfg.greedy.loop_labels = False
decoding_cfg.greedy.use_cuda_graph_decoder = False
model.change_decoding_strategy(decoding_cfg)
# Configure streaming latency (Lowest latency option as default)
model.encoder.set_default_att_context_size([70, 0])
# Move to GPU and set eval mode
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)
model.eval()
return model
# Global model variable
model = None
@spaces.GPU
def predict(audio_file):
global model
if model is None:
model = setup_model()
if not audio_file:
return ""
try:
# Transcribe audio
predictions = model.transcribe([audio_file])
# Output: list of IPA strings
if predictions and len(predictions) > 0:
pred = predictions[0]
if hasattr(pred, 'text'):
text = pred.text
else:
text = str(pred)
# Remove SentencePiece artifacts
text = text.replace("▁", "").strip()
return text
return ""
except Exception as e:
return f"Error during transcription: {str(e)}"
# Create Gradio interface
iface = gr.Interface(
fn=predict,
inputs=gr.Audio(type="filepath", label="Input Audio"),
outputs=gr.Textbox(label="IPA Transcription"),
title="Nemotron Phoneme IPA Recognition",
description="Nemotron-based phoneme recognition model fine-tuned for raw IPA transcription with full diacritics preserved. Based on boldvoice/nemotron-phoneme-ipa-v1.",
)
if __name__ == "__main__":
# Attempt to load model at startup
try:
model = setup_model()
print("Model loaded successfully.")
except Exception as e:
print(f"Model loading will happen on first request. Error: {e}")
iface.launch(
server_name="0.0.0.0",
server_port=7860,
share=False
)