import gradio as gr import torch import shutil from pathlib import Path from huggingface_hub import hf_hub_download import spaces # NeMo import try: from nemo.collections.asr.models import ASRModel except ImportError: print("Warning: NeMo toolkit not found. Please ensure it is installed via requirements.txt") ASRModel = None def setup_model(): if ASRModel is None: raise ImportError("NeMo toolkit is required but not installed.") print("Downloading model files...") checkpoint_path = hf_hub_download(repo_id="boldvoice/nemotron-phoneme-ipa-v1", filename="checkpoint.ckpt") tokenizer_model = hf_hub_download(repo_id="boldvoice/nemotron-phoneme-ipa-v1", filename="tokenizer/tokenizer.model") vocab_txt = hf_hub_download(repo_id="boldvoice/nemotron-phoneme-ipa-v1", filename="tokenizer/vocab.txt") print("Setting up tokenizer...") tokenizer_dir = Path("tokenizer") tokenizer_dir.mkdir(exist_ok=True) shutil.copy(tokenizer_model, tokenizer_dir / "tokenizer.model") shutil.copy(vocab_txt, tokenizer_dir / "vocab.txt") print("Loading base model...") # Load base model and change vocabulary model = ASRModel.from_pretrained("nvidia/nemotron-speech-streaming-en-0.6b") model.change_vocabulary(new_tokenizer_dir=str(tokenizer_dir), new_tokenizer_type="bpe") print("Loading fine-tuned weights...") # Load fine-tuned weights # Using weights_only=False as per model card instructions checkpoint = torch.load(checkpoint_path, map_location="cpu", weights_only=False) state_dict = checkpoint.get("state_dict", checkpoint) cleaned_state_dict = {} for k, v in state_dict.items(): new_key = k.replace("model.", "", 1) if k.startswith("model.") else k cleaned_state_dict[new_key] = v model.load_state_dict(cleaned_state_dict, strict=False) # Disable CUDA graphs for inference if hasattr(model, 'decoding') and model.decoding is not None: decoding_cfg = model.cfg.decoding decoding_cfg.greedy.loop_labels = False decoding_cfg.greedy.use_cuda_graph_decoder = False model.change_decoding_strategy(decoding_cfg) # Configure streaming latency (Lowest latency option as default) model.encoder.set_default_att_context_size([70, 0]) # Move to GPU and set eval mode device = torch.device("cuda" if torch.cuda.is_available() else "cpu") model = model.to(device) model.eval() return model # Global model variable model = None @spaces.GPU def predict(audio_file): global model if model is None: model = setup_model() if not audio_file: return "" try: # Transcribe audio predictions = model.transcribe([audio_file]) # Output: list of IPA strings if predictions and len(predictions) > 0: pred = predictions[0] if hasattr(pred, 'text'): text = pred.text else: text = str(pred) # Remove SentencePiece artifacts text = text.replace("▁", "").strip() return text return "" except Exception as e: return f"Error during transcription: {str(e)}" # Create Gradio interface iface = gr.Interface( fn=predict, inputs=gr.Audio(type="filepath", label="Input Audio"), outputs=gr.Textbox(label="IPA Transcription"), title="Nemotron Phoneme IPA Recognition", description="Nemotron-based phoneme recognition model fine-tuned for raw IPA transcription with full diacritics preserved. Based on boldvoice/nemotron-phoneme-ipa-v1.", ) if __name__ == "__main__": # Attempt to load model at startup try: model = setup_model() print("Model loaded successfully.") except Exception as e: print(f"Model loading will happen on first request. Error: {e}") iface.launch( server_name="0.0.0.0", server_port=7860, share=False )