Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| import torch | |
| import shutil | |
| from pathlib import Path | |
| from huggingface_hub import hf_hub_download | |
| import spaces | |
| # NeMo import | |
| try: | |
| from nemo.collections.asr.models import ASRModel | |
| except ImportError: | |
| print("Warning: NeMo toolkit not found. Please ensure it is installed via requirements.txt") | |
| ASRModel = None | |
| def setup_model(): | |
| if ASRModel is None: | |
| raise ImportError("NeMo toolkit is required but not installed.") | |
| print("Downloading model files...") | |
| checkpoint_path = hf_hub_download(repo_id="boldvoice/nemotron-phoneme-ipa-v1", filename="checkpoint.ckpt") | |
| tokenizer_model = hf_hub_download(repo_id="boldvoice/nemotron-phoneme-ipa-v1", filename="tokenizer/tokenizer.model") | |
| vocab_txt = hf_hub_download(repo_id="boldvoice/nemotron-phoneme-ipa-v1", filename="tokenizer/vocab.txt") | |
| print("Setting up tokenizer...") | |
| tokenizer_dir = Path("tokenizer") | |
| tokenizer_dir.mkdir(exist_ok=True) | |
| shutil.copy(tokenizer_model, tokenizer_dir / "tokenizer.model") | |
| shutil.copy(vocab_txt, tokenizer_dir / "vocab.txt") | |
| print("Loading base model...") | |
| # Load base model and change vocabulary | |
| model = ASRModel.from_pretrained("nvidia/nemotron-speech-streaming-en-0.6b") | |
| model.change_vocabulary(new_tokenizer_dir=str(tokenizer_dir), new_tokenizer_type="bpe") | |
| print("Loading fine-tuned weights...") | |
| # Load fine-tuned weights | |
| # Using weights_only=False as per model card instructions | |
| checkpoint = torch.load(checkpoint_path, map_location="cpu", weights_only=False) | |
| state_dict = checkpoint.get("state_dict", checkpoint) | |
| cleaned_state_dict = {} | |
| for k, v in state_dict.items(): | |
| new_key = k.replace("model.", "", 1) if k.startswith("model.") else k | |
| cleaned_state_dict[new_key] = v | |
| model.load_state_dict(cleaned_state_dict, strict=False) | |
| # Disable CUDA graphs for inference | |
| if hasattr(model, 'decoding') and model.decoding is not None: | |
| decoding_cfg = model.cfg.decoding | |
| decoding_cfg.greedy.loop_labels = False | |
| decoding_cfg.greedy.use_cuda_graph_decoder = False | |
| model.change_decoding_strategy(decoding_cfg) | |
| # Configure streaming latency (Lowest latency option as default) | |
| model.encoder.set_default_att_context_size([70, 0]) | |
| # Move to GPU and set eval mode | |
| device = torch.device("cuda" if torch.cuda.is_available() else "cpu") | |
| model = model.to(device) | |
| model.eval() | |
| return model | |
| # Global model variable | |
| model = None | |
| def predict(audio_file): | |
| global model | |
| if model is None: | |
| model = setup_model() | |
| if not audio_file: | |
| return "" | |
| try: | |
| # Transcribe audio | |
| predictions = model.transcribe([audio_file]) | |
| # Output: list of IPA strings | |
| if predictions and len(predictions) > 0: | |
| pred = predictions[0] | |
| if hasattr(pred, 'text'): | |
| text = pred.text | |
| else: | |
| text = str(pred) | |
| # Remove SentencePiece artifacts | |
| text = text.replace("▁", "").strip() | |
| return text | |
| return "" | |
| except Exception as e: | |
| return f"Error during transcription: {str(e)}" | |
| # Create Gradio interface | |
| iface = gr.Interface( | |
| fn=predict, | |
| inputs=gr.Audio(type="filepath", label="Input Audio"), | |
| outputs=gr.Textbox(label="IPA Transcription"), | |
| title="Nemotron Phoneme IPA Recognition", | |
| description="Nemotron-based phoneme recognition model fine-tuned for raw IPA transcription with full diacritics preserved. Based on boldvoice/nemotron-phoneme-ipa-v1.", | |
| ) | |
| if __name__ == "__main__": | |
| # Attempt to load model at startup | |
| try: | |
| model = setup_model() | |
| print("Model loaded successfully.") | |
| except Exception as e: | |
| print(f"Model loading will happen on first request. Error: {e}") | |
| iface.launch( | |
| server_name="0.0.0.0", | |
| server_port=7860, | |
| share=False | |
| ) | |