File size: 3,989 Bytes
7fcc44f
 
 
 
 
3b991b8
 
7fcc44f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3b991b8
7fcc44f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
804294b
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
import gradio as gr
import torch
import shutil
from pathlib import Path
from huggingface_hub import hf_hub_download
import spaces


# NeMo import
try:
    from nemo.collections.asr.models import ASRModel
except ImportError:
    print("Warning: NeMo toolkit not found. Please ensure it is installed via requirements.txt")
    ASRModel = None

def setup_model():
    if ASRModel is None:
        raise ImportError("NeMo toolkit is required but not installed.")

    print("Downloading model files...")
    checkpoint_path = hf_hub_download(repo_id="boldvoice/nemotron-phoneme-ipa-v1", filename="checkpoint.ckpt")
    tokenizer_model = hf_hub_download(repo_id="boldvoice/nemotron-phoneme-ipa-v1", filename="tokenizer/tokenizer.model")
    vocab_txt = hf_hub_download(repo_id="boldvoice/nemotron-phoneme-ipa-v1", filename="tokenizer/vocab.txt")

    print("Setting up tokenizer...")
    tokenizer_dir = Path("tokenizer")
    tokenizer_dir.mkdir(exist_ok=True)
    shutil.copy(tokenizer_model, tokenizer_dir / "tokenizer.model")
    shutil.copy(vocab_txt, tokenizer_dir / "vocab.txt")

    print("Loading base model...")
    # Load base model and change vocabulary
    model = ASRModel.from_pretrained("nvidia/nemotron-speech-streaming-en-0.6b")
    model.change_vocabulary(new_tokenizer_dir=str(tokenizer_dir), new_tokenizer_type="bpe")

    print("Loading fine-tuned weights...")
    # Load fine-tuned weights
    # Using weights_only=False as per model card instructions
    checkpoint = torch.load(checkpoint_path, map_location="cpu", weights_only=False)
    state_dict = checkpoint.get("state_dict", checkpoint)
    cleaned_state_dict = {}
    for k, v in state_dict.items():
        new_key = k.replace("model.", "", 1) if k.startswith("model.") else k
        cleaned_state_dict[new_key] = v
    model.load_state_dict(cleaned_state_dict, strict=False)

    # Disable CUDA graphs for inference
    if hasattr(model, 'decoding') and model.decoding is not None:
        decoding_cfg = model.cfg.decoding
        decoding_cfg.greedy.loop_labels = False
        decoding_cfg.greedy.use_cuda_graph_decoder = False
        model.change_decoding_strategy(decoding_cfg)

    # Configure streaming latency (Lowest latency option as default)
    model.encoder.set_default_att_context_size([70, 0])

    # Move to GPU and set eval mode
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = model.to(device)
    model.eval()
    
    return model

# Global model variable
model = None

@spaces.GPU
def predict(audio_file):
    global model
    if model is None:
        model = setup_model()
        
    if not audio_file:
        return ""
        
    try:
        # Transcribe audio
        predictions = model.transcribe([audio_file])
        
        # Output: list of IPA strings
        if predictions and len(predictions) > 0:
            pred = predictions[0]
            if hasattr(pred, 'text'):
                text = pred.text
            else:
                text = str(pred)
            # Remove SentencePiece artifacts
            text = text.replace("▁", "").strip()
            return text
        return ""
    except Exception as e:
        return f"Error during transcription: {str(e)}"

# Create Gradio interface
iface = gr.Interface(
    fn=predict,
    inputs=gr.Audio(type="filepath", label="Input Audio"),
    outputs=gr.Textbox(label="IPA Transcription"),
    title="Nemotron Phoneme IPA Recognition",
    description="Nemotron-based phoneme recognition model fine-tuned for raw IPA transcription with full diacritics preserved. Based on boldvoice/nemotron-phoneme-ipa-v1.",
)

if __name__ == "__main__":
    # Attempt to load model at startup
    try:
        model = setup_model()
        print("Model loaded successfully.")
    except Exception as e:
        print(f"Model loading will happen on first request. Error: {e}")
        
    iface.launch(
        server_name="0.0.0.0",
        server_port=7860,
        share=False
    )