Spaces:

harmonicsnail
/

Modern_TalkNET

Sleeping

App Files Files Community

harmonicsnail commited on Oct 28, 2025

Commit

9496a85

1 Parent(s): 2a517e7

Add inference model + Gradio app

Browse files

Files changed (3) hide show

app.py +24 -78
model_inference.py +100 -49
requirements.txt +2 -6

app.py CHANGED Viewed

@@ -1,95 +1,41 @@
 # app.py
 import gradio as gr
 import os
-import numpy as np
-import soundfile as sf
-import tempfile
 from model_inference import NetTALKWrapper
-# choose TTS backend: "gtts" or "coqui" (TTS) or "none"
-TTS_BACKEND = os.environ.get("TTS_BACKEND", "gtts")
-# load model once (fast startup if model is cached)
-MODEL_PATH = "nettalk_model.pt"
-model = NetTALKWrapper(MODEL_PATH)
-# optional: simple gTTS-based synth (works by speaking the phoneme string as text)
-def synthesize_gtts(phoneme_text):
-    from gtts import gTTS
-    tmp = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
-    # gTTS outputs mp3 -> convert to wav using soundfile via numpy? Simpler: save mp3 then load then re-save wav
-    mp3_tmp = tmp.name + ".mp3"
-    tts = gTTS(phoneme_text, lang="en")
-    tts.save(mp3_tmp)
-    # load mp3 with soundfile may not work; scipy can read via pydub if available.
-    try:
-        import pydub
-        audio = pydub.AudioSegment.from_mp3(mp3_tmp)
-        audio.export(tmp.name, format="wav")
-    except Exception:
-        # fallback: return mp3 (Gradio accepts mp3 as audio)
-        return mp3_tmp
-    return tmp.name
-# optional: Coqui TTS (phoneme-aware) - heavier but can take ARPAbet inputs
-def synthesize_coqui(arpabet):
-    # This requires the `TTS` package and an appropriate model that accepts phoneme input.
-    try:
-        from TTS.api import TTS
-    except Exception as e:
-        raise RuntimeError("TTS package not installed or failed to import.") from e
-    # choose a model name you installed / that exists; example placeholder:
-    tts = TTS(model_name="tts_models/en/ljspeech/tacotron2-DDC", progress_bar=False, gpu=False)
-    # Some TTS models accept `phoneme` argument or `phoneme_input=True`. Check the model docs.
-    wav = tts.tts(arpabet, speaker=None, phoneme_input=False)
-    # wav is a numpy array and sample rate accessible via tts.synthesizer.output_sample_rate
-    sr = tts.synthesizer.output_sample_rate if hasattr(tts.synthesizer, "output_sample_rate") else 22050
-    tmp = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
-    sf.write(tmp.name, wav, sr)
-    return tmp.name
-def predict_and_speak(word):
     if not word or not word.strip():
         return "Please enter a word", None
-    phonemes = model.predict(word)
-    audio_path = None
-    # Try preferred backend
-    try:
-        if TTS_BACKEND == "coqui":
-            audio_path = synthesize_coqui(phonemes)
-        else:
-            audio_path = synthesize_gtts(phonemes)
-    except Exception as e:
-        # If synth fails, still return phonemes and a None audio
-        print("Synthesis failed:", e)
-        audio_path = None
-    # gr.Audio accepts: filename (wav/mp3), numpy array, or (np, sr)
-    return phonemes, audio_path
-# ---- Gradio UI ----
 css = """
-body { background: linear-gradient(135deg,#0f172a,#020617); color: #e6eef8; }
-.gradio-container { max-width: 900px; margin: auto; padding: 20px; }
 """
-with gr.Blocks(css=css, theme=gr.themes.Default()) as demo:
-    gr.Markdown("# 🧠 NetTALK → ARPAbet demo")
-    gr.Markdown("Enter a word, get predicted ARPAbet phonemes and a synthesized audio preview.")
     with gr.Row():
-        word_in = gr.Textbox(label="Enter word", placeholder="example: 'computer'", lines=1)
-        run_btn = gr.Button("Predict")
-    phoneme_out = gr.Textbox(label="Predicted ARPAbet Phonemes")
-    audio_out = gr.Audio(label="Synthesized audio (preview)")
-    run_btn.click(fn=predict_and_speak, inputs=[word_in], outputs=[phoneme_out, audio_out])
-    gr.Markdown("Tip: Replace `preprocess()` and `decode_to_arpabet()` in `model_inference.py` with your real model code.")
 if __name__ == "__main__":
-    demo.launch()

 # app.py
 import gradio as gr
 import os
 from model_inference import NetTALKWrapper
+# Optional: set env var NETTALK_STATE_DICT to different filename if needed
+STATE_DICT = os.environ.get("NETTALK_STATE_DICT", "nettalk_state_dict.pt")
+# instantiate the model once
+try:
+    model = NetTALKWrapper(state_dict_path=STATE_DICT)
+except Exception as e:
+    # Gradio will show this on startup logs — helpful for debugging
+    raise RuntimeError(f"Failed to load model: {e}")
+def predict_phonemes(word: str):
     if not word or not word.strip():
         return "Please enter a word", None
+    phonemes = model.predict_string(word)
+    # return phoneme string; no audio here (you can add TTS later)
+    return phonemes, None
 css = """
+.gradio-container { max-width: 900px; margin: auto; }
+body { background: linear-gradient(135deg,#071024,#081226); color: #e6eef8; }
 """
+with gr.Blocks(css=css, theme=gr.themes.Base()) as demo:
+    gr.Markdown("# 🧠 NetTALK phoneme predictor")
+    gr.Markdown("Enter a word and get ARPAbet phonemes predicted by the trained model.")
     with gr.Row():
+        word = gr.Textbox(label="Enter word", placeholder="example: 'computer'", lines=1)
+        btn = gr.Button("Predict")
+    out_ph = gr.Textbox(label="Predicted ARPAbet Phonemes")
+    # placeholder for future audio output
+    out_audio = gr.Audio(label="Synthesized audio (optional)", visible=False)
+    btn.click(predict_phonemes, inputs=word, outputs=[out_ph, out_audio])
 if __name__ == "__main__":
+    demo.launch(server_name="0.0.0.0", server_port=7860)

model_inference.py CHANGED Viewed

@@ -1,66 +1,117 @@
 # model_inference.py
 import torch
 import numpy as np
 class NetTALKWrapper:
-    def __init__(self, model_path="nettalk_model.pt", device=None):
-        # pick device automatically
         if device is None:
             device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
         self.device = device
-        # If you saved state_dict, load accordingly:
         try:
-            self.model = torch.load(model_path, map_location=self.device)
         except Exception as e:
-            # fallback: user may have saved state_dict
-            print("torch.load failed; try loading state_dict. Error:", e)
-            # Example placeholder architecture - REPLACE with your actual model class
-            from torch import nn
-            class DummyModel(nn.Module):
-                def __init__(self):
-                    super().__init__()
-                    self.dummy = nn.Linear(10, 10)
-                def forward(self, x):
-                    return torch.randn(1, 20)  # placeholder
-            m = DummyModel()
-            sd = torch.load(model_path, map_location="cpu")
-            try:
-                m.load_state_dict(sd)
-                self.model = m.to(self.device)
-            except Exception:
-                raise RuntimeError("Could not load model. Please update model_inference.py to use your architecture.")
         self.model.eval()
-    # ---- Replace these helper methods with your real preprocess/decoder ----
-    def preprocess(self, word: str):
-        """
-        Convert `word` (string) to input tensor expected by your NetTALK model.
-        Example NetTALK uses character windowing / one-hot encoding — replace below.
-        """
-        # PLACEHOLDER: map characters to indices, pad/truncate to length L, then to tensor
-        # *Replace with your actual preprocessing code*
-        max_len = 32
-        arr = np.zeros((1, max_len), dtype=np.int64)
-        for i, c in enumerate(word.lower()[:max_len]):
-            arr[0, i] = ord(c)  # placeholder mapping
-        return torch.from_numpy(arr).to(self.device).float()
-    def decode_to_arpabet(self, model_output):
-        """
-        Convert model raw output to an ARPAbet string (e.g., "HH AH0 L OW1").
-        Replace this with your decoder logic (argmax, beam search, label mapping, etc).
-        """
-        # PLACEHOLDER: just return dummy tokens
-        return "AH0 N T EH1 R P AH0 B EH1 T"
-    def predict(self, word: str):
-        # basic sanitization
         word = word.strip()
         if not word:
-            return ""
-        x = self.preprocess(word)
         with torch.no_grad():
-            y = self.model(x)
-        return self.decode_to_arpabet(y)

 # model_inference.py
+import os
 import torch
+import torch.nn as nn
 import numpy as np
+# Window and hidden sizes must match your training config
+WINDOW_SIZE = 7
+HIDDEN_SIZE = 128
+# Path to CMU dict in the repo root (must be present)
+CMUDICT_PATH = "cmudict.dict.txt"
+STATE_DICT_PATH = os.environ.get("NETTALK_STATE_DICT", "nettalk_state_dict.pt")
+# --- 1) Rebuild vocab from CMUdict (same method you used in notebook) ---
+def build_vocab(cmudict_path=CMUDICT_PATH):
+    words = []
+    phones_all = []
+    with open(cmudict_path, "r", encoding="utf-8", errors="ignore") as f:
+        for line in f:
+            if line.strip() and not line.startswith(";;;"):
+                parts = line.strip().split()
+                w = parts[0]
+                p = parts[1:]
+                words.append(w)
+                phones_all.append(p)
+    # character vocab from words (include space for padding)
+    char_vocab = set("".join(words))
+    char_vocab.add(" ")  # ensure space exists
+    char_to_idx = {c: i+1 for i, c in enumerate(sorted(char_vocab))}  # reserve 0 for unknown/pad
+    char_to_idx["<PAD>"] = 0
+    idx_to_char = {i: c for c, i in char_to_idx.items()}
+    phone_vocab = set(phone for p_list in phones_all for phone in p_list)
+    phone_to_idx = {p: i for i, p in enumerate(sorted(phone_vocab))}
+    idx_to_phone = {i: p for p, i in phone_to_idx.items()}
+    return char_to_idx, idx_to_char, phone_to_idx, idx_to_phone
+CHAR_TO_IDX, IDX_TO_CHAR, PHONE_TO_IDX, IDX_TO_PHONE = build_vocab()
+VOCAB_SIZE = len(CHAR_TO_IDX)  # includes PAD token
+NUM_PHONES = len(PHONE_TO_IDX)
+# --- 2) Architecture matching your notebook ---
+class PhonemeClassifier(nn.Module):
+    def __init__(self, vocab_size, hidden_size, num_phones, window_size=WINDOW_SIZE):
+        super().__init__()
+        self.window_size = window_size
+        self.embedding = nn.Embedding(vocab_size, hidden_size, padding_idx=0)
+        self.fc1 = nn.Linear(hidden_size * window_size, hidden_size)
+        self.relu = nn.ReLU()
+        self.fc2 = nn.Linear(hidden_size, num_phones)
+    def forward(self, x):
+        # x: (batch, window_size)
+        x = self.embedding(x)  # (batch, window, hidden)
+        x = x.view(x.size(0), -1)  # flatten window
+        x = self.relu(self.fc1(x))
+        x = self.fc2(x)
+        return x
+# --- 3) Wrapper that loads state_dict and provides predict(word) ---
 class NetTALKWrapper:
+    def __init__(self, state_dict_path=STATE_DICT_PATH, device=None):
         if device is None:
             device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
         self.device = device
+        # instantiate model with same architecture
+        self.model = PhonemeClassifier(VOCAB_SIZE, HIDDEN_SIZE, NUM_PHONES, WINDOW_SIZE).to(self.device)
+        # Try loading state_dict
+        if not os.path.exists(state_dict_path):
+            raise FileNotFoundError(f"State dict not found at {state_dict_path}. Please upload it to the repo or set NETTALK_STATE_DICT env var.")
+        sd = torch.load(state_dict_path, map_location=self.device)
         try:
+            # sd could be a dict directly (state_dict)
+            self.model.load_state_dict(sd)
         except Exception as e:
+            # If the saved file contains extra keys (e.g., a checkpoint dict), try to extract 'model_state_dict'
+            if isinstance(sd, dict) and "model_state_dict" in sd:
+                self.model.load_state_dict(sd["model_state_dict"])
+            else:
+                raise RuntimeError("Failed to load state_dict. Ensure you saved with torch.save(model.state_dict(), ...)") from e
         self.model.eval()
+    def _word_to_windows(self, word):
+        # pad with spaces on both sides
+        pad = " " * (WINDOW_SIZE // 2)
+        padded = pad + word.lower() + pad
+        windows = []
+        for i in range(len(word)):
+            w = padded[i:i + WINDOW_SIZE]
+            idxs = [CHAR_TO_IDX.get(ch, 0) for ch in w]
+            windows.append(idxs)
+        return torch.tensor(windows, dtype=torch.long, device=self.device)  # (len(word), window_size)
+    def predict(self, word):
         word = word.strip()
         if not word:
+            return []
+        windows = self._word_to_windows(word)  # (L, window_size)
         with torch.no_grad():
+            logits = self.model(windows)  # (L, num_phones)
+            probs = torch.softmax(logits, dim=-1)
+            preds = torch.argmax(probs, dim=-1).cpu().numpy().tolist()
+        # map indices to ARPAbet tokens
+        phones = [IDX_TO_PHONE[p] for p in preds]
+        return phones
+    def predict_string(self, word):
+        phones = self.predict(word)
+        return " ".join(phones)

requirements.txt CHANGED Viewed

@@ -1,12 +1,8 @@
 torch
 gradio>=3.0
 numpy
-scipy
 soundfile
-# Optional TTS backends (pick one):
-# For a fast fallback TTS:
 gTTS
-# For a more advanced phoneme-aware TTS (may require GPU & larger install):
 TTS
-# Helpful: phonemizer if you want alternative phoneme utilities
-phonemizer

 torch
 gradio>=3.0
 numpy
 soundfile
+# optional (for audio synthesis later):
 gTTS
+pydub
 TTS