Spaces:

harmonicsnail
/

Modern_TalkNET

Sleeping

App Files Files Community

harmonicsnail commited on Oct 28, 2025

Commit

c5a6262

1 Parent(s): 2379562

fixed model inference py

Browse files

Files changed (1) hide show

model_inference.py +33 -35

model_inference.py CHANGED Viewed

@@ -1,35 +1,37 @@
-# model_inference.py
 import os
 import torch
 import torch.nn as nn
 import numpy as np
-# Window and hidden sizes must match your training config
 WINDOW_SIZE = 7
 HIDDEN_SIZE = 128
-# Path to CMU dict in the repo root (must be present)
-CMUDICT_PATH = "cmudict.dict.txt"
 STATE_DICT_PATH = os.environ.get("NETTALK_STATE_DICT", "nettalk_state_dict.pt")
-# --- 1) Rebuild vocab from CMUdict (same method you used in notebook) ---
-import json
 def load_vocab():
-    with open("char_vocab.json") as f:
         char_to_idx = json.load(f)
-    idx_to_char = {i: c for c, i in char_to_idx.items()}
-    return char_to_idx, idx_to_char
-CHAR_TO_IDX, IDX_TO_CHAR = load_vocab()
-VOCAB_SIZE = len(CHAR_TO_IDX)
-CHAR_TO_IDX, IDX_TO_CHAR, PHONE_TO_IDX, IDX_TO_PHONE = build_vocab()
-VOCAB_SIZE = len(CHAR_TO_IDX)  # includes PAD token
 NUM_PHONES = len(PHONE_TO_IDX)
-# --- 2) Architecture matching your notebook ---
 class PhonemeClassifier(nn.Module):
     def __init__(self, vocab_size, hidden_size, num_phones, window_size=WINDOW_SIZE):
         super().__init__()
@@ -41,63 +43,59 @@ class PhonemeClassifier(nn.Module):
     def forward(self, x):
         # x: (batch, window_size)
-        x = self.embedding(x)  # (batch, window, hidden)
-        x = x.view(x.size(0), -1)  # flatten window
         x = self.relu(self.fc1(x))
         x = self.fc2(x)
         return x
-# --- 3) Wrapper that loads state_dict and provides predict(word) ---
 class NetTALKWrapper:
     def __init__(self, state_dict_path=STATE_DICT_PATH, device=None):
         if device is None:
             device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
         self.device = device
-        # instantiate model with same architecture
         self.model = PhonemeClassifier(VOCAB_SIZE, HIDDEN_SIZE, NUM_PHONES, WINDOW_SIZE).to(self.device)
-        # Try loading state_dict
         if not os.path.exists(state_dict_path):
-            raise FileNotFoundError(f"State dict not found at {state_dict_path}. Please upload it to the repo or set NETTALK_STATE_DICT env var.")
         sd = torch.load(state_dict_path, map_location=self.device)
         try:
-            # sd could be a dict directly (state_dict)
             self.model.load_state_dict(sd)
         except Exception as e:
-            # If the saved file contains extra keys (e.g., a checkpoint dict), try to extract 'model_state_dict'
             if isinstance(sd, dict) and "model_state_dict" in sd:
                 self.model.load_state_dict(sd["model_state_dict"])
             else:
-                raise RuntimeError("Failed to load state_dict. Ensure you saved with torch.save(model.state_dict(), ...)") from e
         self.model.eval()
     def _word_to_windows(self, word):
-        # pad with spaces on both sides
         pad = " " * (WINDOW_SIZE // 2)
         padded = pad + word.lower() + pad
         windows = []
         for i in range(len(word)):
-            w = padded[i:i + WINDOW_SIZE]
             idxs = [CHAR_TO_IDX.get(ch, 0) for ch in w]
             windows.append(idxs)
-        return torch.tensor(windows, dtype=torch.long, device=self.device)  # (len(word), window_size)
     def predict(self, word):
         word = word.strip()
         if not word:
             return []
-        windows = self._word_to_windows(word)  # (L, window_size)
         with torch.no_grad():
-            logits = self.model(windows)  # (L, num_phones)
-            probs = torch.softmax(logits, dim=-1)
-            preds = torch.argmax(probs, dim=-1).cpu().numpy().tolist()
-        # map indices to ARPAbet tokens
         phones = [IDX_TO_PHONE[p] for p in preds]
         return phones
     def predict_string(self, word):
-        phones = self.predict(word)
-        return " ".join(phones)

 import os
+import json
 import torch
 import torch.nn as nn
 import numpy as np
+# --- Config ---
 WINDOW_SIZE = 7
 HIDDEN_SIZE = 128
+# Paths
 STATE_DICT_PATH = os.environ.get("NETTALK_STATE_DICT", "nettalk_state_dict.pt")
+CHAR_VOCAB_PATH = "char_vocab.json"
+PHONE_VOCAB_PATH = "phone_vocab.json"
+# --- 1) Load vocabularies (must match training) ---
 def load_vocab():
+    with open(CHAR_VOCAB_PATH, "r") as f:
         char_to_idx = json.load(f)
+    with open(PHONE_VOCAB_PATH, "r") as f:
+        phone_to_idx = json.load(f)
+    idx_to_char = {int(v): k for k, v in char_to_idx.items()}
+    idx_to_phone = {int(v): k for k, v in phone_to_idx.items()}
+    return char_to_idx, idx_to_char, phone_to_idx, idx_to_phone
+CHAR_TO_IDX, IDX_TO_CHAR, PHONE_TO_IDX, IDX_TO_PHONE = load_vocab()
+VOCAB_SIZE = len(CHAR_TO_IDX)
 NUM_PHONES = len(PHONE_TO_IDX)
+# --- 2) Model architecture (must match training) ---
 class PhonemeClassifier(nn.Module):
     def __init__(self, vocab_size, hidden_size, num_phones, window_size=WINDOW_SIZE):
         super().__init__()
     def forward(self, x):
         # x: (batch, window_size)
+        x = self.embedding(x)
+        x = x.view(x.size(0), -1)
         x = self.relu(self.fc1(x))
         x = self.fc2(x)
         return x
+# --- 3) Wrapper for inference ---
 class NetTALKWrapper:
     def __init__(self, state_dict_path=STATE_DICT_PATH, device=None):
         if device is None:
             device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
         self.device = device
         self.model = PhonemeClassifier(VOCAB_SIZE, HIDDEN_SIZE, NUM_PHONES, WINDOW_SIZE).to(self.device)
+        # Load weights safely
         if not os.path.exists(state_dict_path):
+            raise FileNotFoundError(f"Missing model weights at {state_dict_path}")
         sd = torch.load(state_dict_path, map_location=self.device)
         try:
             self.model.load_state_dict(sd)
         except Exception as e:
             if isinstance(sd, dict) and "model_state_dict" in sd:
                 self.model.load_state_dict(sd["model_state_dict"])
             else:
+                raise RuntimeError(
+                    "Failed to load state_dict. Ensure you saved with torch.save(model.state_dict(), ...)"
+                ) from e
         self.model.eval()
     def _word_to_windows(self, word):
         pad = " " * (WINDOW_SIZE // 2)
         padded = pad + word.lower() + pad
         windows = []
         for i in range(len(word)):
+            w = padded[i : i + WINDOW_SIZE]
             idxs = [CHAR_TO_IDX.get(ch, 0) for ch in w]
             windows.append(idxs)
+        return torch.tensor(windows, dtype=torch.long, device=self.device)
     def predict(self, word):
         word = word.strip()
         if not word:
             return []
+        windows = self._word_to_windows(word)
         with torch.no_grad():
+            logits = self.model(windows)
+            preds = torch.argmax(torch.softmax(logits, dim=-1), dim=-1).cpu().numpy().tolist()
         phones = [IDX_TO_PHONE[p] for p in preds]
         return phones
     def predict_string(self, word):
+        return " ".join(self.predict(word))