huper29
/

huper_corrector

@@ -9,48 +9,106 @@ import pytorch_lightning as pl
 from .model import PhonemeCorrector
 from transformers import Wav2Vec2Processor, HubertModel
 class PhonemeCorrectionInference:
     def __init__(self, checkpoint_path, vocab_path, audio_model_name="facebook/hubert-large-ls960-ft", device=None):
         self.device = device if device else torch.device("cuda" if torch.cuda.is_available() else "cpu")
-        # 1. Load Vocab / Config
         print(f"Loading config from {vocab_path}...")
-        with open(vocab_path, 'r') as f:
             self.config = json.load(f)
-        self.op_map = self.config['op_to_id']
-        self.ins_map = self.config['insert_to_id']
-        # Create Reverse Maps (ID -> String)
         self.id2op = {v: k for k, v in self.op_map.items()}
         self.id2ins = {v: k for k, v in self.ins_map.items()}
-        # 2. Load G2P
         self.g2p = G2p()
-        # 3. Load Model
-        print(f"Loading model from {checkpoint_path}...")
-        if os.path.exists(checkpoint_path):
-            checkpoint = torch.load(checkpoint_path, map_location=self.device)
-            hparams = checkpoint.get('hyper_parameters', {})
-            vocab_size = max(self.ins_map.values()) + 1
-            audio_vocab_size = hparams.get('audio_vocab_size', 2048)
-            self.model = PhonemeCorrector.load_from_checkpoint(
-                checkpoint_path,
-                map_location=self.device,
-                vocab_size=vocab_size,
-                audio_vocab_size=audio_vocab_size
-            )
-        else:
             raise FileNotFoundError(f"Checkpoint not found at {checkpoint_path}")
-        self.model.to(self.device)
-        self.model.eval()
-        # 4. Load Audio Tokenizer
         print(f"Loading Audio Tokenizer: {audio_model_name}")
         self.audio_processor = Wav2Vec2Processor.from_pretrained(audio_model_name)
         self.audio_model = HubertModel.from_pretrained(audio_model_name).eval().to(self.device)

 from .model import PhonemeCorrector
 from transformers import Wav2Vec2Processor, HubertModel
+from safetensors.torch import load_file as safetensors_load_file
 class PhonemeCorrectionInference:
     def __init__(self, checkpoint_path, vocab_path, audio_model_name="facebook/hubert-large-ls960-ft", device=None):
         self.device = device if device else torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        # 1) Load vocab
         print(f"Loading config from {vocab_path}...")
+        with open(vocab_path, "r") as f:
             self.config = json.load(f)
+        self.op_map = self.config["op_to_id"]
+        self.ins_map = self.config["insert_to_id"]
         self.id2op = {v: k for k, v in self.op_map.items()}
         self.id2ins = {v: k for k, v in self.ins_map.items()}
+        # 2) Load G2P
         self.g2p = G2p()
+        # 3) Load hparams.json (prefer same dir as checkpoint, fallback to parent)
+        if not os.path.exists(checkpoint_path):
             raise FileNotFoundError(f"Checkpoint not found at {checkpoint_path}")
+        hparams = {}
+        hp_candidates = [
+            os.path.join(os.path.dirname(checkpoint_path), "hparams.json"),
+            os.path.join(os.path.dirname(os.path.dirname(checkpoint_path)), "hparams.json"),
+        ]
+        for hp in hp_candidates:
+            if os.path.exists(hp):
+                with open(hp, "r") as f:
+                    hparams = json.load(f)
+                break
+        # 4) Load weights/state_dict
+        print(f"Loading model weights from {checkpoint_path}...")
+        lower = checkpoint_path.lower()
+        if lower.endswith(".safetensors"):
+            state_dict = safetensors_load_file(checkpoint_path, device="cpu")
+        elif lower.endswith(".ckpt") or lower.endswith(".pt") or lower.endswith(".pth"):
+            # NOTE: weights_only=False is needed for Lightning-style checkpoints in PyTorch 2.6+
+            ckpt = torch.load(checkpoint_path, map_location="cpu", weights_only=False)
+            state_dict = ckpt.get("state_dict", ckpt)
+            if not hparams and isinstance(ckpt, dict):
+                hparams = ckpt.get("hyper_parameters", {}) or {}
+        else:
+            raise ValueError(f"Unsupported checkpoint format: {checkpoint_path}")
+        # 5) Build model with correct hyperparams
+        vocab_size_from_vocab = max(self.ins_map.values()) + 1
+        # Prefer hparams.json, but also sanity-check against state_dict shapes
+        vocab_size = int(hparams.get("vocab_size", vocab_size_from_vocab))
+        audio_vocab_size = int(hparams.get("audio_vocab_size", 2048))
+        d_model = int(hparams.get("d_model", 256))
+        nhead = int(hparams.get("nhead", 4))
+        num_layers = int(hparams.get("num_layers", 4))
+        dropout = float(hparams.get("dropout", 0.1))
+        lr = float(hparams.get("lr", 1e-4))
+        weight_decay = float(hparams.get("weight_decay", 0.01))
+        scheduler_config = hparams.get("scheduler_config", None)
+        optimizer_config = hparams.get("optimizer_config", None)
+        # Hard check: vocab.json and weights must agree
+        if "text_embedding.weight" in state_dict:
+            vsd, dsd = state_dict["text_embedding.weight"].shape
+            asd = state_dict["audio_embedding.weight"].shape[0]
+            if vsd != vocab_size_from_vocab:
+                raise ValueError(
+                    f"vocab.json (vocab_size={vocab_size_from_vocab}) does not match weights (vocab_size={vsd}). "
+                    "Please upload the matching vocab.json."
+                )
+            # Override to match weights exactly (safer)
+            vocab_size = vsd
+            audio_vocab_size = asd
+            d_model = dsd
+        self.model = PhonemeCorrector(
+            vocab_size=vocab_size,
+            audio_vocab_size=audio_vocab_size,
+            d_model=d_model,
+            nhead=nhead,
+            num_layers=num_layers,
+            dropout=dropout,
+            lr=lr,
+            weight_decay=weight_decay,
+            scheduler_config=scheduler_config,
+            optimizer_config=optimizer_config,
+        )
+        missing, unexpected = self.model.load_state_dict(state_dict, strict=False)
+        if missing or unexpected:
+            print(f"[load_state_dict] missing={len(missing)} unexpected={len(unexpected)}")
+            if missing[:5]:
+                print("  missing (first 5):", missing[:5])
+            if unexpected[:5]:
+                print("  unexpected (first 5):", unexpected[:5])
+        self.model.to(self.device).eval()
+        # 6) Load Audio Tokenizer
         print(f"Loading Audio Tokenizer: {audio_model_name}")
         self.audio_processor = Wav2Vec2Processor.from_pretrained(audio_model_name)
         self.audio_model = HubertModel.from_pretrained(audio_model_name).eval().to(self.device)