Spaces:

alzavo
/

sayest

Running

App Files Files Community

Aleksei Žavoronkov commited on 15 days ago

Commit

8cf218e

1 Parent(s): ed26f9c

update model architecture to the latest

Browse files

Files changed (5) hide show

app.py +37 -74
constants.py +3 -3
gop_model.py +209 -153
models.py +40 -20
utils.py +62 -32

app.py CHANGED Viewed

@@ -1,58 +1,60 @@
 import gradio as gr
-from constants import ALL_PHONEMES, QUALITY_MODEL_REPO_ID, DURATION_MODEL_REPO_ID
 from utils import load_model_and_processor, run_inference, validate_phonemes
-from multiprocessing import Process, Queue, set_start_method
-import logging
 logging.basicConfig(
     level=logging.INFO,
     format="%(asctime)s - %(levelname)s - %(message)s",
-    handlers=[logging.StreamHandler()]
 )
 logger = logging.getLogger(__name__)
-logger.info("Loading models into memory globally...")
-phoneme_model, phoneme_processor = load_model_and_processor(QUALITY_MODEL_REPO_ID)
-duration_model, duration_processor = load_model_and_processor(DURATION_MODEL_REPO_ID)
-logger.info("Models loaded successfully.")
 css = """
 .phoneme-scores { display: flex; flex-wrap: wrap; justify-content: center; gap: 15px; }
 .phoneme-container { text-align: center; padding: 10px; border: 1px solid #ddd; border-radius: 8px; }
 .phoneme { font-size: 1.5em; font-weight: bold; margin-bottom: 5px; }
 .score { padding: 8px 12px; border-radius: 5px; color: white; font-weight: bold; }
-.good { background-color: #28a745; } /* Green */
-.medium { background-color: #ffc107; } /* Yellow */
-.bad { background-color: #dc3545; } /* Red */
 """
 def get_score_class(score, score_type):
     if score_type == "quality":
-        if score == 1: return 'good'
-        if score == 2: return 'medium'
-        return 'bad'
-    else:
-        return 'good' if score == 1 else 'bad'
 def generate_html_output(result, score_type):
     if isinstance(result, str):
         return result
-    predicted_scores, tokens, token_lengths = result
-    html_output = f"<div class='phoneme-section'><h3 class='scores-title'>{'Quality Scores' if score_type == 'quality' else 'Duration Scores'}</h3></div><div class='phoneme-scores'>"
-    for i in range(token_lengths[0]):
-        token = tokens[i]
-        score = predicted_scores[0][i]
-        score = int(score) + 1
-        score_class = get_score_class(score, score_type)
         html_output += f"""
         <div class='phoneme-container'>
             <div class='phoneme'>{token}</div>
-            <div class='score {score_class}'>{score}</div>
         </div>
         """
@@ -60,24 +62,7 @@ def generate_html_output(result, score_type):
     return html_output
-def inference_wrapper(model_type, model, processor, audio_path, transcript, queue):
-    result = run_inference(audio_path, transcript, model, processor)
-    if isinstance(result, str):
-        queue.put((model_type, result))
-        return
-    predicted_scores, tokens, token_lengths = result
-    scores_list = predicted_scores.cpu().tolist()
-    lengths_list = token_lengths.cpu().tolist()
-    safe_result = (scores_list, tokens, lengths_list)
-    queue.put((model_type, safe_result))
-def score_phonemes_in_parallel(phoneme_text, audio_file):
     if audio_file is None:
         return "<p style='text-align:center; color:red;'>Please upload a .wav audio file.</p>", ""
@@ -85,27 +70,9 @@ def score_phonemes_in_parallel(phoneme_text, audio_file):
     if phonemes_validation_error:
         return phonemes_validation_error, ""
-    results_queue = Queue()
-    quality_process = Process(
-        target=inference_wrapper,
-        args=("quality", phoneme_model, phoneme_processor, audio_file, phoneme_text, results_queue)
-    )
-    duration_process = Process(
-        target=inference_wrapper,
-        args=("duration", duration_model, duration_processor, audio_file, phoneme_text, results_queue)
-    )
-    quality_process.start()
-    duration_process.start()
-    quality_process.join()
-    duration_process.join()
-    results = {}
-    while not results_queue.empty():
-        key, value = results_queue.get()
-        results[key] = value
     quality_result = results.get("quality")
     duration_result = results.get("duration")
@@ -120,8 +87,9 @@ with gr.Blocks(css=css, theme=gr.themes.Soft()) as demo:
     gr.Markdown(
         """
         # Phoneme Pronunciation and Duration Scorer
-        Enter the phonemes directly into the text box (space-separated).
-        Then, upload a `.wav` file or record the audio of the pronounced word.
         The application will provide a pronunciation (quality) and duration score for each phoneme.
         Scores legend:
@@ -144,7 +112,7 @@ with gr.Blocks(css=css, theme=gr.themes.Soft()) as demo:
             ["aja (L2 speaker)", "a j a", "./audio/L2/03ac-e45b-ec8a-6fa0_aja_take1.wav"],
             ["sõpra (L2 speaker)", "s õ pp r a", "./audio/L2/4071-0c77-e1d3-9587_sõpra_take1.wav"],
         ],
-        inputs=[word_input, phoneme_text_input, audio_input]
     )
     gr.Markdown("---")
@@ -155,16 +123,11 @@ with gr.Blocks(css=css, theme=gr.themes.Soft()) as demo:
         duration_output_html = gr.HTML()
     btn.click(
-        fn=score_phonemes_in_parallel,
         inputs=[phoneme_text_input, audio_input],
-        outputs=[phoneme_output_html, duration_output_html]
     )
 if __name__ == "__main__":
-    try:
-        set_start_method("fork", force=True)
-        logger.info("Multiprocessing start method set to 'fork'.")
-    except RuntimeError:
-        logger.warning("Start method has already been set.")
     demo.queue(default_concurrency_limit=2).launch()

+import logging
 import gradio as gr
+from constants import ALL_PHONEMES, MODEL_REPO_ID
 from utils import load_model_and_processor, run_inference, validate_phonemes
 logging.basicConfig(
     level=logging.INFO,
     format="%(asctime)s - %(levelname)s - %(message)s",
+    handlers=[logging.StreamHandler()],
 )
 logger = logging.getLogger(__name__)
+logger.info("Loading model into memory globally")
+model, processor = load_model_and_processor(MODEL_REPO_ID)
+logger.info("Model loaded successfully")
 css = """
 .phoneme-scores { display: flex; flex-wrap: wrap; justify-content: center; gap: 15px; }
 .phoneme-container { text-align: center; padding: 10px; border: 1px solid #ddd; border-radius: 8px; }
 .phoneme { font-size: 1.5em; font-weight: bold; margin-bottom: 5px; }
 .score { padding: 8px 12px; border-radius: 5px; color: white; font-weight: bold; }
+.good { background-color: #28a745; }
+.medium { background-color: #ffc107; }
+.bad { background-color: #dc3545; }
 """
 def get_score_class(score, score_type):
     if score_type == "quality":
+        if score == 1:
+            return "good"
+        if score == 2:
+            return "medium"
+        return "bad"
+    return "good" if score == 1 else "bad"
 def generate_html_output(result, score_type):
     if isinstance(result, str):
         return result
+    if not result:
+        return "<p style='text-align:center; color:red;'>No scores were produced.</p>"
+    predicted_scores, tokens = result
+    title = "Quality Scores" if score_type == "quality" else "Duration Scores"
+    html_output = f"<div class='phoneme-section'><h3 class='scores-title'>{title}</h3></div><div class='phoneme-scores'>"
+    for token, score in zip(tokens, predicted_scores):
+        display_score = int(score) + 1
+        score_class = get_score_class(display_score, score_type)
         html_output += f"""
         <div class='phoneme-container'>
             <div class='phoneme'>{token}</div>
+            <div class='score {score_class}'>{display_score}</div>
         </div>
         """
     return html_output
+def score_phonemes(phoneme_text, audio_file):
     if audio_file is None:
         return "<p style='text-align:center; color:red;'>Please upload a .wav audio file.</p>", ""
     if phonemes_validation_error:
         return phonemes_validation_error, ""
+    results = run_inference(audio_file, phoneme_text, model, processor)
+    if isinstance(results, str):
+        return results, ""
     quality_result = results.get("quality")
     duration_result = results.get("duration")
     gr.Markdown(
         """
         # Phoneme Pronunciation and Duration Scorer
+        Enter phonemes directly into the text box, separated by spaces.
+        Use `|` between words if you want to score a multi-word sequence.
+        Then upload a `.wav` file or record the audio of the pronounced word.
         The application will provide a pronunciation (quality) and duration score for each phoneme.
         Scores legend:
             ["aja (L2 speaker)", "a j a", "./audio/L2/03ac-e45b-ec8a-6fa0_aja_take1.wav"],
             ["sõpra (L2 speaker)", "s õ pp r a", "./audio/L2/4071-0c77-e1d3-9587_sõpra_take1.wav"],
         ],
+        inputs=[word_input, phoneme_text_input, audio_input],
     )
     gr.Markdown("---")
         duration_output_html = gr.HTML()
     btn.click(
+        fn=score_phonemes,
         inputs=[phoneme_text_input, audio_input],
+        outputs=[phoneme_output_html, duration_output_html],
+        api_name=False,
     )
 if __name__ == "__main__":
     demo.queue(default_concurrency_limit=2).launch()

constants.py CHANGED Viewed

@@ -1,5 +1,4 @@
-QUALITY_MODEL_REPO_ID = "alzavo/sayest-quality"
-DURATION_MODEL_REPO_ID = "alzavo/sayest-duration"
 SAMPLING_RATE = 16000
 MONO_CHANNEL = 1
 MAX_AUDIO_DURATION_SECONDS = 5
@@ -94,4 +93,5 @@ ALL_PHONEMES = {
     "u:",
     "w",
     "j:j",
-}

+MODEL_REPO_ID = "alzavo/sayest-latest"
 SAMPLING_RATE = 16000
 MONO_CHANNEL = 1
 MAX_AUDIO_DURATION_SECONDS = 5
     "u:",
     "w",
     "j:j",
+    "|",
+}

gop_model.py CHANGED Viewed

@@ -12,9 +12,6 @@ from models import OrdinalLogLoss
 class GOPWav2Vec2Config(PretrainedConfig):
-    """
-    Configuration for GOP-enhanced model that wraps a Wav2Vec2ForCTC backbone.
-    """
     model_type = "gop-wav2vec2"
     def __init__(
@@ -33,6 +30,7 @@ class GOPWav2Vec2Config(PretrainedConfig):
         unk_id: Optional[int] = None,
         bos_id: Optional[int] = None,
         eos_id: Optional[int] = None,
         token_id_vocab: Optional[List[int]] = None,
         ctc_config: Optional[dict] = None,
         **kwargs,
@@ -57,21 +55,17 @@ class GOPWav2Vec2Config(PretrainedConfig):
         self.unk_id = unk_id
         self.bos_id = bos_id
         self.eos_id = eos_id
         self.token_id_vocab = token_id_vocab
         self.ctc_config = ctc_config
 class GOPPhonemeClassifier(PreTrainedModel):
-    """
-    GOP classifier that wraps a pretrained Wav2Vec2ForCTC backbone.
-    Computes per-phoneme scores using GOP-derived features + a small Transformer + classifier head.
-    """
     config_class = GOPWav2Vec2Config
     def __init__(self, config: GOPWav2Vec2Config, load_pretrained_backbone: bool = False):
         super().__init__(config)
         if config.ctc_config is not None:
             backbone_config = Wav2Vec2Config.from_dict(config.ctc_config)
         elif config.base_model_name_or_path is not None:
@@ -82,35 +76,43 @@ class GOPPhonemeClassifier(PreTrainedModel):
         self.ctc_model = Wav2Vec2ForCTC(backbone_config)
         self.config.ctc_config = backbone_config.to_dict()
-        # Special ids
         self.blank_id = config.pad_id
         self.unk_id = config.unk_id
         self.bos_id = config.bos_id
         self.eos_id = config.eos_id
         self.pad_id = self.blank_id
         special_ids = {self.blank_id, self.unk_id, self.bos_id, self.eos_id, self.pad_id}
         self.special_ids = {i for i in special_ids if i is not None}
         vocab_size = int(self.ctc_model.config.vocab_size)
         self.token_id_vocab = (
-            config.token_id_vocab if config.token_id_vocab is not None else [i for i in range(vocab_size) if i not in self.special_ids]
         )
         self.gop_feature_dim = 1 + len(self.token_id_vocab) + 1
         self.embedding_dim = int(config.gop_embedding_dim)
-        self.token_embedding = nn.Embedding(vocab_size, self.embedding_dim, padding_idx=self.pad_id if self.pad_id is not None else 0)
         self.combined_feature_dim = self.gop_feature_dim + self.embedding_dim
-        enc_layer = nn.TransformerEncoderLayer(
-            d_model=self.combined_feature_dim,
-            nhead=config.gop_transformer_nhead,
-            dim_feedforward=config.gop_transformer_dim_feedforward,
-            dropout=config.gop_transformer_dropout,
-            activation=F.relu,
             batch_first=True,
         )
-        self.gop_transformer_encoder = nn.TransformerEncoder(enc_layer, num_layers=config.gop_transformer_nlayers)
         head_label_config = getattr(config, "gop_head_labels", None)
         if head_label_config is None:
@@ -118,13 +120,24 @@ class GOPPhonemeClassifier(PreTrainedModel):
                 raise ValueError("Config must provide gop_head_labels or num_gop_labels for the classifier.")
             head_label_config = {"default": int(config.num_gop_labels)}
         self.head_label_config = {str(k): int(v) for k, v in head_label_config.items()}
-        self.classifiers = nn.ModuleDict({
-            head: nn.Linear(self.combined_feature_dim, num_labels)
-            for head, num_labels in self.head_label_config.items()
-        })
         self._init_losses()
         self.post_init()
         if load_pretrained_backbone:
@@ -152,11 +165,15 @@ class GOPPhonemeClassifier(PreTrainedModel):
             elif weights is not None:
                 head_weights = weights
             if head_weights is not None:
-                head_weights = head_weights if isinstance(head_weights, torch.Tensor) else torch.tensor(head_weights, dtype=torch.float)
             loss_modules[head] = OrdinalLogLoss(
                 num_classes=int(num_labels),
                 alpha=alpha,
-                reduction='mean',
                 class_weights=head_weights,
             )
         self.loss_fns = nn.ModuleDict(loss_modules)
@@ -168,7 +185,6 @@ class GOPPhonemeClassifier(PreTrainedModel):
         target_ids: torch.Tensor,
         target_lengths: torch.Tensor,
     ) -> torch.Tensor:
-        """CTC log p(target|input) per item for a batch."""
         target_ids_cpu = target_ids.cpu()
         target_lengths_cpu = target_lengths.cpu()
         log_probs_cpu = log_probs_TNC.cpu()
@@ -176,23 +192,22 @@ class GOPPhonemeClassifier(PreTrainedModel):
         targets_flat = []
         for i in range(target_ids_cpu.size(0)):
-            valid_targets = target_ids_cpu[i, :target_lengths_cpu[i]]
             targets_flat.append(valid_targets)
         targets_cat = torch.cat(targets_flat) if targets_flat else torch.tensor([], dtype=torch.long)
         if target_lengths_cpu.sum() == 0:
-            return torch.full((log_probs_TNC.size(1),), -float('inf'), device=log_probs_TNC.device)
-        ctc_loss_fn = torch.nn.CTCLoss(blank=self.blank_id, reduction='none', zero_infinity=True)
         try:
             loss_per_item = ctc_loss_fn(log_probs_cpu, targets_cat, input_lengths_cpu, target_lengths_cpu)
             return -loss_per_item.to(log_probs_TNC.device)
-        except Exception as e:
-            warnings.warn(f"CTCLoss calculation failed: {e}. Returning -inf for batch.")
-            return torch.full((log_probs_TNC.size(1),), -float('inf'), device=log_probs_TNC.device)
     def _get_feat_extract_output_lengths(self, input_lengths: torch.LongTensor):
-        """Compute time dimension after backbone feature extractor."""
         def _conv_out_length(input_length, kernel_size, stride):
             return torch.div(input_length - kernel_size, stride, rounding_mode="floor") + 1
@@ -200,6 +215,92 @@ class GOPPhonemeClassifier(PreTrainedModel):
             input_lengths = _conv_out_length(input_lengths, kernel_size, stride)
         return input_lengths
     def forward(
         self,
         input_values: torch.Tensor,
@@ -212,10 +313,11 @@ class GOPPhonemeClassifier(PreTrainedModel):
         return_dict: Optional[bool] = None,
         labels: Optional[torch.Tensor] = None,
     ) -> SequenceClassifierOutput:
         device = input_values.device
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
         if self.training or labels is not None:
             if canonical_token_ids is None or token_lengths is None or token_mask is None:
@@ -224,7 +326,15 @@ class GOPPhonemeClassifier(PreTrainedModel):
         if token_mask is None:
             raise ValueError("`token_mask` must be provided to GOPPhonemeClassifier.forward.")
-        # 1) Backbone forward to get hidden states
         outputs = self.ctc_model.wav2vec2(
             input_values,
             attention_mask=attention_mask,
@@ -234,150 +344,96 @@ class GOPPhonemeClassifier(PreTrainedModel):
         )
         hidden_states = outputs.last_hidden_state
-        # 2) Frame-level logits for CTC
         logits_ctc = self.ctc_model.lm_head(hidden_states)
         log_probs_ctc = F.log_softmax(logits_ctc, dim=-1)
         log_probs_TNC = log_probs_ctc.permute(1, 0, 2).contiguous()
-        # 3) Frame lengths
         batch_size = input_values.size(0)
-        if attention_mask is None:
-            input_lengths_frames = torch.full((batch_size,), log_probs_TNC.size(0), dtype=torch.long, device=device)
-        else:
-            input_lengths_samples = attention_mask.sum(dim=-1)
-            input_lengths_frames = self._get_feat_extract_output_lengths(input_lengths_samples)
-            input_lengths_frames = torch.clamp(input_lengths_frames, max=log_probs_TNC.size(0))
-        # 4) GOP feature calculation over tokens
         max_token_len = canonical_token_ids.size(1) if canonical_token_ids is not None else 0
-        batch_combined_features_list = [[] for _ in range(batch_size)]
-        token_mask_bool = token_mask.to(device=device).bool()
         lpp_log_prob_batch = self._calculate_log_prob(
             log_probs_TNC, input_lengths_frames, canonical_token_ids, token_lengths
         )
         for token_idx in range(max_token_len):
-            current_token_ids = canonical_token_ids[:, token_idx]
-            current_token_embeddings = self.token_embedding(current_token_ids)
-            token_out_of_bounds_mask = (token_idx >= token_lengths)
-            mask_column = token_mask_bool[:, token_idx] if token_mask_bool.dim() == 2 else token_mask_bool
-            skip_mask = token_out_of_bounds_mask | ~mask_column
-            if skip_mask.all():
-                continue
-            active_mask = ~skip_mask
             all_sub_log_probs = []
-            if self.token_id_vocab:
-                for sub_token_id in self.token_id_vocab:
-                    sub_ids_batch = canonical_token_ids.clone()
-                    sub_ids_batch[active_mask, token_idx] = sub_token_id
-                    log_prob_sub_batch = self._calculate_log_prob(
-                        log_probs_TNC, input_lengths_frames, sub_ids_batch, token_lengths
-                    )
-                    all_sub_log_probs.append(log_prob_sub_batch)
-            if all_sub_log_probs:
-                sub_lpr_batch = lpp_log_prob_batch.unsqueeze(1) - torch.stack(all_sub_log_probs, dim=1)
-                sub_lpr_batch = torch.nan_to_num(sub_lpr_batch, nan=0.0, posinf=1e10, neginf=-1e10)
-                if skip_mask.any():
-                    sub_lpr_batch[skip_mask, :] = 0.0
-            else:
-                sub_lpr_batch = torch.zeros((batch_size, 0), device=device)
-            # Deletion GOP component
             del_lpr_list = []
             for b_idx in range(batch_size):
                 if skip_mask[b_idx]:
                     del_lpr_list.append(torch.tensor(-1e10, device=device))
-                    continue
-                item_tokens = canonical_token_ids[b_idx, : token_lengths[b_idx]].tolist()
-                del_tokens_list = item_tokens[:token_idx] + item_tokens[token_idx + 1:]
-                if not del_tokens_list:
-                    log_prob_del_item = torch.tensor(-float('inf'), device=device)
                 else:
-                    del_ids_tensor = torch.tensor([del_tokens_list], dtype=torch.long, device=canonical_token_ids.device)
-                    del_len_tensor = torch.tensor([len(del_tokens_list)], dtype=torch.long, device=canonical_token_ids.device)
-                    log_probs_item_TNC = log_probs_TNC[:, b_idx:b_idx + 1, :]
-                    input_len_item = input_lengths_frames[b_idx:b_idx + 1]
-                    log_prob_del_item = self._calculate_log_prob(
-                        log_probs_item_TNC, input_len_item, del_ids_tensor, del_len_tensor
-                    )
-                    if log_prob_del_item.dim() > 0:
-                        log_prob_del_item = log_prob_del_item[0]
-                lpr_del_item = lpp_log_prob_batch[b_idx] - log_prob_del_item
-                lpr_del_item = torch.nan_to_num(lpr_del_item, nan=0.0, posinf=1e10, neginf=-1e10)
-                del_lpr_list.append(lpr_del_item)
             del_lpr_batch = torch.stack(del_lpr_list)
-            gop_part = torch.cat([lpp_log_prob_batch.unsqueeze(1), sub_lpr_batch, del_lpr_batch.unsqueeze(1)], dim=1)
             combined_features = torch.cat([gop_part, current_token_embeddings], dim=1)
-            for b_idx in range(batch_size):
-                if active_mask[b_idx]:
-                    batch_combined_features_list[b_idx].append(combined_features[b_idx])
-        # 5) Pad phoneme feature sequences and mask
-        feature_lengths_list = [len(seq_list) for seq_list in batch_combined_features_list]
-        if feature_lengths_list:
-            feature_lengths = torch.tensor(feature_lengths_list, dtype=torch.long, device=device)
-            target_pad_len = int(feature_lengths.max().item())
-        else:
-            feature_lengths = torch.zeros((batch_size,), dtype=torch.long, device=device)
-            target_pad_len = 0
-        padded_sequences = []
-        for seq_list in batch_combined_features_list:
-            if seq_list:
-                seq_tensor = torch.stack(seq_list, dim=0)
-                pad_len = target_pad_len - seq_tensor.size(0)
-                if pad_len > 0:
-                    seq_tensor = F.pad(seq_tensor, (0, 0, 0, pad_len))
-                padded_sequences.append(seq_tensor)
-            else:
-                padded_sequences.append(torch.zeros((target_pad_len, self.combined_feature_dim), device=device))
-        transformer_input = torch.stack(padded_sequences, dim=0) if padded_sequences else torch.zeros((0, 0, self.combined_feature_dim), device=device)
-        transformer_padding_mask = torch.arange(target_pad_len, device=device)[None, :] >= feature_lengths[:, None]
-        # 6) GOP transformer
-        gop_transformer_output = self.gop_transformer_encoder(
-            transformer_input,
-            src_key_padding_mask=transformer_padding_mask
-        )
-        # 7) Per-phoneme classifier (multi-head)
-        final_logits = {
-            head: classifier(gop_transformer_output)
-            for head, classifier in self.classifiers.items()
-        }
-        # 8) Loss
         loss = None
         if labels is not None:
-            if isinstance(labels, torch.Tensor):
-                label_map = {next(iter(final_logits.keys())): labels}
-            elif isinstance(labels, dict):
-                label_map = labels
-            else:
-                raise TypeError("labels must be a Tensor or a dict of Tensors when provided.")
-            active_mask = ~transformer_padding_mask.view(-1)
             for head, head_logits in final_logits.items():
-                head_labels = label_map.get(head)
-                if head_labels is None:
-                    continue
-                logits_flat = head_logits.view(-1, head_logits.size(-1))
-                labels_flat = head_labels.view(-1)
-                active_logits = logits_flat[active_mask]
-                #breakpoint()
-                active_labels = labels_flat[active_mask]
-                if active_labels.numel() == 0:
-                    continue
-                head_loss = self.loss_fns[head](active_logits, active_labels)
-                loss = head_loss if loss is None else loss + head_loss
-            if loss is None:
-                loss = torch.tensor(0.0, device=device, requires_grad=True)
         if not return_dict:
             output = (final_logits,)

 class GOPWav2Vec2Config(PretrainedConfig):
     model_type = "gop-wav2vec2"
     def __init__(
         unk_id: Optional[int] = None,
         bos_id: Optional[int] = None,
         eos_id: Optional[int] = None,
+        word_boundary_id: Optional[int] = None,
         token_id_vocab: Optional[List[int]] = None,
         ctc_config: Optional[dict] = None,
         **kwargs,
         self.unk_id = unk_id
         self.bos_id = bos_id
         self.eos_id = eos_id
+        self.word_boundary_id = word_boundary_id
         self.token_id_vocab = token_id_vocab
         self.ctc_config = ctc_config
 class GOPPhonemeClassifier(PreTrainedModel):
     config_class = GOPWav2Vec2Config
     def __init__(self, config: GOPWav2Vec2Config, load_pretrained_backbone: bool = False):
         super().__init__(config)
         if config.ctc_config is not None:
             backbone_config = Wav2Vec2Config.from_dict(config.ctc_config)
         elif config.base_model_name_or_path is not None:
         self.ctc_model = Wav2Vec2ForCTC(backbone_config)
         self.config.ctc_config = backbone_config.to_dict()
         self.blank_id = config.pad_id
         self.unk_id = config.unk_id
         self.bos_id = config.bos_id
         self.eos_id = config.eos_id
         self.pad_id = self.blank_id
+        self.word_boundary_id = config.word_boundary_id
         special_ids = {self.blank_id, self.unk_id, self.bos_id, self.eos_id, self.pad_id}
         self.special_ids = {i for i in special_ids if i is not None}
         vocab_size = int(self.ctc_model.config.vocab_size)
         self.token_id_vocab = (
+            config.token_id_vocab
+            if config.token_id_vocab is not None
+            else [i for i in range(vocab_size) if i not in self.special_ids]
         )
         self.gop_feature_dim = 1 + len(self.token_id_vocab) + 1
         self.embedding_dim = int(config.gop_embedding_dim)
+        self.token_embedding = nn.Embedding(
+            vocab_size,
+            self.embedding_dim,
+            padding_idx=self.pad_id if self.pad_id is not None else 0,
+        )
         self.combined_feature_dim = self.gop_feature_dim + self.embedding_dim
+        self.gop_part_dropout = nn.Dropout(config.gop_transformer_dropout)
+        self.gop_part_norm = nn.LayerNorm(self.gop_feature_dim)
+        self.lstm_hidden_size = int(config.gop_transformer_dim_feedforward)
+        self.gop_rnn = nn.LSTM(
+            input_size=self.combined_feature_dim,
+            hidden_size=self.lstm_hidden_size,
+            num_layers=config.gop_transformer_nlayers,
+            dropout=config.gop_transformer_dropout if config.gop_transformer_nlayers > 1 else 0.0,
+            bidirectional=True,
             batch_first=True,
         )
         head_label_config = getattr(config, "gop_head_labels", None)
         if head_label_config is None:
                 raise ValueError("Config must provide gop_head_labels or num_gop_labels for the classifier.")
             head_label_config = {"default": int(config.num_gop_labels)}
         self.head_label_config = {str(k): int(v) for k, v in head_label_config.items()}
+        self.head_hidden_dim = self.lstm_hidden_size * 2
+        self.head_mlps = nn.ModuleDict(
+            {
+                head: nn.Sequential(
+                    nn.Linear(self.head_hidden_dim, self.head_hidden_dim),
+                    nn.LeakyReLU(),
+                )
+                for head in self.head_label_config.keys()
+            }
+        )
+        self.classifiers = nn.ModuleDict(
+            {
+                head: nn.Linear(self.head_hidden_dim, num_labels)
+                for head, num_labels in self.head_label_config.items()
+            }
+        )
         self._init_losses()
         self.post_init()
         if load_pretrained_backbone:
             elif weights is not None:
                 head_weights = weights
             if head_weights is not None:
+                head_weights = (
+                    head_weights
+                    if isinstance(head_weights, torch.Tensor)
+                    else torch.tensor(head_weights, dtype=torch.float)
+                )
             loss_modules[head] = OrdinalLogLoss(
                 num_classes=int(num_labels),
                 alpha=alpha,
+                reduction="mean",
                 class_weights=head_weights,
             )
         self.loss_fns = nn.ModuleDict(loss_modules)
         target_ids: torch.Tensor,
         target_lengths: torch.Tensor,
     ) -> torch.Tensor:
         target_ids_cpu = target_ids.cpu()
         target_lengths_cpu = target_lengths.cpu()
         log_probs_cpu = log_probs_TNC.cpu()
         targets_flat = []
         for i in range(target_ids_cpu.size(0)):
+            valid_targets = target_ids_cpu[i, : target_lengths_cpu[i]]
             targets_flat.append(valid_targets)
         targets_cat = torch.cat(targets_flat) if targets_flat else torch.tensor([], dtype=torch.long)
         if target_lengths_cpu.sum() == 0:
+            return torch.full((log_probs_TNC.size(1),), -float("inf"), device=log_probs_TNC.device)
+        ctc_loss_fn = torch.nn.CTCLoss(blank=self.blank_id, reduction="none", zero_infinity=True)
         try:
             loss_per_item = ctc_loss_fn(log_probs_cpu, targets_cat, input_lengths_cpu, target_lengths_cpu)
             return -loss_per_item.to(log_probs_TNC.device)
+        except Exception as exc:
+            warnings.warn(f"CTCLoss calculation failed: {exc}. Returning -inf for batch.")
+            return torch.full((log_probs_TNC.size(1),), -float("inf"), device=log_probs_TNC.device)
     def _get_feat_extract_output_lengths(self, input_lengths: torch.LongTensor):
         def _conv_out_length(input_length, kernel_size, stride):
             return torch.div(input_length - kernel_size, stride, rounding_mode="floor") + 1
             input_lengths = _conv_out_length(input_lengths, kernel_size, stride)
         return input_lengths
+    def _prepare_labels(
+        self, labels: Optional[Union[torch.Tensor, Dict[str, torch.Tensor]]]
+    ) -> Optional[Dict[str, torch.Tensor]]:
+        if labels is None:
+            return None
+        if isinstance(labels, torch.Tensor):
+            if len(self.head_label_config) != 1:
+                raise ValueError("Multi-head setup requires `labels` to be a dict keyed by head name.")
+            head_name = next(iter(self.head_label_config))
+            return {head_name: labels}
+        if not isinstance(labels, dict):
+            raise ValueError("`labels` must be a Tensor for single-head setups or a dict for multi-head setups.")
+        return labels
+    def _validate_inputs(
+        self,
+        input_values: torch.Tensor,
+        attention_mask: torch.Tensor,
+        canonical_token_ids: torch.Tensor,
+        token_lengths: torch.Tensor,
+        token_mask: torch.Tensor,
+        labels: Optional[Dict[str, torch.Tensor]],
+    ) -> torch.Tensor:
+        if input_values.dim() != 2:
+            raise ValueError(f"`input_values` must be 2D (batch, time); got shape {tuple(input_values.shape)}.")
+        if attention_mask is None or attention_mask.shape != input_values.shape:
+            raise ValueError("`attention_mask` must be provided and match the shape of `input_values`.")
+        if canonical_token_ids is None or token_lengths is None or token_mask is None:
+            raise ValueError("`canonical_token_ids`, `token_lengths`, and `token_mask` are required.")
+        if canonical_token_ids.dim() != 2:
+            raise ValueError(
+                f"`canonical_token_ids` must be 2D (batch, tokens); got shape {tuple(canonical_token_ids.shape)}."
+            )
+        batch_size, max_tokens = canonical_token_ids.shape
+        if batch_size != input_values.shape[0]:
+            raise ValueError("Batch size mismatch between `input_values` and `canonical_token_ids`.")
+        if token_mask.dim() != 2 or token_mask.shape != canonical_token_ids.shape:
+            raise ValueError("`token_mask` must be the same shape as `canonical_token_ids`.")
+        if token_lengths.dim() != 1 or token_lengths.shape[0] != batch_size:
+            raise ValueError("`token_lengths` must be 1D with length equal to batch size.")
+        if torch.any(token_lengths < 0):
+            raise ValueError("`token_lengths` must be non-negative.")
+        if torch.any(token_lengths > max_tokens):
+            raise ValueError("`token_lengths` cannot exceed the number of provided tokens.")
+        token_mask_bool = token_mask.to(device=canonical_token_ids.device, dtype=torch.bool)
+        arange_positions = torch.arange(max_tokens, device=canonical_token_ids.device)
+        padded_active = token_mask_bool & (arange_positions.unsqueeze(0) >= token_lengths.unsqueeze(1))
+        if torch.any(padded_active):
+            raise ValueError("`token_mask` marks padded positions as valid (indices >= token_lengths).")
+        if torch.any(token_mask_bool.sum(dim=1) > token_lengths):
+            raise ValueError("`token_mask` has more active positions than `token_lengths` for some batch items.")
+        if labels is not None:
+            if not isinstance(labels, dict):
+                raise ValueError("`labels` must be a dict keyed by head name after normalization.")
+            expected_heads = set(self.head_label_config.keys())
+            label_heads = set(labels.keys())
+            unknown_heads = label_heads - expected_heads
+            missing_heads = expected_heads - label_heads
+            if unknown_heads:
+                raise ValueError(f"Unexpected label heads provided: {sorted(unknown_heads)}.")
+            if missing_heads:
+                raise ValueError(f"Missing label heads: {sorted(missing_heads)}.")
+            for head, head_labels in labels.items():
+                if head_labels.shape != canonical_token_ids.shape:
+                    raise ValueError(
+                        f"Labels for head '{head}' must match `canonical_token_ids` shape "
+                        f"{tuple(canonical_token_ids.shape)}; got {tuple(head_labels.shape)}."
+                    )
+                if head_labels.dtype not in (torch.int64, torch.long):
+                    raise ValueError(f"Labels for head '{head}' must be integer tensors; got dtype {head_labels.dtype}.")
+                masked_positions = token_mask_bool.logical_not()
+                bad_mask = masked_positions & (head_labels != -100)
+                if torch.any(bad_mask):
+                    bad_count = int(bad_mask.sum().item())
+                    raise ValueError(
+                        f"Labels for head '{head}' must be -100 at masked positions; found {bad_count} mismatches."
+                    )
+        return token_mask_bool
     def forward(
         self,
         input_values: torch.Tensor,
         return_dict: Optional[bool] = None,
         labels: Optional[torch.Tensor] = None,
     ) -> SequenceClassifierOutput:
         device = input_values.device
+        assert attention_mask is not None
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        labels = self._prepare_labels(labels)
         if self.training or labels is not None:
             if canonical_token_ids is None or token_lengths is None or token_mask is None:
         if token_mask is None:
             raise ValueError("`token_mask` must be provided to GOPPhonemeClassifier.forward.")
+        token_mask_bool = self._validate_inputs(
+            input_values=input_values,
+            attention_mask=attention_mask,
+            canonical_token_ids=canonical_token_ids,
+            token_lengths=token_lengths,
+            token_mask=token_mask,
+            labels=labels,
+        ).to(device=device)
         outputs = self.ctc_model.wav2vec2(
             input_values,
             attention_mask=attention_mask,
         )
         hidden_states = outputs.last_hidden_state
         logits_ctc = self.ctc_model.lm_head(hidden_states)
         log_probs_ctc = F.log_softmax(logits_ctc, dim=-1)
         log_probs_TNC = log_probs_ctc.permute(1, 0, 2).contiguous()
         batch_size = input_values.size(0)
+        input_lengths_samples = attention_mask.sum(dim=-1)
+        input_lengths_frames = self._get_feat_extract_output_lengths(input_lengths_samples)
+        input_lengths_frames = torch.clamp(input_lengths_frames, max=log_probs_TNC.size(0))
         max_token_len = canonical_token_ids.size(1) if canonical_token_ids is not None else 0
+        batch_combined_features_list = []
+        token_embeddings = self.token_embedding(canonical_token_ids)
         lpp_log_prob_batch = self._calculate_log_prob(
             log_probs_TNC, input_lengths_frames, canonical_token_ids, token_lengths
         )
         for token_idx in range(max_token_len):
+            current_token_embeddings = token_embeddings[:, token_idx, :]
+            active_mask = token_mask_bool[:, token_idx]
+            skip_mask = ~active_mask
             all_sub_log_probs = []
+            for sub_token_id in self.token_id_vocab:
+                sub_ids_batch = canonical_token_ids.clone()
+                sub_ids_batch[active_mask, token_idx] = sub_token_id
+                log_prob_sub_batch = self._calculate_log_prob(
+                    log_probs_TNC, input_lengths_frames, sub_ids_batch, token_lengths
+                )
+                all_sub_log_probs.append(log_prob_sub_batch)
+            sub_log_probs_batch = torch.stack(all_sub_log_probs, dim=1)
+            sub_log_probs_batch = F.log_softmax(sub_log_probs_batch, dim=1)
+            sub_log_probs_batch = torch.nan_to_num(sub_log_probs_batch, nan=0.0, posinf=1e10, neginf=-1e10)
+            if skip_mask.any():
+                sub_log_probs_batch[skip_mask, :] = 0.0
             del_lpr_list = []
             for b_idx in range(batch_size):
                 if skip_mask[b_idx]:
                     del_lpr_list.append(torch.tensor(-1e10, device=device))
                 else:
+                    item_tokens = canonical_token_ids[b_idx, : token_lengths[b_idx]].tolist()
+                    del_tokens_list = item_tokens[:token_idx] + item_tokens[token_idx + 1 :]
+                    if not del_tokens_list:
+                        log_prob_del_item = torch.tensor(-float("inf"), device=device)
+                    else:
+                        del_ids_tensor = torch.tensor(
+                            [del_tokens_list], dtype=torch.long, device=canonical_token_ids.device
+                        )
+                        del_len_tensor = torch.tensor(
+                            [len(del_tokens_list)], dtype=torch.long, device=canonical_token_ids.device
+                        )
+                        log_probs_item_TNC = log_probs_TNC[:, b_idx : b_idx + 1, :]
+                        input_len_item = input_lengths_frames[b_idx : b_idx + 1]
+                        log_prob_del_item = self._calculate_log_prob(
+                            log_probs_item_TNC, input_len_item, del_ids_tensor, del_len_tensor
+                        )
+                        if log_prob_del_item.dim() > 0:
+                            log_prob_del_item = log_prob_del_item[0]
+                    lpr_del_item = lpp_log_prob_batch[b_idx] - log_prob_del_item
+                    lpr_del_item = torch.nan_to_num(lpr_del_item, nan=0.0, posinf=1e10, neginf=-1e10)
+                    del_lpr_list.append(lpr_del_item)
             del_lpr_batch = torch.stack(del_lpr_list)
+            gop_part = torch.cat(
+                [lpp_log_prob_batch.unsqueeze(1), sub_log_probs_batch, del_lpr_batch.unsqueeze(1)], dim=1
+            )
+            gop_part = self.gop_part_norm(gop_part)
+            gop_part = self.gop_part_dropout(gop_part)
             combined_features = torch.cat([gop_part, current_token_embeddings], dim=1)
+            batch_combined_features_list.append(combined_features)
+        transformer_input = torch.stack(batch_combined_features_list, dim=1)
+        gop_rnn_output, _ = self.gop_rnn(transformer_input)
+        final_logits = {}
+        for head, classifier in self.classifiers.items():
+            head_features = self.head_mlps[head](gop_rnn_output)
+            final_logits[head] = classifier(head_features)
         loss = None
         if labels is not None:
+            loss = 0.0
             for head, head_logits in final_logits.items():
+                head_labels = labels[head]
+                head_loss = self.loss_fns[head](head_logits, head_labels)
+                loss += head_loss
         if not return_dict:
             output = (final_logits,)

models.py CHANGED Viewed

@@ -4,19 +4,21 @@ import torch.nn as nn
 class OrdinalLogLoss(nn.Module):
     def __init__(
-            self,
-            num_classes,
-            alpha=1.0,
-            reduction='mean',
-            distance_matrix=None,
-            class_weights=None,
-            eps=1e-8
     ):
         super(OrdinalLogLoss, self).__init__()
         self.num_classes = num_classes
         self.alpha = alpha
         self.reduction = reduction
         self.eps = eps
         if distance_matrix is not None:
             assert distance_matrix.shape == (num_classes, num_classes), \
@@ -35,21 +37,39 @@ class OrdinalLogLoss(nn.Module):
             self.class_weights = None
     def forward(self, logits, target):
-        probs = torch.softmax(logits, dim=1).clamp(max=1 - self.eps)
-        distances = self.distance_matrix[target] ** self.alpha
-        per_class_loss = -torch.log(1 - probs + self.eps)
-        loss = (per_class_loss * distances).sum(dim=1)  # shape (batch_size,)
-        # Apply class weights
         if self.class_weights is not None:
-            sample_weights = self.class_weights[target]
-            loss = loss * sample_weights
-        # Apply reduction
         if self.reduction == 'mean':
-            return loss.mean()
         elif self.reduction == 'sum':
-            return loss.sum()
         else:
-            return loss

 class OrdinalLogLoss(nn.Module):
     def __init__(
+        self,
+        num_classes,
+        alpha=1.0,
+        reduction='mean',
+        distance_matrix=None,
+        class_weights=None,
+        eps=1e-8,
+        ignore_index=-100,
     ):
         super(OrdinalLogLoss, self).__init__()
         self.num_classes = num_classes
         self.alpha = alpha
         self.reduction = reduction
         self.eps = eps
+        self.ignore_index = ignore_index
         if distance_matrix is not None:
             assert distance_matrix.shape == (num_classes, num_classes), \
             self.class_weights = None
     def forward(self, logits, target):
+        if logits.numel() == 0:
+            return logits.new_tensor(0.0)
+        probs = torch.softmax(logits, dim=-1).clamp(max=1 - self.eps)
+        if self.ignore_index is not None:
+            valid_mask = target != self.ignore_index
+        else:
+            valid_mask = torch.ones_like(target, dtype=torch.bool)
+        if not valid_mask.any():
+            if self.reduction == 'none':
+                return logits.new_zeros(target.shape, dtype=logits.dtype)
+            return logits.new_tensor(0.0)
+        active_probs = probs[valid_mask]
+        active_target = target[valid_mask]
+        distances = self.distance_matrix[active_target] ** self.alpha
+        per_class_loss = -torch.log(1 - active_probs + self.eps)
+        loss_active = (per_class_loss * distances).sum(dim=-1)
         if self.class_weights is not None:
+            sample_weights = self.class_weights[active_target]
+            loss_active = loss_active * sample_weights
+        if self.reduction == 'none':
+            full_loss = logits.new_zeros(target.shape, dtype=logits.dtype)
+            full_loss[valid_mask] = loss_active
+            return full_loss
         if self.reduction == 'mean':
+            return loss_active.mean()
         elif self.reduction == 'sum':
+            return loss_active.sum()
         else:
+            raise ValueError(f"Unsupported reduction: {self.reduction}")

utils.py CHANGED Viewed

@@ -1,24 +1,27 @@
-from typing import List
 import torch
 import torchaudio
-from transformers import Wav2Vec2Processor, QuantoConfig
 from constants import MAX_AUDIO_DURATION_SECONDS, MONO_CHANNEL, SAMPLING_RATE
 from gop_model import GOPPhonemeClassifier
-import logging
 logger = logging.getLogger(__name__)
 def load_model_and_processor(model_repo_id: str):
-    logger.info(f"Loading model and processor from Hugging Face Hub: {model_repo_id}")
     quantization_config = QuantoConfig(weights="int8")
-    logger.info("Applying INT8 dynamic quantization during model loading...")
     model = GOPPhonemeClassifier.from_pretrained(
         model_repo_id,
         quantization_config=quantization_config,
-        device_map="auto"
     )
     processor = Wav2Vec2Processor.from_pretrained(model_repo_id)
     model.eval()
@@ -36,6 +39,46 @@ def validate_phonemes(phoneme_text, allowed_phonemes):
     return None
 def run_inference(audio_file_path: str, transcript: str, model: GOPPhonemeClassifier, processor: Wav2Vec2Processor):
     if not audio_file_path or not transcript:
         return "<p style='text-align:center; color:red;'>Please provide both an audio file and the transcript.</p>"
@@ -56,19 +99,14 @@ def run_inference(audio_file_path: str, transcript: str, model: GOPPhonemeClassi
         audio_input = waveform.squeeze(0)
         processed_audio = processor(audio_input, sampling_rate=SAMPLING_RATE, return_tensors="pt", padding=True)
-        input_values = processed_audio.input_values.to(model.device)
-        attention_mask = processed_audio.attention_mask.to(model.device)
-        phonemes: List[str] = transcript.strip().split()
-        tokenizer = processor.tokenizer
-        unk_id = getattr(tokenizer, "unk_token_id", None)
-        ids = tokenizer.convert_tokens_to_ids(phonemes)
-        if isinstance(ids, int):
-            ids = [ids]
-        ids = [i if i is not None else unk_id for i in ids]
-        canonical_token_ids = torch.tensor([ids], dtype=torch.long).to(model.device)
-        token_lengths = torch.tensor([len(ids)], dtype=torch.long).to(model.device)
-        token_mask = torch.ones_like(canonical_token_ids).to(model.device)
         with torch.no_grad():
             outputs = model(
@@ -76,19 +114,11 @@ def run_inference(audio_file_path: str, transcript: str, model: GOPPhonemeClassi
                 attention_mask=attention_mask,
                 canonical_token_ids=canonical_token_ids,
                 token_lengths=token_lengths,
-                token_mask=token_mask
             )
-        logits = outputs.logits
-        head_name = next(iter(logits))
-        scores_tensor = logits[head_name]
-        predicted_scores = torch.argmax(scores_tensor, dim=-1)
-        tokens = processor.tokenizer.convert_ids_to_tokens(canonical_token_ids[0])
-        return predicted_scores, tokens, token_lengths
-    except Exception as e:
-        logger.error(f"An error occurred during inference: {e}", exc_info=True)
-        return f"<p style='text-align:center; color:red;'>An error occurred: {e}</p>"

+from typing import Dict, List, Tuple
+import logging
 import torch
 import torchaudio
+from transformers import QuantoConfig, Wav2Vec2Processor
 from constants import MAX_AUDIO_DURATION_SECONDS, MONO_CHANNEL, SAMPLING_RATE
 from gop_model import GOPPhonemeClassifier
 logger = logging.getLogger(__name__)
 def load_model_and_processor(model_repo_id: str):
+    logger.info("Loading model and processor from Hugging Face Hub: %s", model_repo_id)
     quantization_config = QuantoConfig(weights="int8")
+    logger.info("Applying INT8 dynamic quantization during model loading")
     model = GOPPhonemeClassifier.from_pretrained(
         model_repo_id,
         quantization_config=quantization_config,
+        device_map="auto",
     )
     processor = Wav2Vec2Processor.from_pretrained(model_repo_id)
     model.eval()
     return None
+def _prepare_canonical_tokens(transcript: str, processor: Wav2Vec2Processor, device: torch.device):
+    phonemes: List[str] = transcript.strip().split()
+    if not phonemes:
+        raise ValueError("Please enter at least one phoneme.")
+    token_mask_values = [token != "|" for token in phonemes]
+    if not any(token_mask_values):
+        raise ValueError("The phoneme sequence must contain at least one non-boundary token.")
+    tokenizer = processor.tokenizer
+    unk_id = getattr(tokenizer, "unk_token_id", None)
+    ids = tokenizer.convert_tokens_to_ids(phonemes)
+    if isinstance(ids, int):
+        ids = [ids]
+    ids = [token_id if token_id is not None else unk_id for token_id in ids]
+    canonical_token_ids = torch.tensor([ids], dtype=torch.long, device=device)
+    token_lengths = torch.tensor([len(ids)], dtype=torch.long, device=device)
+    token_mask = torch.tensor([token_mask_values], dtype=torch.bool, device=device)
+    display_tokens = [token for token, is_active in zip(phonemes, token_mask_values) if is_active]
+    return canonical_token_ids, token_lengths, token_mask, display_tokens
+def _extract_head_predictions(
+    logits_by_head: Dict[str, torch.Tensor],
+    token_mask: torch.Tensor,
+    display_tokens: List[str],
+) -> Dict[str, Tuple[List[int], List[str]]]:
+    active_mask = token_mask[0].bool()
+    results: Dict[str, Tuple[List[int], List[str]]] = {}
+    for head_name, head_logits in logits_by_head.items():
+        predicted_scores = torch.argmax(head_logits, dim=-1)[0]
+        filtered_scores = predicted_scores[active_mask].detach().cpu().tolist()
+        results[head_name] = (filtered_scores, display_tokens)
+    return results
 def run_inference(audio_file_path: str, transcript: str, model: GOPPhonemeClassifier, processor: Wav2Vec2Processor):
     if not audio_file_path or not transcript:
         return "<p style='text-align:center; color:red;'>Please provide both an audio file and the transcript.</p>"
         audio_input = waveform.squeeze(0)
         processed_audio = processor(audio_input, sampling_rate=SAMPLING_RATE, return_tensors="pt", padding=True)
+        model_device = next(model.parameters()).device
+        input_values = processed_audio.input_values.to(model_device)
+        attention_mask = processed_audio.attention_mask.to(model_device)
+        canonical_token_ids, token_lengths, token_mask, display_tokens = _prepare_canonical_tokens(
+            transcript, processor, model_device
+        )
         with torch.no_grad():
             outputs = model(
                 attention_mask=attention_mask,
                 canonical_token_ids=canonical_token_ids,
                 token_lengths=token_lengths,
+                token_mask=token_mask,
             )
+        return _extract_head_predictions(outputs.logits, token_mask, display_tokens)
+    except Exception as exc:
+        logger.error("An error occurred during inference: %s", exc, exc_info=True)
+        return f"<p style='text-align:center; color:red;'>An error occurred: {exc}</p>"