File size: 34,890 Bytes

781c3b4

#!/usr/bin/env python3
"""

Custom ViSoNorm model class for BartPho-based models.

This preserves the custom heads needed for text normalization and

is loadable via auto_map without custom model_type.

"""

import math
import torch
import torch.nn as nn
from transformers import MBartModel, MBartConfig, MBartPreTrainedModel
from transformers.modeling_outputs import Seq2SeqLMOutput
# Define constants locally to avoid external dependencies
NUM_LABELS_N_MASKS = 5


def gelu(x):
    return x * 0.5 * (1.0 + torch.erf(x / math.sqrt(2.0)))


class MBartLMHead(nn.Module):
    def __init__(self, config, bart_model_embedding_weights):
        super().__init__()
        # Use the actual hidden size from the pretrained model, not the config
        actual_hidden_size = bart_model_embedding_weights.size(1)
        self.dense = nn.Linear(actual_hidden_size, actual_hidden_size)
        self.layer_norm = nn.LayerNorm(actual_hidden_size, eps=1e-12)

        num_labels = bart_model_embedding_weights.size(0)
        self.decoder = nn.Linear(actual_hidden_size, num_labels, bias=False)
        self.decoder.weight = bart_model_embedding_weights
        self.decoder.bias = nn.Parameter(torch.zeros(num_labels))

    def forward(self, features):
        x = self.dense(features)
        x = gelu(x)
        x = self.layer_norm(x)
        x = self.decoder(x)
        return x


class BartMaskNPredictionHead(nn.Module):
    def __init__(self, config, actual_hidden_size):
        super(BartMaskNPredictionHead, self).__init__()
        self.mask_predictor_dense = nn.Linear(actual_hidden_size, 50)
        self.mask_predictor_proj = nn.Linear(50, NUM_LABELS_N_MASKS)
        self.activation = gelu

    def forward(self, sequence_output):
        mask_predictor_state = self.activation(self.mask_predictor_dense(sequence_output))
        prediction_scores = self.mask_predictor_proj(mask_predictor_state)
        return prediction_scores


class BartBinaryPredictor(nn.Module):
    def __init__(self, hidden_size, dense_dim=100):
        super(BartBinaryPredictor, self).__init__()
        self.dense = nn.Linear(hidden_size, dense_dim)
        # Use 'predictor' to match the checkpoint parameter names
        self.predictor = nn.Linear(dense_dim, 2)
        self.activation = gelu

    def forward(self, sequence_output):
        state = self.activation(self.dense(sequence_output))
        prediction_scores = self.predictor(state)
        return prediction_scores


class ViSoNormBartPhoForMaskedLM(MBartPreTrainedModel):
    config_class = MBartConfig

    def __init__(self, config: MBartConfig):
        super().__init__(config)
        
        # Create MBartModel with the exact configuration from the checkpoint
        bart_config = MBartConfig(
            vocab_size=self.config.vocab_size,
            hidden_size=self.config.hidden_size,
            num_hidden_layers=self.config.num_hidden_layers,
            num_attention_heads=self.config.num_attention_heads,
            intermediate_size=self.config.intermediate_size,
            max_position_embeddings=self.config.max_position_embeddings,
            type_vocab_size=self.config.type_vocab_size,
            initializer_range=self.config.initializer_range,
            layer_norm_eps=self.config.layer_norm_eps,
            pad_token_id=self.config.pad_token_id,
            bos_token_id=self.config.bos_token_id,
            eos_token_id=self.config.eos_token_id,
            mask_token_id=self.config.mask_token_id,
        )
        
        # Use the exact same config that was used during training
        self.bart = MBartModel(self.config)
        
        # Get actual hidden size from the pretrained model
        actual_hidden_size = self.bart.shared.weight.size(1)
        
        # ViSoNorm normalization head - use exact same structure as training
        self.cls = MBartLMHead(self.config, self.bart.shared.weight)
        
        # Additional heads for ViSoNorm functionality
        self.mask_n_predictor = BartMaskNPredictionHead(self.config, actual_hidden_size)
        self.nsw_detector = BartBinaryPredictor(actual_hidden_size, dense_dim=100)
        self.num_labels_n_mask = NUM_LABELS_N_MASKS

        # Initialize per HF conventions
        self.post_init()
    
    def _load_state_dict(self, state_dict, strict=True):
        """

        Custom state dict loading that handles shape mismatches gracefully.

        """
        # Check for positional embedding mismatches
        if 'bart.encoder.embed_positions.weight' in state_dict:
            checkpoint_pos_shape = state_dict['bart.encoder.embed_positions.weight'].shape
            model_pos_shape = self.bart.encoder.embed_positions.weight.shape
            
            if checkpoint_pos_shape != model_pos_shape:
                # Resize the positional embeddings to match the checkpoint
                self.bart.encoder.embed_positions.weight.data = torch.nn.Parameter(
                    torch.zeros(checkpoint_pos_shape[0], checkpoint_pos_shape[1])
                )
                self.bart.decoder.embed_positions.weight.data = torch.nn.Parameter(
                    torch.zeros(checkpoint_pos_shape[0], checkpoint_pos_shape[1])
                )
        
        # Load the state dict with strict=False to handle any remaining mismatches
        missing_keys, unexpected_keys = self.load_state_dict(state_dict, strict=False)
        
        return missing_keys, unexpected_keys
    
    @classmethod
    def from_pretrained(cls, pretrained_model_name_or_path, *args, **kwargs):
        """

        Override from_pretrained to use our custom state dict loading.

        """
        # Load the config first
        config = MBartConfig.from_pretrained(pretrained_model_name_or_path)
        
        # Create the model instance
        model = cls(config)
        
        # Load the state dict manually using our custom method
        import os
        from huggingface_hub import hf_hub_download
        
        # Try to find the model file in the repository
        model_file = None
        
        # First try pytorch_model.bin
        try:
            model_file = hf_hub_download(pretrained_model_name_or_path, "pytorch_model.bin")
            state_dict = torch.load(model_file, map_location='cpu')
        except Exception:
            # Try model.safetensors
            try:
                model_file = hf_hub_download(pretrained_model_name_or_path, "model.safetensors")
                from safetensors.torch import load_file
                state_dict = load_file(model_file)
            except Exception:
                # Try local files if it's a local path
                if os.path.exists(pretrained_model_name_or_path):
                    pytorch_file = os.path.join(pretrained_model_name_or_path, "pytorch_model.bin")
                    safetensors_file = os.path.join(pretrained_model_name_or_path, "model.safetensors")
                    
                    if os.path.exists(pytorch_file):
                        state_dict = torch.load(pytorch_file, map_location='cpu')
                    elif os.path.exists(safetensors_file):
                        from safetensors.torch import load_file
                        state_dict = load_file(safetensors_file)
                    else:
                        raise FileNotFoundError(f"No model file found in {pretrained_model_name_or_path}")
                else:
                    raise FileNotFoundError(f"Model file not found for {pretrained_model_name_or_path}")
        
        # Use our custom state dict loading
        model._load_state_dict(state_dict)
        
        return model
    
    def fix_classification_head_for_tokenizer(self, tokenizer):
        """

        Fix the classification head to match the tokenizer's vocabulary size.

        This is needed when there's a vocabulary mismatch between model and tokenizer.

        """
        tokenizer_vocab_size = len(tokenizer)
        model_vocab_size = self.config.vocab_size
        
        if tokenizer_vocab_size != model_vocab_size:
            # Check if <space> token is missing
            if '<space>' not in tokenizer.get_vocab():
                # Add the <space> token
                tokenizer.add_tokens(['<space>'])
                new_vocab_size = len(tokenizer)
                
                # Update the model's embedding layer to match new tokenizer
                self.bart.resize_token_embeddings(new_vocab_size)
                
                # Initialize the new token's embedding with proper weights
                with torch.no_grad():
                    # Get the embedding for the new token (last one)
                    new_token_id = new_vocab_size - 1
                    # Initialize with the average of existing embeddings (better than random)
                    existing_embeddings = self.bart.shared.weight[:-1]  # All except the new token
                    avg_embedding = existing_embeddings.mean(dim=0)
                    self.bart.shared.weight[new_token_id] = avg_embedding
    

    def forward(

        self,

        input_ids=None,

        attention_mask=None,

        decoder_input_ids=None,

        decoder_attention_mask=None,

        head_mask=None,

        decoder_head_mask=None,

        cross_attn_head_mask=None,

        encoder_outputs=None,

        past_key_values=None,

        inputs_embeds=None,

        decoder_inputs_embeds=None,

        use_cache=None,

        output_attentions=None,

        output_hidden_states=None,

        return_dict=None,

    ):
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        outputs = self.bart(
            input_ids=input_ids,
            attention_mask=attention_mask,
            decoder_input_ids=decoder_input_ids,
            decoder_attention_mask=decoder_attention_mask,
            head_mask=head_mask,
            decoder_head_mask=decoder_head_mask,
            cross_attn_head_mask=cross_attn_head_mask,
            encoder_outputs=encoder_outputs,
            past_key_values=past_key_values,
            inputs_embeds=inputs_embeds,
            decoder_inputs_embeds=decoder_inputs_embeds,
            use_cache=use_cache,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        # BartModel returns Seq2SeqModelOutput, we need the encoder last hidden state
        if return_dict:
            sequence_output = outputs.last_hidden_state
        else:
            sequence_output = outputs[0]
        
        # Calculate all three prediction heads
        logits_norm = self.cls(sequence_output)
        logits_n_masks_pred = self.mask_n_predictor(sequence_output)
        logits_nsw_detection = self.nsw_detector(sequence_output)

        if not return_dict:
            return (logits_norm, logits_n_masks_pred, logits_nsw_detection) + outputs[1:]

        # Return all prediction heads for ViSoNorm inference
        # Create a custom output object that contains all three heads
        class ViSoNormOutput:
            def __init__(self, logits_norm, logits_n_masks_pred, logits_nsw_detection, hidden_states=None, attentions=None):
                self.logits = logits_norm
                self.logits_norm = logits_norm
                self.logits_n_masks_pred = logits_n_masks_pred
                self.logits_nsw_detection = logits_nsw_detection
                self.hidden_states = hidden_states
                self.attentions = attentions
        
        # Handle Seq2SeqModelOutput attributes correctly
        hidden_states = getattr(outputs, 'encoder_hidden_states', None) or getattr(outputs, 'hidden_states', None)
        attentions = getattr(outputs, 'encoder_attentions', None) or getattr(outputs, 'attentions', None)
        
        return ViSoNormOutput(
            logits_norm=logits_norm,
            logits_n_masks_pred=logits_n_masks_pred,
            logits_nsw_detection=logits_nsw_detection,
            hidden_states=hidden_states,
            attentions=attentions,
        )
    
    def normalize_text(self, tokenizer, text, device='cpu'):
        """

        Normalize text using the ViSoNorm BartPho model with proper NSW detection and masking.

        

        Args:

            tokenizer: HuggingFace tokenizer (should be BartphoTokenizer)

            text: Input text to normalize

            device: Device to run inference on

        

        Returns:

            Tuple of (normalized_text, source_tokens, prediction_tokens)

        """
        # Move model to device
        self.to(device)
        
        # CRITICAL: Fix classification head for tokenizer vocabulary mismatch
        self.fix_classification_head_for_tokenizer(tokenizer)
        
        # Step 1: Preprocess text exactly like training data
        # BARTpho uses custom tokenization - handle it properly
        
        # Use the tokenizer's encode method to ensure proper tokenization
        # This handles special tokens correctly for BARTpho
        encoded = tokenizer.encode(text, add_special_tokens=True, return_tensors="pt")
        input_tokens_tensor = encoded.to(device)
        
        # Get the actual tokens for debugging
        input_tokens = tokenizer.convert_ids_to_tokens(encoded[0])
        
        # Step 2: Apply the same truncation and masking logic as training
        input_tokens_tensor, _, token_type_ids, input_mask = self._truncate_and_build_masks(input_tokens_tensor)
        
        # Step 3: Get all three prediction heads from ViSoNorm model
        # Use the same approach as training: call bart directly and get encoder_last_hidden_state
        self.eval()
        with torch.no_grad():
            bart_outputs = self.bart(input_tokens_tensor, attention_mask=input_mask, output_hidden_states=True)
            sequence_output = bart_outputs.encoder_last_hidden_state
            
            # Calculate all three prediction heads
            logits_norm = self.cls(sequence_output)
            logits_n_masks_pred = self.mask_n_predictor(sequence_output)
            logits_nsw_detection = self.nsw_detector(sequence_output)
            
            # Create outputs object with the same interface as our custom forward method
            class ViSoNormOutput:
                def __init__(self, logits_norm, logits_n_masks_pred, logits_nsw_detection):
                    self.logits = logits_norm
                    self.logits_norm = logits_norm
                    self.logits_n_masks_pred = logits_n_masks_pred
                    self.logits_nsw_detection = logits_nsw_detection
            
            outputs = ViSoNormOutput(logits_norm, logits_n_masks_pred, logits_nsw_detection)
        
        # Step 4: Use NSW detector to identify tokens that need normalization
        tokens = tokenizer.convert_ids_to_tokens(input_tokens_tensor[0])
        
        if hasattr(outputs, 'logits_nsw_detection') and outputs.logits_nsw_detection is not None:
            # Handle different output shapes
            if outputs.logits_nsw_detection.dim() == 3:  # (batch, seq_len, 2) - binary classification
                nsw_predictions = torch.argmax(outputs.logits_nsw_detection[0], dim=-1) == 1
            else:  # (batch, seq_len) - single output
                nsw_predictions = torch.sigmoid(outputs.logits_nsw_detection[0]) > 0.5
            
            tokens_need_norm = []
            for i, token in enumerate(tokens):
                # Skip special tokens
                if token in ['<s>', '</s>', '<pad>', '<unk>', '<mask>']:
                    tokens_need_norm.append(False)
                else:
                    if i < len(nsw_predictions):
                        tokens_need_norm.append(nsw_predictions[i].item())
                    else:
                        tokens_need_norm.append(False)
        else:
            # Fallback: assume all non-special tokens need checking
            tokens_need_norm = [token not in ['<s>', '</s>', '<pad>', '<unk>', '<mask>'] for token in tokens]
        
        # Update NSW tokens list (purely model-driven or generic non-special fallback)
        nsw_tokens = [tokens[i] for i, need in enumerate(tokens_need_norm) if need]
        
        # Step 5: Greedy 0/1-mask selection when heads are unusable
        # Try, per NSW position, whether adding one mask improves sequence likelihood

        def _score_sequence(input_ids_tensor: torch.Tensor) -> float:
            with torch.no_grad():
                # Use the same approach as training: call bart directly
                bart_outputs = self.bart(input_ids_tensor, attention_mask=torch.ones_like(input_ids_tensor), output_hidden_states=True)
                sequence_output = bart_outputs.encoder_last_hidden_state
                logits = self.cls(sequence_output)
                log_probs = torch.log_softmax(logits[0], dim=-1)
                # Score by taking the max log-prob at each position (approximate sequence likelihood)
                position_scores, _ = torch.max(log_probs, dim=-1)
                return float(position_scores.mean().item())

        mask_token_id = tokenizer.convert_tokens_to_ids('<mask>')
        working_ids = input_tokens_tensor[0].detach().clone().cpu().tolist()
        nsw_indices = [i for i, need in enumerate(tokens_need_norm) if need]

        offset = 0
        for i in nsw_indices:
            pos = i + offset
            # Candidate A: no mask
            cand_a = working_ids
            score_a = _score_sequence(torch.tensor([cand_a], device=device))
            # Candidate B: add one mask after pos
            cand_b = working_ids[:pos+1] + [mask_token_id] + working_ids[pos+1:]
            score_b = _score_sequence(torch.tensor([cand_b], device=device))
            if score_b > score_a:
                working_ids = cand_b
                offset += 1

        # Final prediction on the chosen masked sequence (may be unchanged)
        masked_input_ids = torch.tensor([working_ids], device=device)
        with torch.no_grad():
            # Use the same approach as training: call bart directly
            bart_outputs = self.bart(masked_input_ids, attention_mask=torch.ones_like(masked_input_ids), output_hidden_states=True)
            sequence_output = bart_outputs.encoder_last_hidden_state
            logits_final = self.cls(sequence_output)
        pred_ids = torch.argmax(logits_final, dim=-1)[0].cpu().tolist()
        
        # Build final token ids by taking predictions at positions; keep originals at specials
        final_tokens = []
        for idx, src_id in enumerate(working_ids):
            tok = tokenizer.convert_ids_to_tokens([src_id])[0]
            if tok in ['<s>', '</s>', '<pad>', '<unk>']:
                final_tokens.append(src_id)
            else:
                pred_id = pred_ids[idx] if idx < len(pred_ids) else src_id
                # Ensure predicted ID is within valid range
                if pred_id >= len(tokenizer):
                    pred_id = len(tokenizer) - 1
                final_tokens.append(pred_id)
        
        # Step 9: Convert to final text
        def remove_special_tokens(token_list):
            special_tokens = ['<s>', '</s>', '<pad>', '<unk>', '<mask>', '<space>']
            return [token for token in token_list if token not in special_tokens]
        
        def _safe_ids_to_text(token_ids):
            if not token_ids:
                return ""
            try:
                tokens = tokenizer.convert_ids_to_tokens(token_ids)
                cleaned = remove_special_tokens(tokens)
                if not cleaned:
                    return ""
                return tokenizer.convert_tokens_to_string(cleaned)
            except Exception:
                return ""
        
        # Build final normalized text
        final_tokens = [tid for tid in final_tokens if tid != -1]
        pred_str = _safe_ids_to_text(final_tokens)
        # Collapse repeated whitespace
        if pred_str:
            pred_str = ' '.join(pred_str.split())
        
        # Also return token lists for optional inspection
        decoded_source = tokenizer.convert_ids_to_tokens(working_ids)
        decoded_pred = tokenizer.convert_ids_to_tokens(final_tokens)
        
        return pred_str, decoded_source, decoded_pred
    
    def detect_nsw(self, tokenizer, text, device='cpu'):
        """

        Detect Non-Standard Words (NSW) in text and return detailed information.

        This method aligns with normalize_text to ensure consistent NSW detection.

        

        Args:

            tokenizer: HuggingFace tokenizer

            text: Input text to analyze

            device: Device to run inference on

        

        Returns:

            List of dictionaries containing NSW information:

            [{'index': int, 'start_index': int, 'end_index': int, 'nsw': str, 

              'prediction': str, 'confidence_score': float}, ...]

        """
        # Move model to device
        self.to(device)
        
        # CRITICAL: Fix classification head for tokenizer vocabulary mismatch
        self.fix_classification_head_for_tokenizer(tokenizer)
        
        # Step 1: Preprocess text exactly like normalize_text
        # Use the tokenizer's encode method to ensure proper tokenization
        # This handles special tokens correctly
        encoded = tokenizer.encode(text, add_special_tokens=True, return_tensors="pt")
        input_tokens_tensor = encoded.to(device)
        
        # Get the actual tokens for debugging
        input_tokens = tokenizer.convert_ids_to_tokens(encoded[0])
        
        # Step 2: Apply the same truncation and masking logic as normalize_text
        input_tokens_tensor, _, token_type_ids, input_mask = self._truncate_and_build_masks(input_tokens_tensor)
        
        # Step 3: Get all three prediction heads from ViSoNorm model (same as normalize_text)
        # Use the same approach as training: call bart directly and get encoder_last_hidden_state
        self.eval()
        with torch.no_grad():
            bart_outputs = self.bart(input_tokens_tensor, attention_mask=input_mask, output_hidden_states=True)
            sequence_output = bart_outputs.encoder_last_hidden_state
            
            # Calculate all three prediction heads
            logits_norm = self.cls(sequence_output)
            logits_n_masks_pred = self.mask_n_predictor(sequence_output)
            logits_nsw_detection = self.nsw_detector(sequence_output)
            
            # Create outputs object with the same interface as our custom forward method
            class ViSoNormOutput:
                def __init__(self, logits_norm, logits_n_masks_pred, logits_nsw_detection):
                    self.logits = logits_norm
                    self.logits_norm = logits_norm
                    self.logits_n_masks_pred = logits_n_masks_pred
                    self.logits_nsw_detection = logits_nsw_detection
            
            outputs = ViSoNormOutput(logits_norm, logits_n_masks_pred, logits_nsw_detection)
        
        # Step 4: Use NSW detector to identify tokens that need normalization (same logic as normalize_text)
        tokens = tokenizer.convert_ids_to_tokens(input_tokens_tensor[0])
        
        if hasattr(outputs, 'logits_nsw_detection') and outputs.logits_nsw_detection is not None:
            # Handle different output shapes (same as normalize_text)
            if outputs.logits_nsw_detection.dim() == 3:  # (batch, seq_len, 2) - binary classification
                nsw_predictions = torch.argmax(outputs.logits_nsw_detection[0], dim=-1) == 1
                nsw_confidence = torch.softmax(outputs.logits_nsw_detection[0], dim=-1)[:, 1]
            else:  # (batch, seq_len) - single output
                nsw_predictions = torch.sigmoid(outputs.logits_nsw_detection[0]) > 0.5
                nsw_confidence = torch.sigmoid(outputs.logits_nsw_detection[0])
            
            tokens_need_norm = []
            for i, token in enumerate(tokens):
                # Skip special tokens (same as normalize_text)
                if token in ['<s>', '</s>', '<pad>', '<unk>', '<mask>']:
                    tokens_need_norm.append(False)
                else:
                    if i < len(nsw_predictions):
                        tokens_need_norm.append(nsw_predictions[i].item())
                    else:
                        tokens_need_norm.append(False)
        else:
            # Fallback: assume all non-special tokens need checking (same as normalize_text)
            tokens_need_norm = [token not in ['<s>', '</s>', '<pad>', '<unk>', '<mask>'] for token in tokens]
        
        # Step 5: Apply the same masking strategy as normalize_text
        def _score_sequence(input_ids_tensor: torch.Tensor) -> float:
            with torch.no_grad():
                # Use the same approach as training: call bart directly
                bart_outputs = self.bart(input_ids_tensor, attention_mask=torch.ones_like(input_ids_tensor), output_hidden_states=True)
                sequence_output = bart_outputs.encoder_last_hidden_state
                logits = self.cls(sequence_output)
                log_probs = torch.log_softmax(logits[0], dim=-1)
                position_scores, _ = torch.max(log_probs, dim=-1)
                return float(position_scores.mean().item())

        mask_token_id = tokenizer.convert_tokens_to_ids('<mask>')
        working_ids = input_tokens_tensor[0].detach().clone().cpu().tolist()
        nsw_indices = [i for i, need in enumerate(tokens_need_norm) if need]

        offset = 0
        for i in nsw_indices:
            pos = i + offset
            # Candidate A: no mask
            cand_a = working_ids
            score_a = _score_sequence(torch.tensor([cand_a], device=device))
            # Candidate B: add one mask after pos
            cand_b = working_ids[:pos+1] + [mask_token_id] + working_ids[pos+1:]
            score_b = _score_sequence(torch.tensor([cand_b], device=device))
            if score_b > score_a:
                working_ids = cand_b
                offset += 1

        # Step 6: Get final predictions using the same masked sequence as normalize_text
        masked_input_ids = torch.tensor([working_ids], device=device)
        with torch.no_grad():
            # Use the same approach as training: call bart directly
            bart_outputs = self.bart(masked_input_ids, attention_mask=torch.ones_like(masked_input_ids), output_hidden_states=True)
            sequence_output = bart_outputs.encoder_last_hidden_state
            logits_final = self.cls(sequence_output)
        pred_ids = torch.argmax(logits_final, dim=-1)[0].cpu().tolist()
        
        # No need for vocabulary mismatch handling - classification head is already fixed

        # Step 7: Build results using the same logic as normalize_text
        # We need to identify NSW tokens by comparing original vs predicted tokens
        # This ensures we catch all tokens that were actually changed, not just those detected by NSW head
        nsw_results = []
        
        # Build final token ids by taking predictions at positions; keep originals at specials (same as normalize_text)
        final_tokens = []
        for idx, src_id in enumerate(working_ids):
            tok = tokenizer.convert_ids_to_tokens([src_id])[0]
            if tok in ['<s>', '</s>', '<pad>', '<unk>']:
                final_tokens.append(src_id)
            else:
                final_tokens.append(pred_ids[idx] if idx < len(pred_ids) else src_id)
        
        # Convert final tokens to normalized text (same as normalize_text)
        def remove_special_tokens(token_list):
            special_tokens = ['<s>', '</s>', '<pad>', '<unk>', '<mask>', '<space>']
            return [token for token in token_list if token not in special_tokens]
        
        def _safe_ids_to_text(token_ids):
            if not token_ids:
                return ""
            try:
                tokens = tokenizer.convert_ids_to_tokens(token_ids)
                cleaned = remove_special_tokens(tokens)
                if not cleaned:
                    return ""
                return tokenizer.convert_tokens_to_string(cleaned)
            except Exception:
                return ""
        
        # Build final normalized text
        final_tokens_cleaned = [tid for tid in final_tokens if tid != -1]
        normalized_text = _safe_ids_to_text(final_tokens_cleaned)
        # Collapse repeated whitespace
        if normalized_text:
            normalized_text = ' '.join(normalized_text.split())
        
        # Now compare original text tokens with normalized text tokens
        original_tokens = tokenizer.tokenize(text)
        normalized_tokens = tokenizer.tokenize(normalized_text)
        
        # Use a smarter approach that can handle multi-token expansions
        # Get the source and predicted tokens from the model
        decoded_source = tokenizer.convert_ids_to_tokens(working_ids)
        decoded_pred = tokenizer.convert_ids_to_tokens(final_tokens)
        
        # Clean the tokens (remove special tokens and ▁ prefix)
        def clean_token(token):
            if token in ['<s>', '</s>', '<pad>', '<unk>', '<mask>']:
                return None
            return token.strip().lstrip('▁')
        
        # Group consecutive predictions that form expansions
        i = 0
        while i < len(decoded_source):
            src_token = decoded_source[i]
            clean_src = clean_token(src_token)
            
            if clean_src is None:
                i += 1
                continue
            
            # Check if this token was changed
            pred_token = decoded_pred[i]
            clean_pred = clean_token(pred_token)
            
            if clean_pred is None:
                i += 1
                continue
            
            if clean_src != clean_pred:
                # This is an NSW token - check if it's part of an expansion
                expansion_tokens = [clean_pred]
                j = i + 1
                
                # Look for consecutive mask tokens that were filled
                while j < len(decoded_source) and j < len(decoded_pred):
                    next_src = decoded_source[j]
                    next_pred = decoded_pred[j]
                    
                    # If the source is a mask token, it was added for expansion
                    if next_src == '<mask>':
                        clean_next_pred = clean_token(next_pred)
                        if clean_next_pred is not None:
                            expansion_tokens.append(clean_next_pred)
                        j += 1
                    else:
                        # Check if the next source token was also changed
                        clean_next_src = clean_token(next_src)
                        clean_next_pred = clean_token(next_pred)
                        
                        if clean_next_src is not None and clean_next_pred is not None and clean_next_src != clean_next_pred:
                            # This is also a changed token, might be part of expansion
                            # But we need to be careful not to group unrelated changes
                            # For now, let's be conservative and only group mask-based expansions
                            break
                        else:
                            break
                
                # Create the expansion text
                expansion_text = ' '.join(expansion_tokens)
                
                # This is an NSW token
                start_idx = text.find(clean_src)
                end_idx = start_idx + len(clean_src) if start_idx != -1 else len(clean_src)
                
                # Calculate confidence score
                if hasattr(outputs, 'logits_nsw_detection') and outputs.logits_nsw_detection is not None:
                    # Find the corresponding position in the original token list
                    orig_pos = None
                    for k, tok in enumerate(tokens):
                        if tok.strip().lstrip('▁') == clean_src:
                            orig_pos = k
                            break
                    
                    if orig_pos is not None and orig_pos < len(nsw_confidence):
                        if outputs.logits_nsw_detection.dim() == 3:
                            nsw_conf = nsw_confidence[orig_pos].item()
                        else:
                            nsw_conf = nsw_confidence[orig_pos].item()
                    else:
                        nsw_conf = 0.5  # Default if position not found
                    
                    # Get normalization confidence
                    norm_logits = logits_final[0]  # Use final masked logits
                    norm_confidence = torch.softmax(norm_logits, dim=-1)
                    norm_conf = norm_confidence[i][final_tokens[i]].item()
                    combined_confidence = (nsw_conf + norm_conf) / 2
                else:
                    combined_confidence = 0.5  # Default confidence if no NSW detector
                
                nsw_results.append({
                    'index': i,
                    'start_index': start_idx,
                    'end_index': end_idx,
                    'nsw': clean_src,
                    'prediction': expansion_text,
                    'confidence_score': round(combined_confidence, 4)
                })
                
                # Move to the next unprocessed token
                i = j
            else:
                i += 1
        
        return nsw_results
    
    def _truncate_and_build_masks(self, input_tokens_tensor, output_tokens_tensor=None):
        """Apply the same truncation and masking logic as training."""
        # BartPho specific truncation logic
        pad_id_model = 1
        input_mask = torch.ones_like(input_tokens_tensor)
        token_type_ids = None
        return input_tokens_tensor, output_tokens_tensor, token_type_ids, input_mask


__all__ = ["ViSoNormBartPhoForMaskedLM"]