Upload folder using huggingface_hub

Browse files

Files changed (13) hide show

config.json +5 -1
configuration_swipe.py +7 -74
embeddings.py +18 -6
heads.py +22 -39
model.safetensors +2 -2
modeling_swipe.py +191 -334
preprocessing.py +275 -0
processing_swipe.py +215 -132
processor_config.json +2 -0
special_tokens_map.json +42 -6
tokenization_swipe.py +37 -1
tokenizer.py +17 -6
tokenizer_config.json +7 -7

config.json CHANGED Viewed

@@ -15,6 +15,9 @@
   "n_heads": 12,
   "n_layers": 12,
   "pad_token_id": 0,
   "predict_path": true,
   "sep_token_id": 2,
   "transformers_version": "4.57.3",
@@ -22,6 +25,7 @@
   "vocab_size": 43,
   "auto_map": {
     "AutoConfig": "configuration_swipe.SwipeTransformerConfig",
-    "AutoModel": "modeling_swipe.SwipeTransformerModel"
   }
 }

   "n_heads": 12,
   "n_layers": 12,
   "pad_token_id": 0,
+  "path_input_dim": 6,
+  "predict_char": true,
+  "predict_length": true,
   "predict_path": true,
   "sep_token_id": 2,
   "transformers_version": "4.57.3",
   "vocab_size": 43,
   "auto_map": {
     "AutoConfig": "configuration_swipe.SwipeTransformerConfig",
+    "AutoModel": "modeling_swipe.SwipeTransformerModel",
+    "AutoModelForCausalLM": "modeling_swipe.SwipeTransformerModel"
   }
 }

configuration_swipe.py CHANGED Viewed

@@ -20,6 +20,7 @@ class SwipeTransformerConfig(PretrainedConfig):
         vocab_size (int, optional): Size of vocabulary. Defaults to 100.
         max_path_len (int, optional): Maximum path sequence length. Defaults to 64.
         max_char_len (int, optional): Maximum character sequence length. Defaults to 38.
         predict_path (bool, optional): Whether to predict path coordinates. Defaults to True.
         pad_token_id (int, optional): Padding token ID. Defaults to 0.
         cls_token_id (int, optional): CLS token ID. Defaults to 1.
@@ -41,7 +42,10 @@ class SwipeTransformerConfig(PretrainedConfig):
         vocab_size: int = 100,
         max_path_len: int = 64,
         max_char_len: int = 38,
         predict_path: bool = True,
         pad_token_id: int = 0,
         cls_token_id: int = 1,
         sep_token_id: int = 2,
@@ -63,83 +67,12 @@ class SwipeTransformerConfig(PretrainedConfig):
         self.vocab_size = vocab_size
         self.max_path_len = max_path_len
         self.max_char_len = max_char_len
         # Model capabilities
         self.predict_path = predict_path
-        # Special tokens
-        self.cls_token_id = cls_token_id
-        self.sep_token_id = sep_token_id
-        self.mask_token_id = mask_token_id
-        self.unk_token_id = unk_token_id
-class SwipeCrossEncoderConfig(PretrainedConfig):
-    """
-    Configuration class for SwipeCrossEncoderForSequenceClassification.
-    This configuration extends the base SwipeTransformer config for use in
-    cross-encoder tasks (e.g., path-word similarity scoring).
-    Args:
-        d_model (int, optional): Hidden dimension size. Defaults to 256.
-        n_layers (int, optional): Number of transformer layers. Defaults to 4.
-        n_heads (int, optional): Number of attention heads. Defaults to 4.
-        d_ff (int, optional): Feedforward dimension. Defaults to 1024.
-        dropout (float, optional): Dropout rate. Defaults to 0.1.
-        vocab_size (int, optional): Size of vocabulary. Defaults to 100.
-        max_path_len (int, optional): Maximum path sequence length. Defaults to 64.
-        max_char_len (int, optional): Maximum character sequence length. Defaults to 38.
-        num_labels (int, optional): Number of classification labels. Defaults to 1.
-        problem_type (str, optional): Problem type ('regression' or 'single_label_classification'). Defaults to "regression".
-        pad_token_id (int, optional): Padding token ID. Defaults to 0.
-        cls_token_id (int, optional): CLS token ID. Defaults to 1.
-        sep_token_id (int, optional): SEP token ID. Defaults to 2.
-        mask_token_id (int, optional): MASK token ID. Defaults to 3.
-        unk_token_id (int, optional): Unknown token ID. Defaults to 4.
-        eos_token_id (int, optional): End-of-sequence token ID. Defaults to 5.
-    """
-    model_type = "swipe_cross_encoder"
-    def __init__(
-        self,
-        d_model: int = 256,
-        n_layers: int = 4,
-        n_heads: int = 4,
-        d_ff: int = 1024,
-        dropout: float = 0.1,
-        vocab_size: int = 100,
-        max_path_len: int = 64,
-        max_char_len: int = 38,
-        num_labels: int = 1,
-        problem_type: str = "regression",
-        pad_token_id: int = 0,
-        cls_token_id: int = 1,
-        sep_token_id: int = 2,
-        mask_token_id: int = 3,
-        unk_token_id: int = 4,
-        eos_token_id: int = 5,
-        **kwargs,
-    ):
-        super().__init__(
-            pad_token_id=pad_token_id, num_labels=num_labels, eos_token_id=eos_token_id, **kwargs
-        )
-        # Model architecture parameters
-        self.d_model = d_model
-        self.n_layers = n_layers
-        self.n_heads = n_heads
-        self.d_ff = d_ff
-        self.dropout = dropout
-        # Vocabulary and sequence length
-        self.vocab_size = vocab_size
-        self.max_path_len = max_path_len
-        self.max_char_len = max_char_len
-        # Classification parameters
-        self.problem_type = problem_type
         # Special tokens
         self.cls_token_id = cls_token_id

         vocab_size (int, optional): Size of vocabulary. Defaults to 100.
         max_path_len (int, optional): Maximum path sequence length. Defaults to 64.
         max_char_len (int, optional): Maximum character sequence length. Defaults to 38.
+        path_input_dim (int, optional): Path feature dimension. Defaults to 6 for (x, y, dx, dy, ds, log_dt).
         predict_path (bool, optional): Whether to predict path coordinates. Defaults to True.
         pad_token_id (int, optional): Padding token ID. Defaults to 0.
         cls_token_id (int, optional): CLS token ID. Defaults to 1.
         vocab_size: int = 100,
         max_path_len: int = 64,
         max_char_len: int = 38,
+        path_input_dim: int = 6,
+        predict_char: bool = True,
         predict_path: bool = True,
+        predict_length: bool = True,
         pad_token_id: int = 0,
         cls_token_id: int = 1,
         sep_token_id: int = 2,
         self.vocab_size = vocab_size
         self.max_path_len = max_path_len
         self.max_char_len = max_char_len
+        self.path_input_dim = path_input_dim
         # Model capabilities
+        self.predict_char = predict_char
         self.predict_path = predict_path
+        self.predict_length = predict_length
         # Special tokens
         self.cls_token_id = cls_token_id

embeddings.py CHANGED Viewed

@@ -5,16 +5,26 @@ import torch.nn as nn
 class PathEmbedding(nn.Module):
-    """Embeds path coordinates (x, y, t) to d_model dimension."""
-    def __init__(self, d_model: int = 256):
         super().__init__()
-        self.projection = nn.Linear(3, d_model)
     def forward(self, path_coords: torch.Tensor) -> torch.Tensor:
         """
         Args:
-            path_coords: [batch, seq_len, 3] - (x, y, t) coordinates
         Returns:
             [batch, seq_len, d_model] embeddings
@@ -90,12 +100,13 @@ class MixedEmbedding(nn.Module):
         max_char_len: int,
         d_model: int = 256,
         dropout: float = 0.1,
     ):
         super().__init__()
         self.d_model = d_model
         # Content embeddings
-        self.path_embedding = PathEmbedding(d_model)
         self.char_embedding = CharacterEmbedding(vocab_size, d_model, padding_idx=0)
         # Positional embeddings
@@ -120,7 +131,8 @@ class MixedEmbedding(nn.Module):
         Create mixed sequence with embeddings.
         Args:
-            path_coords: [batch, path_len, 3] path coordinates
             char_tokens: [batch, char_len] character token IDs
             cls_token: [batch, 1] CLS token IDs
             sep_token: [batch, 1] SEP token IDs

 class PathEmbedding(nn.Module):
+    """Embeds path features (x, y, dx, dy, ds, log_dt) to d_model dimension."""
+    def __init__(self, d_model: int = 256, input_dim: int = 6):
+        """
+        Initialize path embedding layer.
+        Args:
+            d_model: Output dimension
+            input_dim: Input feature dimension (default: 6 for x, y, dx, dy, ds, log_dt)
+        """
         super().__init__()
+        self.projection = nn.Linear(input_dim, d_model)
     def forward(self, path_coords: torch.Tensor) -> torch.Tensor:
         """
+        Project path features to d_model dimension.
         Args:
+            path_coords: [batch, seq_len, input_dim] - path features
+                         Default: (x, y, dx, dy, ds, log_dt) with input_dim=6
         Returns:
             [batch, seq_len, d_model] embeddings
         max_char_len: int,
         d_model: int = 256,
         dropout: float = 0.1,
+        path_input_dim: int = 6,
     ):
         super().__init__()
         self.d_model = d_model
         # Content embeddings
+        self.path_embedding = PathEmbedding(d_model, input_dim=path_input_dim)
         self.char_embedding = CharacterEmbedding(vocab_size, d_model, padding_idx=0)
         # Positional embeddings
         Create mixed sequence with embeddings.
         Args:
+            path_coords: [batch, path_len, path_input_dim] path features
+                         Default: [batch, path_len, 6] for (x, y, dx, dy, ds, log_dt)
             char_tokens: [batch, char_len] character token IDs
             cls_token: [batch, 1] CLS token IDs
             sep_token: [batch, 1] SEP token IDs

heads.py CHANGED Viewed

@@ -32,11 +32,11 @@ class CharacterPredictionHead(nn.Module):
 class PathPredictionHead(nn.Module):
     """Prediction head for masked path coordinates."""
-    def __init__(self, d_model: int):
         super().__init__()
         self.dense = nn.Linear(d_model, d_model)
         self.layer_norm = nn.LayerNorm(d_model)
-        self.decoder = nn.Linear(d_model, 3)  # Predict (x, y, t)
         self.activation = nn.GELU()
     def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
@@ -45,55 +45,38 @@ class PathPredictionHead(nn.Module):
             hidden_states: [batch, seq_len, d_model]
         Returns:
-            [batch, seq_len, 3] coordinates in [0, 1] range
         """
         x = self.dense(hidden_states)
         x = self.activation(x)
         x = self.layer_norm(x)
-        coords = self.decoder(x)
-        coords = torch.sigmoid(coords)  # Ensure [0, 1] range
-        return coords
-class ClassificationHead(nn.Module):
-    """
-    Classification head for cross-encoder.
-    Follows SBERT architecture: Dense → GELU → LayerNorm → Linear(→1)
-    Outputs a single similarity score per input.
-    """
-    def __init__(self, d_model: int, num_labels: int = 1):
-        super().__init__()
-        self.dense = nn.Linear(d_model, d_model)
-        self.activation = nn.GELU()
-        self.norm = nn.LayerNorm(d_model)
-        self.classifier = nn.Linear(d_model, num_labels)
-    def forward(self, features: torch.Tensor) -> torch.Tensor:
-        """
-        Args:
-            features: [batch, d_model] - typically SEP token embeddings
-        Returns:
-            [batch, num_labels] similarity scores
-        """
-        x = self.dense(features)
-        x = self.activation(x)  # GELU
-        x = self.norm(x)  # LayerNorm
-        logits = self.classifier(x)  # [batch, 1] or [batch, num_labels]
-        return logits
 class LengthPredictionHead(nn.Module):
-    """Predict sequence length (e.g., swipable character count) from CLS embedding."""
-    def __init__(self, d_model: int, max_length: int):
         super().__init__()
         self.dense = nn.Linear(d_model, d_model)
         self.activation = nn.GELU()
         self.norm = nn.LayerNorm(d_model)
-        self.classifier = nn.Linear(d_model, max_length + 1)  # classes: 0..max_length
     def forward(self, cls_features: torch.Tensor) -> torch.Tensor:
         """
@@ -101,9 +84,9 @@ class LengthPredictionHead(nn.Module):
             cls_features: [batch, d_model] CLS embeddings
         Returns:
-            [batch, max_length+1] logits over lengths
         """
         x = self.dense(cls_features)
         x = self.activation(x)
         x = self.norm(x)
-        return self.classifier(x)

 class PathPredictionHead(nn.Module):
     """Prediction head for masked path coordinates."""
+    def __init__(self, d_model: int, output_dim: int = 6):
         super().__init__()
         self.dense = nn.Linear(d_model, d_model)
         self.layer_norm = nn.LayerNorm(d_model)
+        self.decoder = nn.Linear(d_model, output_dim)
         self.activation = nn.GELU()
     def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
             hidden_states: [batch, seq_len, d_model]
         Returns:
+            [batch, seq_len, output_dim] path features.
         """
         x = self.dense(hidden_states)
         x = self.activation(x)
         x = self.layer_norm(x)
+        features = self.decoder(x)
+        # Per-feature constraints:
+        # - x, y are normalized to [0,1]
+        # - dx, dy are signed deltas (roughly [-1,1])
+        # - ds is non-negative
+        # - log_dt is non-negative
+        if features.shape[-1] == 6:
+            x_y = torch.sigmoid(features[..., 0:2])
+            dx_dy = torch.tanh(features[..., 2:4])
+            ds = torch.nn.functional.softplus(features[..., 4:5])
+            log_dt = torch.nn.functional.softplus(features[..., 5:6])
+            return torch.cat([x_y, dx_dy, ds, log_dt], dim=-1)
+        # Fallback: unconstrained regression for other output dims.
+        return features
 class LengthPredictionHead(nn.Module):
+    """Regress sequence length (e.g., swipable character count) from CLS embedding."""
+    def __init__(self, d_model: int):
         super().__init__()
         self.dense = nn.Linear(d_model, d_model)
         self.activation = nn.GELU()
         self.norm = nn.LayerNorm(d_model)
+        self.regressor = nn.Linear(d_model, 1)  # predict expected length directly
     def forward(self, cls_features: torch.Tensor) -> torch.Tensor:
         """
             cls_features: [batch, d_model] CLS embeddings
         Returns:
+            [batch, 1] predicted length
         """
         x = self.dense(cls_features)
         x = self.activation(x)
         x = self.norm(x)
+        return self.regressor(x).squeeze(-1)

model.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:b00a7a35dc99485501db127c4433afee95bd4f0c34d6c6db8ed6c69eec43404b
-size 348336548

 version https://git-lfs.github.com/spec/v1
+oid sha256:452880eddd71bf06c0c61750ae3f5ba65670c15aa97da8233df856ac74bd3e56
+size 348207344

modeling_swipe.py CHANGED Viewed

@@ -1,19 +1,13 @@
 """HuggingFace-compatible model classes for SwipeTransformer."""
 from dataclasses import dataclass
-from typing import Optional, Tuple
 import torch
 import torch.nn as nn
 from transformers import PreTrainedModel
-from transformers.modeling_outputs import (
-    BaseModelOutput,
-    BaseModelOutputWithPooling,
-    ModelOutput,
-    SequenceClassifierOutput,
-)
-from .configuration_swipe import SwipeCrossEncoderConfig, SwipeTransformerConfig
 @dataclass
@@ -24,27 +18,32 @@ class SwipeTransformerOutput(ModelOutput):
     Args:
         loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
             Language modeling loss (character prediction).
-        char_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, vocab_size)`):
-            Prediction scores of the character prediction head.
-        path_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, 3)`, *optional*):
-            Prediction scores of the path prediction head (if enabled).
-        length_logits (`torch.FloatTensor` of shape `(batch_size, max_length+1)`, *optional*):
-            Prediction scores of the length prediction head (if enabled).
         last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
             Sequence of hidden-states at the output of the last layer of the model.
         pooler_output (`torch.FloatTensor` of shape `(batch_size, hidden_size)`):
             SEP token embeddings for similarity/embedding tasks.
         hidden_states (`tuple(torch.FloatTensor)`, *optional*):
-            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, sequence_length, hidden_size)`.
     """
-    loss: Optional[torch.FloatTensor] = None
-    char_logits: torch.FloatTensor = None
-    path_logits: Optional[torch.FloatTensor] = None
-    length_logits: Optional[torch.FloatTensor] = None
-    last_hidden_state: torch.FloatTensor = None
-    pooler_output: Optional[torch.FloatTensor] = None
-    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
 class SwipeTransformerPreTrainedModel(PreTrainedModel):
@@ -98,6 +97,7 @@ class SwipeTransformerModel(SwipeTransformerPreTrainedModel):
             max_char_len=config.max_char_len,
             d_model=config.d_model,
             dropout=config.dropout,
         )
         # Transformer encoder
@@ -117,21 +117,26 @@ class SwipeTransformerModel(SwipeTransformerPreTrainedModel):
         )
         # Prediction heads
-        self.char_head = CharacterPredictionHead(
-            d_model=config.d_model,
-            vocab_size=config.vocab_size,
         )
         if config.predict_path:
-            self.path_head = PathPredictionHead(d_model=config.d_model)
         else:
             self.path_head = None
         # Length prediction head (predicts word length from path)
         # Max length is max_char_len (including EOS)
-        self.length_head = LengthPredictionHead(
-            d_model=config.d_model,
-            max_length=config.max_char_len,
         )
         # Initialize weights
@@ -139,35 +144,61 @@ class SwipeTransformerModel(SwipeTransformerPreTrainedModel):
     def forward(
         self,
-        path_coords: torch.Tensor,
         input_ids: torch.Tensor,
         attention_mask: torch.Tensor | None = None,
-        labels: torch.Tensor | None = None,
         return_dict: bool | None = None,
         output_hidden_states: bool | None = None,
     ):
         """
         Forward pass of the model.
         Args:
-            path_coords (torch.Tensor): Path coordinates [batch, path_len, 3]
             input_ids (torch.Tensor): Character token IDs [batch, char_len]
             attention_mask (torch.Tensor, optional): Attention mask [batch, seq_len]
-            labels (torch.Tensor, optional): Labels for loss calculation [batch, char_len]
             return_dict (bool, optional): Whether to return ModelOutput object
             output_hidden_states (bool, optional): Whether to output hidden states
         Returns:
             SwipeTransformerOutput or tuple: Model outputs with:
                 - loss: Optional loss value
-                - char_logits: Character prediction logits [batch, seq_len, vocab_size]
-                - path_logits: Path prediction logits [batch, seq_len, 3] (if predict_path=True)
-                - length_logits: Length prediction logits [batch, max_length]
                 - last_hidden_state: Hidden states [batch, seq_len, d_model]
-                - pooler_output: SEP token embeddings [batch, d_model] for similarity/embedding tasks
-                - hidden_states: Tuple of hidden states (if output_hidden_states=True)
         """
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
         batch_size = path_coords.shape[0]
         device = path_coords.device
@@ -191,43 +222,138 @@ class SwipeTransformerModel(SwipeTransformerPreTrainedModel):
         else:
             src_key_padding_mask = None
-        # Encode (batch_first=True is set in TransformerEncoderLayer)
-        hidden_states = self.encoder(embeddings, src_key_padding_mask=src_key_padding_mask)
-        # Character prediction
-        char_logits = self.char_head(hidden_states)
-        # Path prediction (if enabled)
         path_logits = None
         if self.path_head is not None:
-            path_logits = self.path_head(hidden_states)
         # Length prediction from CLS token
         cls_hidden = hidden_states[:, 0, :]  # [batch, d_model] - CLS at position 0
-        length_logits = self.length_head(cls_hidden)  # [batch, max_length]
         # Extract SEP token embedding for pooler output (embeddings/similarity tasks)
         # SEP is at position 1 + path_len
-        path_len = path_coords.shape[1]
         sep_position = 1 + path_len
         pooler_output = hidden_states[:, sep_position, :]  # [batch, d_model]
-        # Compute loss if labels provided
         loss = None
-        if labels is not None:
-            loss_fct = nn.CrossEntropyLoss(ignore_index=self.config.pad_token_id)
-            # Extract character positions from hidden states
-            # Sequence is: [CLS] + path + [SEP] + chars
-            char_start = 1 + path_len + 1  # After [CLS], path, and [SEP]
-            char_hidden = hidden_states[:, char_start : char_start + labels.shape[1], :]
-            char_pred = self.char_head(char_hidden)
-            loss = loss_fct(char_pred.reshape(-1, self.config.vocab_size), labels.reshape(-1))
         if not return_dict:
-            output = (hidden_states, char_logits, length_logits, pooler_output)
-            if path_logits is not None:
-                output = output + (path_logits,)
-            return ((loss,) + output) if loss is not None else output
         return SwipeTransformerOutput(
             loss=loss,
@@ -236,281 +362,12 @@ class SwipeTransformerModel(SwipeTransformerPreTrainedModel):
             length_logits=length_logits,
             last_hidden_state=hidden_states,
             pooler_output=pooler_output,
-            hidden_states=(hidden_states,) if output_hidden_states else None,
-        )
-class SwipeCrossEncoderForSequenceClassification(SwipeTransformerPreTrainedModel):
-    """
-    HuggingFace-compatible cross-encoder for sequence classification.
-    This model is designed for similarity scoring between swipe paths and words.
-    It extracts the SEP token embedding and passes it through a classification head.
-    Args:
-        config (SwipeCrossEncoderConfig): Model configuration
-    """
-    config_class = SwipeCrossEncoderConfig
-    base_model_prefix = "swipe_cross_encoder"
-    def __init__(self, config: SwipeCrossEncoderConfig):
-        super().__init__(config)
-        self.config = config
-        self.num_labels = config.num_labels
-        # Import existing components
-        from .embeddings import MixedEmbedding
-        from .heads import ClassificationHead
-        # Embeddings
-        self.embeddings = MixedEmbedding(
-            vocab_size=config.vocab_size,
-            max_path_len=config.max_path_len,
-            max_char_len=config.max_char_len,
-            d_model=config.d_model,
-            dropout=config.dropout,
         )
-        # Transformer encoder
-        encoder_layer = nn.TransformerEncoderLayer(
-            d_model=config.d_model,
-            nhead=config.n_heads,
-            dim_feedforward=config.d_ff,
-            dropout=config.dropout,
-            activation="gelu",
-            batch_first=True,
-            norm_first=True,  # Pre-LayerNorm
-        )
-        self.encoder = nn.TransformerEncoder(
-            encoder_layer,
-            num_layers=config.n_layers,
-            enable_nested_tensor=False,
-        )
-        # Classification head
-        self.classifier = ClassificationHead(
-            d_model=config.d_model,
-            num_labels=config.num_labels,
-        )
-        # Initialize weights
-        self.post_init()
-    def forward(
-        self,
-        path_coords: torch.Tensor,
-        input_ids: torch.Tensor,
-        attention_mask: torch.Tensor | None = None,
-        labels: torch.Tensor | None = None,
-        return_dict: bool | None = None,
-    ):
-        """
-        Forward pass for cross-encoder.
-        Args:
-            path_coords (torch.Tensor): Path coordinates [batch, path_len, 3]
-            input_ids (torch.Tensor): Character token IDs [batch, char_len]
-            attention_mask (torch.Tensor, optional): Attention mask [batch, seq_len]
-            labels (torch.Tensor, optional): Labels for loss calculation [batch, num_labels]
-            return_dict (bool, optional): Whether to return ModelOutput object
-        Returns:
-            SequenceClassifierOutput or tuple: Model outputs with logits and optional loss
-        """
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-        batch_size = path_coords.shape[0]
-        device = path_coords.device
-        # Create [CLS] and [SEP] tokens
-        cls_token = torch.full(
-            (batch_size, 1), fill_value=self.config.cls_token_id, dtype=torch.long, device=device
-        )
-        sep_token = torch.full(
-            (batch_size, 1), fill_value=self.config.sep_token_id, dtype=torch.long, device=device
-        )
-        # Get embeddings
-        embeddings = self.embeddings(path_coords, input_ids, cls_token, sep_token)
-        # Prepare attention mask
-        if attention_mask is not None:
-            src_key_padding_mask = attention_mask == 0
-        else:
-            src_key_padding_mask = None
-        # Encode (batch_first=True is set in TransformerEncoderLayer)
-        hidden_states = self.encoder(embeddings, src_key_padding_mask=src_key_padding_mask)
-        # Extract SEP token embedding
-        # SEP is at position 1 + path_len
-        path_len = path_coords.shape[1]
-        sep_position = 1 + path_len
-        sep_embedding = hidden_states[:, sep_position, :]  # [batch, d_model]
-        # Classification
-        logits = self.classifier(sep_embedding)  # [batch, num_labels]
-        # Compute loss if labels provided
-        loss = None
-        if labels is not None:
-            if self.config.problem_type is None:
-                if self.num_labels == 1:
-                    self.config.problem_type = "regression"
-                else:
-                    self.config.problem_type = "single_label_classification"
-            if self.config.problem_type == "regression":
-                loss_fct = nn.MSELoss()
-                if self.num_labels == 1:
-                    loss = loss_fct(logits.squeeze(), labels.squeeze())
-                else:
-                    loss = loss_fct(logits, labels)
-            elif self.config.problem_type == "single_label_classification":
-                loss_fct = nn.CrossEntropyLoss()
-                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
-        if not return_dict:
-            output = (logits,) + (hidden_states,)
-            return ((loss,) + output) if loss is not None else output
-        return SequenceClassifierOutput(
-            loss=loss,
-            logits=logits,
-            hidden_states=(hidden_states,),
-        )
-class SwipeModel(SwipeTransformerPreTrainedModel):
-    """
-    Base Swipe model for extracting embeddings.
-    .. deprecated::
-        This class is deprecated. Use SwipeTransformerModel instead, which now
-        includes pooler_output for embeddings alongside prediction heads.
-        SwipeTransformerModel provides both predictions AND embeddings in a single model.
-    This model returns the SEP token embedding, which can be used for:
-    - Vector databases
-    - Semantic search
-    - Similarity computation
-    The SEP token embedding represents the joint encoding of the path and text.
-    Usage (Deprecated - use SwipeTransformerModel instead):
-        ```python
-        from transformers import AutoModel
-        model = AutoModel.from_pretrained(
-            "your-username/swipe-model",
-            trust_remote_code=True
-        )
-        # Get embeddings
-        outputs = model(path_coords=paths, input_ids=tokens)
-        embeddings = outputs.pooler_output  # SEP token embeddings
-        ```
-    Args:
-        config (SwipeTransformerConfig or SwipeCrossEncoderConfig): Model configuration
-    """
-    def __init__(self, config):
-        super().__init__(config)
-        self.config = config
-        # Import existing components
-        from .embeddings import MixedEmbedding
-        # Embeddings
-        self.embeddings = MixedEmbedding(
-            vocab_size=config.vocab_size,
-            max_path_len=config.max_path_len,
-            max_char_len=config.max_char_len,
-            d_model=config.d_model,
-            dropout=config.dropout,
-        )
-        # Transformer encoder
-        encoder_layer = nn.TransformerEncoderLayer(
-            d_model=config.d_model,
-            nhead=config.n_heads,
-            dim_feedforward=config.d_ff,
-            dropout=config.dropout,
-            activation="gelu",
-            batch_first=True,
-            norm_first=True,  # Pre-LayerNorm
-        )
-        self.encoder = nn.TransformerEncoder(
-            encoder_layer,
-            num_layers=config.n_layers,
-            enable_nested_tensor=False,
-        )
-        # Initialize weights
-        self.post_init()
-    def forward(
-        self,
-        path_coords: torch.Tensor,
-        input_ids: torch.Tensor,
-        attention_mask: torch.Tensor | None = None,
-        return_dict: bool | None = None,
-        output_hidden_states: bool | None = None,
-    ):
-        """
-        Forward pass that returns embeddings.
-        Args:
-            path_coords (torch.Tensor): Path coordinates [batch, path_len, 3]
-            input_ids (torch.Tensor): Character token IDs [batch, char_len]
-            attention_mask (torch.Tensor, optional): Attention mask [batch, seq_len]
-            return_dict (bool, optional): Whether to return ModelOutput object
-            output_hidden_states (bool, optional): Whether to output all hidden states
-        Returns:
-            BaseModelOutputWithPooling with:
-                - last_hidden_state: Full sequence hidden states [batch, seq_len, d_model]
-                - pooler_output: SEP token embeddings [batch, d_model]
-                - hidden_states: Tuple of hidden states (if output_hidden_states=True)
-        """
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-        batch_size = path_coords.shape[0]
-        device = path_coords.device
-        # Create [CLS] and [SEP] tokens
-        cls_token = torch.full(
-            (batch_size, 1), fill_value=self.config.cls_token_id, dtype=torch.long, device=device
-        )
-        sep_token = torch.full(
-            (batch_size, 1), fill_value=self.config.sep_token_id, dtype=torch.long, device=device
-        )
-        # Get embeddings
-        embeddings = self.embeddings(path_coords, input_ids, cls_token, sep_token)
-        # Prepare attention mask
-        if attention_mask is not None:
-            src_key_padding_mask = attention_mask == 0
-        else:
-            src_key_padding_mask = None
-        # Encode (batch_first=True is set in TransformerEncoderLayer)
-        hidden_states = self.encoder(embeddings, src_key_padding_mask=src_key_padding_mask)
-        # Extract SEP token embedding (pooler output)
-        # SEP is at position 1 + path_len
-        path_len = path_coords.shape[1]
-        sep_position = 1 + path_len
-        pooler_output = hidden_states[:, sep_position, :]  # [batch, d_model]
-        if not return_dict:
-            return (hidden_states, pooler_output)
-        return BaseModelOutputWithPooling(
-            last_hidden_state=hidden_states,
-            pooler_output=pooler_output,
-            hidden_states=(hidden_states,) if output_hidden_states else None,
-        )

 """HuggingFace-compatible model classes for SwipeTransformer."""
 from dataclasses import dataclass
 import torch
 import torch.nn as nn
 from transformers import PreTrainedModel
+from transformers.modeling_outputs import ModelOutput
+from .configuration_swipe import SwipeTransformerConfig
 @dataclass
     Args:
         loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
             Language modeling loss (character prediction).
+        char_logits (`torch.FloatTensor` of shape `(batch_size, char_length, vocab_size)`):
+            Prediction scores of the character prediction head (text segment only).
+        path_logits (`torch.FloatTensor` of shape `(batch_size, path_length, path_input_dim)`, *optional*):
+            Prediction scores of the path prediction head (path segment only, if enabled).
+        length_logits (`torch.FloatTensor` of shape `(batch_size,)`, *optional*):
+            Predicted length from the length head (if enabled).
         last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
             Sequence of hidden-states at the output of the last layer of the model.
         pooler_output (`torch.FloatTensor` of shape `(batch_size, hidden_size)`):
             SEP token embeddings for similarity/embedding tasks.
         hidden_states (`tuple(torch.FloatTensor)`, *optional*):
+            Tuple of `torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`.
+            When requested, this includes the input embeddings plus one entry per encoder layer.
+        attentions (`tuple(torch.FloatTensor)`, *optional*):
+            Tuple of attention tensors (one for each layer) of shape
+            `(batch_size, num_heads, sequence_length, sequence_length)`.
     """
+    loss: torch.FloatTensor | None = None
+    char_logits: torch.FloatTensor | None = None
+    path_logits: torch.FloatTensor | None = None
+    length_logits: torch.FloatTensor | None = None
+    last_hidden_state: torch.FloatTensor | None = None
+    pooler_output: torch.FloatTensor | None = None
+    hidden_states: tuple[torch.FloatTensor] | None = None
+    attentions: tuple[torch.FloatTensor] | None = None
 class SwipeTransformerPreTrainedModel(PreTrainedModel):
             max_char_len=config.max_char_len,
             d_model=config.d_model,
             dropout=config.dropout,
+            path_input_dim=config.path_input_dim,
         )
         # Transformer encoder
         )
         # Prediction heads
+        self.char_head = (
+            CharacterPredictionHead(
+                d_model=config.d_model,
+                vocab_size=config.vocab_size,
+            )
+            if config.predict_char
+            else None
         )
         if config.predict_path:
+            self.path_head = PathPredictionHead(
+                d_model=config.d_model, output_dim=config.path_input_dim
+            )
         else:
             self.path_head = None
         # Length prediction head (predicts word length from path)
         # Max length is max_char_len (including EOS)
+        self.length_head = (
+            LengthPredictionHead(d_model=config.d_model) if config.predict_length else None
         )
         # Initialize weights
     def forward(
         self,
         input_ids: torch.Tensor,
+        path_coords: torch.Tensor,
         attention_mask: torch.Tensor | None = None,
+        labels: torch.Tensor | dict | None = None,
         return_dict: bool | None = None,
         output_hidden_states: bool | None = None,
+        output_attentions: bool | None = None,
+        **kwargs,
     ):
         """
         Forward pass of the model.
         Args:
             input_ids (torch.Tensor): Character token IDs [batch, char_len]
+            path_coords (torch.Tensor): Path features [batch, path_len, path_input_dim]
+                                       Default: [batch, path_len, 6] for (x, y, dx, dy, ds, log_dt)
             attention_mask (torch.Tensor, optional): Attention mask [batch, seq_len]
+            labels (torch.Tensor or dict, optional): Labels for loss calculation
+                Can be tensor [batch, char_len] or dict with keys like char_labels, path_labels
             return_dict (bool, optional): Whether to return ModelOutput object
             output_hidden_states (bool, optional): Whether to output hidden states
+            output_attentions (bool, optional): Whether to output attention weights
+            **kwargs: Additional arguments (for compatibility)
         Returns:
             SwipeTransformerOutput or tuple: Model outputs with:
                 - loss: Optional loss value
+                - char_logits: Character prediction logits [batch, char_len, vocab_size] (if enabled)
+                - path_logits: Path prediction logits [batch, path_len, path_input_dim] (if enabled)
+                - length_logits: Length regression output [batch] (if enabled)
                 - last_hidden_state: Hidden states [batch, seq_len, d_model]
+                - pooler_output: SEP token embedding [batch, d_model] for similarity/embedding tasks
+                - hidden_states: Tuple of per-layer hidden states (if output_hidden_states=True)
+                - attentions: Tuple of per-layer attention weights (if output_attentions=True)
         """
+        # Validate required inputs
+        if input_ids is None or path_coords is None:
+            raise ValueError("Both input_ids and path_coords are required")
+        # Extract labels if dict (used by custom trainers)
+        if isinstance(labels, dict):
+            char_labels = labels.get("char_labels")
+            # Can handle other label types in the future (path_labels, etc.)
+        else:
+            char_labels = labels
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        output_hidden_states = (
+            output_hidden_states
+            if output_hidden_states is not None
+            else self.config.output_hidden_states
+        )
+        output_attentions = (
+            output_attentions if output_attentions is not None else self.config.output_attentions
+        )
         batch_size = path_coords.shape[0]
         device = path_coords.device
         else:
             src_key_padding_mask = None
+        # Encode while optionally capturing attentions and per-layer hidden states.
+        attentions: tuple[torch.Tensor, ...] | None = None
+        hidden_states_by_layer: list[torch.Tensor] | None = [] if output_hidden_states else None
+        hooks = []
+        original_forwards: dict[int, callable] = {}
+        attentions_buffer: list[torch.Tensor | None] | None = None
+        def make_patched_forward(original_forward):
+            def patched_forward(
+                query,
+                key,
+                value,
+                key_padding_mask=None,
+                need_weights=True,
+                attn_mask=None,
+                average_attn_weights=False,
+                is_causal=False,
+            ):
+                return original_forward(
+                    query,
+                    key,
+                    value,
+                    key_padding_mask=key_padding_mask,
+                    need_weights=True,
+                    attn_mask=attn_mask,
+                    average_attn_weights=False,
+                    is_causal=is_causal,
+                )
+            return patched_forward
+        def make_hook(layer_idx: int):
+            def hook(_module: nn.Module, _input: tuple, output: tuple):
+                if (
+                    attentions_buffer is not None
+                    and isinstance(output, tuple)
+                    and len(output) > 1
+                    and output[1] is not None
+                ):
+                    attentions_buffer[layer_idx] = output[1]
+            return hook
+        if output_attentions:
+            attentions_buffer = [None] * len(self.encoder.layers)
+            for idx, layer in enumerate(self.encoder.layers):
+                attn_module = layer.self_attn
+                original_forwards[idx] = attn_module.forward
+                attn_module.forward = make_patched_forward(original_forwards[idx])
+                hooks.append(attn_module.register_forward_hook(make_hook(idx)))
+        try:
+            x = embeddings
+            for layer in self.encoder.layers:
+                x = layer(x, src_key_padding_mask=src_key_padding_mask)
+                if hidden_states_by_layer is not None:
+                    hidden_states_by_layer.append(x)
+            hidden_states = x
+            if attentions_buffer is not None:
+                if any(a is None for a in attentions_buffer):
+                    missing = [i for i, a in enumerate(attentions_buffer) if a is None]
+                    raise RuntimeError(
+                        f"Failed to capture attention weights for layers: {missing}."
+                    )
+                attentions = tuple(attentions_buffer)  # type: ignore[assignment]
+        finally:
+            for hook in hooks:
+                hook.remove()
+            for idx, layer in enumerate(self.encoder.layers):
+                if idx in original_forwards:
+                    layer.self_attn.forward = original_forwards[idx]
+        path_len = path_coords.shape[1]
+        char_len = input_ids.shape[1]
+        # Character prediction (text segment only)
+        char_logits = None
+        if self.char_head is not None:
+            # Sequence is: [CLS] + path + [SEP] + chars
+            char_start = 1 + path_len + 1
+            char_hidden = hidden_states[:, char_start : char_start + char_len, :]
+            char_logits = self.char_head(char_hidden)
+        # Path prediction (path segment only, if enabled)
         path_logits = None
         if self.path_head is not None:
+            path_hidden = hidden_states[:, 1 : 1 + path_len, :]
+            path_logits = self.path_head(path_hidden)
         # Length prediction from CLS token
         cls_hidden = hidden_states[:, 0, :]  # [batch, d_model] - CLS at position 0
+        length_logits = self.length_head(cls_hidden) if self.length_head is not None else None
         # Extract SEP token embedding for pooler output (embeddings/similarity tasks)
         # SEP is at position 1 + path_len
         sep_position = 1 + path_len
         pooler_output = hidden_states[:, sep_position, :]  # [batch, d_model]
+        # Compute loss if labels provided (masked-only; -100 = ignore)
         loss = None
+        if char_labels is not None and self.char_head is not None:
+            # Predict only the text segment
+            char_pred = char_logits  # [B, char_len, V]
+            labels_flat = char_labels.reshape(-1)
+            mask = labels_flat != -100
+            if mask.any():
+                logits_flat = char_pred.reshape(-1, self.config.vocab_size)[mask]
+                labels_flat = labels_flat[mask]
+                loss = nn.functional.cross_entropy(logits_flat, labels_flat, reduction="mean")
+            else:
+                loss = torch.tensor(0.0, device=hidden_states.device)
         if not return_dict:
+            hidden_tuple = None
+            if hidden_states_by_layer is not None:
+                hidden_tuple = (embeddings,) + tuple(hidden_states_by_layer)
+            output = (
+                char_logits,
+                path_logits,
+                length_logits,
+                hidden_states,
+                pooler_output,
+                hidden_tuple,
+                attentions,
+            )
+            return (loss,) + output if loss is not None else output
+        all_hidden_states = None
+        if hidden_states_by_layer is not None:
+            all_hidden_states = (embeddings,) + tuple(hidden_states_by_layer)
         return SwipeTransformerOutput(
             loss=loss,
             length_logits=length_logits,
             last_hidden_state=hidden_states,
             pooler_output=pooler_output,
+            hidden_states=all_hidden_states,
+            attentions=attentions,
         )
+#
+# Legacy note:
+# `SwipeModel` (embeddings-only) has been removed; use `SwipeTransformerModel` and read
+# `outputs.pooler_output` for embeddings.

preprocessing.py ADDED Viewed

	@@ -0,0 +1,275 @@

+"""Shared preprocessing utilities for swipe path data.
+This module provides a single source of truth for path preprocessing,
+used by both the training dataset and the HuggingFace processor.
+"""
+import numpy as np
+def preprocess_raw_path_to_features(
+    data_points: list[dict],
+    max_len: int,
+    *,
+    resample_mode: str = "spatial",
+    dt_clamp_min_ms: float = 1.0,
+    dt_clamp_max_ms: float = 200.0,
+) -> tuple[np.ndarray, np.ndarray]:
+    """Convert a raw `{"x","y","t"}` path to fixed-length engineered features.
+    This is the fast path used by training and the HuggingFace processor. It avoids
+    building an intermediate list-of-dicts representation by:
+    1) extracting x/y/t arrays once,
+    2) resampling x/y using spatial- or time-uniform interpolation,
+    3) recomputing dx/dy/ds and log_dt on the resampled trajectory.
+    Args:
+        data_points: Raw path as a list of dicts with keys: "x", "y", "t".
+        max_len: Target length.
+        resample_mode: "spatial" (arc-length) or "time" (cumulative dt).
+        dt_clamp_min_ms: Clamp for dt feature after resampling (first dt remains 0).
+        dt_clamp_max_ms: Clamp for dt feature after resampling.
+    Returns:
+        (features, mask) where:
+          - features: [max_len, 6] float32 array (x, y, dx, dy, ds, log_dt)
+          - mask: [max_len] int64 array (1 for valid; all-ones for non-empty paths)
+    """
+    num_points = len(data_points)
+    if num_points == 0:
+        return (
+            np.zeros((max_len, 6), dtype=np.float32),
+            np.zeros(max_len, dtype=np.int64),
+        )
+    x = np.fromiter((p["x"] for p in data_points), dtype=np.float64, count=num_points)
+    y = np.fromiter((p["y"] for p in data_points), dtype=np.float64, count=num_points)
+    t = np.fromiter((p["t"] for p in data_points), dtype=np.float64, count=num_points)
+    x = np.clip(x, 0.0, 1.0)
+    y = np.clip(y, 0.0, 1.0)
+    # Per-step deltas and axes for resampling
+    dx_in = np.concatenate([[0.0], np.diff(x)])
+    dy_in = np.concatenate([[0.0], np.diff(y)])
+    ds_in = np.hypot(dx_in, dy_in)
+    dt_raw_in = np.concatenate([[0.0], np.diff(t)])
+    s = np.cumsum(ds_in)
+    tau = np.cumsum(dt_raw_in)
+    if resample_mode not in {"spatial", "time"}:
+        raise ValueError(f"Unknown resample_mode={resample_mode!r} (use 'spatial' or 'time')")
+    eps = 1e-12
+    if resample_mode == "time" and tau[-1] > eps:
+        target_tau = np.linspace(0.0, float(tau[-1]), max_len, dtype=np.float64)
+        x_r = np.interp(target_tau, tau, x)
+        y_r = np.interp(target_tau, tau, y)
+        tau_r = target_tau
+    else:
+        # Spatial sampling (or fallback when time axis is degenerate).
+        if s[-1] <= eps:
+            original = np.arange(num_points, dtype=np.float64)
+            target = np.linspace(0, num_points - 1, max_len, dtype=np.float64)
+            x_r = np.interp(target, original, x)
+            y_r = np.interp(target, original, y)
+            tau_r = np.interp(target, original, tau)
+        else:
+            target_s = np.linspace(0.0, float(s[-1]), max_len, dtype=np.float64)
+            x_r = np.interp(target_s, s, x)
+            y_r = np.interp(target_s, s, y)
+            tau_r = np.interp(target_s, s, tau)
+    dx = np.concatenate([[0.0], np.diff(x_r)])
+    dy = np.concatenate([[0.0], np.diff(y_r)])
+    ds = np.hypot(dx, dy)
+    dt_raw_r = np.concatenate([[0.0], np.diff(tau_r)])
+    dt_feat = np.clip(dt_raw_r, dt_clamp_min_ms, dt_clamp_max_ms)
+    dt_feat[0] = 0.0
+    log_dt = np.log1p(np.maximum(0.0, dt_feat))
+    mask = np.ones(max_len, dtype=np.int64)
+    features = np.stack([x_r, y_r, dx, dy, ds, log_dt], axis=-1).astype(np.float32)
+    return features, mask
+def normalize_and_compute_features(
+    data_points: list[dict],
+    dt_clamp_min_ms: float = 1.0,
+    dt_clamp_max_ms: float = 200.0,
+) -> list[dict]:
+    """
+    Normalize coordinates and compute motion features.
+    Computes delta features (dx, dy, dt) and log-scaled time deltas.
+    First point has dx=dy=dt=0 by convention.
+    Args:
+        data_points: List of {"x", "y", "t"} dicts
+        dt_clamp_min_ms: Minimum dt in milliseconds (inclusive).
+        dt_clamp_max_ms: Maximum dt in milliseconds (inclusive).
+    Returns:
+        List of dicts with keys:
+          - x, y: normalized coordinates in [0, 1]
+          - t: raw timestamp from input (passed through)
+          - dx, dy: deltas in x/y
+          - ds: sqrt(dx^2 + dy^2)
+          - dt_raw: raw time delta (unclamped)
+          - dt: clamped time delta used for feature stability
+          - log_dt: log1p(dt)
+    """
+    if not data_points:
+        return []
+    num_points = len(data_points)
+    x = np.fromiter((p["x"] for p in data_points), dtype=np.float64, count=num_points)
+    y = np.fromiter((p["y"] for p in data_points), dtype=np.float64, count=num_points)
+    t = np.fromiter((p["t"] for p in data_points), dtype=np.float64, count=num_points)
+    x = np.clip(x, 0.0, 1.0)
+    y = np.clip(y, 0.0, 1.0)
+    dx = np.concatenate([[0.0], np.diff(x)])
+    dy = np.concatenate([[0.0], np.diff(y)])
+    ds = np.hypot(dx, dy)
+    dt_raw = np.concatenate([[0.0], np.diff(t)])
+    dt = np.clip(dt_raw, dt_clamp_min_ms, dt_clamp_max_ms)
+    dt[0] = 0.0
+    log_dt = np.log1p(np.maximum(0.0, dt))
+    out: list[dict] = []
+    for i in range(num_points):
+        out.append(
+            {
+                "x": float(x[i]),
+                "y": float(y[i]),
+                "t": float(t[i]),
+                "dx": float(dx[i]),
+                "dy": float(dy[i]),
+                "ds": float(ds[i]),
+                "dt_raw": float(dt_raw[i]),
+                "dt": float(dt[i]),
+                "log_dt": float(log_dt[i]),
+            }
+        )
+    return out
+def sample_path_points_with_features(
+    data_points: list[dict],
+    max_len: int,
+    *,
+    resample_mode: str = "spatial",
+    dt_clamp_min_ms: float = 1.0,
+    dt_clamp_max_ms: float = 200.0,
+) -> tuple[np.ndarray, np.ndarray]:
+    """
+    Sample path points with motion features to fixed length using interpolation.
+    Always uses interpolation (no zero-padding) to preserve feature structure.
+    Paths shorter than max_len are upsampled; longer paths are downsampled.
+    Modes:
+      - resample_mode="spatial": sample approximately uniformly in arc length (distance).
+      - resample_mode="time": sample uniformly in time (dwell regions get more samples).
+    Args:
+        data_points: List of coordinate dicts. Expected keys: x, y and either:
+            - dx, dy (preferred), plus optional ds, dt, log_dt; or
+            - ds/log_dt/dt (ds can be derived from dx/dy; dt from log_dt).
+        max_len: Target length
+        resample_mode: "spatial" or "time"
+        dt_clamp_min_ms: Clamp for dt feature after resampling (first dt remains 0).
+        dt_clamp_max_ms: Clamp for dt feature after resampling.
+    Returns:
+        Tuple of (features, mask) where:
+            - features: [max_len, 6] array with (x, y, dx, dy, ds, log_dt)
+            - mask: [max_len] binary mask (all 1s since we always interpolate)
+    """
+    num_points = len(data_points)
+    if num_points == 0:
+        # Empty path - return zeros
+        return (
+            np.zeros((max_len, 6), dtype=np.float32),
+            np.zeros(max_len, dtype=np.int64),
+        )
+    # Extract base signals
+    x = np.fromiter((p["x"] for p in data_points), dtype=np.float64, count=num_points)
+    y = np.fromiter((p["y"] for p in data_points), dtype=np.float64, count=num_points)
+    # Prefer provided dx/dy, otherwise derive from x/y
+    if all("dx" in p for p in data_points) and all("dy" in p for p in data_points):
+        dx_in = np.fromiter((p["dx"] for p in data_points), dtype=np.float64, count=num_points)
+        dy_in = np.fromiter((p["dy"] for p in data_points), dtype=np.float64, count=num_points)
+    else:
+        dx_in = np.concatenate([[0.0], np.diff(x)])
+        dy_in = np.concatenate([[0.0], np.diff(y)])
+    # ds can be provided or derived from dx/dy
+    if all("ds" in p for p in data_points):
+        ds_in = np.fromiter((p["ds"] for p in data_points), dtype=np.float64, count=num_points)
+    else:
+        ds_in = np.sqrt(dx_in**2 + dy_in**2)
+    # Time axis for resampling: prefer dt_raw (unclamped) so "dwell" gets represented.
+    if all("dt_raw" in p for p in data_points):
+        dt_axis = np.fromiter(
+            (p["dt_raw"] for p in data_points), dtype=np.float64, count=num_points
+        )
+    elif all("dt" in p for p in data_points):
+        dt_axis = np.fromiter((p["dt"] for p in data_points), dtype=np.float64, count=num_points)
+    elif all("log_dt" in p for p in data_points):
+        log_dt_in_raw = np.fromiter(
+            (p["log_dt"] for p in data_points), dtype=np.float64, count=num_points
+        )
+        dt_axis = np.expm1(log_dt_in_raw)
+    else:
+        dt_axis = np.zeros(num_points, dtype=np.float64)
+    # Cumulative arc length (s) and cumulative time (tau) for resampling
+    s = np.cumsum(ds_in)
+    tau = np.cumsum(dt_axis)
+    if resample_mode not in {"spatial", "time"}:
+        raise ValueError(f"Unknown resample_mode={resample_mode!r} (use 'spatial' or 'time')")
+    eps = 1e-12
+    if resample_mode == "time" and tau[-1] > eps:
+        target_tau = np.linspace(0.0, float(tau[-1]), max_len, dtype=np.float64)
+        x_r = np.interp(target_tau, tau, x)
+        y_r = np.interp(target_tau, tau, y)
+        tau_r = target_tau
+    else:
+        # Spatial sampling (or fallback when time axis is degenerate).
+        # Handle degenerate paths (zero movement): fall back to index-based interpolation
+        if s[-1] <= eps:
+            original = np.arange(num_points, dtype=np.float64)
+            target = np.linspace(0, num_points - 1, max_len, dtype=np.float64)
+            x_r = np.interp(target, original, x)
+            y_r = np.interp(target, original, y)
+            tau_r = np.interp(target, original, tau)
+        else:
+            target_s = np.linspace(0.0, float(s[-1]), max_len, dtype=np.float64)
+            x_r = np.interp(target_s, s, x)
+            y_r = np.interp(target_s, s, y)
+            tau_r = np.interp(target_s, s, tau)
+    # Recompute deltas on the resampled path for consistency
+    dx = np.concatenate([[0.0], np.diff(x_r)])
+    dy = np.concatenate([[0.0], np.diff(y_r)])
+    ds = np.sqrt(dx**2 + dy**2)
+    dt_raw_r = np.concatenate([[0.0], np.diff(tau_r)])
+    dt_feat = np.clip(dt_raw_r, dt_clamp_min_ms, dt_clamp_max_ms)
+    dt_feat[0] = 0.0
+    log_dt = np.log1p(np.maximum(0.0, dt_feat))
+    mask = np.ones(max_len, dtype=np.int64)
+    features = np.stack([x_r, y_r, dx, dy, ds, log_dt], axis=-1).astype(np.float32)
+    return features, mask

processing_swipe.py CHANGED Viewed

@@ -1,9 +1,15 @@
 """Processor for handling multimodal swipe inputs (path + text)."""
 import numpy as np
 import torch
 from transformers import ProcessorMixin
 class SwipeProcessor(ProcessorMixin):
     """
@@ -21,10 +27,19 @@ class SwipeProcessor(ProcessorMixin):
     attributes = ["tokenizer"]
     tokenizer_class = "AutoTokenizer"  # Will use auto_map from tokenizer_config.json
-    def __init__(self, tokenizer=None, max_path_len: int = 64, max_char_len: int = 38):
         self.tokenizer = tokenizer
         self.max_path_len = max_path_len
         self.max_char_len = max_char_len
         # Attributes expected by newer transformers (not used for swipe models)
         self.chat_template = None
         self.audio_tokenizer = None
@@ -33,21 +48,36 @@ class SwipeProcessor(ProcessorMixin):
     def __call__(
         self,
-        path_coords: list[list[list[float]]] | torch.Tensor | np.ndarray | None = None,
         text: str | list[str] | None = None,
         padding: bool | str = True,
         truncation: bool = True,
         max_length: int | None = None,
         return_tensors: str | None = "pt",
-        **kwargs,
     ):
         """
         Process path coordinates and text into model inputs.
         Args:
-            path_coords: List of paths or tensor [batch, path_len, 3]
-                        Each point is (x, y, time). Can be None if only processing text.
-            text: String or list of strings to encode. Can be None if only processing paths.
             padding: Whether to pad sequences. Can be True/False or "max_length"
             truncation: Whether to truncate sequences
             max_length: Maximum sequence length for text (overrides max_char_len)
@@ -56,9 +86,10 @@ class SwipeProcessor(ProcessorMixin):
         Returns:
             Dictionary with:
-                - path_coords: [batch, max_path_len, 3] (if path_coords provided)
                 - input_ids: [batch, max_char_len] (if text provided)
-                - attention_mask: [batch, total_seq_len]
         """
         if path_coords is None and text is None:
             raise ValueError("Must provide either path_coords or text (or both)")
@@ -67,24 +98,43 @@ class SwipeProcessor(ProcessorMixin):
         if path_coords is not None:
             # Handle path coordinates
             if isinstance(path_coords, (list, tuple)):
-                # Check if it's a batch or single path
-                if len(path_coords) > 0 and isinstance(path_coords[0][0], (list, tuple)):
-                    # Batch of paths [[path1], [path2], ...]
-                    path_coords = torch.tensor(path_coords, dtype=torch.float32)
                 else:
-                    # Single path [[x,y,t], [x,y,t], ...]
-                    path_coords = torch.tensor([path_coords], dtype=torch.float32)
             elif isinstance(path_coords, np.ndarray):
                 path_coords = torch.from_numpy(path_coords).float()
                 if path_coords.dim() == 2:
                     # Single path, add batch dimension
                     path_coords = path_coords.unsqueeze(0)
             elif isinstance(path_coords, torch.Tensor):
                 if path_coords.dim() == 2:
                     # Single path, add batch dimension
                     path_coords = path_coords.unsqueeze(0)
-            batch_size = path_coords.shape[0]
         elif text is not None:
             if isinstance(text, str):
                 batch_size = 1
@@ -98,31 +148,120 @@ class SwipeProcessor(ProcessorMixin):
         # Process path coordinates
         if path_coords is not None:
-            current_path_len = path_coords.shape[1]
-            # Truncate if needed
-            if truncation and current_path_len > self.max_path_len:
-                path_coords = path_coords[:, : self.max_path_len, :]
-                current_path_len = self.max_path_len
-            # Pad if needed
-            if padding and current_path_len < self.max_path_len:
-                pad_len = self.max_path_len - current_path_len
-                path_coords = torch.cat([path_coords, torch.zeros(batch_size, pad_len, 3)], dim=1)
-            # Create path mask (1 = real data, 0 = padding)
-            # Detect padding by checking for all-zero coordinates
-            path_mask = torch.ones(batch_size, self.max_path_len, dtype=torch.long)
-            # A point is padding if all its coordinates (x, y, t) are zero
-            is_padding = (path_coords == 0).all(dim=-1)  # [batch, path_len]
-            path_mask[is_padding] = 0
             result["path_coords"] = path_coords
-            # Store path_mask internally for attention_mask construction
-            _path_mask = path_mask
         else:
             # No path coords provided, create empty/zero tensors
-            path_coords = torch.zeros(batch_size, self.max_path_len, 3)
             _path_mask = torch.zeros(batch_size, self.max_path_len, dtype=torch.long)
             result["path_coords"] = path_coords
@@ -157,9 +296,9 @@ class SwipeProcessor(ProcessorMixin):
                 # Truncate but preserve EOS at the end
                 for i in range(len(encoded_raw["input_ids"])):
                     if len(encoded_raw["input_ids"][i]) > text_max_length:
-                        encoded_raw["input_ids"][i] = (
-                            encoded_raw["input_ids"][i][: text_max_length - 1] + [eos_id]
-                        )
             # Pad sequences
             if padding:
@@ -264,104 +403,48 @@ class SwipeProcessor(ProcessorMixin):
         """
         return self.tokenizer.decode(token_ids, **kwargs)
-    def normalize_coordinates(
-        self, data_points: list[dict], canvas_width: float = None, canvas_height: float = None
-    ) -> list[dict]:
-        """
-        Normalize swipe coordinates and timestamps.
-        Args:
-            data_points: List of dicts with 'x', 'y', 't' keys
-            canvas_width: Canvas width (not used - kept for compatibility)
-            canvas_height: Canvas height (not used - kept for compatibility)
-        Returns:
-            List of normalized coordinate dicts with x, y in [0,1] and t in [0,1]
-        Note:
-            For futo-org/swipe.futo.org dataset, x and y are already normalized to [0,1].
-            This function clamps them to ensure they stay in bounds and normalizes timestamps.
-        """
-        if not data_points:
-            return []
-        # Extract timestamps for normalization
-        timestamps = [p["t"] for p in data_points]
-        t_min = min(timestamps)
-        t_max = max(timestamps)
-        t_range = t_max - t_min if t_max > t_min else 1.0
-        normalized = []
-        for point in data_points:
-            # x and y are already normalized to [0,1] in the dataset
-            # But sometimes they go slightly outside bounds, so clamp them
-            x_norm = max(0.0, min(1.0, point["x"]))
-            y_norm = max(0.0, min(1.0, point["y"]))
-            # Normalize timestamp to [0, 1]
-            t_norm = (point["t"] - t_min) / t_range
-            normalized.append({"x": x_norm, "y": y_norm, "t": t_norm})
-        return normalized
-    def sample_path_points(self, data_points: list[dict], max_len: int = None) -> tuple:
         """
-        Sample or pad path points to fixed length using linear interpolation.
-        Args:
-            data_points: List of coordinate dicts with 'x', 'y', 't' keys
-            max_len: Target length (defaults to self.max_path_len if not specified)
-        Returns:
-            Tuple of (sampled_points, mask) where:
-            - sampled_points: numpy array of shape [max_len, 3] with (x, y, t) coordinates
-            - mask: numpy array of shape [max_len] indicating valid (1) vs padding (0) points
-        Note:
-            - If path has fewer points than max_len, it's zero-padded
-            - If path has more points than max_len, it's downsampled using linear interpolation
-            - If path has exactly max_len points, it's returned as-is
         """
-        if max_len is None:
-            max_len = self.max_path_len
-        num_points = len(data_points)
-        if num_points == max_len:
-            points = data_points
-            mask = [1] * max_len
-        elif num_points < max_len:
-            # Pad with zeros
-            points = data_points + [{"x": 0.0, "y": 0.0, "t": 0.0}] * (max_len - num_points)
-            mask = [1] * num_points + [0] * (max_len - num_points)
-        else:
-            # Downsample using linear interpolation
-            # Extract coordinates as arrays
-            x_coords = np.array([p["x"] for p in data_points])
-            y_coords = np.array([p["y"] for p in data_points])
-            t_coords = np.array([p["t"] for p in data_points])
-            # Original indices (parameter for interpolation)
-            original_indices = np.arange(num_points)
-            # Target indices for interpolation (evenly spaced)
-            target_indices = np.linspace(0, num_points - 1, max_len)
-            # Interpolate each coordinate independently
-            x_interp = np.interp(target_indices, original_indices, x_coords)
-            y_interp = np.interp(target_indices, original_indices, y_coords)
-            t_interp = np.interp(target_indices, original_indices, t_coords)
-            # Reconstruct points
-            points = [
-                {"x": float(x), "y": float(y), "t": float(t)}
-                for x, y, t in zip(x_interp, y_interp, t_interp, strict=True)
-            ]
-            mask = [1] * max_len
-        # Convert to numpy arrays
-        coords = np.array([[p["x"], p["y"], p["t"]] for p in points], dtype=np.float32)
-        mask = np.array(mask, dtype=np.int64)
-        return coords, mask

 """Processor for handling multimodal swipe inputs (path + text)."""
+from __future__ import annotations
+from typing import Any
 import numpy as np
 import torch
 from transformers import ProcessorMixin
+from .preprocessing import preprocess_raw_path_to_features
 class SwipeProcessor(ProcessorMixin):
     """
     attributes = ["tokenizer"]
     tokenizer_class = "AutoTokenizer"  # Will use auto_map from tokenizer_config.json
+    def __init__(
+        self,
+        tokenizer=None,
+        max_path_len: int = 64,
+        max_char_len: int = 38,
+        path_input_dim: int = 6,
+        path_resample_mode: str = "time",
+    ):
         self.tokenizer = tokenizer
         self.max_path_len = max_path_len
         self.max_char_len = max_char_len
+        self.path_input_dim = path_input_dim
+        self.path_resample_mode = path_resample_mode
         # Attributes expected by newer transformers (not used for swipe models)
         self.chat_template = None
         self.audio_tokenizer = None
     def __call__(
         self,
+        path_coords: (
+            list[dict[str, float]]
+            | list[list[dict[str, float]]]
+            | list[list[list[float]]]
+            | torch.Tensor
+            | np.ndarray
+            | None
+        ) = None,
         text: str | list[str] | None = None,
         padding: bool | str = True,
         truncation: bool = True,
         max_length: int | None = None,
         return_tensors: str | None = "pt",
+        **kwargs: Any,
     ):
         """
         Process path coordinates and text into model inputs.
         Args:
+            path_coords:
+                Swipe paths in one of the supported formats:
+                - Raw path (single example): list of dicts like `{"x": ..., "y": ..., "t": ...}`
+                - Raw batch: list of raw paths
+                - Numeric arrays/tensors: `[batch, path_len, D]` or `[path_len, D]`
+                If `D==3` and `path_input_dim==6`, raw `(x,y,t)` triples are converted to engineered
+                `(x, y, dx, dy, ds, log_dt)` features and resampled to `max_path_len`.
+                If omitted, the processor emits a zero path with a zero path attention mask.
+            text:
+                String or list of strings to encode.
+                If omitted, the processor emits padded text tokens with a zero text attention mask.
             padding: Whether to pad sequences. Can be True/False or "max_length"
             truncation: Whether to truncate sequences
             max_length: Maximum sequence length for text (overrides max_char_len)
         Returns:
             Dictionary with:
+                - path_coords: [batch, max_path_len, path_input_dim] (if path_coords provided)
+                  Default: [batch, max_path_len, 6] for (x, y, dx, dy, ds, log_dt)
                 - input_ids: [batch, max_char_len] (if text provided)
+                - attention_mask: [batch, total_seq_len] (covers `[CLS] + path + [SEP] + text`)
         """
         if path_coords is None and text is None:
             raise ValueError("Must provide either path_coords or text (or both)")
         if path_coords is not None:
             # Handle path coordinates
             if isinstance(path_coords, (list, tuple)):
+                if len(path_coords) == 0:
+                    batch_size = 1
                 else:
+                    first = path_coords[0]
+                    # Raw single path: [{"x","y","t"}, ...]
+                    if isinstance(first, dict):
+                        batch_size = 1
+                    # Raw batch of paths: [[{"x","y","t"}, ...], ...]
+                    elif (
+                        isinstance(first, (list, tuple))
+                        and len(first) > 0
+                        and isinstance(first[0], dict)
+                    ):
+                        batch_size = len(path_coords)
+                    # Numeric batch: [[[...], ...], ...] where points are lists/tuples
+                    elif (
+                        isinstance(first, (list, tuple))
+                        and len(first) > 0
+                        and isinstance(first[0], (list, tuple))
+                    ):
+                        path_coords = torch.tensor(path_coords, dtype=torch.float32)
+                        batch_size = path_coords.shape[0]
+                    else:
+                        # Numeric single path: [[...], [...], ...]
+                        path_coords = torch.tensor([path_coords], dtype=torch.float32)
+                        batch_size = path_coords.shape[0]
             elif isinstance(path_coords, np.ndarray):
                 path_coords = torch.from_numpy(path_coords).float()
                 if path_coords.dim() == 2:
                     # Single path, add batch dimension
                     path_coords = path_coords.unsqueeze(0)
+                batch_size = path_coords.shape[0]
             elif isinstance(path_coords, torch.Tensor):
                 if path_coords.dim() == 2:
                     # Single path, add batch dimension
                     path_coords = path_coords.unsqueeze(0)
+                batch_size = path_coords.shape[0]
         elif text is not None:
             if isinstance(text, str):
                 batch_size = 1
         # Process path coordinates
         if path_coords is not None:
+            # Check if path_coords is raw data (list of dicts) or already a tensor
+            if isinstance(path_coords, (list, tuple)) and len(path_coords) > 0:
+                first_elem = path_coords[0]
+                # Raw single path: [{"x","y","t"}, ...]
+                if isinstance(first_elem, dict) and "x" in first_elem:
+                    path_feats, mask = preprocess_raw_path_to_features(
+                        path_coords,
+                        self.max_path_len,
+                        resample_mode=self.path_resample_mode,
+                    )
+                    if return_tensors == "pt":
+                        path_coords = torch.from_numpy(path_feats).float().unsqueeze(0)
+                        _path_mask = torch.from_numpy(mask).long().unsqueeze(0)
+                    else:
+                        path_coords = np.expand_dims(path_feats, axis=0)
+                        _path_mask = np.expand_dims(mask, axis=0)
+                # Raw batch of paths: [[{"x","y","t"}, ...], ...]
+                elif (
+                    isinstance(first_elem, (list, tuple))
+                    and len(first_elem) > 0
+                    and isinstance(first_elem[0], dict)
+                    and "x" in first_elem[0]
+                ):
+                    processed_paths = []
+                    path_masks = []
+                    for path in path_coords:
+                        path_feats, mask = preprocess_raw_path_to_features(
+                            path,
+                            self.max_path_len,
+                            resample_mode=self.path_resample_mode,
+                        )
+                        processed_paths.append(path_feats)
+                        path_masks.append(mask)
+                    path_coords = np.stack(processed_paths)  # [batch, max_path_len, 6]
+                    _path_mask = np.stack(path_masks)  # [batch, max_path_len]
+                    if return_tensors == "pt":
+                        path_coords = torch.from_numpy(path_coords).float()
+                        _path_mask = torch.from_numpy(_path_mask).long()
+                else:
+                    # Numeric list input; process as before
+                    path_coords = torch.tensor(path_coords, dtype=torch.float32)
+                    if path_coords.dim() == 2:
+                        path_coords = path_coords.unsqueeze(0)
+                    current_path_len = path_coords.shape[1]
+                    if truncation and current_path_len > self.max_path_len:
+                        path_coords = path_coords[:, : self.max_path_len, :]
+                    if padding and current_path_len < self.max_path_len:
+                        pad_len = self.max_path_len - current_path_len
+                        pad_shape = (batch_size, pad_len, self.path_input_dim)
+                        path_coords = torch.cat([path_coords, torch.zeros(pad_shape)], dim=1)
+                    _path_mask = torch.ones(batch_size, self.max_path_len, dtype=torch.long)
+                    is_padding = (path_coords == 0).all(dim=-1)
+                    _path_mask[is_padding] = 0
+            elif isinstance(path_coords, np.ndarray):
+                path_coords = torch.from_numpy(path_coords).float()
+                if path_coords.dim() == 2:
+                    path_coords = path_coords.unsqueeze(0)
+                # If user provided raw (x,y,t) triples but model expects engineered features,
+                # convert to motion features and resample.
+                if path_coords.shape[-1] == 3 and self.path_input_dim == 6:
+                    processed_paths = []
+                    path_masks = []
+                    for path in path_coords.cpu().numpy():
+                        raw = [{"x": float(p[0]), "y": float(p[1]), "t": float(p[2])} for p in path]
+                        path_feats, mask = preprocess_raw_path_to_features(
+                            raw,
+                            self.max_path_len,
+                            resample_mode=self.path_resample_mode,
+                        )
+                        processed_paths.append(path_feats)
+                        path_masks.append(mask)
+                    path_coords = torch.from_numpy(np.stack(processed_paths)).float()
+                    _path_mask = torch.from_numpy(np.stack(path_masks)).long()
+                else:
+                    _path_mask = torch.ones(
+                        path_coords.shape[0], self.max_path_len, dtype=torch.long
+                    )
+            elif isinstance(path_coords, torch.Tensor):
+                if path_coords.dim() == 2:
+                    path_coords = path_coords.unsqueeze(0)
+                # If user provided raw (x,y,t) triples but model expects engineered features,
+                # convert to motion features and resample.
+                if path_coords.shape[-1] == 3 and self.path_input_dim == 6:
+                    processed_paths = []
+                    path_masks = []
+                    for path in path_coords.detach().cpu().numpy():
+                        raw = [{"x": float(p[0]), "y": float(p[1]), "t": float(p[2])} for p in path]
+                        path_feats, mask = preprocess_raw_path_to_features(
+                            raw,
+                            self.max_path_len,
+                            resample_mode=self.path_resample_mode,
+                        )
+                        processed_paths.append(path_feats)
+                        path_masks.append(mask)
+                    path_coords = torch.from_numpy(np.stack(processed_paths)).float()
+                    _path_mask = torch.from_numpy(np.stack(path_masks)).long()
+                else:
+                    _path_mask = torch.ones(
+                        path_coords.shape[0], self.max_path_len, dtype=torch.long
+                    )
             result["path_coords"] = path_coords
         else:
             # No path coords provided, create empty/zero tensors
+            path_coords = torch.zeros(batch_size, self.max_path_len, self.path_input_dim)
             _path_mask = torch.zeros(batch_size, self.max_path_len, dtype=torch.long)
             result["path_coords"] = path_coords
                 # Truncate but preserve EOS at the end
                 for i in range(len(encoded_raw["input_ids"])):
                     if len(encoded_raw["input_ids"][i]) > text_max_length:
+                        encoded_raw["input_ids"][i] = encoded_raw["input_ids"][i][
+                            : text_max_length - 1
+                        ] + [eos_id]
             # Pad sequences
             if padding:
         """
         return self.tokenizer.decode(token_ids, **kwargs)
+    def encode_path(self, path_coords, *, return_tensors: str | None = "pt", **kwargs: Any):
+        """Create model inputs from a swipe path only (no text)."""
+        return self(path_coords=path_coords, text=None, return_tensors=return_tensors, **kwargs)
+    def encode_text(self, text, *, return_tensors: str | None = "pt", **kwargs: Any):
+        """Create model inputs from text only (no path)."""
+        return self(path_coords=None, text=text, return_tensors=return_tensors, **kwargs)
+    # Preprocessing methods are now imported from shared preprocessing module
+    # See src/swipealot/data/preprocessing.py for the implementation
+    def save_pretrained(
+        self,
+        save_directory,
+        push_to_hub=False,
+        **kwargs,
+    ):
         """
+        Save the processor to a directory, ensuring auto_map is included.
         """
+        # Call parent save_pretrained
+        result = super().save_pretrained(
+            save_directory,
+            push_to_hub=push_to_hub,
+            **kwargs,
+        )
+        # Add auto_map to processor_config.json for AutoProcessor compatibility
+        import json
+        from pathlib import Path
+        # Try both possible config file names
+        for config_name in ["preprocessor_config.json", "processor_config.json"]:
+            processor_config_path = Path(save_directory) / config_name
+            if processor_config_path.exists():
+                with open(processor_config_path) as f:
+                    config = json.load(f)
+                config["auto_map"] = {"AutoProcessor": "processing_swipe.SwipeProcessor"}
+                with open(processor_config_path, "w") as f:
+                    json.dump(config, f, indent=2)
+                break
+        return result

processor_config.json CHANGED Viewed

@@ -1,6 +1,8 @@
 {
   "max_char_len": 48,
   "max_path_len": 128,
   "processor_class": "SwipeProcessor",
   "auto_map": {
     "AutoProcessor": "processing_swipe.SwipeProcessor"

 {
   "max_char_len": 48,
   "max_path_len": 128,
+  "path_input_dim": 6,
+  "path_resample_mode": "time",
   "processor_class": "SwipeProcessor",
   "auto_map": {
     "AutoProcessor": "processing_swipe.SwipeProcessor"

special_tokens_map.json CHANGED Viewed

@@ -1,8 +1,44 @@
 {
-  "cls_token": "[CLS]",
-  "eos_token": "[EOS]",
-  "mask_token": "[MASK]",
-  "pad_token": "[PAD]",
-  "sep_token": "[SEP]",
-  "unk_token": "[UNK]"
 }

 {
+  "cls_token": {
+    "content": "[CLS]",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "[EOS]",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "mask_token": {
+    "content": "[MASK]",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "[PAD]",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "sep_token": {
+    "content": "[SEP]",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "unk_token": {
+    "content": "[UNK]",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
 }

tokenization_swipe.py CHANGED Viewed

@@ -117,7 +117,8 @@ class SwipeTokenizer(PreTrainedTokenizer):
         Returns:
             int: Token ID
         """
-        return self._tokenizer.char_to_id.get(token, self._tokenizer.unk_token_id)
     def _convert_id_to_token(self, index: int) -> str:
         """
@@ -154,6 +155,41 @@ class SwipeTokenizer(PreTrainedTokenizer):
         filtered = [t for t in tokens if t not in special_tokens]
         return "".join(filtered)
     def save_vocabulary(self, save_directory: str, filename_prefix: str | None = None) -> tuple:
         """
         Save the tokenizer vocabulary to a directory.

         Returns:
             int: Token ID
         """
+        # Delegate to the core tokenizer to keep token/id mapping logic in one place.
+        return self._tokenizer.token_to_id(token)
     def _convert_id_to_token(self, index: int) -> str:
         """
         filtered = [t for t in tokens if t not in special_tokens]
         return "".join(filtered)
+    def save_pretrained(
+        self,
+        save_directory,
+        legacy_format=None,
+        filename_prefix=None,
+        push_to_hub=False,
+        **kwargs,
+    ):
+        """Save the tokenizer and write `auto_map` for `AutoTokenizer` loading."""
+        # Call parent save_pretrained
+        result = super().save_pretrained(
+            save_directory,
+            legacy_format=legacy_format,
+            filename_prefix=filename_prefix,
+            push_to_hub=push_to_hub,
+            **kwargs,
+        )
+        # Add auto_map to tokenizer_config.json for AutoTokenizer compatibility
+        from pathlib import Path
+        tokenizer_config_path = Path(save_directory) / "tokenizer_config.json"
+        if tokenizer_config_path.exists():
+            with open(tokenizer_config_path) as f:
+                config = json.load(f)
+            # For tokenizers, Transformers expects the 2-tuple form: [slow, fast].
+            # We only ship a slow tokenizer implementation, so fast is None.
+            config["auto_map"] = {"AutoTokenizer": ["tokenization_swipe.SwipeTokenizer", None]}
+            with open(tokenizer_config_path, "w") as f:
+                json.dump(config, f, indent=2)
+        return result
     def save_vocabulary(self, save_directory: str, filename_prefix: str | None = None) -> tuple:
         """
         Save the tokenizer vocabulary to a directory.

tokenizer.py CHANGED Viewed

@@ -48,16 +48,27 @@ class CharacterTokenizer:
         self.id_to_char = {idx: char for char, idx in self.char_to_id.items()}
         self.vocab_size = len(self.char_to_id)
     def encode(self, text: str) -> list[int]:
         """Encode text to token IDs (case-insensitive, punctuation -> [PUNC])."""
-        unk_id = self.char_to_id[self.unk_token]
-        punc_id = self.char_to_id[self.punc_token]
         tokens = []
         for char in text.lower():
-            if char.isalpha() or char.isdigit():
-                tokens.append(self.char_to_id.get(char, unk_id))
-            else:
-                tokens.append(punc_id)
         return tokens
     def decode(self, token_ids: list[int]) -> str:

         self.id_to_char = {idx: char for char, idx in self.char_to_id.items()}
         self.vocab_size = len(self.char_to_id)
+    def encode_char(self, char: str) -> int:
+        """Encode a single character to a token id (case-insensitive; punctuation -> [PUNC])."""
+        char = char.lower()
+        if char.isalpha() or char.isdigit():
+            return self.char_to_id.get(char, self.unk_token_id)
+        return self.punc_token_id
+    def token_to_id(self, token: str) -> int:
+        """Map a token string to its id (supports specials and single characters)."""
+        direct = self.char_to_id.get(token)
+        if direct is not None:
+            return direct
+        if len(token) == 1:
+            return self.encode_char(token)
+        return self.unk_token_id
     def encode(self, text: str) -> list[int]:
         """Encode text to token IDs (case-insensitive, punctuation -> [PUNC])."""
         tokens = []
         for char in text.lower():
+            tokens.append(self.encode_char(char))
         return tokens
     def decode(self, token_ids: list[int]) -> str:

tokenizer_config.json CHANGED Viewed

@@ -49,6 +49,12 @@
       "special": true
     }
   },
   "clean_up_tokenization_spaces": false,
   "cls_token": "[CLS]",
   "eos_token": "[EOS]",
@@ -59,11 +65,5 @@
   "processor_class": "SwipeProcessor",
   "sep_token": "[SEP]",
   "tokenizer_class": "SwipeTokenizer",
-  "unk_token": "[UNK]",
-  "auto_map": {
-    "AutoTokenizer": [
-      "tokenization_swipe.SwipeTokenizer",
-      null
-    ]
-  }
 }

       "special": true
     }
   },
+  "auto_map": {
+    "AutoTokenizer": [
+      "tokenization_swipe.SwipeTokenizer",
+      null
+    ]
+  },
   "clean_up_tokenization_spaces": false,
   "cls_token": "[CLS]",
   "eos_token": "[EOS]",
   "processor_class": "SwipeProcessor",
   "sep_token": "[SEP]",
   "tokenizer_class": "SwipeTokenizer",
+  "unk_token": "[UNK]"
 }