perplexity-ai
/

pplx-embed-context-v1-0.6b

+"""
+Custom Pooling module for Late Chunking contextual embeddings.
+This module extracts chunk-level embeddings from concatenated token embeddings.
+Works together with custom_st.py to enable late chunking.
+Usage:
+    Place this file alongside custom_st.py in your model directory.
+"""
+from __future__ import annotations
+import json
+import os
+from typing import Any
+import torch
+import torch.nn as nn
+def mean_pooling_chunked(
+    token_embeddings: torch.Tensor, attention_mask: torch.Tensor
+) -> torch.Tensor:
+    """
+    Vectorized mean pooling for chunked embeddings.
+    Args:
+        token_embeddings: (batch_size, num_chunks, chunk_max_seq_length, hidden_dim)
+        attention_mask: (batch_size, num_chunks, chunk_max_seq_length)
+    Returns:
+        pooled: (batch_size, num_chunks, hidden_dim)
+    """
+    # Expand mask to match embedding dimension
+    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
+    # Sum embeddings across chunk_max_seq_length dimension (dim=2)
+    sum_embeddings = torch.sum(token_embeddings * input_mask_expanded, dim=2)
+    # Sum mask across chunk_max_seq_length dimension
+    sum_mask = torch.clamp(input_mask_expanded.sum(dim=2), min=1e-9)
+    return sum_embeddings / sum_mask
+class Pooling(nn.Module):
+    """
+    Pooling module for late chunking that extracts individual chunk embeddings.
+    This module reads 'num_chunks_list' from features (set by custom_st.py Transformer)
+    and extracts chunk embeddings by reshaping and mean pooling within each chunk.
+    Args:
+        chunk_max_seq_length: Maximum sequence length for each chunk in tokens.
+            Must match the value in custom_st.py Transformer.
+    Config keys:
+        - chunk_max_seq_length: The fixed token length per chunk
+    Example:
+        >>> pooling = Pooling(chunk_max_seq_length=128)
+        >>> # Input: token_embeddings from 3 chunks of 128 tokens each
+        >>> features = {
+        ...     'token_embeddings': torch.randn(1, 384, 768),  # batch=1, seq_len=384, hidden=768
+        ...     'attention_mask': torch.ones(1, 384),
+        ...     'num_chunks_list': [3]  # Set by Transformer.tokenize()
+        ... }
+        >>> output = pooling(features)
+        >>> output['chunk_embeddings'].shape
+        torch.Size([1, 3, 768])  # batch=1, num_chunks=3, hidden=768
+    """
+    def __init__(self, chunk_max_seq_length: int = 128, **kwargs):
+        super().__init__()
+        self.config_keys = ["chunk_max_seq_length"]
+        self.chunk_max_seq_length = chunk_max_seq_length
+    def forward(
+        self,
+        features: dict[str, torch.Tensor],
+        **kwargs,
+    ) -> dict[str, torch.Tensor]:
+        """
+        Forward pass that extracts chunk embeddings via mean pooling.
+        Args:
+            features: Dictionary with 'token_embeddings', 'attention_mask', and 'num_chunks_list'
+            **kwargs: Additional arguments (ignored)
+        Returns:
+            Dictionary with 'chunk_embeddings' of shape (batch_size, num_chunks, hidden_dim)
+        """
+        token_embeddings = features["token_embeddings"]
+        attention_mask = features["attention_mask"]
+        num_chunks_list = features.get("num_chunks_list")
+        if num_chunks_list is None:
+            raise ValueError(
+                "num_chunks_list not found in features. "
+                "Make sure you're using the custom_st.py Transformer module."
+            )
+        # Extract chunk embeddings for each document in the batch
+        batch_chunk_embeddings = []
+        for i, num_chunks in enumerate(num_chunks_list):
+            doc_token_embeddings = token_embeddings[i : i + 1]  # Keep batch dim
+            doc_attention_mask = attention_mask[i : i + 1]
+            chunk_embeddings = self._extract_chunk_embeddings(
+                doc_token_embeddings, doc_attention_mask, num_chunks
+            )
+            batch_chunk_embeddings.append(chunk_embeddings[0])  # Remove batch dim
+        features["chunk_embeddings"] = batch_chunk_embeddings
+        return features
+    def _extract_chunk_embeddings(
+        self,
+        token_embeddings: torch.Tensor,
+        attention_mask: torch.Tensor,
+        num_chunks: int,
+    ) -> torch.Tensor:
+        """
+        Extract chunk embeddings from concatenated token embeddings.
+        Args:
+            token_embeddings: (1, seq_len, hidden_dim)
+            attention_mask: (1, seq_len)
+            num_chunks: Number of chunks
+        Returns:
+            chunk_embeddings: (1, num_chunks, hidden_dim)
+        """
+        batch_size, seq_len, hidden_dim = token_embeddings.shape
+        # Verify that seq_len matches num_chunks * chunk_max_seq_length
+        expected_seq_len = num_chunks * self.chunk_max_seq_length
+        if seq_len != expected_seq_len:
+            raise ValueError(
+                f"Sequence length {seq_len} does not match num_chunks * chunk_max_seq_length "
+                f"({num_chunks} * {self.chunk_max_seq_length} = {expected_seq_len})"
+            )
+        # Reshape to (batch_size, num_chunks, chunk_max_seq_length, hidden_dim)
+        token_embeddings_chunked = token_embeddings.view(
+            batch_size, num_chunks, self.chunk_max_seq_length, hidden_dim
+        )
+        # Reshape attention mask to (batch_size, num_chunks, chunk_max_seq_length)
+        attention_mask_chunked = attention_mask.view(
+            batch_size, num_chunks, self.chunk_max_seq_length
+        )
+        # Apply mean pooling to all chunks at once
+        chunk_embeddings = mean_pooling_chunked(
+            token_embeddings_chunked, attention_mask_chunked
+        )
+        return chunk_embeddings
+    def get_sentence_embedding_dimension(self) -> int | None:
+        """
+        Returns None since the embedding dimension comes from the transformer.
+        """
+        return None
+    def get_config_dict(self) -> dict[str, Any]:
+        """
+        Returns the config dictionary for serialization.
+        """
+        return {key: getattr(self, key) for key in self.config_keys}
+    def save(self, output_path: str, safe_serialization: bool = True) -> None:
+        """
+        Save the module configuration.
+        """
+        with open(
+            os.path.join(output_path, "config.json"), "w", encoding="utf-8"
+        ) as f:
+            json.dump(self.get_config_dict(), f, indent=2)
+    @staticmethod
+    def load(input_path: str) -> "Pooling":
+        """
+        Load the module from a directory.
+        """
+        config_path = os.path.join(input_path, "config.json")
+        with open(config_path, encoding="utf-8") as f:
+            config = json.load(f)
+        return Pooling(**config)

custom_st.py ADDED Viewed

	@@ -0,0 +1,199 @@

+"""
+Custom Sentence Transformer module for Late Chunking contextual embeddings.
+This module ONLY accepts nested lists of chunks: list[list[str]]
+All chunks are encoded together to preserve positional context (late chunking).
+Usage:
+    Place this file alongside contextual_pooling.py in your model directory.
+    Load with: SentenceTransformer("model-path", trust_remote_code=True)
+    documents = [
+        ["chunk 1", "chunk 2", "chunk 3"],
+        ["chunk 1", "chunk 2"]
+    ]
+    embeddings = model.encode(documents)
+"""
+from __future__ import annotations
+import json
+import os
+from typing import Any, Dict, List, Optional
+import torch
+from torch import nn
+from transformers import AutoConfig, AutoModel, AutoTokenizer
+class Transformer(nn.Module):
+    """
+    Custom Transformer wrapper for late chunking contextual embeddings.
+    This class handles tokenization of nested list inputs (documents with chunks)
+    and concatenates chunks with fixed chunk_max_seq_length for late chunking.
+    Args:
+        model_name_or_path: Hugging Face model name or path
+        chunk_max_seq_length: Maximum sequence length for each chunk in tokens.
+            Each chunk will be padded or truncated to exactly this many tokens.
+        model_args: Keyword arguments passed to AutoModel.from_pretrained()
+        tokenizer_args: Keyword arguments passed to AutoTokenizer.from_pretrained()
+        config_args: Keyword arguments passed to AutoConfig.from_pretrained()
+        cache_dir: Cache directory for Hugging Face models
+        tokenizer_name_or_path: Tokenizer name or path (defaults to model_name_or_path)
+    """
+    def __init__(
+        self,
+        model_name_or_path: str,
+        chunk_max_seq_length: int = 128,
+        model_args: Optional[Dict[str, Any]] = None,
+        tokenizer_args: Optional[Dict[str, Any]] = None,
+        config_args: Optional[Dict[str, Any]] = None,
+        cache_dir: Optional[str] = None,
+        tokenizer_name_or_path: Optional[str] = None,
+        **kwargs,
+    ) -> None:
+        super().__init__()
+        self.config_keys = ["chunk_max_seq_length"]
+        self.chunk_max_seq_length = chunk_max_seq_length
+        if model_args is None:
+            model_args = {}
+        if tokenizer_args is None:
+            tokenizer_args = {}
+        if config_args is None:
+            config_args = {}
+        # Load config and model
+        config = AutoConfig.from_pretrained(
+            model_name_or_path, **config_args, cache_dir=cache_dir
+        )
+        self.auto_model = AutoModel.from_pretrained(
+            model_name_or_path, config=config, cache_dir=cache_dir, **model_args
+        )
+        # Load tokenizer
+        self.tokenizer = AutoTokenizer.from_pretrained(
+            tokenizer_name_or_path if tokenizer_name_or_path else model_name_or_path,
+            cache_dir=cache_dir,
+            **tokenizer_args,
+        )
+    def forward(
+        self, features: Dict[str, torch.Tensor]
+    ) -> Dict[str, torch.Tensor]:
+        """
+        Forward pass through the transformer model.
+        Args:
+            features: Dictionary with 'input_ids' and 'attention_mask'
+        Returns:
+            Dictionary with 'token_embeddings' and other transformer outputs
+        """
+        output_states = self.auto_model(
+            input_ids=features["input_ids"],
+            attention_mask=features["attention_mask"],
+            return_dict=True,
+        )
+        # Get token embeddings (last hidden state)
+        features["token_embeddings"] = output_states.last_hidden_state
+        return features
+    def tokenize(
+        self, texts: List[List[str]], padding: bool = True
+    ) -> Dict[str, torch.Tensor]:
+        """
+        Tokenize nested list of chunks for late chunking.
+        Args:
+            texts: list[list[str]] - list of documents, each document is a list of chunk texts
+            padding: Padding strategy (always uses max_length for chunks)
+        Returns:
+            Dictionary with 'input_ids', 'attention_mask', and 'num_chunks_list'
+        """
+        if not texts:
+            raise ValueError("Input texts cannot be empty")
+        if not isinstance(texts[0], list):
+            raise ValueError(
+                "This model only accepts nested lists: list[list[str]]. "
+                "Expected format: [['chunk1', 'chunk2'], ['chunk1']]"
+            )
+        all_input_ids = []
+        all_attention_masks = []
+        num_chunks_list = []
+        for i, doc_chunks in enumerate(texts):
+            if not doc_chunks:
+                raise ValueError(
+                    f"Document at index {i} has no chunks. "
+                    "Each document must have at least one chunk."
+                )
+            # Tokenize each chunk to fixed size
+            chunk_input_ids = []
+            chunk_attention_masks = []
+            for chunk in doc_chunks:
+                chunk_tokens = self.tokenizer(
+                    chunk,
+                    max_length=self.chunk_max_seq_length,
+                    truncation=True,
+                    padding="max_length",
+                    return_tensors=None,
+                )
+                chunk_input_ids.extend(chunk_tokens["input_ids"])
+                chunk_attention_masks.extend(chunk_tokens["attention_mask"])
+            all_input_ids.append(chunk_input_ids)
+            all_attention_masks.append(chunk_attention_masks)
+            num_chunks_list.append(len(doc_chunks))
+        # Convert to tensors
+        return {
+            "input_ids": torch.tensor(all_input_ids, dtype=torch.long),
+            "attention_mask": torch.tensor(all_attention_masks, dtype=torch.long),
+            "num_chunks_list": num_chunks_list,  # Store for pooling
+        }
+    def get_word_embedding_dimension(self) -> int:
+        """Returns the embedding dimension."""
+        return self.auto_model.config.hidden_size
+    def save(self, output_path: str, safe_serialization: bool = True) -> None:
+        """Save the model, tokenizer, and configuration."""
+        self.auto_model.save_pretrained(
+            output_path, safe_serialization=safe_serialization
+        )
+        self.tokenizer.save_pretrained(output_path)
+        # Save custom config
+        with open(
+            os.path.join(output_path, "sentence_bert_config.json"), "w", encoding="utf-8"
+        ) as f:
+            json.dump(
+                {key: getattr(self, key) for key in self.config_keys},
+                f,
+                indent=2,
+            )
+    @staticmethod
+    def load(input_path: str) -> "Transformer":
+        """Load the model from a directory."""
+        # Load config
+        config_path = os.path.join(input_path, "sentence_bert_config.json")
+        if os.path.exists(config_path):
+            with open(config_path, encoding="utf-8") as f:
+                config = json.load(f)
+        else:
+            config = {}
+        return Transformer(model_name_or_path=input_path, **config)