chore: moved tokenlearn as in internal package

Browse files

Files changed (6) hide show

src/distiller/tokenlearn/__init__.py +0 -0
src/distiller/tokenlearn/featurize.py +161 -0
src/distiller/tokenlearn/pretrain.py +242 -0
src/distiller/tokenlearn/train.py +134 -0
src/distiller/tokenlearn/utils.py +62 -0
src/distiller/tokenlearn/version.py +2 -0

src/distiller/tokenlearn/__init__.py ADDED Viewed

File without changes

src/distiller/tokenlearn/featurize.py ADDED Viewed

	@@ -0,0 +1,161 @@

+import argparse
+import json
+import logging
+from collections.abc import Iterator
+from pathlib import Path
+import numpy as np
+from datasets import load_dataset
+from more_itertools import batched
+from sentence_transformers import SentenceTransformer
+from tqdm import tqdm
+from transformers.tokenization_utils import PreTrainedTokenizer
+_SAVE_EVERY = 32
+logger = logging.getLogger(__name__)
+def featurize(
+    dataset: Iterator[dict[str, str]],
+    model: SentenceTransformer,
+    output_dir: str,
+    max_means: int,
+    batch_size: int,
+    text_key: str,
+) -> None:
+    """Make a directory and dump all kinds of data in it."""
+    output_dir_path = Path(output_dir)
+    output_dir_path.mkdir(parents=True, exist_ok=True)
+    # Ugly hack
+    largest_batch = max([int(x.stem.split("_")[1]) for x in list(output_dir_path.glob("*.json"))], default=0)
+    if largest_batch:
+        logger.info(f"Resuming from batch {largest_batch}, skipping previous batches.")
+    texts = []
+    embeddings = []
+    dim = model.get_sentence_embedding_dimension()
+    if dim is None:
+        msg = "Model has no sentence embedding dimension."
+        raise ValueError(msg)
+    tokenizer: PreTrainedTokenizer = model.tokenizer
+    # Binding i in case the dataset is empty.
+    i = 0
+    for i, batch in tqdm(enumerate(batched(dataset, n=batch_size))):
+        if i * batch_size >= max_means:
+            logger.info(f"Reached maximum number of means: {max_means}")
+            break
+        if largest_batch and i <= largest_batch:
+            continue
+        batch = [x[text_key] for x in batch]
+        if not all(isinstance(x, str) for x in batch):
+            msg = f"Detected non-string at batch: {i}"
+            raise ValueError(msg)
+        batch_embeddings = model.encode(batch, output_value="token_embeddings")  # type: ignore  # Annoying
+        for text, embedding in zip(batch, batch_embeddings, strict=False):
+            texts.append(_truncate_text(tokenizer, text))
+            embeddings.append(embedding[1:-1].mean(axis=0).cpu().numpy())
+        if i and i % _SAVE_EVERY == 0:
+            json.dump(texts, open(output_dir_path / f"feature_{i}.json", "w"), indent=4)
+            np.save(output_dir_path / f"feature_{i}.npy", embeddings)
+            texts = []
+            embeddings = []
+    if texts:
+        json.dump(texts, open(output_dir_path / f"feature_{i}.json", "w"), indent=4)
+        np.save(output_dir_path / f"feature_{i}.npy", embeddings)
+def _truncate_text(tokenizer: PreTrainedTokenizer, text: str) -> str:
+    """Truncate text to fit the tokenizer's maximum length."""
+    tokens = tokenizer.encode(
+        text,
+        truncation=True,
+        max_length=tokenizer.model_max_length,
+    )
+    return tokenizer.decode(tokens, skip_special_tokens=True)
+def main() -> None:
+    """Main function to featurize texts using a sentence transformer."""
+    parser = argparse.ArgumentParser(description="Featurize texts using a sentence transformer.")
+    parser.add_argument(
+        "--model-name",
+        type=str,
+        default="baai/bge-base-en-v1.5",
+        help="The model name for distillation (e.g., 'baai/bge-base-en-v1.5').",
+    )
+    parser.add_argument(
+        "--output-dir",
+        type=str,
+        default=None,
+        help="Directory to save the featurized texts.",
+    )
+    parser.add_argument(
+        "--dataset-path",
+        type=str,
+        default="allenai/c4",
+        help="The dataset path or name (e.g. 'allenai/c4').",
+    )
+    parser.add_argument(
+        "--dataset-name",
+        type=str,
+        default="en",
+        help="The dataset configuration name (e.g., 'en' for C4).",
+    )
+    parser.add_argument(
+        "--dataset-split",
+        type=str,
+        default="train",
+        help="The dataset split (e.g., 'train', 'validation').",
+    )
+    parser.add_argument(
+        "--no-streaming",
+        action="store_false",
+        help="Disable streaming mode when loading the dataset.",
+    )
+    parser.add_argument(
+        "--max-means",
+        type=int,
+        default=1000000,
+        help="The maximum number of mean embeddings to generate.",
+    )
+    parser.add_argument(
+        "--key",
+        type=str,
+        default="text",
+        help="The key of the text field in the dataset to featurize (default: 'text').",
+    )
+    parser.add_argument(
+        "--batch-size",
+        type=int,
+        default=32,
+        help="Batch size to use for encoding the texts.",
+    )
+    args = parser.parse_args()
+    if args.output_dir is None:
+        model_name = args.model_name.replace("/", "_")
+        dataset_path = args.dataset_path.replace("/", "_")
+        output_dir = f"{model_name}_{dataset_path}_featurized"
+    else:
+        output_dir = args.output_dir
+    model = SentenceTransformer(args.model_name)
+    dataset = load_dataset(
+        args.dataset_path,
+        name=args.dataset_name,
+        split=args.dataset_split,
+        streaming=args.no_streaming,
+    )
+    featurize(iter(dataset), model, output_dir, args.max_means, args.batch_size, args.key)
+if __name__ == "__main__":
+    main()

src/distiller/tokenlearn/pretrain.py ADDED Viewed

	@@ -0,0 +1,242 @@

+from __future__ import annotations
+import logging
+from typing import TYPE_CHECKING
+import numpy as np
+import torch
+from torch import nn
+from torch.nn.utils.rnn import pad_sequence
+from torch.utils.data import DataLoader, Dataset
+from tqdm import tqdm
+from distiller.model2vec.distill.utils import select_optimal_device
+from distiller.model2vec.model import StaticModel
+if TYPE_CHECKING:
+	from tokenizers import Tokenizer
+logger = logging.getLogger(__name__)
+class StaticModelFineTuner(nn.Module):
+	def __init__(self, vectors: torch.Tensor, out_dim: int, pad_id: int) -> None:
+		"""
+		Initialize from a model.
+		:param vectors: The vectors to use.
+		:param out_dim: The output dimension.
+		:param pad_id: The padding id.
+		"""
+		super().__init__()
+		self.pad_id = pad_id
+		norms = vectors.norm(dim=1)
+		# Normalize the vectors
+		vectors = vectors / norms[:, None]
+		self.embeddings = nn.Embedding.from_pretrained(vectors.clone(), freeze=False, padding_idx=pad_id)
+		self.n_out = out_dim
+		self.out_layer = nn.Linear(vectors.shape[1], self.n_out)
+		weights = torch.Tensor(norms)
+		weights[pad_id] = 0
+		self.w = nn.Parameter(weights)
+	def sub_forward(self, input_ids: torch.Tensor) -> torch.Tensor:
+		"""Forward pass through the mean."""
+		# Fix for index out of bounds issue - filter out invalid tokens
+		valid_mask = (input_ids >= 0) & (input_ids < self.w.shape[0])
+		if not valid_mask.all():
+			input_ids = torch.where(valid_mask, input_ids, 0)
+		w = self.w[input_ids]
+		zeros = (input_ids != self.pad_id).float()
+		w = w * zeros
+		# Add a small epsilon to avoid division by zero
+		length = zeros.sum(1) + 1e-16
+		# Fix for embedding index out of bounds issue
+		valid_emb_mask = (input_ids >= 0) & (input_ids < self.embeddings.num_embeddings)
+		if not valid_emb_mask.all():
+			input_ids = torch.where(valid_emb_mask, input_ids, 0)
+		embedded = self.embeddings(input_ids)
+		# Zero out the padding
+		embedded = torch.bmm(w[:, None, :], embedded).squeeze(1)
+		# Simulate actual mean
+		return embedded / length[:, None]
+	def forward(self, x: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]:
+		"""Forward pass through the mean, and a classifier layer after."""
+		embedded = self.sub_forward(x)
+		return self.out_layer(embedded), embedded
+	@property
+	def device(self) -> torch.device:
+		"""Get the device of the model."""
+		return self.embeddings.weight.device
+class TextDataset(Dataset):
+	def __init__(self, texts: list[str], targets: torch.Tensor, tokenizer: Tokenizer) -> None:
+		"""
+		Initialize the dataset.
+		:param texts: The texts to tokenize.
+		:param targets: The targets.
+		:param tokenizer: The tokenizer to use.
+		:raises ValueError: If the number of labels does not match the number of texts.
+		"""
+		if len(targets) != len(texts):
+			msg = "Number of labels does not match number of texts."
+			raise ValueError(msg)
+		self.texts = [x[:20_000] for x in texts]
+		self.tokenized_texts: list[list[int]] = [
+			encoding.ids[:512] for encoding in tokenizer.encode_batch_fast(self.texts, add_special_tokens=False)
+		]
+		self.targets = targets
+		self.tokenizer = tokenizer
+	def __len__(self) -> int:
+		"""Return the length of the dataset."""
+		return len(self.tokenized_texts)
+	def __getitem__(self, index: int) -> tuple[list[int], torch.Tensor]:
+		"""Gets an item."""
+		return self.tokenized_texts[index], self.targets[index]
+	@staticmethod
+	def collate_fn(batch: list[tuple[list[list[int]], int]]) -> tuple[torch.Tensor, torch.Tensor]:
+		"""Collate function."""
+		texts, targets = zip(*batch, strict=False)
+		tensors = [torch.LongTensor(x).int() for x in texts]
+		padded = pad_sequence(tensors, batch_first=True, padding_value=0)
+		return padded, torch.stack(targets)
+	def to_dataloader(self, shuffle: bool, batch_size: int = 32) -> DataLoader:
+		"""Convert the dataset to a DataLoader."""
+		return DataLoader(self, collate_fn=self.collate_fn, shuffle=shuffle, batch_size=batch_size)
+def train_supervised(
+	train_dataset: TextDataset,
+	validation_dataset: TextDataset,
+	model: StaticModel,
+	patience: int | None = 5,
+	device: str | None = None,
+	batch_size: int = 256,
+	lr: float = 1e-3,
+) -> StaticModel:
+	"""
+	Train a tokenlearn model.
+	:param train_dataset: The training dataset.
+	:param validation_dataset: The validation dataset.
+	:param model: The model to train.
+	:param patience: The number of epochs to wait before early stopping.
+	:param device: The device to train on.
+	:param batch_size: The batch size.
+	:param lr: The learning rate.
+	:return: The trained model.
+	"""
+	device = select_optimal_device(device)
+	train_dataloader = train_dataset.to_dataloader(shuffle=True, batch_size=batch_size)
+	# Initialize the model
+	trainable_model = StaticModelFineTuner(
+		torch.from_numpy(model.embedding),
+		out_dim=train_dataset.targets.shape[1],
+		pad_id=model.tokenizer.token_to_id("[PAD]"),
+	)
+	trainable_model.to(device)
+	# Separate parameters for model and linear layer
+	model_params = [
+		*list(trainable_model.embeddings.parameters()),
+		trainable_model.w,
+		*list(trainable_model.out_layer.parameters()),
+	]
+	# Create optimizer with separate parameter groups
+	optimizer = torch.optim.AdamW(params=model_params, lr=lr)
+	lowest_loss = float("inf")
+	param_dict = trainable_model.state_dict()
+	curr_patience = patience
+	stop = False
+	criterion = nn.MSELoss()
+	try:
+		for epoch in range(100_000):
+			logger.info(f"Epoch {epoch}")
+			trainable_model.train()
+			# Track train loss separately
+			train_losses = []
+			barred_train = tqdm(train_dataloader, desc=f"Epoch {epoch:03d} [Train]")
+			for idx, (x, y) in enumerate(barred_train):
+				optimizer.zero_grad()
+				x = x.to(trainable_model.device)
+				y_hat, _ = trainable_model(x)
+				# Separate loss components
+				train_loss = criterion(y_hat, y.to(trainable_model.device)).mean()
+				# Apply weights
+				train_loss.backward()
+				optimizer.step()
+				train_losses.append(train_loss.item())
+				barred_train.set_description_str(f"Train Loss: {np.mean(train_losses[-10:]):.3f}")
+				# Evaluate every 1000 steps and at the end of the epoch
+				if (idx > 0 and idx % 1000 == 0) or idx == len(train_dataloader) - 1:
+					trainable_model.eval()
+					with torch.no_grad():
+						validation_losses = []
+						barred_val = tqdm(
+							validation_dataset.to_dataloader(shuffle=False, batch_size=batch_size), desc="Validation"
+						)
+						for x_val, y_val in barred_val:
+							x_val = x_val.to(trainable_model.device)
+							y_hat_val, _ = trainable_model(x_val)
+							val_loss = criterion(y_hat_val, y_val.to(trainable_model.device)).mean()
+							validation_losses.append(val_loss.item())
+							barred_val.set_description_str(f"Validation Loss: {np.mean(validation_losses):.3f}")
+						validation_loss = np.mean(validation_losses)
+					# Early stopping logic based on validation loss
+					if patience is not None and curr_patience is not None:
+						if (lowest_loss - validation_loss) > 1e-4:
+							param_dict = trainable_model.state_dict()  # Save best model state based on training loss
+							curr_patience = patience
+							lowest_loss = validation_loss
+						else:
+							curr_patience -= 1
+							if curr_patience == 0:
+								stop = True
+								break
+						logger.info(f"Patience level: {patience - curr_patience}")
+						logger.info(f"Validation loss: {validation_loss:.3f}")
+						logger.info(f"Lowest loss: {lowest_loss:.3f}")
+					trainable_model.train()
+			if stop:
+				logger.info("Early stopping")
+				break
+	except KeyboardInterrupt:
+		logger.info("Training interrupted")
+	trainable_model.eval()
+	# Load best model based on training loss
+	trainable_model.load_state_dict(param_dict)
+	# Move the embeddings to the device (GPU)
+	embeddings_weight = trainable_model.embeddings.weight.to(device)
+	# Perform the forward pass on GPU
+	with torch.no_grad():
+		vectors = trainable_model.sub_forward(torch.arange(len(embeddings_weight))[:, None].to(device)).cpu().numpy()
+	return StaticModel(vectors=vectors, tokenizer=model.tokenizer, config=model.config)

src/distiller/tokenlearn/train.py ADDED Viewed

	@@ -0,0 +1,134 @@

+import argparse
+import logging
+from pathlib import Path
+import numpy as np
+import torch
+from sklearn.decomposition import PCA
+from distiller.model2vec.distill import distill
+from distiller.model2vec.model import StaticModel
+from distiller.tokenlearn.pretrain import TextDataset, train_supervised
+from distiller.tokenlearn.utils import collect_means_and_texts, create_vocab
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+_MAX_N_VAL_SAMPLES = 10_000
+def train_model(
+	model_name: str,
+	train_txt: list[str],
+	train_vec: np.ndarray,
+	device: str = "cpu",
+	vocab_size: int | None = None,
+	pca_dims: int = 256,
+) -> StaticModel:
+	"""
+	Train a tokenlearn model.
+	:param model_name: The sentence transformer model name for distillation.
+	:param train_txt: List of texts to train on.
+	:param train_vec: List of vectors to train on.
+	:param device: Device to run the training on.
+	:param vocab_size: The vocabulary size to use (optional).
+	:param pca_dims: Number of dimensions to reduce the target embeddings to using PCA.
+	    The model will use the same number of dimensions for the embeddings.
+	:return: The trained model.
+	"""
+	pca_for_targets = PCA(n_components=pca_dims)
+	train_vec = pca_for_targets.fit_transform(train_vec)
+	var = np.cumsum(pca_for_targets.explained_variance_ratio_)[-1]
+	logger.info(f"Explained variance of target embeddings: {var:.2f}")
+	# Split the data into training and validation sets
+	# We use a max of 10k samples as validation data
+	val_samples = min(_MAX_N_VAL_SAMPLES, len(train_txt) // 10)
+	train_txt, train_vec, val_txt, val_vec = (
+		train_txt[:-val_samples],
+		train_vec[:-val_samples],
+		train_txt[-val_samples:],
+		train_vec[-val_samples:],
+	)
+	if vocab_size:
+		# Create a vocabulary if a vocab size is specified
+		vocab = create_vocab(texts=train_txt, vocab_size=vocab_size)
+		logger.info(f"Vocabulary created with {len(vocab)} tokens.")
+	else:
+		vocab = None
+	model = distill(model_name=model_name, quantize_to="float32", vocabulary=vocab, pca_dims=pca_dims)
+	train_data = TextDataset(train_txt, torch.from_numpy(train_vec), model.tokenizer)
+	val_data = TextDataset(val_txt, torch.from_numpy(val_vec), model.tokenizer)
+	# Train the model
+	return train_supervised(train_dataset=train_data, validation_dataset=val_data, model=model, device=device)
+def save_model(model: StaticModel, save_path: str) -> None:
+	"""
+	Save the model to the specified path.
+	:param model: The model to save.
+	:param save_path: Path to save the model.
+	"""
+	model.save_pretrained(save_path)
+	logging.info(f"Model saved to {save_path}")
+def main() -> None:
+	"""Main function to train and save a Model2Vec model using tokenlearn."""
+	parser = argparse.ArgumentParser(description="Train a Model2Vec using tokenlearn.")
+	parser.add_argument(
+		"--model-name",
+		type=str,
+		default="baai/bge-base-en-v1.5",
+		help="The model name for distillation (e.g., 'baai/bge-base-en-v1.5').",
+	)
+	parser.add_argument(
+		"--data-path",
+		type=str,
+		default="data/fineweb_bgebase",
+		help="Path to the directory containing the dataset.",
+	)
+	parser.add_argument(
+		"--save-path",
+		type=str,
+		required=True,
+		help="Path to save the trained model.",
+	)
+	parser.add_argument(
+		"--device",
+		type=str,
+		default="cpu",
+		help="Device to run the training on (e.g., 'cpu', 'cuda').",
+	)
+	parser.add_argument(
+		"--vocab-size",
+		type=int,
+		default=56000,
+		help="The vocabulary size to use for training.",
+	)
+	parser.add_argument(
+		"--pca-dims",
+		type=int,
+		default=256,
+		help="Number of dimensions to reduce the target embeddings to using PCA.",
+	)
+	args = parser.parse_args()
+	# Collect paths for training data
+	paths = sorted(Path(args.data_path).glob("*.json"))
+	train_txt, train_vec = collect_means_and_texts(paths)
+	# Train the model
+	model = train_model(
+		args.model_name, train_txt, train_vec, device=args.device, vocab_size=args.vocab_size, pca_dims=args.pca_dims
+	)
+	save_model(model, args.save_path)
+if __name__ == "__main__":
+	main()

src/distiller/tokenlearn/utils.py ADDED Viewed

	@@ -0,0 +1,62 @@

+import json
+import logging
+from collections import Counter
+from pathlib import Path
+import numpy as np
+import regex
+from tqdm import tqdm
+logger = logging.getLogger(__name__)
+def create_vocab(texts: list[str], vocab_size: int = 56_000) -> list[str]:
+    """
+    Create a vocabulary from a list of texts.
+    :param texts: The list of texts to create the vocabulary from.
+    :param vocab_size: The size of the vocabulary. Defaults to 56,000, which is the vocab_size used for our 32M models.
+    :return: The vocabulary.
+    """
+    tokenizer_regex = regex.compile(r"\w+|[^\w\s]+")
+    # Tokenize all texts
+    tokens = []
+    for text in tqdm(texts, desc="Tokenizing texts"):
+        tokens.extend(tokenizer_regex.findall(text.lower()))
+    # Count the tokens
+    token_counts = Counter(tokens)
+    # Get the most common tokens as the vocabulary
+    return [word for word, _ in token_counts.most_common(vocab_size)]
+def collect_means_and_texts(paths: list[Path]) -> tuple[list[str], np.ndarray]:
+    """Collect means and texts from a list of paths."""
+    txts = []
+    vectors_list = []
+    for items_path in tqdm(paths, desc="Collecting means and texts"):
+        if not items_path.name.endswith(".json"):
+            continue
+        base_path = items_path.with_name(items_path.stem.replace("", ""))
+        vectors_path = items_path.with_name(base_path.name.replace(".json", "") + ".npy")
+        try:
+            with open(items_path) as f:
+                items = json.load(f)
+            vectors = np.load(vectors_path, allow_pickle=False)
+        except (KeyError, FileNotFoundError, ValueError) as e:
+            logger.info(f"Error loading data from {base_path}: {e}")
+            continue
+        # Filter out any NaN vectors before appending
+        vectors = np.array(vectors)
+        items = np.array(items)
+        non_nan_indices = ~np.isnan(vectors).any(axis=1)
+        valid_vectors = vectors[non_nan_indices]
+        valid_items = items[non_nan_indices]
+        txts.extend(valid_items.tolist())
+        vectors_list.append(valid_vectors)
+    all_vectors = np.concatenate(vectors_list, axis=0) if vectors_list else np.array([])
+    return txts, all_vectors

src/distiller/tokenlearn/version.py ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ __version_triple__ = (0, 2, 0)
2	+ __version__ = ".".join(map(str, __version_triple__))