Adding LCA tokenizer source code

Browse files

Files changed (5) hide show

config_utils.py +769 -0
general_utils.py +309 -0
sequtils.py +980 -0
tokenizer.py +363 -0
tokenizer_config.json +6 -0

config_utils.py ADDED Viewed

	@@ -0,0 +1,769 @@

+# Config utils
+import yaml
+import pathlib
+from os.path import join
+import os
+import numpy as np
+import torch
+import argparse
+from multiprocessing import cpu_count
+from transformers import TrainingArguments
+from copy import deepcopy
+import re
+import sys
+def add_hf_args_to_parser(parser):
+    # Create a temporary TrainingArguments to access default values and descriptions
+    hf_args = TrainingArguments(output_dir="/tmp")  # Dummy output_dir
+    # Iterate over all public attributes
+    for attr in dir(hf_args):
+        if not attr.startswith("_"):
+            default = getattr(hf_args, attr)
+            # You can add more sophisticated handling based on attribute types here
+            if isinstance(default, (int, float, str, bool)):
+                help_str = f"Auto-generated help for {attr}"
+                parser.add_argument(f"--{attr}", type=type(default), default=default, help=help_str)
+    return parser
+class BaseConfig:
+    """Base class for managing and validating configurations."""
+    numpy_dtype_mapping = {1: np.int8,
+                           2: np.int16,
+                           8: np.int64,
+                           4: np.int32}
+    def __init__(self):
+        super().__init__()
+    def cast_to_expected_type(self, parameter_class: str, parameter_name: str, value: any) -> any:
+        """
+        Cast the given value to the expected type.
+        :param parameter_class: The class/category of the parameter.
+        :type parameter_class: str
+        :param parameter_name: The name of the parameter.
+        :type parameter_name: str
+        :param value: The value to be casted.
+        :type value: any
+        :return: Value casted to the expected type.
+        :rtype: any
+        :raises ValueError: If casting fails.
+        """
+        expected_type = self.parameters[parameter_class][parameter_name]['type']
+        if expected_type in ["integer", "int"]:
+            try:
+                return int(value)
+            except ValueError:
+                raise ValueError(f"Failed to cast value '{value}' to integer for parameter '{parameter_name}' in class '{parameter_class}'.")
+        elif expected_type == "float":
+            try:
+                return float(value)
+            except ValueError:
+                raise ValueError(f"Failed to cast value '{value}' to float for parameter '{parameter_name}' in class '{parameter_class}'.")
+        elif expected_type in ["string", "str"]:
+            return str(value)
+        elif expected_type in ["boolean", "bool"]:
+            if isinstance(value, bool):
+                return value
+            elif str(value).lower() == "true":
+                return True
+            elif str(value).lower() == "false":
+                return False
+            else:
+                raise ValueError(f"Failed to cast value '{value}' to boolean for parameter '{parameter_name}' in class '{parameter_class}'.")
+        elif expected_type == "type":
+            # For this type, we will simply return the value without casting.
+            # It assumes the configuration provides valid Python types.
+            return value
+        elif expected_type == "list":
+            if isinstance(value, list):
+                return value
+            else:
+                raise ValueError(f"Failed to validate value '{value}' as a list for parameter '{parameter_name}' in class '{parameter_class}'.")
+        elif expected_type == "tuple":
+            if isinstance(value, tuple):
+                return value
+            else:
+                raise ValueError(f"Failed to validate value '{value}' as a tuple for parameter '{parameter_name}' in class '{parameter_class}'.")
+        elif expected_type == "set":
+            if isinstance(value, set):
+                return value
+            else:
+                raise ValueError(f"Failed to validate value '{value}' as a set for parameter '{parameter_name}' in class '{parameter_class}'.")
+        elif expected_type == "dict":
+            if isinstance(value, dict):
+                return value
+            else:
+                raise ValueError(f"Failed to validate value '{value}' as a dict for parameter '{parameter_name}' in class '{parameter_class}'.")
+        else:
+            raise ValueError(f"Unknown expected type '{expected_type}' for parameter '{parameter_name}' in class '{parameter_class}'.")
+    def get_parameter(self, parameter_class: str, parameter_name: str) -> any:
+        """
+        Retrieve the default value of a specified parameter.
+        :param parameter_class: The class/category of the parameter (e.g., 'segmentation').
+        :type parameter_class: str
+        :param parameter_name: The name of the parameter.
+        :type parameter_name: str
+        :return: Default value of the parameter, casted to the expected type.
+        :rtype: any
+        """
+        default_value = self.parameters[parameter_class][parameter_name]['default']
+        return self.cast_to_expected_type(parameter_class, parameter_name, default_value)
+    def validate_type(self, parameter_class: str, parameter_name: str, value: any) -> bool:
+        """
+        Validate the type of a given value against the expected type.
+        :param parameter_class: The class/category of the parameter.
+        :type parameter_class: str
+        :param parameter_name: The name of the parameter.
+        :type parameter_name: str
+        :param value: The value to be validated.
+        :type value: any
+        :return: True if the value is of the expected type, otherwise False.
+        :rtype: bool
+        """
+        expected_type = self.parameters[parameter_class][parameter_name]['type']
+        if expected_type == "integer" and not isinstance(value, int):
+            return False
+        elif expected_type == "float" and not isinstance(value, float):
+            return False
+        elif expected_type == "string" and not isinstance(value, str):
+            return False
+        else:
+            return True
+    def validate_value(self, parameter_class: str, parameter_name: str, value: any) -> bool:
+        """
+        Validate the value of a parameter against its constraints.
+        :param parameter_class: The class/category of the parameter.
+        :type parameter_class: str
+        :param parameter_name: The name of the parameter.
+        :type parameter_name: str
+        :param value: The value to be validated.
+        :type value: any
+        :return: True if the value meets the constraints, otherwise False.
+        :rtype: bool
+        """
+        constraints = self.parameters[parameter_class][parameter_name].get('constraints', {})
+        if 'options' in constraints and value not in constraints['options']:
+            return False
+        if 'min' in constraints and value < constraints['min']:
+            return False
+        if 'max' in constraints and value > constraints['max']:
+            return False
+        return True
+    def validate(self, parameter_class: str, parameter_name: str, value: any):
+        """
+        Validate both the type and value of a parameter.
+        :param parameter_class: The class/category of the parameter.
+        :type parameter_class: str
+        :param parameter_name: The name of the parameter.
+        :type parameter_name: str
+        :param value: The value to be validated.
+        :type value: any
+        :raises TypeError: If the value is not of the expected type.
+        :raises ValueError: If the value does not meet the parameter's constraints.
+        """
+        if not self.validate_type(parameter_class, parameter_name, value):
+            raise TypeError(f"Invalid type for {parameter_name} for parameter class '{parameter_class}'. Expected {self.parameters[parameter_class][parameter_name]['type']}.")
+        if not self.validate_value(parameter_class, parameter_name, value):
+            raise ValueError(f"Invalid value for {parameter_name}  for parameter class '{parameter_class}'. Constraints: {self.parameters[parameter_class][parameter_name].get('constraints', {})}.")
+    def describe(self, parameter_class: str, parameter_name: str) -> str:
+        """
+        Retrieve the description of a parameter.
+        :param parameter_class: The class/category of the parameter.
+        :type parameter_class: str
+        :param parameter_name: The name of the parameter.
+        :type parameter_name: str
+        :return: Description of the parameter.
+        :rtype: str
+        """
+        return self.parameters[parameter_class][parameter_name]['description']
+    @staticmethod
+    def rename_non_unique_parameters(config: dict) -> tuple[dict, dict, dict]:
+        """
+        Rename parameters in the configuration to ensure uniqueness across different groups.
+        This method identifies parameters with the same name across different groups and renames them
+        by prefixing the group name. This is to prevent conflicts when parameters are used in a context
+        where the group name is not specified.
+        :param config: A dictionary where each key is a group name and each value is a dict
+                       of parameters for that group.
+        :type config: dict
+        :return: A tuple containing:
+                 - renamed_config: A dictionary with the same structure as the input, but with non-unique parameter
+                   names renamed. The structure is {group_name: {param_name: param_info}}.
+                 - cmd_argument2group_param: A dictionary mapping the new parameter names to their original group
+                   and parameter name. The structure is {new_param_name: [group_name, original_param_name]}.
+                 - group2param2cmdarg: A dictionary mapping each group to a dict that maps the original parameter
+                   names to the new parameter names. The structure is {group_name: {original_param_name: new_param_name}}.
+        :rtype: tuple[dict, dict, dict]
+        """
+        # Identify non-unique parameter names
+        param_counts = {}
+        for group_name, parameters in config.items():
+            for param_name in parameters.keys():
+                param_counts[param_name] = param_counts.get(param_name, 0) + 1
+        non_unique_params = {param for param, count in param_counts.items() if count > 1}
+        cmd_argument2group_param = {}
+        group2param2cmdarg = {}
+        for group_name, parameters in config.items():
+            group2param2cmdarg[group_name]={}
+            for param_name in parameters.keys():
+                group2param2cmdarg[group_name][param_name] = param_name
+        # Rename only the non-unique parameters
+        renamed_config = {}
+        for group_name, parameters in config.items():
+            renamed_group = {}
+            for param_name, param_info in parameters.items():
+                new_param_name = f"{group_name}_{param_name}" if param_name in non_unique_params else param_name
+                cmd_argument2group_param[new_param_name] = [group_name, param_name]
+                group2param2cmdarg[group_name][param_name]=new_param_name
+                renamed_group[new_param_name] = param_info
+            renamed_config[group_name] = renamed_group
+        return renamed_config, cmd_argument2group_param, group2param2cmdarg
+    @staticmethod
+    def create_parser(config: dict) -> argparse.ArgumentParser:
+        """
+        Create and configure an argparse parser based on the given configuration.
+        This method sets up a command-line argument parser with arguments defined in the configuration.
+        Each top-level key in the configuration represents a group of related arguments.
+        :param config: A dictionary where each key is a group name and each value is a dict
+                       of parameters for that group. Each parameter's information should include
+                       its type, default value, and help description.
+        :type config: dict
+        :return: Configured argparse.ArgumentParser instance with arguments added as specified
+                 in the configuration.
+        :rtype: argparse.ArgumentParser
+        :raises ValueError: If an unknown or unsupported type is specified for a parameter.
+        """
+        parser = argparse.ArgumentParser(description="Command-line parser for project settings")
+        # Mapping of type strings to Python types
+        type_mapping = {
+            'integer': int,
+            'int': int,
+            'float': float,
+            'string': str,
+            'str': str,
+            'bool': bool,
+            'boolean': bool,
+            'list': list
+            # Complex types like 'dict' and 'type' are intentionally excluded
+        }
+        # List of types to handle as strings
+        handle_as_string = ['dict', 'type', 'list']
+        excluded_parameters = ['vocabmap', 'np_tokentype', 'pretraining_dataset_data', 'optim']
+        for group_name, parameters in config.items():
+            group = parser.add_argument_group(group_name)
+            for param_name, param_info in parameters.items():
+                param_type_str = param_info['type']
+                description = param_info['description']
+                escaped_description = re.sub(r"([^%])%", r"\1%%", description)
+                if param_name in excluded_parameters:
+                    continue
+                if param_type_str in handle_as_string:
+                    # Handle these types as strings in argparse, conversion will be done later in the program
+                    param_type = str
+                elif param_type_str not in type_mapping:
+                    raise ValueError(f"Unknown or unsupported type '{param_type_str}' for parameter '{param_name}'")
+                else:
+                    param_type = type_mapping[param_type_str]
+                #print(f'The current type is: {param_type}')
+                default_param = param_info['default']
+                description = param_info['description']
+                kwargs = {
+                    'type': param_type,
+                    'default': param_info['default'],
+                    'help': escaped_description
+                }            # Add constraints if they exist
+                """
+                if 'constraints' in param_info:
+                    constraints = param_info['constraints']
+                    if 'min' in constraints:
+                        kwargs['type'] = lambda x: eval(param_type_str)(x) if eval(param_type_str)(x) >= constraints['min'] else sys.exit(f"Value for {param_name} must be at least {constraints['min']}")
+                    if 'max' in constraints:
+                        kwargs['type'] = lambda x: eval(param_type_str)(x) if eval(param_type_str)(x) <= constraints['max'] else sys.exit(f"Value for {param_name} must be at most {constraints['max']}")
+                    if 'options' in constraints:
+                        kwargs['choices'] = constraints['options']
+                """
+                # Add argument to the group
+                group.add_argument(f'--{param_name}', **kwargs)
+        #parser = add_hf_args_to_parser(parser)
+        return parser
+class SeqConfig(BaseConfig):
+    """Class to manage and validate sequence processing configurations."""
+    def __init__(self):
+        super().__init__()
+        self.default_seq_config_file = self._get_default_sequence_processing_config_file()
+        with open(self.default_seq_config_file, 'r') as file:
+            self.parameters = yaml.safe_load(file)
+        # Some postprocessing steps
+        self.parameters['tokenization']['shift']['constraints']['max'] = self.parameters['tokenization']['kmer']['default']-1
+        # Ha valaki update-li a k-mer paramter-t, akkor triggerelni kellene, hogy mi legyen.
+        self.get_and_set_segmentation_parameters()
+        self.get_and_set_tokenization_parameters()
+        self.get_and_set_computational_parameters()
+    def _get_default_sequence_processing_config_file(self) -> str:
+        """
+        Retrieve the default sequence processing configuration file.
+        :return: Path to the configuration file.
+        :rtype: str
+        """
+        current_path = pathlib.Path(__file__).parent
+        prokbert_seq_config_file = join(current_path, 'configs', 'sequence_processing.yaml')
+        self.current_path = current_path
+        try:
+            # Attempt to read the environment variable
+            prokbert_seq_config_file = os.environ['SEQ_CONFIG_FILE']
+        except KeyError:
+            # Handle the case when the environment variable is not found
+            pass
+            # print("SEQ_CONFIG_FILE environment variable has not been set. Using default value: {0}".format(prokbert_seq_config_file))
+        return prokbert_seq_config_file
+    def get_and_set_segmentation_parameters(self, parameters: dict = {}) -> dict:
+        """
+        Retrieve and validate the provided parameters for segmentation.
+        :param parameters: A dictionary of parameters to be validated.
+        :type parameters: dict
+        :return: A dictionary of validated segmentation parameters.
+        :rtype: dict
+        :raises ValueError: If an invalid segmentation parameter is provided.
+        """
+        segmentation_params = {k: self.get_parameter('segmentation', k) for k in self.parameters['segmentation']}
+        for param, param_value in parameters.items():
+            if param not in segmentation_params:
+                raise ValueError(f"The provided {param} is an INVALID segmentation parameter! The valid parameters are: {list(segmentation_params.keys())}")
+            self.validate('segmentation', param, param_value)
+            segmentation_params[param] = param_value
+        self.segmentation_params = segmentation_params
+        return segmentation_params
+    def get_and_set_tokenization_parameters(self, parameters: dict = {}) -> dict:
+        # Updating the other parameters if necesseary, i.e. if k-mer has-been changed, then the shift is updated and we run a parameter check at the end
+        tokenization_params = {k: self.get_parameter('tokenization', k) for k in self.parameters['tokenization']}
+        for param, param_value in parameters.items():
+            if param not in tokenization_params:
+                raise ValueError(f"The provided {param} is an INVALID tokenization parameter! The valid parameters are: {list(tokenization_params.keys())}")
+            self.validate('tokenization', param, param_value)
+            tokenization_params[param] = param_value
+        # Loading and check the vocab file. It is assumed that its ordered dictionary
+        vocabfile=tokenization_params['vocabfile']
+        act_kmer = tokenization_params['kmer']
+        if vocabfile=='auto':
+            vocabfile_path = join(self.current_path, 'data/prokbert_vocabs/', f'prokbert-base-dna{act_kmer}', 'vocab.txt')
+            tokenization_params['vocabfile'] = vocabfile_path
+        else:
+            vocabfile_path = vocabfile
+        with open(vocabfile_path) as vocabfile_in:
+            vocabmap = {line.strip(): i for i, line in enumerate(vocabfile_in)}
+        tokenization_params['vocabmap'] = vocabmap
+        # Loading the vocab
+        self.tokenization_params = tokenization_params
+        return tokenization_params
+    def get_and_set_computational_parameters(self, parameters: dict = {}) -> dict:
+        """ Reading and validating the computational paramters
+        """
+        computational_params = {k: self.get_parameter('computation', k) for k in self.parameters['computation']}
+        core_count = cpu_count()
+        if computational_params['cpu_cores_for_segmentation'] == -1:
+            computational_params['cpu_cores_for_segmentation'] = core_count
+        if computational_params['cpu_cores_for_tokenization'] == -1:
+            computational_params['cpu_cores_for_tokenization'] = core_count
+        for param, param_value in parameters.items():
+            if param not in computational_params:
+                raise ValueError(f"The provided {param} is an INVALID computation parameter! The valid parameters are: {list(computational_params.keys())}")
+            self.validate('computation', param, param_value)
+            computational_params[param] = param_value
+        np_tokentype= SeqConfig.numpy_dtype_mapping[computational_params['numpy_token_integer_prec_byte']]
+        computational_params['np_tokentype'] = np_tokentype
+        self.computational_params = computational_params
+        return computational_params
+    def get_maximum_segment_length_from_token_count_from_params(self):
+        """Calculating the maximum length of the segment from the token count """
+        max_token_counts = self.tokenization_params['token_limit']
+        shift = self.tokenization_params['shift']
+        kmer = self.tokenization_params['kmer']
+        return self.get_maximum_segment_length_from_token_count(max_token_counts, shift, kmer)
+    def get_maximum_token_count_from_max_length_from_params(self):
+        """Calculating the maximum length of the segment from the token count """
+        max_segment_length = self.tokenization_params['max_segment_length']
+        shift = self.tokenization_params['shift']
+        kmer = self.tokenization_params['kmer']
+        max_token_count = self.get_maximum_token_count_from_max_length(max_segment_length, shift, kmer)
+        return max_token_count
+    def get_cmd_arg_parser(self) -> tuple[argparse.ArgumentParser, dict, dict]:
+        """
+        Create and return a command-line argument parser for ProkBERT configurations, along with mappings
+        between command-line arguments and configuration parameters.
+        This method combines sequence configuration parameters with training configuration parameters
+        and sets up a command-line argument parser using these combined settings. It ensures that parameter
+        names are unique across different groups by renaming any non-unique parameters.
+        :return: A tuple containing:
+                 - Configured argparse.ArgumentParser instance for handling ProkBERT configurations.
+                 - A dictionary mapping new command-line arguments to their original group and parameter name.
+                 - A dictionary mapping each group to a dict that maps the original parameter names
+                   to the new command-line argument names.
+        :rtype: tuple[argparse.ArgumentParser, dict, dict]
+        Note: The method assumes that the configuration parameters for training and sequence configuration
+        are available within the class.
+        """
+        combined_params = deepcopy(self.parameters)
+        combined_params['Sequence'] = {}
+        combined_params['Sequence']['fasta_file_dir'] = {'default': 'None',
+                                                         'description' : 'Directory where the input fasta file are located for the pretraining',
+                                                         'type': 'string'}
+        combined_params['Sequence']['out'] = {'default': 'pretrain.h5',
+                                                         'description' : 'Output path',
+                                                         'type': 'string'}
+        combined_params, cmd_argument2group_param, group2param2cmdarg = BaseConfig.rename_non_unique_parameters(combined_params)
+        parser = BaseConfig.create_parser(combined_params)
+        return parser,cmd_argument2group_param, group2param2cmdarg
+    @staticmethod
+    def get_maximum_segment_length_from_token_count(max_token_counts, shift, kmer):
+        """Calcuates how long sequence can be covered
+        """
+        max_segment_length = (max_token_counts-3)*shift + kmer
+        return max_segment_length
+    @staticmethod
+    def get_maximum_token_count_from_max_length(max_segment_length, shift, kmer):
+        """Calcuates how long sequence can be covered
+        """
+        max_token_count = int(np.ceil((max_segment_length - kmer)/shift+3))
+        return max_token_count
+class ProkBERTConfig(BaseConfig):
+    """Class to manage and validate pretraining configurations."""
+    torch_dtype_mapping = {1: torch.uint8,
+                           2: torch.int16,
+                           8: torch.int64,
+                           4: torch.int32}
+    def __init__(self):
+        super().__init__()
+        self.default_pretrain_config_file = self._get_default_pretrain_config_file()
+        with open(self.default_pretrain_config_file, 'r') as file:
+            self.parameters = yaml.safe_load(file)
+        # Load and validate each parameter set
+        self.data_collator_params = self.get_set_parameters('data_collator')
+        self.model_params = self.get_set_parameters('model')
+        self.dataset_params = self.get_set_parameters('dataset')
+        self.pretraining_params = self.get_set_parameters('pretraining')
+        self.finetuning_params = self.get_set_parameters('finetuning')
+        # Getting the sequtils params as well
+        self.def_seq_config = SeqConfig()
+        self.segmentation_params = self.def_seq_config.get_and_set_segmentation_parameters(self.parameters['segmentation'])
+        self.tokenization_params = self.def_seq_config.get_and_set_tokenization_parameters(self.parameters['tokenization'])
+        self.computation_params = self.def_seq_config.get_and_set_computational_parameters(self.parameters['computation'])
+        self.default_torchtype = ProkBERTConfig.torch_dtype_mapping[self.computation_params['numpy_token_integer_prec_byte']]
+        hf_training_args = TrainingArguments("working_dir")
+        self.hf_training_args_dict = hf_training_args.to_dict()
+    def _get_default_pretrain_config_file(self) -> str:
+        """
+        Retrieve the default pretraining configuration file.
+        :return: Path to the configuration file.
+        :rtype: str
+        """
+        current_path = pathlib.Path(__file__).parent
+        pretrain_config_file = join(current_path, 'configs', 'pretraining.yaml')
+        try:
+            # Attempt to read the environment variable
+            pretrain_config_file = os.environ['PRETRAIN_CONFIG_FILE']
+        except KeyError:
+            # Handle the case when the environment variable is not found
+            pass
+            # print(f"PRETRAIN_CONFIG_FILE environment variable has not been set. Using default value: {pretrain_config_file}")
+        return pretrain_config_file
+    def get_set_parameters(self, parameter_class: str, parameters: dict = {}) -> dict:
+        """
+        Retrieve and validate the provided parameters for a given parameter class.
+        :param parameter_class: The class/category of the parameter (e.g., 'data_collator').
+        :type parameter_class: str
+        :param parameters: A dictionary of parameters to be validated.
+        :type parameters: dict
+        :return: A dictionary of validated parameters.
+        :rtype: dict
+        :raises ValueError: If an invalid parameter is provided.
+        """
+        class_params = {k: self.get_parameter(parameter_class, k) for k in self.parameters[parameter_class]}
+        # First validatiading the class parameters as well
+        for param, param_value in class_params.items():
+            self.validate(parameter_class, param, param_value)
+        for param, param_value in parameters.items():
+            if param not in class_params and (parameter_class!='pretraining'):
+                raise ValueError(f"The provided {param} is an INVALID {parameter_class} parameter! The valid parameters are: {list(class_params.keys())}")
+            else:
+                if parameter_class == 'pretraining' or parameter_class == 'finetuning' :
+                    if param in self.hf_training_args_dict or param in class_params:
+                        if param in class_params:
+                            self.validate(parameter_class, param, param_value)
+                        class_params[param] = param_value
+                    else:
+                        raise ValueError(f"The provided {param} is an INVALID {parameter_class} parameter! In addition is not a valid training argument.")
+                else:
+                    self.validate(parameter_class, param, param_value)
+                    class_params[param] = param_value
+        return class_params
+    def get_and_set_model_parameters(self, parameters: dict = {}) -> dict:
+        """ Setting the model parameters """
+        # Here we include the additional training arguments available for the trainer
+        self.model_params = self.get_set_parameters('model', parameters)
+        return self.model_params
+    def get_and_set_dataset_parameters(self, parameters: dict = {}) -> dict:
+        """ Setting the dataset parameters """
+        self.dataset_params = self.get_set_parameters('dataset', parameters)
+        return self.dataset_params
+    def get_and_set_pretraining_parameters(self, parameters: dict = {}) -> dict:
+        """ Setting the model parameters """
+        self.pretraining_params = self.get_set_parameters('pretraining', parameters)
+        return self.pretraining_params
+    def get_and_set_datacollator_parameters(self, parameters: dict = {}) -> dict:
+        """ Setting the model parameters """
+        self.data_collator_params = self.get_set_parameters('data_collator', parameters)
+        return self.data_collator_params
+    def get_and_set_segmentation_parameters(self, parameters: dict = {}) -> dict:
+        self.segmentation_params = self.def_seq_config.get_and_set_segmentation_parameters(parameters)
+        return self.segmentation_params
+    def get_and_set_tokenization_parameters(self, parameters: dict = {}) -> dict:
+        self.tokenization_params = self.def_seq_config.get_and_set_tokenization_parameters(parameters)
+        return self.tokenization_params
+    def get_and_set_computation_params(self, parameters: dict = {}) -> dict:
+        self.computation_params = self.def_seq_config.get_and_set_computational_parameters(parameters)
+        return self.computation_params
+    def get_and_set_finetuning_parameters(self, parameters: dict = {}) -> dict:
+        """ Setting the finetuning parameters """
+        # Here we include the additional training arguments available for the trainer
+        self.finetuning_params = self.get_set_parameters('finetuning', parameters)
+        return self.finetuning_params
+    def get_inference_parameters(self):
+        # Instantiate TrainingArguments to access default values
+        hf_defaults = TrainingArguments(output_dir="/tmp")  # Dummy output_dir for initialization
+        return {
+            'inference': {
+                'fastain': {
+                    'default': None,
+                    'type': 'str',
+                    'description': 'Path to the input data for inference.'
+                },
+                'out': {
+                    'default': None,
+                    'type': 'str',
+                    'description': 'Output path for the inference results.'
+                },
+                'per_device_eval_batch_size': {
+                    'default': hf_defaults.per_device_eval_batch_size,
+                    'type': 'int',
+                    'description': 'Batch size per device during evaluation.'
+                },
+                'ddp_backend': {
+                    'default': hf_defaults.ddp_backend,
+                    'type': 'str',
+                    'description': 'The backend to use for distributed training.'
+                },
+                'dataloader_drop_last': {
+                    'default': hf_defaults.dataloader_drop_last,
+                    'type': 'bool',
+                    'description': 'Drop the last incomplete batch if it is not divisible by the batch size.'
+                },
+                'torch_compile': {
+                    'default': getattr(hf_defaults, 'torch_compile', False),  # Fallback for compatibility
+                    'type': 'bool',
+                    'description': 'Whether to use TorchScript’s JIT compilation to accelerate training.'
+                },
+                'torch_compile_mode': {
+                    'default': getattr(hf_defaults, 'torch_compile_mode', 'eager'),  # Fallback for compatibility
+                    'type': 'str',
+                    'description': 'The JIT mode to use for compiling PyTorch operations.'
+                }
+            }
+        }
+    def get_cmd_arg_parser(self, keyset=[]) -> tuple[argparse.ArgumentParser, dict, dict]:
+        """
+        Create and return a command-line argument parser for ProkBERT configurations, along with mappings
+        between command-line arguments and configuration parameters.
+        This method combines sequence configuration parameters with training configuration parameters
+        and sets up a command-line argument parser using these combined settings. It ensures that parameter
+        names are unique across different groups by renaming any non-unique parameters.
+        :return: A tuple containing:
+                 - Configured argparse.ArgumentParser instance for handling ProkBERT configurations.
+                 - A dictionary mapping new command-line arguments to their original group and parameter name.
+                 - A dictionary mapping each group to a dict that maps the original parameter names
+                   to the new command-line argument names.
+        :rtype: tuple[argparse.ArgumentParser, dict, dict]
+        Note: The method assumes that the configuration parameters for training and sequence configuration
+        are available within the class.
+        """
+        if len(keyset) ==0:
+            trainin_conf_keysets = ['data_collator', 'model', 'dataset', 'pretraining', 'finetuning']
+        else:
+            trainin_conf_keysets = keyset
+        inference_params = self.get_inference_parameters()
+        seq_config = deepcopy(self.def_seq_config.parameters)
+        default_other_config = deepcopy(self.parameters)
+        combined_params = {}
+        for k,v in seq_config.items():
+            combined_params[k] = v
+        for k in trainin_conf_keysets:
+            combined_params[k] = default_other_config[k]
+        combined_params.update(inference_params)
+        combined_params, cmd_argument2group_param, group2param2cmdarg = BaseConfig.rename_non_unique_parameters(combined_params)
+        parser = BaseConfig.create_parser(combined_params)
+        return parser,cmd_argument2group_param, group2param2cmdarg
+def get_user_provided_args(args, parser):
+    """
+    Extract arguments provided by the user from the parsed arguments.
+    Args:
+        args (argparse.Namespace): Parsed command-line arguments.
+        parser (argparse.ArgumentParser): The argument parser instance.
+    Returns:
+        dict: A dictionary of user-provided arguments and their values.
+    """
+    user_provided_args = {}
+    for action in parser._actions:
+        arg_name = action.dest
+        default_value = action.default
+        user_value = getattr(args, arg_name, None)
+        if user_value != default_value:
+            user_provided_args[arg_name] = user_value
+    return user_provided_args

general_utils.py ADDED Viewed

	@@ -0,0 +1,309 @@

+# coding=utf-8
+import pandas as pd
+import os
+import numpy as np
+import subprocess
+import shutil
+""" Library for general utils, such as dataframe properties checking,
+creating directories, checking files, etc.
+"""
+def check_expected_columns(df: pd.DataFrame, expected_columns: list) -> bool:
+    """Checks if a DataFrame contains the expected columns.
+    :param df: The input DataFrame to be checked.
+    :type df: pd.DataFrame
+    :param expected_columns: A list of columns that are expected to be present in the DataFrame.
+    :type expected_columns: list
+    :param df: pd.DataFrame:
+    :param expected_columns: list:
+    :returns: True if all expected columns are present in the DataFrame, False otherwise.
+    :rtype: bool
+    :raises ValueError: If any of the expected columns are not present in the DataFrame.
+    Examples
+    --------
+    >>> df = pd.DataFrame({'A': [1, 2], 'B': [3, 4]})
+    >>> check_expected_columns(df, ['A', 'B'])
+    True
+    >>> check_expected_columns(df, ['A', 'C'])
+    ValueError: The following columns are missing: ['C']
+    """
+    missing_columns = [col for col in expected_columns if col not in df.columns]
+    if missing_columns:
+        raise ValueError(f"The following columns are missing: {missing_columns}")
+    return True
+def is_valid_primary_key(df: pd.DataFrame, column_name: str) -> bool:
+    """Checks if a specified column in a DataFrame can serve as a valid primary key.
+    :param df: The input DataFrame to be checked.
+    :type df: pd.DataFrame
+    :param column_name: The name of the column to check.
+    :type column_name: str
+    :returns: True if the column can serve as a valid primary key, False otherwise.
+    :rtype: bool
+    :raises ValueError: If the specified column does not exist in the DataFrame.
+    Examples
+    --------
+    >>> df = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]})
+    >>> is_valid_primary_key(df, 'A')
+    True
+    >>> df = pd.DataFrame({'A': [1, 2, 2], 'B': [4, 5, 6]})
+    >>> is_valid_primary_key(df, 'A')
+    False
+    """
+    if column_name not in df.columns:
+        raise ValueError(f"Column '{column_name}' does not exist in the DataFrame.")
+    # Check for NaN values
+    if df[column_name].isnull().any():
+        return False
+    # Check for unique values
+    if not df[column_name].is_unique:
+        return False
+    return True
+def get_non_empty_files(start_path: str, extensions: tuple = ('.fasta', '.fna')) -> str:
+    """Generator that yields non-empty files from a specified directory and its subdirectories based on the given extensions.
+    :param start_path: The path to the directory from which to start the search.
+    :type start_path: str
+    :param extensions: A tuple of file extensions to look for (default is ('.fasta', '.fna')).
+                       The function also automatically checks for compressed versions with '.gz'.
+    :type extensions: tuple
+    :returns: Yields filenames that match the specified extensions and are non-empty.
+    :rtype: str
+    """
+    for dirpath, _, filenames in os.walk(start_path):
+        for filename in filenames:
+            filepath = os.path.join(dirpath, filename)
+            if any(filename.endswith(ext) or filename.endswith(ext + '.gz') for ext in extensions) and os.path.getsize(filepath) > 0:
+                yield filename
+def truncate_zero_columns(arr: np.ndarray) -> np.ndarray:
+    """Truncate all trailing columns composed entirely of zeros in a given 2D numpy array.
+    :param arr: Input 2D numpy array.
+    :type arr: np.ndarray
+    :returns: A new array with trailing zero columns removed.
+    :rtype: np.ndarray
+    """
+    # Iterate over columns from the end
+    for idx in range(arr.shape[1]-1, -1, -1):
+        if np.any(arr[:, idx]):
+            return arr[:, :(idx+1)]
+    return np.empty((arr.shape[0], 0))
+import os
+def create_directory_for_filepath(filepath: str) -> None:
+    """Given a file path, creates the underlying directory structure if it doesn't already exist.
+    :param filepath: The path to the file for which the directory structure should be created.
+    :type filepath: str
+    :raises ValueError: If the provided path is empty or None.
+    :raises OSError: If there's an error creating the directory structure.
+    """
+    if not filepath:
+        raise ValueError("The provided filepath is empty or None.")
+    directory = os.path.dirname(filepath)
+    if directory and not os.path.exists(directory):
+        try:
+            os.makedirs(directory)
+            print(f"Directory structure {directory} created successfully.")
+        except OSError as e:
+            raise OSError(f"Error creating directory structure {directory}. Error: {e}")
+# Example usage:
+# create_directory_for_filepath("/path/to/directory/that/might/not/exist/filename.txt")
+def check_file_exists(file_path: str) -> bool:
+    """Checks if the provided file path exists.
+    :param file_path: Path to the file.
+    :type file_path: str
+    :returns: True if the file exists, raises ValueError otherwise.
+    :rtype: bool
+    """
+    if os.path.exists(file_path):
+        return True
+    else:
+        raise ValueError(f"The provided file path '{file_path}' does not exist.")
+def count_gpus(method="clinfo"):
+    """
+    Count the number of available GPUs using the specified method.
+    This function counts the number of NVIDIA and AMD GPUs using the chosen method. By default, it uses the 'clinfo'
+    method for AMD GPUs.
+    :param method: The method to use for GPU counting. Choose between 'clinfo' (default) and 'rocm'.
+    :type method: str, optional
+    :return: The total number of GPUs detected.
+    :rtype: int
+    :raises ValueError: If an unknown method is provided.
+    :raises Exception: If an error occurs while querying AMD GPUs using the specified method.
+    .. note::
+        - The 'clinfo' method queries AMD GPUs by running the 'clinfo' command.
+        - The 'rocm' method queries AMD GPUs by running 'rocm-smi --list' command.
+    """
+    import torch
+    import subprocess
+    # Count NVIDIA GPUs
+    nvidia_gpu_count = torch.cuda.device_count()
+    # Count AMD GPUs
+    amd_gpu_count = 0
+    try:
+        if method == "clinfo":
+            clinfo_output = subprocess.check_output('clinfo').decode('utf-8')
+            amd_gpu_count = clinfo_output.lower().count('device type: gpu')
+        elif method == "rocm":
+            rocm_output = subprocess.check_output('rocm-smi --list', shell=True).decode('utf-8')
+            amd_gpu_count = len(rocm_output.strip().split('\n'))
+        else:
+            raise ValueError("Unknown method provided. Choose between 'clinfo' and 'rocm'.")
+    except Exception as e:
+        print(f"Error querying AMD GPUs using method '{method}': {e}")
+    total_gpus = nvidia_gpu_count + amd_gpu_count
+    return total_gpus
+def create_hard_links(source_directory: str, target_directory: str, blacklist: list = []) -> None:
+    """Creates hard links for all files from the source directory to the target directory.
+    :param source_directory: The directory containing the original files.
+    :type source_directory: str
+    :param target_directory: The directory where hard links will be created.
+    :type target_directory: str
+    :param blacklist: List of filenames to exclude from creating hard links.
+    :type blacklist: list
+    :returns: None
+    """
+    # Ensure the provided directories exist
+    if not os.path.exists(source_directory):
+        raise ValueError(f"The source directory '{source_directory}' does not exist.")
+    if not os.path.exists(target_directory):
+        os.makedirs(target_directory)
+    # Iterate through the files in the source directory
+    for filename in os.listdir(source_directory):
+        source_file_path = os.path.join(source_directory, filename)
+        target_file_path = os.path.join(target_directory, filename)
+        # Check for files to skip
+        if (filename.startswith('.') or
+            filename.startswith('_') or
+            os.path.isdir(source_file_path) or
+            filename in blacklist):
+            continue
+        # Create a hard link
+        os.link(source_file_path, target_file_path)
+    return f"Hard links created in {target_directory} from {source_directory}."
+# Example usage
+# create_hard_links("/path/to/source_directory", "/path/to/target_directory", blacklist=["file_to_skip.txt"])
+def create_selected_hard_links(source_directory: str, target_directory: str, filenames: list) -> None:
+    """Creates hard links for the specified files from the source directory to the target directory.
+    :param source_directory: The directory containing the original files.
+    :type source_directory: str
+    :param target_directory: The directory where hard links will be created.
+    :type target_directory: str
+    :param filenames: List of filenames for which hard links should be created.
+    :type filenames: list
+    :returns: None
+    """
+    # Ensure the provided directories exist
+    if not os.path.exists(source_directory):
+        raise ValueError(f"The source directory '{source_directory}' does not exist.")
+    if not os.path.exists(target_directory):
+        os.makedirs(target_directory)
+    # Iterate through the specified filenames
+    for filename in filenames:
+        source_file_path = os.path.join(source_directory, filename)
+        target_file_path = os.path.join(target_directory, filename)
+        # Ensure the file exists in the source directory
+        if not os.path.isfile(source_file_path):
+            print(f"Warning: {filename} does not exist in the source directory. Skipping.")
+            continue
+        # Create a hard link
+        try:
+            os.link(source_file_path, target_file_path)
+        except FileExistsError:
+            print(f'The target hard link {target_file_path} exist. Skipping...')
+    return f"Hard links for specified files created in {target_directory} from {source_directory}."
+def remove_hidden_files(directory: str) -> None:
+    """Removes all files recursively in a folder that start with '.' or '_'.
+    :param directory: The directory from which hidden files should be removed.
+    :type directory: str
+    :returns: None
+    """
+    # Ensure the directory exists
+    if not os.path.exists(directory):
+        raise ValueError(f"The directory '{directory}' does not exist.")
+    # Use os.walk to iterate through all subdirectories and files
+    for dirpath, dirnames, filenames in os.walk(directory, topdown=False):
+        # Filter out directories starting with '.' or '_'
+        dirnames[:] = [d for d in dirnames if not d.startswith('.') and not d.startswith('_')]
+        # Remove files starting with '.' or '_'
+        for filename in filenames:
+            if filename.startswith('.') or filename.startswith('_'):
+                file_path = os.path.join(dirpath, filename)
+                os.remove(file_path)
+                print(f"Removed: {file_path}")
+    print(f"All hidden files removed from {directory}.")

sequtils.py ADDED Viewed

	@@ -0,0 +1,980 @@

+import logging
+logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
+# coding=utf-8
+""" Library for sequence processing """
+import os
+import sys
+import pandas as pd
+from multiprocessing import Pool
+import multiprocessing
+from os.path import join, isfile, splitext
+from os import listdir
+import random
+from Bio import SeqIO
+import numpy as np
+import math
+import gzip
+from mimetypes import guess_type
+from functools import partial
+import operator
+import pathlib
+#from typing import Dict, List, Type, Tuple
+from itertools import product
+from typing import List, Union, Dict, Any, Optional, Tuple, Type, Set
+from .general_utils import *
+from Bio.Seq import Seq
+from Bio.SeqRecord import SeqRecord
+from scipy.ndimage import convolve1d
+import h5py
+def load_contigs(
+    fasta_files_list: Union[List[str], str],
+    adding_reverse_complement: bool = True,
+    IsAddHeader: bool = False,
+    AsDataFrame: bool = False,
+    to_uppercase: bool = False,
+    is_add_sequence_id: bool = False
+) -> Union[List[Union[str, List[str]]], pd.DataFrame]:
+    """
+    Loads contigs from a list of FASTA files.
+    :param fasta_files_list: List of paths to FASTA files or a single file path. Compressed (gz) FASTA files are accepted.
+    :type fasta_files_list: Union[List[str], str]
+    :param adding_reverse_complement: If True, adds the reverse complement of each sequence. Defaults to True.
+    :type adding_reverse_complement: bool
+    :param IsAddHeader: If True, includes the FASTA ID and description in the output. Defaults to False.
+    :type IsAddHeader: bool
+    :param AsDataFrame: If True, returns the sequences as a pandas DataFrame. Defaults to False.
+    :type AsDataFrame: bool
+    :param to_uppercase: If True, converts sequences to uppercase. Defaults to False.
+    :type to_uppercase: bool
+    :param is_add_sequence_id: If True, adds a unique integer sequence ID to each sequence. Defaults to False.
+    :type is_add_sequence_id: bool
+    :return: The loaded sequences. Each sequence is represented as a string if IsAddHeader is False, or as a list
+             [sequence_id, fasta_id, description, source_file, sequence, orientation] if IsAddHeader is True and is_add_sequence_id is True.
+             If AsDataFrame is True, the sequences are returned as a DataFrame.
+    :rtype: Union[List[Union[str, List[str]]], pd.DataFrame]
+    Example:
+        >>> fasta_files = ['path/to/file1.fasta', 'path/to/file2.fasta.gz']
+        >>> load_contigs(fasta_files, adding_reverse_complement=False, IsAddHeader=True, AsDataFrame=True, to_uppercase=True, is_add_sequence_id=True)
+        # Returns a DataFrame with the sequences from the specified FASTA files, all in uppercase, with unique sequence IDs.
+    """
+    logging.info('Loading sequence data into memory!')
+    if isinstance(fasta_files_list, str):
+        logging.info('Since the fasta_files_list is a string, not a list, we convert it to a list.')
+        fasta_files_list = [fasta_files_list]
+    sequences = []
+    sequence_id = 0
+    df_cols = ['sequence_id', 'fasta_id', 'description', 'source_file', 'sequence', 'orientation'] if (IsAddHeader and is_add_sequence_id) else ['fasta_id', 'description', 'source_file', 'sequence', 'orientation'] if IsAddHeader else ['sequence']
+    for act_assembly in fasta_files_list:
+        # Determine the file encoding based on the file extension
+        encoding = guess_type(act_assembly)[1]
+        _open = partial(gzip.open, mode='rt') if encoding == 'gzip' else open
+        with _open(act_assembly) as f_assembly:
+            # Parse the fasta file
+            contigs = list(SeqIO.parse(f_assembly, "fasta"))
+        for contig in contigs:
+            act_seq = str(contig.seq)[:] if not to_uppercase else str(contig.seq).upper()[:]
+            act_header = str(contig.id)
+            act_description = str(contig.description)
+            if adding_reverse_complement:
+                # Compute the reverse complement of the sequence
+                act_reverse_complement = str(contig.seq.reverse_complement()) if not to_uppercase else str(contig.seq.reverse_complement()).upper()
+            if IsAddHeader:
+                # Include sequence ID (if applicable), fasta ID, description, source file, sequence, and orientation in the output
+                entry = [sequence_id] if is_add_sequence_id else []
+                entry.extend([act_header, act_description, act_assembly, act_seq, 'forward'])
+                sequences.append(entry)
+                if adding_reverse_complement:
+                    entry = [sequence_id + 1] if is_add_sequence_id else []
+                    entry.extend([act_header, act_description, act_assembly, act_reverse_complement, 'reverse'])
+                    sequences.append(entry)
+                    if is_add_sequence_id:
+                        sequence_id += 2
+                else:
+                    sequence_id+=1
+            else:
+                # Only include the sequence in the output
+                sequences.append(act_seq)
+                if adding_reverse_complement:
+                    sequences.append(act_reverse_complement)
+    if AsDataFrame:
+        # Convert the sequences to a DataFrame
+        sequences = pd.DataFrame(sequences, columns=df_cols)
+    return sequences
+def segment_sequence_contiguous(
+    sequence: str,
+    params: Dict[str, Any],
+    sequence_id: Optional[Any] = np.nan
+) -> List[Dict[str, Any]]:
+    """
+    Creates end-to-end, disjoint segments of a sequence without overlaps.
+    Segments smaller than the predefined minimum length will be discarded.
+    This function returns a list of segments along with their positions in the original sequence.
+    :param sequence: The input nucleotide sequence to be segmented.
+    :type sequence: str
+    :param params: Dictionary containing the segmentation parameters. Must include 'min_length' and 'max_length' keys
+                   specifying the minimum and maximum lengths of the segments, respectively. Can contain other parameters.
+    :type params: Dict[str, Any]
+    :param sequence_id: An identifier for the sequence, optional. Defaults to NaN.
+    :type sequence_id: Optional[Any]
+    :return: A list of dictionaries, each representing a segment. Each dictionary contains the segment's sequence,
+             start position, end position, and sequence ID.
+    :rtype: List[Dict[str, Any]]
+    Example:
+        >>> params = {'min_length': 0, 'max_length': 100}
+        >>> segment_sequence_contiguous('ATCGATCGA', params)
+        [{'segment': 'ATCGATCGA', 'segment_start': 0, 'segment_end': 9, 'sequence_id': np.nan}]
+    """
+    # Extract segmentation parameters
+    min_segment_len = params['min_length']
+    max_segment_len = params['max_length']
+    # Ensure the sequence is treated as a string
+    if isinstance(sequence, str):
+        act_seq = sequence
+    L = len(sequence)
+    segments = []
+    for i in range(0, L, max_segment_len):
+        act_start_pos = i
+        act_end_pos = min(i + max_segment_len, L)
+        act_segment = sequence[act_start_pos:act_end_pos]
+        # Add segment to the list if it's longer than the minimum length
+        if len(act_segment) >= min_segment_len:
+            new_record = {
+                'segment': act_segment,
+                'segment_start': act_start_pos,
+                'segment_end': act_end_pos,
+                'sequence_id': sequence_id
+            }
+            segments.append(new_record)
+    return segments
+def segment_sequences_random(
+    sequences: Union[pd.DataFrame, List[str]],
+    params: Dict[str, Union[int, float, str, Dict, List, Tuple]]
+) -> List[Dict[str, Union[int, str]]]:
+    """
+    Randomly segments the input sequences.
+    This function accepts either a list of sequences or a DataFrame containing sequences.
+    If a DataFrame is provided, it's assumed to have preprocessed sequences with "sequence" and "sequence_id" columns,
+    where "sequence_id" is a valid primary key. The function returns a list of dictionaries,
+    each containing details of a segment including its sequence, start position, end position,
+    associated sequence ID, and a segment ID (not generated in this function).
+    :param sequences: A DataFrame containing sequences with "sequence" and "sequence_id" columns or a list of sequences.
+    :type sequences: Union[pd.DataFrame, List[str]]
+    :param params: Dictionary containing segmentation parameters such as 'coverage', 'min_length', and 'max_length'.
+    :type params: Dict[str, Union[int, float, str, Dict, List, Tuple]]
+    :return: A list of dictionaries with each containing details of a segment.
+    :rtype: List[Dict[str, Union[int, str]]]
+    Notes:
+        - The actual number of segments may differ from the expected number due to random sampling and sequences
+          being shorter than the specified segment size.
+        - Segment IDs are not generated by this function.
+    """
+    # Calculate sequence lengths and cumulative sum of lengths
+    sequences['seq_lengths'] = sequences.apply(lambda x: len(x['sequence']), axis=1)
+    sequences['lenght_cum_sum'] = sequences['seq_lengths'].cumsum()
+    Lseqs = sum(sequences['seq_lengths'])
+    # Calculate the number of segments to sample based on expected coverage.
+    # Note: The actual number might be biased if many sequences are "short" compared to the segment sizes.
+    N_segments = int(np.ceil(params['coverage'] * Lseqs / params['max_length']))
+    logging.info(f'Sampling {N_segments} segments from {len(sequences)} sequences.')
+    # Generate random starting coordinates for segments
+    start_coords = list(np.sort(np.int64(np.random.uniform(0, sequences['lenght_cum_sum'].max(), N_segments))))
+    segmentdb = []
+    for sid, act_sampling_coord in enumerate(start_coords):
+        diff = act_sampling_coord - sequences['lenght_cum_sum']
+        # Find the sequence in which the current segment starts
+        for i in range(len(sequences['lenght_cum_sum'])):
+            if diff[i] < 0:
+                break
+        act_sequence_id = sequences['sequence_id'].iloc[i]
+        rel_coord = act_sampling_coord - sequences['lenght_cum_sum'].iloc[i] + sequences['seq_lengths'].iloc[i]
+        segment_end = min(rel_coord + params['max_length'], sequences['seq_lengths'].iloc[i])
+        # Skip the segment if it's shorter than the minimum segment length
+        if segment_end - rel_coord < params['min_length']:
+            pred_seqgment = sequences['sequence'].iloc[i][rel_coord:segment_end]
+            minimum_len = params['min_length']
+            logging.info(f'Too short segment, skip! Sampled segment: {pred_seqgment},  Segment end coordinate: {segment_end}, relative coordinate: {rel_coord}, minimum length is: {minimum_len}')
+            continue
+        new_segment = sequences['sequence'].iloc[i][rel_coord:segment_end]
+        new_record = {
+            'sequence_id': act_sequence_id,
+            'segment_start': rel_coord,
+            'segment_end': segment_end,
+            'segment': new_segment,
+            'segment_id': str(sid)
+        }
+        segmentdb.append(new_record)
+    return segmentdb
+def segment_sequences(
+    sequences: Union[List[str], pd.DataFrame],
+    params: Dict[str, Union[int, float, str, ]],
+    AsDataFrame: bool = False
+) -> Union[List[str], pd.DataFrame]:
+    """
+    Segments sequences based on the provided parameters.
+    This function assumes that the sequence is quality controlled and preprocessed, i.e., it is a valid nucleotide sequence.
+    If sequences are provided as a DataFrame, then it is assumed that there is a "sequence_id" and
+    a "sequence" attribute. The "sequence_id" should be a valid primary key.
+    If the output is requested as a DataFrame, then the IDs are added as well.
+    :param sequences: A list of sequences or a DataFrame containing sequences.
+                      If a DataFrame, it must have "sequence_id" and "sequence" attributes.
+    :type sequences: Union[List[str], pd.DataFrame]
+    :param params: Dictionary containing the segmentation parameters.
+        - 'type' (str): The type of segmentation ('contiguous' or 'random').
+        - 'min_length' (int): Minimum length of a segment.
+        - 'max_length' (int): Maximum length of a segment.
+        - 'coverage' (float): Coverage percentage for random segmentation.
+    :type params: Dict[str, Union[int, float, str, Dict[str, int], List[int], Tuple[int, int]]]
+    :param AsDataFrame: If True, the output will be a DataFrame. If False, it will be a list. Defaults to False.
+    :type AsDataFrame: bool
+    :return: List of segmented sequences or a DataFrame with segmented sequences and their corresponding information based on the `AsDataFrame` parameter.
+    :rtype: Union[List[str], pd.DataFrame]
+    :raises ValueError: If the provided sequences DataFrame does not have the required attributes.
+    :raises ValueError: If the "sequence_id" column is not a valid primary key.
+    Examples:
+        >>> segment_sequences(['AATCAATTTTATTT', 'AGCCGATTCAATTGCATTATTT'], {'type': 'contiguous', 'min_length': 1, 'max_length': 1000, 'coverage': 1.0})
+    """
+    segmentation_type = params['type']
+    # Checking for primary key and sequence attribute???
+    expected_attributes = ['sequence_id', 'sequence']
+    return_cols = ['segment_id', 'sequence_id', 'segment_start', 'segment_end', 'segment']
+    if isinstance(sequences, list):
+        logging.info('Sequences is a list, therefore ignoring ids and tracking information. ')
+        IsSequenceId = None
+        IsSeqList = True
+    elif isinstance(sequences, pd.DataFrame):
+        #logging.info('Sequences is a list, therefore adding tracking information.')
+        logging.info('Checking input DataFrame!')
+        check_expected_columns(sequences, expected_attributes)
+        logging.info('Checking input sequence_id is valid primary key in the DataFrame')
+        is_valid_primary_key(sequences, 'sequence_id')
+        IsSequenceId = True
+        IsSeqList=False
+    segments = []
+    if segmentation_type == 'contiguous':
+        if IsSeqList:
+            if IsSequenceId:
+                for act_seq_id, seq in enumerate(sequences):
+                    act_segments = segment_sequence_contiguous(seq, params, act_seq_id)
+                    segments.extend(act_segments)
+            else:
+                for seq in sequences:
+                    act_segments = segment_sequence_contiguous(seq, params)
+                    segments.extend(act_segments)
+        else:
+            for _, rec in sequences.iterrows():
+                act_seq = rec['sequence']
+                act_seq_id = rec['sequence_id']
+                act_segments = segment_sequence_contiguous(act_seq, params, act_seq_id)
+                segments.extend(act_segments)
+    elif segmentation_type == 'random':
+        if IsSeqList:
+            seqeunce_df = pd.DataFrame(sequences,
+                                        columns = ['sequence'])
+            seqeunce_df['sequence_id'] = list(range(len(sequences)))
+            segments = segment_sequences_random(seqeunce_df, params)
+        else:
+            segments = segment_sequences_random(sequences, params)
+    if AsDataFrame:
+        #logging.info('Creating a DataFrame from the segments. ')
+        segment_db = pd.DataFrame(segments)
+        segment_ids = list(range(len(segment_db)))
+        segment_db['segment_id'] = segment_ids
+        segment_db = segment_db[return_cols]
+    else:
+        segment_db = [seg['segment'] for seg in segments]
+    return segment_db
+def lca_kmer_tokenize_segment(segment: str, offset: int, params: Dict[str, Dict[str, int] | int | float]):
+    # calculate the tokenization for one offset value
+    shift = params['shift']
+    max_segment_length = params['max_segment_length']
+    max_unknown_token_proportion = params['max_unknown_token_proportion']
+    kmer = params['kmer']
+    token_limit = params['token_limit']
+    vocabmap = params['vocabmap']
+    add_special_token = params['add_special_token']
+    if len(segment) > max_segment_length:
+        raise(ValueError(f'The segment is longer {len(segment)} then the maximum allowed segment length ({max_segment_length}). '))
+    kmers = [segment[i:i + kmer] for i in range(offset, len(segment) - kmer + 1, shift)]
+    return kmers
+def lca_tokenize_segment(segment: str, params: Dict[str, Dict[str, int] | int | float]) -> Tuple[List[List[int]], List[List[str]]]:
+    """
+    Tokenizes a single segment using Local Context Aware (LCA) tokenization.
+    The segment is first split into k-mers with specified shifts and then tokenized into token vectors.
+    :param segment: The input nucleotide sequence segment to be tokenized.
+    :type segment: str
+    :param params: Dictionary containing the tokenization parameters.
+        - 'shift' (int): The k-mer shift parameter.
+        - 'max_segment_length' (int): Maximum allowable segment length.
+        - 'max_unknown_token_proportion' (float): Maximum allowable proportion of unknown tokens in a segment.
+        - 'kmer' (int): Size of the k-mer.
+        - 'token_limit' (int): Maximum number of tokens allowed in the tokenized output.
+        - 'vocabmap' (dict[str, int]): Dictionary mapping k-mers to their respective token values.
+    :type params: dict
+    :returns: A tuple containing:
+        - list[list[int]]: List of tokenized segments (each segment as a list of integers).
+        - list[list[str]]: List of k-merized segments with different shifts (each segment as a list of strings).
+    :rtype: Tuple[List[List[int]], List[List[str]]]
+    :raises ValueError: If the segment length exceeds the `max_segment_length`.
+    Examples:
+        >>> vocabmap_example = {"[CLS]": 2, "[SEP]": 3, "[UNK]": 0, "TCTTT": 4, "CTTTG": 5, "TTTGC": 6, "TTGCT": 7}
+        >>> segment_example = 'TCTTTGCTAAG'
+        >>> params_example = {'shift': 1, 'max_segment_length': 512, 'max_unknown_token_proportion': 0.2, 'kmer': 5, 'token_limit': 10, 'vocabmap': vocabmap_example}
+        >>> lca_tokenize_segment(segment_example, params_example)
+        ([[2, 4, 5, 6, 7, 3]], [['TCTTT', 'CTTTG', 'TTTGC', 'TTGCT']])
+    """
+    #logging.info('Tokenizing a segment')
+    shift = params['shift']
+    max_segment_length = params['max_segment_length']
+    max_unknown_token_proportion = params['max_unknown_token_proportion']
+    kmer = params['kmer']
+    token_limit = params['token_limit']
+    vocabmap = params['vocabmap']
+    add_special_token = params['add_special_token']
+    if len(segment) > max_segment_length:
+        raise(ValueError(f'The segment is longer {len(segment)} then the maximum allowed segment length ({max_segment_length}). '))
+    kmers_offset = []
+    # For every pssoble offset and window we should get a k-mer vector.
+    # If the segmen is too short or non-existent, then we might have a problem. So, please ensure the segment
+    for offset in range(shift):
+        kmers = [segment[i:i + kmer] for i in range(offset, len(segment) - kmer + 1, shift)]
+        kmers_offset.append(kmers)
+    # Mapping the k-mers into numbers
+    tokenized_segments = tokenize_kmerized_segment_list(kmers_offset, vocabmap, token_limit, max_unknown_token_proportion, add_special_token)
+    return tokenized_segments, kmers_offset
+def tokenize_kmerized_segment_list(kmerized_segments: List[List[str]],
+                                   vocabmap: Dict[str, int],
+                                   token_limit: int,
+                                   max_unknown_token_proportion: float,
+                                   add_special_tokens: bool = True) -> List[List[int]]:
+    """Tokenizes or vectorizes a list of k-merized segments into a list of token vectors. If the expected number of
+    tokens in a segment exceeds the maximum allowed tokens (`token_limit`), the function raises an error. For segments
+    where unknown k-mers exceed the proportion set by `max_unknown_token_proportion`, the output is a special token
+    sequence indicating an empty sentence.
+    :param kmerized_segments: List containing k-merized segments.
+    :type kmerized_segments: List[List[str]]
+    :param vocabmap: Dictionary that maps k-mers to their respective token values.
+    :type vocabmap: Dict[str, int]
+    :param token_limit: Maximum number of tokens allowed in the tokenized output.
+    :type token_limit: int
+    :param max_unknown_token_proportion: Maximum allowable proportion of unknown tokens in a segment.
+    :type max_unknown_token_proportion: float
+    :param add_special_tokens: Whether to add special tokens (`[CLS]` and `[SEP]`) to the tokenized segments.
+    :type add_special_tokens: bool, optional (default=True)
+    :returns: List containing tokenized segments.
+    :rtype: List[List[int]]
+    :raises ValueError: If the expected number of tokens in a segment exceeds `token_limit`.
+    Examples
+    --------
+    >>> vocabmap_example = {"[CLS]": 2, "[SEP]": 3, "[UNK]": 0, "TCTTTG": 4, "CTTTGC": 5, "TTTGCT": 6, "TTGCTA": 7}
+    >>> kmerized_segment_example = [['TCTTTG', 'CTTTGC', 'TTTGCT', 'TTGCTA']]
+    >>> tokenize_kmerized_segment_list(kmerized_segment_example, vocabmap_example, 10, 0.2)
+    [[2, 4, 5, 6, 7, 3]]
+    """
+    tokenized_segments = []
+    if add_special_tokens:
+        empty_sentence = [2, 3]
+    else:
+        empty_sentence = []
+    for act_kmer_list in kmerized_segments:
+        if add_special_tokens:
+            tokenized_kmerized_segment = [vocabmap['[CLS]']]
+        else:
+            tokenized_kmerized_segment = []
+        unkcount=0
+        L_kmerized_segment = len(act_kmer_list)
+        unkw_tsh_count = int(L_kmerized_segment*max_unknown_token_proportion)
+        if len(act_kmer_list)+2 > token_limit:
+            raise(ValueError(f'The expected number of tokens in the segment ({L_kmerized_segment+2}) is larger, then the maximum allowed number of tokens = ({token_limit}). '))
+        if L_kmerized_segment == 0:
+            logging.info('Its and empty sentence')
+            tokenized_kmerized_segment = empty_sentence
+            tokenized_segments.append(empty_sentence)
+            continue
+        for kmer in act_kmer_list:
+            try:
+                tokenized_kmerized_segment.append(vocabmap[kmer.upper()])
+            except KeyError:
+                tokenized_kmerized_segment.append(vocabmap['[UNK]'])
+                unkcount+=1
+        if unkcount > unkw_tsh_count:
+            tokenized_segments.append(empty_sentence)
+            continue
+        if add_special_tokens:
+            tokenized_kmerized_segment.append(vocabmap['[SEP]'])
+        tokenized_segments.append(tokenized_kmerized_segment)
+    return tokenized_segments
+def process_batch_tokenize_segments_with_ids(
+    segments: List[str],
+    segment_ids: List[Any],
+    tokenization_params: Dict[str, Any],
+    np_token_type: type = np.uint16
+) -> Dict[Any, List[np.ndarray]]:
+    """
+    Tokenizes a batch of segments and associates them with their provided IDs.
+    This function generates vector representations for a collection of segments, assuming the segments
+    have undergone quality control. The result is a dictionary where the keys are segment IDs, and the values
+    are lists of potential vector representations for the segment, with each list element corresponding to
+    a specific shift.
+    The vector representations are converted to numpy arrays. The output is not a 2D rectangular array but
+    a dictionary mapping each segment ID to its tokenized representations.
+    :param segments: A list of preprocessed and validated segments.
+    :type segments: List[str]
+    :param segment_ids: A list of segment IDs corresponding to each segment in `segments`.
+    :type segment_ids: List[Any]
+    :param tokenization_params: A dictionary containing tokenization parameters.
+    :type tokenization_params: Dict[str, Any]
+    :param np_token_type: Numpy data type for the tokenized segments. Defaults to np.uint16.
+    :type np_token_type: type, optional
+    :return: A dictionary with segment IDs as keys and lists of numpy arrays representing tokenized segments as values.
+    :rtype: Dict[Any, List[np.ndarray]]
+    Example:
+        >>> segments = ['ACTG', 'TGCA']
+        >>> segment_ids = [1, 2]
+        >>> tokenization_params = {'max_segment_length': 50, ...}
+        >>> tokenized_segments = process_batch_tokenize_segments_with_ids(
+                segments, segment_ids, tokenization_params
+            )
+    """
+    tokenized_segments_with_ids = {}
+    for i, segment in enumerate(segments):
+        act_id = segment_ids[i]
+        tokenized_segments_with_ids[act_id] = []
+        max_segment_length = tokenization_params['max_segment_length']
+        if len(segment) > max_segment_length:
+            raise ValueError(f'The segment is longer ({len(segment)}) than the maximum allowed segment length ({max_segment_length}).')
+        tokenized_segment, _ = lca_tokenize_segment(segment, tokenization_params)
+        tokenized_segment = [np.array(act_segment, dtype=np_token_type) for act_segment in tokenized_segment]
+        tokenized_segments_with_ids[act_id] = tokenized_segment
+    return tokenized_segments_with_ids
+def batch_tokenize_segments_with_ids(
+    segment_data: Union[Tuple[List[str], List[Any]], pd.DataFrame],
+    tokenization_params: Dict[str, Any],
+    num_cores: int = 1,
+    batch_size: int = 10000,
+    np_token_type: type = np.uint16
+) -> Dict[Any, List[np.ndarray]]:
+    """
+    Parallel tokenization of segments with associated IDs.
+    This function splits the input data into batches and uses multiprocessing to tokenize
+    the segments in parallel. It supports both list/tuple inputs and pandas DataFrames.
+    :param segment_data: Either a tuple/list containing two elements (segments, segment_ids),
+                         or a pandas DataFrame with 'segment' and 'segment_id' columns.
+    :type segment_data: Union[Tuple[List[str], List[Any]], pd.DataFrame]
+    :param tokenization_params: Dictionary containing tokenization parameters.
+    :type tokenization_params: Dict[str, Any]
+    :param num_cores: Number of CPU cores to use for parallel processing. Defaults to 1.
+    :type num_cores: int, optional
+    :param batch_size: Number of segments to process in each batch. Defaults to 10,000.
+    :type batch_size: int, optional
+    :param np_token_type: Numpy data type for the tokenized segments. Defaults to np.uint16.
+    :type np_token_type: type, optional
+    :return: A dictionary where keys are segment IDs and values are lists of numpy arrays representing tokenized segments.
+    :rtype: Dict[Any, List[np.ndarray]]
+    :raises ValueError: If the input data is neither a tuple/list nor a pandas DataFrame.
+    Example:
+        >>> segments = ['ACTG', 'TGCA']
+        >>> segment_ids = [1, 2]
+        >>> tokenization_params = {'max_segment_length': 50, ...}
+        >>> tokenized_data = batch_tokenize_segments_with_ids(
+                (segments, segment_ids),
+                tokenization_params,
+                num_cores=4,
+                batch_size=1000
+            )
+    """
+    if isinstance(segment_data, tuple) or isinstance(segment_data, list):
+        segments = segment_data[0]
+        segment_ids = segment_data[1]
+    elif isinstance(segment_data, pd.DataFrame):
+        segments = list(segment_data['segment'])
+        segment_ids = list(segment_data['segment_id'])
+    else:
+        raise ValueError(f'The input should be either pandas DataFrame or a tuple instead of {type(segment_data)}')
+    Ndata = len(segments)
+    batch_intervals = [(i, min(i + batch_size, Ndata)) for i in range(0, Ndata, batch_size)]
+    params = [
+        (segments[interval[0]:interval[1]],
+         segment_ids[interval[0]:interval[1]],
+         tokenization_params,
+         np_token_type)
+        for interval in batch_intervals
+    ]
+    with Pool(processes=num_cores) as pool:
+        result_list = pool.starmap(process_batch_tokenize_segments_with_ids, params)
+    tokenized_sets = {}
+    for d in result_list:
+        tokenized_sets.update(d)
+    return tokenized_sets
+def get_rectangular_array_from_tokenized_dataset(tokenized_segments_data: Dict[int, List[np.ndarray]], shift: int, max_token_count: int, truncate_zeros: bool = True, randomize: bool = True, numpy_dtype: Type = np.uint16) -> Tuple[np.ndarray, pd.DataFrame]:
+    """Create a rectangular numpy array that can be used as input to a Language Model (LM) from tokenized segment data.
+    :param tokenized_segments_data: A dictionary where keys are segment ids and values are lists of possible LCA tokenized vectors.
+    :type tokenized_segments_data: Dict[int, List[np.ndarray]]
+    :param shift: Number of LCA offsets.
+    :type shift: int
+    :param max_token_count: Maximum allowed token count in the output numpy array.
+    :type max_token_count: int
+    :param truncate_zeros: If True, truncate columns from the end of the numpy array that only contain zeros. (default=True)
+    :type truncate_zeros: bool, optional
+    :param randomize: If True, randomize the order of the rows in the output numpy array. (default=True)
+    :type randomize: bool, optional
+    :param numpy_dtype: Data type of the values in the output numpy array. (default=np.uint16)
+    :type numpy_dtype: Type, optional
+    :returns: A rectangular numpy array suitable for input to an LM.
+    :rtype: np.ndarray
+    :returns: A dataframe that describes which row in the numpy array corresponds to which segment and its LCA offset.
+        Columns are: ['torch_id', 'segment_id', 'offset']
+    :rtype: pd.DataFrame
+    """
+    expected_length = len(tokenized_segments_data)*shift
+    X=np.full((expected_length,max_token_count),0, dtype=numpy_dtype)
+    torch_db = []
+    torch_id = 0
+    for segment_id, tokenized_vectors in tokenized_segments_data.items():
+        for offset in range(shift):
+            segment_vector = tokenized_vectors[offset]
+            X[torch_id,0:segment_vector.shape[0]] = segment_vector
+            torch_db.append([torch_id, segment_id, offset])
+            torch_id+=1
+    torch_tokenized_segment_db = pd.DataFrame(torch_db,
+                                            columns = ['torch_id', 'segment_id', 'offset'])
+    if randomize:
+        logging.info('Doing randomization!')
+        perm = np.random.permutation(expected_length)
+        X = X[perm,:]
+        torch_tokenized_segment_db.rename({'torch_id': 'original_torch_id'}, axis=1, inplace=True)
+        torch_tokenized_segment_db = torch_tokenized_segment_db.iloc[perm,:].reset_index().drop('index', axis=1).reset_index().rename({'index' : 'torch_id'}, axis=1)
+    if truncate_zeros:
+        logging.info('Tuncating all zeros column')
+    X = truncate_zero_columns(X)
+    return X, torch_tokenized_segment_db
+def pretty_print_overlapping_sequence(segment, segment_kmers, tokenizer_params):
+    """
+    Format the sequence for pretty printing with overlapping k-mers.
+    :param segment: DNA sequence.
+    :type segment: str
+    :param segment_kmers: List of k-mers in the segment.
+    :type segment_kmers: list
+    :param tokenizer_params: Dictionary containing tokenization parameters.
+    :type tokenizer_params: dict
+    :return: List of formatted strings representing the sequence with overlapping k-mers.
+    :rtype: list
+    """
+    shift = tokenizer_params['shift']
+    k = tokenizer_params['kmer']
+    sep_c = 2
+    lines = []
+    base_offset = len(str( int((k+3)/shift))) + 3
+    first_line = ' '*base_offset + segment
+    lines.append(first_line)
+    nr_lines = int(np.ceil((k+sep_c)/shift))
+    logging.info('Nr. line to cover the seq:  {0}'.format(nr_lines))
+    for line_id in range(nr_lines):
+        line_mers = [k_mer for j, k_mer in enumerate(segment_kmers) if j%nr_lines== line_id]
+        act_line = str(line_id) + '.  ' + ' '*(line_id*shift)  + (' '*(sep_c)).join(line_mers)
+        lines.append(act_line)
+    lines = '\n'.join(lines)
+    return lines
+def generate_kmers(abc: Set[str], k: int) -> List[str]:
+    """
+    Generates all possible k-mers from a given alphabet.
+    :param abc: The alphabet.
+    :type abc: Set[str]
+    :param k: Length of the k-mers.
+    :type k: int
+    :return: List of all possible k-mers.
+    :rtype: List[str]
+    """
+    return [''.join(p) for p in product(abc, repeat=k)]
+def save_to_hdf(X: np.ndarray, hdf_file_path: str, database: pd.DataFrame = None, compression: bool = False, pd_chunksize: int = 10_000_000) -> None:
+    """Save a numpy array and an optional pandas DataFrame to an HDF5 file.
+    :param X: 2D numpy array to be saved.
+    :type X: np.ndarray
+    :param hdf_file_path: Path to the HDF5 file.
+    :type hdf_file_path: str
+    :param database: Pandas DataFrame to be saved. Defaults to None.
+    :type database: pd.DataFrame
+    :param compression: Whether to apply compression. Defaults to False.
+    :type compression: bool
+    :param pd_chunksize: Number of rows per chunk for saving the DataFrame. Defaults to 10,000,000.
+    :type pd_chunksize: int
+    :raises ValueError: If the provided numpy array is not 2D.
+    :raises OSError: If there's an error creating the directory structure or removing an existing HDF5 file.
+    Example:
+    >>> import numpy as np
+        >>> import pandas as pd
+        >>> array = np.random.random((100, 100))
+        >>> df = pd.DataFrame({'A': range(1, 101), 'B': range(101, 201)})
+        >>> save_to_hdf(array, "sample.hdf5", database=df, compression=True)
+    """
+    # Check if X is a 2D numpy array
+    if len(X.shape) != 2:
+        raise ValueError("The provided numpy array is not 2D.")
+    # If HDF5 file exists, attempt to delete it
+    if os.path.exists(hdf_file_path):
+        try:
+            os.remove(hdf_file_path)
+            logging.info(f"Existing HDF5 file {hdf_file_path} removed successfully.")
+        except Exception as e:
+            raise OSError(f"Error removing existing HDF5 file {hdf_file_path}. Error: {e}")
+    # Create directory structure for HDF5 file
+    create_directory_for_filepath(hdf_file_path)
+    # Save the numpy array to HDF5
+    with h5py.File(hdf_file_path, 'w') as hdf:
+        try:
+            grp = hdf.create_group("training_data")
+        except ValueError:
+            del hdf['training_data']
+        if compression:
+            grp.create_dataset("X", data=X, compression="lzf", chunks=True)
+        else:
+            grp.create_dataset("X", data=X, chunks=True)
+    logging.info(f"Numpy array saved to {hdf_file_path} successfully.")
+    # Save the pandas DataFrame to HDF5, if provided
+    if database is not None:
+        logging.info("Adding database into the HDF5 file!")
+        num_chunks = int(np.ceil(len(database) / pd_chunksize))
+        logging.info(f'Number of chunks: {num_chunks}')
+        chunk_grouping = np.arange(len(database)) // pd_chunksize
+        chunkseqs = database.groupby(chunk_grouping)
+        for i, (_, chunk) in enumerate(chunkseqs):
+            logging.info(f'Writing database chunk {i} into {hdf_file_path}')
+            if compression:
+                chunk.to_hdf(hdf_file_path, f'database_{i}', format='table', data_columns=True,  mode='a', complib='lzo')
+            else:
+                chunk.to_hdf(hdf_file_path, f'database_{i}', format='table', data_columns=True,  mode='a')
+        logging.info('Database addition finished!')
+def dataframe_to_seqrecords(
+    df: pd.DataFrame,
+    fastaidcol: str = 'test_fastaid',
+    sequencecol: str = 'sequence'
+) -> List[SeqRecord]:
+    """
+    Convert a DataFrame with sequence information into a list of SeqRecord objects.
+    :param df: DataFrame containing at least two columns: one for sequence IDs and one for sequences.
+    :type df: pd.DataFrame
+    :param fastaidcol: Name of the column in `df` that contains sequence IDs. Defaults to 'test_fastaid'.
+    :type fastaidcol: str, optional
+    :param sequencecol: Name of the column in `df` that contains nucleotide sequences. Defaults to 'sequence'.
+    :type sequencecol: str, optional
+    :return: A list of SeqRecord objects constructed from the DataFrame.
+    :rtype: List[SeqRecord]
+    Example:
+        >>> import pandas as pd
+        >>> data = {'test_fastaid': ['seq1', 'seq2'], 'sequence': ['ATCG', 'GGTA']}
+        >>> df = pd.DataFrame(data)
+        >>> seq_records = dataframe_to_seqrecords(df)
+        >>> seq_records[0].id
+        'seq1'
+    """
+    seq_records = []
+    for _, row in df.iterrows():
+        seq = Seq(row[sequencecol])
+        record = SeqRecord(seq, id=str(row[fastaidcol]), description="")
+        seq_records.append(record)
+    return seq_records
+def write_seqrecords_to_fasta(
+    seq_records: List[SeqRecord],
+    file_name: str
+) -> None:
+    """
+    Write a list of SeqRecord objects to a FASTA file.
+    :param seq_records: List of SeqRecord objects to be written to file.
+    :type seq_records: List[SeqRecord]
+    :param file_name: Name or path of the file to write the FASTA records.
+    :type file_name: str
+    :return: None
+    :rtype: None
+    Example:
+        >>> from Bio.Seq import Seq
+        >>> from Bio.SeqRecord import SeqRecord
+        >>> seq_records = [SeqRecord(Seq('ATCG'), id='seq1'), SeqRecord(Seq('GGTA'), id='seq2')]
+        >>> write_seqrecords_to_fasta(seq_records, 'output.fasta')
+    """
+    SeqIO.write(seq_records, file_name, "fasta")
+def dump_records_to_files(
+    seq_records: List[SeqRecord],
+    folder_path: str
+) -> None:
+    """
+    Write each SeqRecord to a separate FASTA file in the specified folder.
+    :param seq_records: List of SeqRecord objects to be written individually.
+    :type seq_records: List[SeqRecord]
+    :param folder_path: Path to the folder where the files should be saved.
+                        The folder will be created if it does not exist.
+    :type folder_path: str
+    :return: None
+    :rtype: None
+    Example:
+        >>> from Bio.Seq import Seq
+        >>> from Bio.SeqRecord import SeqRecord
+        >>> seq_records = [SeqRecord(Seq('ATCG'), id='seq1'), SeqRecord(Seq('GGTA'), id='seq2')]
+        >>> dump_records_to_files(seq_records, 'sequences_folder')
+    """
+    # Ensure the folder exists
+    os.makedirs(folder_path, exist_ok=True)
+    for record in seq_records:
+        file_path = os.path.join(folder_path, f"{record.id}.fasta")
+        SeqIO.write(record, file_path, "fasta")
+def split_seqrecords_to_fasta_chunks(
+    seq_records: List[SeqRecord],
+    output_folder: str,
+    chunk_size_mb: int = 10
+) -> None:
+    """
+    Splits a list of SeqRecord objects into multiple FASTA files, each less than a specified size in MB.
+    :param seq_records: List of SeqRecord objects to be split into chunks.
+    :type seq_records: List[SeqRecord]
+    :param output_folder: The output folder where the FASTA files will be saved.
+    :type output_folder: str
+    :param chunk_size_mb: Maximum size of each FASTA file in megabytes. Defaults to 10 MB.
+    :type chunk_size_mb: int, optional
+    :return: None
+    :rtype: None
+    Example:
+        >>> seq_records = [...]  # A list of SeqRecord objects
+        >>> split_seqrecords_to_fasta_chunks(seq_records, 'output_chunks', chunk_size_mb=5)
+    Notes:
+        - The last chunk may be smaller than the specified `chunk_size_mb`.
+        - The function approximates the size of each record for chunking.
+    """
+    # Ensure output folder exists
+    os.makedirs(output_folder, exist_ok=True)
+    current_chunk = []
+    current_chunk_size = 0  # in bytes
+    chunk_id = 1  # Identifier for chunks/files
+    for record in seq_records:
+        # Approximate size of the record in bytes
+        record_size = len(str(record.seq)) + len(record.id) + 2  # Adding buffer for '>' and '\n'
+        # Check if adding this record exceeds the chunk size
+        if current_chunk_size + record_size > chunk_size_mb * 1024 * 1024:
+            file_path = os.path.join(output_folder, f"chunk_{chunk_id}.fasta")
+            SeqIO.write(current_chunk, file_path, "fasta")
+            current_chunk = []
+            current_chunk_size = 0
+            chunk_id += 1
+        current_chunk.append(record)
+        current_chunk_size += record_size
+    # Write any remaining records to the last chunk
+    if current_chunk:
+        file_path = os.path.join(output_folder, f"chunk_{chunk_id}.fasta")
+        SeqIO.write(current_chunk, file_path, "fasta")
+def filter_short_sequences(
+    seq_records: List[SeqRecord],
+    length_threshold: int
+) -> List[SeqRecord]:
+    """
+    Filters out SeqRecord objects with sequences shorter than a specified threshold.
+    :param seq_records: List of SeqRecord objects.
+    :type seq_records: List[SeqRecord]
+    :param length_threshold: The minimum length of sequences to be retained.
+    :type length_threshold: int
+    :return: A list of SeqRecord objects that meet or exceed the length threshold.
+    :rtype: List[SeqRecord]
+    Example:
+        >>> from Bio.Seq import Seq
+        >>> from Bio.SeqRecord import SeqRecord
+        >>> records = [
+        ...     SeqRecord(Seq('ATCG'), id='seq1'),
+        ...     SeqRecord(Seq('AT'), id='seq2')
+        ... ]
+        >>> filtered_records = filter_short_sequences(records, 3)
+        >>> len(filtered_records)
+        1
+        >>> filtered_records[0].id
+        'seq1'
+    """
+    filtered_records = [record for record in seq_records if len(record.seq) >= length_threshold]
+    return filtered_records
+def get_token_counts_for_segment(Lseg, kmer, shift, offset):
+    nr_tokens = int((Lseg -kmer)/shift + 1)
+    return nr_tokens
+def get_seq_coordinates(token_pos, kmer, shift, offset):
+    seq_start = int(token_pos*shift + offset)
+    seq_end = int(token_pos*shift+kmer + offset)
+    return seq_start, seq_end
+def get_token_coordinates(seq_pos, kmer, shift, offset, Lseg):
+    nrtokens = get_token_counts_for_segment(Lseg, kmer, shift, offset)
+    token_pos_end = int((seq_pos+offset - kmer) / shift)
+    token_pos_start = int((seq_pos + offset) / shift)
+    if token_pos_end<0:
+        token_pos_end=0
+    if token_pos_start >= nrtokens:
+        token_pos_start = nrtokens-1
+    return token_pos_start, token_pos_end
+def sliding_window_average(arr, window_size=6):
+    # Create a window for averaging
+    window = np.ones(window_size) / window_size
+    # Use 'valid' mode to slide the window over the array without padding
+    result = np.convolve(arr, window, mode='valid')
+    return result
+def convolve_expression_array(expression_array, window_size=6, step=2):
+    # Define the averaging window
+    window = np.ones(window_size) / window_size
+    # Apply convolution along each column (axis=0)
+    convolved_array = convolve1d(expression_array, window, axis=1, mode='reflect')
+    # Downsample by step size
+    return convolved_array[:, ::step]

tokenizer.py ADDED Viewed

	@@ -0,0 +1,363 @@

+import collections
+import os
+import json
+from copy import deepcopy
+from typing import List, Optional, Tuple, Dict
+from transformers import PreTrainedTokenizer
+from transformers.utils.hub import cached_file, hf_hub_url
+from .config_utils import SeqConfig
+from .sequtils import generate_kmers, lca_kmer_tokenize_segment
+# Define the names of the vocabulary files
+VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt"}
+# Define the mapping for pretrained vocabulary files
+PRETRAINED_VOCAB_FILES_MAP = {
+    "vocab_file": {
+        "lca-mini-k6s1": "lca-base-dna6/vocab.txt",
+        "lca-mini-k6s2": "lca-base-dna6/vocab.txt",
+        "lca-mini-k1s1": "lca-base-dna1/vocab.txt",
+    }
+}
+# Define positional embedding sizes for pretrained models
+PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
+    "lca-mini-k6s1": 1024,
+    "lca-mini-k1s1": 1024,
+    "lca-mini-k6s2": 2048,
+}
+# Define initial configuration for pretrained models
+PRETRAINED_INIT_CONFIGURATION = {
+    "lca-mini-k6s1": {"do_upper_case": True},
+    "lca-mini-k1s1": {"do_upper_case": True},
+    "lca-mini-k6s2": {"do_upper_case": True},
+}
+# Utility function to load vocabulary from a file
+def load_vocab(vocab_file):
+    """Loads a vocabulary file into a dictionary."""
+    vocab = collections.OrderedDict()
+    with open(vocab_file, "r", encoding="utf-8") as reader:
+        tokens = reader.readlines()
+    for index, token in enumerate(tokens):
+        vocab[token.rstrip("\n")] = index
+    return vocab
+class LCATokenizer(PreTrainedTokenizer):
+    """
+    Custom tokenizer for LCA (Local Context Aware) tasks.
+    Handles specific tokenization processes, including k-mer tokenization with configurable shifts.
+    Attributes:
+        vocab_files_names (dict): Mapping of vocabulary file names.
+        pretrained_vocab_files_map (dict): Mapping of pretrained vocabulary files.
+        pretrained_init_configuration (dict): Initial configuration for pretrained models.
+        max_model_input_sizes (dict): Maximum input sizes for pretrained models.
+    """
+    vocab_files_names = VOCAB_FILES_NAMES
+    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
+    pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION
+    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
+    nucleotide_abc = {"A", "T", "C", "G"}
+    extended_nucleotide_abc = {"A", "T", "C", "G", "*"}
+    sequence_unk_token = 'N'
+    default_unk_token = "[UNK]"
+    default_sep_token = "[SEP]"
+    default_pad_token = "[PAD]"
+    default_cls_token = "[CLS]"
+    default_mask_token = "[MASK]"
+    def __init__(
+        self,
+        config: Dict = {},
+        operation_space: str = "kmer",
+        **kwargs,
+    ):
+        """
+        Initializes the LCATokenizer with configuration and operation space.
+        Args:
+            config (dict): Tokenization parameters like k-mer size and shift.
+            operation_space (str): Defines operation mode ('kmer' or 'sequence').
+            kwargs: Additional arguments for PreTrainedTokenizer.
+        """
+        self.defconfig = SeqConfig()
+        config = self.defconfig.get_and_set_tokenization_parameters(config)
+        self.config = config
+        self.operation_space = operation_space
+        # Set default tokens
+        kwargs.setdefault("cls_token", self.default_cls_token)
+        kwargs.setdefault("unk_token", self.default_unk_token)
+        kwargs.setdefault("sep_token", self.default_sep_token)
+        kwargs.setdefault("pad_token", self.default_pad_token)
+        kwargs.setdefault("mask_token", self.default_mask_token)
+        # Load vocabulary
+        vocab_file = self.config["vocabfile"]
+        self.vocab = self.config["vocabmap"]
+        self.id2token = {v: k for k, v in self.vocab.items()}
+        self.max_len = self.config["max_segment_length"]
+        super().__init__(**kwargs)
+        # Handle extended vocabulary for sequence mode
+        if self.operation_space == 'sequence':
+            token_extension = sorted(list(set(generate_kmers(LCATokenizer.extended_nucleotide_abc, self.config['kmer'])) - \
+                 set(generate_kmers(LCATokenizer.nucleotide_abc, self.config['kmer'])) ))
+            self.extended_vocab = deepcopy(self.vocab)
+            for token in token_extension:
+                self.extended_vocab[token] = 4
+            self.unk_token = LCATokenizer.sequence_unk_token * self.config['shift']
+            self.mask_token = '*'
+            self.extended_vocab[self.mask_token] = self.vocab['[MASK]']
+            full_unk = 'N' * self.config['kmer']
+            self.vocab[full_unk] = 1
+            self.id2token[1] = full_unk
+            self.full_unk_token = full_unk
+        else:
+            self.extended_vocab = self.vocab
+            self.unk_token = '[UNK]'
+        self.unkown_tokenid = self.vocab['[UNK]']
+        self.sep_token = '[SEP]'
+        self.cls_token = '[CLS]'
+        self.pad_token = '[PAD]'
+        self.mask_token = '[MASK]'
+        self.special_tokens = list(self.special_tokens_map.values())
+    def _tokenize(self, text, **kwargs):
+        """
+        Tokenizes the input text using LCA tokenization with an optional offset.
+        Args:
+            text (str): The input DNA sequence to tokenize.
+            kwargs: Additional arguments, including:
+                - offset (int): The starting position for tokenization. Default is 0.
+        Returns:
+            List[str]: A list of tokens generated from the input text.
+        """
+        offset = kwargs.get("offset", 0)
+        #if offset < 0 or offset >= self.config.get("shift", 1):
+        #    raise ValueError(f"Invalid offset: {offset}. Must be between 0 and {self.config['shift'] - 1}.")
+        return lca_kmer_tokenize_segment(text, offset, self.config)
+    def _convert_token_to_id(self, token: str) -> int:
+        """
+        Converts a token to its corresponding ID using the vocabulary.
+        Args:
+            token (str): The token to convert.
+        Returns:
+            int: Token ID, or the unknown token ID if the token is not in the vocabulary.
+        """
+        return self.extended_vocab.get(token, self.unkown_tokenid)
+    def _convert_id_to_token(self, index: int) -> str:
+        """
+        Converts an ID to its corresponding token using the vocabulary.
+        Args:
+            index (int): The ID to convert.
+        Returns:
+            str: Corresponding token, or the unknown token if the ID is not in the vocabulary.
+        """
+        return self.id2token.get(index, self.unk_token)
+    def __len__(self) -> int:
+        """
+        Returns the length of the tokenizer's vocabulary.
+        The length returned is one less than the actual number of items in the vocabulary
+        to account for a specific offset or adjustment in token indexing.
+        :return: The adjusted length of the vocabulary.
+        :rtype: int
+        """
+        return len(self.vocab)
+    def tokenize(self, text: str, **kwargs) -> List[str]:
+        """
+        Tokenizes the input text using LCA tokenization.
+        Args:
+            text (str): The input DNA sequence to tokenize.
+            kwargs: Additional arguments, including:
+                - offset (int): The starting position for tokenization. Default is 0.
+        Returns:
+            List[str]: A list of tokens generated from the input text.
+        """
+        return self._tokenize(text, **kwargs)
+    def encode(self, text: str,  **kwargs) -> List[int]:
+        """
+        Extends the base `encode` method to support an `offset` parameter for custom tokenization logic.
+        Args:
+            text (str): Input text (DNA sequence).
+            offset (int): Offset parameter for the LCA tokenization. Defaults to 0.
+            kwargs: Additional arguments passed to the base `encode` method.
+        Returns:
+            List[int]: Encoded token IDs.
+        """
+        # Inject the offset into kwargs for the tokenizer
+        offset = kwargs.get("offset", 0)
+        kwargs["offset"] = offset
+        return super().encode(text, **kwargs)
+    def build_inputs_with_special_tokens(
+        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
+    ) -> List[int]:
+        """
+        Builds inputs by adding special tokens to a sequence or pair of sequences.
+        Args:
+            token_ids_0 (List[int]): List of token IDs for the first sequence.
+            token_ids_1 (List[int], optional): List of token IDs for the second sequence.
+        Returns:
+            List[int]: Input IDs with special tokens.
+        """
+        if token_ids_1 is None:
+            return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
+        input_ids = [self.cls_token_id] + token_ids_0 + [self.sep_token_id] + token_ids_1 + [self.sep_token_id]
+        #token_type_ids = [0 for i in range(len(input_ids))]
+        return input_ids
+    def create_token_type_ids_from_sequences(
+        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
+    ) -> List[int]:
+        """
+        Create the token type IDs corresponding to the sequences passed. [What are token type
+        IDs?](../glossary#token-type-ids)
+        Should be overridden in a subclass if the model has a special way of building those.
+        Args:
+            token_ids_0 (`List[int]`): The first tokenized sequence.
+            token_ids_1 (`List[int]`, *optional*): The second tokenized sequence.
+        Returns:
+            `List[int]`: The token type ids.
+        """
+        if token_ids_1 is None:
+            return (len(token_ids_0)+2) * [0]
+        return [0] * len(token_ids_0) + [1] * len(token_ids_1)
+    def batch_encode_plus(self, *args, **kwargs):
+        """
+        Extends the base `batch_encode_plus` method to add custom functionality if needed.
+        Args:
+            *args: Positional arguments passed to the base method.
+            **kwargs: Keyword arguments passed to the base method.
+        Returns:
+            dict: A dictionary containing the results of batch encoding.
+        """
+        # Call the parent method to handle the batch encoding
+        #print('Running batch encoding with ids')
+        act_outputs = super().batch_encode_plus(*args, **kwargs)
+        return act_outputs
+    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
+        """
+        Saves the tokenizer's vocabulary to a file.
+        Args:
+            save_directory (str): Directory to save the vocabulary file.
+            filename_prefix (str, optional): Prefix for the filename. Default is None.
+        Returns:
+            Tuple[str]: Path to the saved vocabulary file.
+        """
+        if filename_prefix is None:
+            filename_prefix = ""
+        vocab_file_path = os.path.join(save_directory, filename_prefix + "vocab.txt")
+        with open(vocab_file_path, "w") as f:
+            for token in self.vocab:
+                f.write(token + "\n")
+        return (vocab_file_path,)
+    def save_pretrained(self, save_directory: str, **kwargs):
+        """
+        Saves the tokenizer configuration and vocabulary to a directory.
+        Args:
+            save_directory (str): Directory to save the tokenizer files.
+        """
+        if not os.path.exists(save_directory):
+            os.makedirs(save_directory)
+        super().save_pretrained(save_directory, **kwargs)
+        tokenizer_config_path = os.path.join(save_directory, "tokenizer_config.json")
+        if os.path.exists(tokenizer_config_path):
+            with open(tokenizer_config_path, "r") as f:
+                tokenizer_config = json.load(f)
+        else:
+            tokenizer_config = {}
+        tokenizer_config.update({
+            "kmer": self.config.get("kmer", 6),
+            "shift": self.config.get("shift", 1),
+        })
+        with open(tokenizer_config_path, "w") as f:
+            json.dump(tokenizer_config, f, indent=2)
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
+        """
+        Loads a tokenizer from the pretrained model directory or Hugging Face Hub.
+        Args:
+            pretrained_model_name_or_path (str): Path or model name on Hugging Face Hub.
+            kwargs: Additional arguments for initialization.
+        Returns:
+            LCATokenizer: The loaded tokenizer instance.
+        """
+        tokenizer_config_file = hf_hub_url(
+            pretrained_model_name_or_path, filename="tokenizer_config.json"
+        )
+        resolved_tokenizer_config_file = cached_file(
+            pretrained_model_name_or_path, filename="tokenizer_config.json"
+        )
+        with open(resolved_tokenizer_config_file, "r") as f:
+            tokenizer_config = json.load(f)
+        kmer = tokenizer_config.pop("kmer", 6)
+        shift = tokenizer_config.pop("shift", 1)
+        base_tokenization_config = {'kmer': kmer, 'shift': shift}
+        defconfig = SeqConfig()
+        config = defconfig.get_and_set_tokenization_parameters(base_tokenization_config)
+        tokenizer = super().from_pretrained(pretrained_model_name_or_path, **kwargs)
+        tokenizer.config = config
+        return tokenizer

tokenizer_config.json CHANGED Viewed

@@ -1,4 +1,10 @@
 {
   "clean_up_tokenization_spaces": true,
   "cls_token": "[CLS]",
   "mask_token": "[MASK]",

 {
+  "auto_map": {
+    "AutoTokenizer": [
+      "tokenizer.LCATokenizer",
+      null
+    ]
+  },
   "clean_up_tokenization_spaces": true,
   "cls_token": "[CLS]",
   "mask_token": "[MASK]",