Spaces:

m7mdal7aj
/

KB-VQA

Sleeping

App Files Files Community

m7mdal7aj commited on May 17, 2024

Commit

a1f367e

verified ·

1 Parent(s): 9ba7b62

Update my_model/fine_tuner/fine_tuning_data_handler.py

Browse files

Files changed (1) hide show

my_model/fine_tuner/fine_tuning_data_handler.py +43 -31

my_model/fine_tuner/fine_tuning_data_handler.py CHANGED Viewed

@@ -1,11 +1,10 @@
 from my_model.utilities.gen_utilities import is_pycharm
 import seaborn as sns
 from transformers import AutoTokenizer
 from datasets import Dataset, load_dataset
 import my_model.config.fine_tuning_config as config
 from my_model.LLAMA2.LLAMA2_model import Llama2ModelManager
-from typing import Tuple
 class FinetuningDataHandler:
@@ -13,7 +12,7 @@ class FinetuningDataHandler:
     A class dedicated to handling data for fine-tuning language models. It manages loading,
     inspecting, preparing, and splitting the dataset, specifically designed to filter out
     data samples exceeding a specified token count limit. This is crucial for models with
-    token count constraints and it helps control the level of GPU RAM tolernace based on the number of tokens,
     ensuring efficient and effective model fine-tuning.
     Attributes:
@@ -22,14 +21,15 @@ class FinetuningDataHandler:
         max_token_count (int): Maximum allowable token count per data sample.
     Methods:
-        load_llm_tokenizer(): Loads the LLM tokenizer and adds special tokens, if not already loaded.
-        load_dataset(): Loads the dataset from a specified file path.
-        plot_tokens_count_distribution(token_counts, title): Plots the distribution of token counts in the dataset.
-        filter_dataset_by_indices(dataset, valid_indices): Filters the dataset based on valid indices, removing samples exceeding token limits.
-        get_token_counts(dataset): Calculates token counts for each sample in the dataset.
-        prepare_dataset(): Tokenizes and filters the dataset, preparing it for training. Also visualizes token count distribution before and after filtering.
-        split_dataset_for_train_eval(dataset): Divides the dataset into training and evaluation sets.
-        inspect_prepare_split_data(): Coordinates the data preparation and splitting process for fine-tuning.
     """
     def __init__(self, tokenizer: AutoTokenizer = None, dataset_file: str = config.DATASET_FILE) -> None:
@@ -37,17 +37,21 @@ class FinetuningDataHandler:
         Initializes the FinetuningDataHandler class.
         Args:
-            tokenizer (AutoTokenizer): Tokenizer to use for tokenizing the dataset.
-            dataset_file (str): Path to the dataset file.
         """
         self.tokenizer = tokenizer  # The tokenizer used for processing the dataset.
         self.dataset_file = dataset_file  # Path to the fine-tuning dataset file.
-        self.max_token_count = config.MAX_TOKEN_COUNT  # Max token count for filtering.
-    def load_llm_tokenizer(self):
         """
         Loads the LLM tokenizer and adds special tokens, if not already loaded.
         If the tokenizer is already loaded, this method does nothing.
         """
         if self.tokenizer is None:
@@ -63,21 +67,26 @@ class FinetuningDataHandler:
         Returns:
             Dataset: The loaded dataset, ready for processing.
         """
         return load_dataset('csv', data_files=self.dataset_file)
-    def plot_tokens_count_distribution(self, token_counts: list, title: str = "Token Count Distribution") -> None:
         """
         Plots the distribution of token counts in the dataset for visualization purposes.
         Args:
-            token_counts (list): List of token counts, each count representing the number of tokens in a dataset sample.
             title (str): Title for the plot, highlighting the nature of the distribution.
         """
         if is_pycharm():  # Ensuring compatibility with PyCharm's environment for interactive plot.
-            import matplotlib
             matplotlib.use('TkAgg')  # Set the backend to 'TkAgg'
-        import matplotlib.pyplot as plt
         sns.set_style("whitegrid")
         plt.figure(figsize=(15, 6))
         plt.hist(token_counts, bins=50, color='#3498db', edgecolor='black')
@@ -89,21 +98,21 @@ class FinetuningDataHandler:
         plt.tight_layout()
         plt.show()
-    def filter_dataset_by_indices(self, dataset: Dataset, valid_indices: list) -> Dataset:
         """
         Filters the dataset based on a list of valid indices. This method is used to exclude
         data samples that have a token count exceeding the specified maximum token count.
         Args:
             dataset (Dataset): The dataset to be filtered.
-            valid_indices (list): Indices of samples with token counts within the limit.
         Returns:
             Dataset: Filtered dataset containing only samples with valid indices.
         """
         return dataset['train'].select(valid_indices)  # Select only samples with valid indices based on token count.
-    def get_token_counts(self, dataset):
         """
         Calculates and returns the token counts for each sample in the dataset.
         This function assumes the dataset has a 'train' split and a 'text' field.
@@ -131,6 +140,7 @@ class FinetuningDataHandler:
         Returns:
             Tuple[Dataset, Dataset]: The train and evaluate datasets, post-filtering.
         """
         dataset = self.load_dataset()
         self.load_llm_tokenizer()
@@ -148,7 +158,7 @@ class FinetuningDataHandler:
         return self.split_dataset_for_train_eval(filtered_dataset)  # split the dataset into training and evaluation.
-    def split_dataset_for_train_eval(self, dataset) -> Tuple[Dataset, Dataset]:
         """
         Splits the dataset into training and evaluation datasets.
@@ -156,27 +166,29 @@ class FinetuningDataHandler:
             dataset (Dataset): The dataset to split.
         Returns:
-            tuple[Dataset, Dataset]: The split training and evaluation datasets.
         """
         split_data = dataset.train_test_split(test_size=config.TEST_SIZE, shuffle=True, seed=config.SEED)
         train_data, eval_data = split_data['train'], split_data['test']
         return train_data, eval_data
-    def inspect_prepare_split_data(self) -> tuple[Dataset, Dataset]:
         """
         Orchestrates the process of inspecting, preparing, and splitting the dataset for fine-tuning.
         Returns:
-            tuple[Dataset, Dataset]: The prepared training and evaluation datasets.
         """
         return self.prepare_dataset()
 # Example usage
 if __name__ == "__main__":
-    #  Please uncomment the below lines to test the data prep.
-    #data_handler = FinetuningDataHandler()
-    #fine_tuning_data_train, fine_tuning_data_eval = data_handler.inspect_prepare_split_data()
-    #print(fine_tuning_data_train, fine_tuning_data_eval)
     pass

+from typing import Tuple, List
 from my_model.utilities.gen_utilities import is_pycharm
 import seaborn as sns
 from transformers import AutoTokenizer
 from datasets import Dataset, load_dataset
 import my_model.config.fine_tuning_config as config
 from my_model.LLAMA2.LLAMA2_model import Llama2ModelManager
 class FinetuningDataHandler:
     A class dedicated to handling data for fine-tuning language models. It manages loading,
     inspecting, preparing, and splitting the dataset, specifically designed to filter out
     data samples exceeding a specified token count limit. This is crucial for models with
+    token count constraints and it helps control the level of GPU RAM tolerance based on the number of tokens,
     ensuring efficient and effective model fine-tuning.
     Attributes:
         max_token_count (int): Maximum allowable token count per data sample.
     Methods:
+        load_llm_tokenizer: Loads the LLM tokenizer and adds special tokens, if not already loaded.
+        load_dataset: Loads the dataset from a specified file path.
+        plot_tokens_count_distribution: Plots the distribution of token counts in the dataset.
+        filter_dataset_by_indices: Filters the dataset based on valid indices, removing samples exceeding token limits.
+        get_token_counts: Calculates token counts for each sample in the dataset.
+        prepare_dataset: Tokenizes and filters the dataset, preparing it for training. Also visualizes token count
+                         distribution before and after filtering.
+        split_dataset_for_train_eval: Divides the dataset into training and evaluation sets.
+        inspect_prepare_split_data: Coordinates the data preparation and splitting process for fine-tuning.
     """
     def __init__(self, tokenizer: AutoTokenizer = None, dataset_file: str = config.DATASET_FILE) -> None:
         Initializes the FinetuningDataHandler class.
         Args:
+            tokenizer (AutoTokenizer, optional): Tokenizer to use for tokenizing the dataset. Defaults to None.
+            dataset_file (str): Path to the dataset file. Defaults to config.DATASET_FILE.
         """
         self.tokenizer = tokenizer  # The tokenizer used for processing the dataset.
         self.dataset_file = dataset_file  # Path to the fine-tuning dataset file.
+        self.max_token_count = config.MAX_TOKEN_COUNT  # Max token count for filtering set to 1,024.
+    def load_llm_tokenizer(self) -> None:
         """
         Loads the LLM tokenizer and adds special tokens, if not already loaded.
         If the tokenizer is already loaded, this method does nothing.
+        Returns:
+            None
         """
         if self.tokenizer is None:
         Returns:
             Dataset: The loaded dataset, ready for processing.
         """
         return load_dataset('csv', data_files=self.dataset_file)
+    def plot_tokens_count_distribution(self, token_counts: List[int], title: str = "Token Count Distribution") -> None:
         """
         Plots the distribution of token counts in the dataset for visualization purposes.
         Args:
+            token_counts (List[int]): List of token counts, each count representing the number of tokens in a dataset
+                                      sample.
             title (str): Title for the plot, highlighting the nature of the distribution.
+        Returns:
+            None
         """
         if is_pycharm():  # Ensuring compatibility with PyCharm's environment for interactive plot.
+            import matplotlib  # The import is kept here intentionaly.
             matplotlib.use('TkAgg')  # Set the backend to 'TkAgg'
+        import matplotlib.pyplot as plt  # The import is kept here intentionaly.
         sns.set_style("whitegrid")
         plt.figure(figsize=(15, 6))
         plt.hist(token_counts, bins=50, color='#3498db', edgecolor='black')
         plt.tight_layout()
         plt.show()
+    def filter_dataset_by_indices(self, dataset: Dataset, valid_indices: List[int]) -> Dataset:
         """
         Filters the dataset based on a list of valid indices. This method is used to exclude
         data samples that have a token count exceeding the specified maximum token count.
         Args:
             dataset (Dataset): The dataset to be filtered.
+            valid_indices (List[int]): Indices of samples with token counts within the limit.
         Returns:
             Dataset: Filtered dataset containing only samples with valid indices.
         """
         return dataset['train'].select(valid_indices)  # Select only samples with valid indices based on token count.
+    def get_token_counts(self, dataset: Dataset) -> List[int]:
         """
         Calculates and returns the token counts for each sample in the dataset.
         This function assumes the dataset has a 'train' split and a 'text' field.
         Returns:
             Tuple[Dataset, Dataset]: The train and evaluate datasets, post-filtering.
         """
         dataset = self.load_dataset()
         self.load_llm_tokenizer()
         return self.split_dataset_for_train_eval(filtered_dataset)  # split the dataset into training and evaluation.
+    def split_dataset_for_train_eval(self, dataset: Dataset) -> Tuple[Dataset, Dataset]:
         """
         Splits the dataset into training and evaluation datasets.
             dataset (Dataset): The dataset to split.
         Returns:
+            Tuple[Dataset, Dataset]: The split training and evaluation datasets.
         """
         split_data = dataset.train_test_split(test_size=config.TEST_SIZE, shuffle=True, seed=config.SEED)
         train_data, eval_data = split_data['train'], split_data['test']
         return train_data, eval_data
+    def inspect_prepare_split_data(self) -> Tuple[Dataset, Dataset]:
         """
         Orchestrates the process of inspecting, preparing, and splitting the dataset for fine-tuning.
         Returns:
+            Tuple[Dataset, Dataset]: The prepared training and evaluation datasets.
         """
         return self.prepare_dataset()
 # Example usage
 if __name__ == "__main__":
+    # Please uncomment the below lines to test the data prep.
+    # data_handler = FinetuningDataHandler()
+    # fine_tuning_data_train, fine_tuning_data_eval = data_handler.inspect_prepare_split_data()
+    # print(fine_tuning_data_train, fine_tuning_data_eval)
     pass