CCRss
/

qqp_kz

Transformers

TensorBoard

Kazakh

text-generation-inference

Model card Files Files and versions

xet

Metrics Training metrics Community

CCRss commited on Dec 21, 2023

Commit

1ceb4d2

1 Parent(s): 1ba1821

Update README.md

Browse files

Files changed (1) hide show

README.md +24 -1

README.md CHANGED Viewed

@@ -19,18 +19,34 @@ The **qqp_kz** model is paraphrasing tool tailored for the Kazakh language. It i
 Data Preprocessing
 The dataset used for training the qqp_kz model undergoes rigorous preprocessing to ensure compatibility and optimal performance:
 ```python
 from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
 tokenizer = AutoTokenizer.from_pretrained("CCRss/tokenizer_t5_kz")
 def preprocess_data(example):
     source = example["src"]
     target = example["trg"]
     source_inputs = tokenizer(source, padding="max_length", truncation=True, max_length=128)
     target_inputs = tokenizer(target, padding="max_length", truncation=True, max_length=128)
     return {**source_inputs, **target_inputs, "labels": target_inputs["input_ids"]}
 encoded_dataset = dataset.map(preprocess_data)
 encoded_dataset.set_format("torch")
 ```
@@ -39,11 +55,17 @@ encoded_dataset.set_format("torch")
 The model is trained with the following configuration:
 ```python
 from transformers import TrainingArguments, Seq2SeqTrainer
 name_of_model = "humarin/chatgpt_paraphraser_on_T5_base"
 model = AutoModelForSeq2SeqLM.from_pretrained(name_of_model)
 training_args = Seq2SeqTrainingArguments(
     per_device_train_batch_size=21,
     gradient_accumulation_steps=3,
@@ -57,6 +79,7 @@ training_args = Seq2SeqTrainingArguments(
     evaluation_strategy="steps"
 )
 trainer = Seq2SeqTrainer(
     model=model,
     args=training_args,
@@ -64,8 +87,8 @@ trainer = Seq2SeqTrainer(
     eval_dataset=encoded_dataset['valid']
 )
 trainer.train()
 ```
 ### Usage

 Data Preprocessing
 The dataset used for training the qqp_kz model undergoes rigorous preprocessing to ensure compatibility and optimal performance:
 ```python
+# Importing necessary modules from the transformers library
 from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
+# Initializing the tokenizer for the specific model. This tokenizer is used to convert
+# text input into a format that is understandable by the model.
 tokenizer = AutoTokenizer.from_pretrained("CCRss/tokenizer_t5_kz")
+# Define a function for preprocessing the data. This function takes an example
+# (which includes source and target texts) and tokenizes both texts using the tokenizer.
+# The tokenized output is then formatted to a fixed length for consistent model input.
 def preprocess_data(example):
+    # Extracting the source and target texts from the example
     source = example["src"]
     target = example["trg"]
+    # Tokenizing the source text with padding and truncation to ensure a fixed length
     source_inputs = tokenizer(source, padding="max_length", truncation=True, max_length=128)
+    # Tokenizing the target text with padding and truncation to ensure a fixed length
     target_inputs = tokenizer(target, padding="max_length", truncation=True, max_length=128)
+    # Returning the tokenized inputs, combining both source and target, and setting the target as labels
     return {**source_inputs, **target_inputs, "labels": target_inputs["input_ids"]}
+# Applying the preprocessing function to the dataset, effectively transforming all text data
+# into a tokenized format suitable for the Seq2Seq model.
 encoded_dataset = dataset.map(preprocess_data)
+# Setting the format of the dataset to PyTorch tensors for compatibility with the training framework.
 encoded_dataset.set_format("torch")
 ```
 The model is trained with the following configuration:
 ```python
+# Importing necessary classes for training from the transformers library
 from transformers import TrainingArguments, Seq2SeqTrainer
+# Name of the pretrained model to be used for Seq2Seq learning
 name_of_model = "humarin/chatgpt_paraphraser_on_T5_base"
+# Loading the model from the pretrained weights
 model = AutoModelForSeq2SeqLM.from_pretrained(name_of_model)
+# Setting up training arguments. This includes batch size, learning rate, number of epochs,
+# directories for saving results and logs, and evaluation strategy.
 training_args = Seq2SeqTrainingArguments(
     per_device_train_batch_size=21,
     gradient_accumulation_steps=3,
     evaluation_strategy="steps"
 )
+# Initializing the trainer with the model, training arguments, and the datasets for training and evaluation.
 trainer = Seq2SeqTrainer(
     model=model,
     args=training_args,
     eval_dataset=encoded_dataset['valid']
 )
+# Starting the training process of the model using the specified datasets and training arguments.
 trainer.train()
 ```
 ### Usage