Fix : input_ids sequence length problem
Browse files
__pycache__/helper_functions.cpython-310.pyc
CHANGED
|
Binary files a/__pycache__/helper_functions.cpython-310.pyc and b/__pycache__/helper_functions.cpython-310.pyc differ
|
|
|
helper_functions.py
CHANGED
|
@@ -112,7 +112,7 @@ def transform_single_text(
|
|
| 112 |
tokens = tokenize_whole_text(text, tokenizer)
|
| 113 |
input_id_chunks, mask_chunks = split_tokens_into_smaller_chunks(tokens, chunk_size, stride, minimal_chunk_length)
|
| 114 |
add_special_tokens_at_beginning_and_end(input_id_chunks, mask_chunks)
|
| 115 |
-
|
| 116 |
input_ids, attention_mask = stack_tokens_from_all_chunks(input_id_chunks, mask_chunks)
|
| 117 |
return input_ids, attention_mask
|
| 118 |
|
|
@@ -158,19 +158,16 @@ def add_special_tokens_at_beginning_and_end(input_id_chunks: list[Tensor], mask_
|
|
| 158 |
mask_chunks[i] = torch.cat([Tensor([1]), mask_chunks[i], Tensor([1])])
|
| 159 |
|
| 160 |
|
| 161 |
-
|
| 162 |
-
|
| 163 |
-
"""Adds padding tokens at the end to make sure that all chunks have exactly chunk_size tokens."""
|
| 164 |
-
pad_token_id = 0 # Assuming this is defined somewhere in your code
|
| 165 |
for i in range(len(input_id_chunks)):
|
| 166 |
# get required padding length
|
| 167 |
-
pad_len = chunk_size +2 - input_id_chunks[i].shape[0]
|
| 168 |
# check if tensor length satisfies required chunk size
|
| 169 |
if pad_len > 0:
|
| 170 |
# if padding length is more than 0, we must add padding
|
| 171 |
-
input_id_chunks[i] = torch.cat([input_id_chunks[i],
|
| 172 |
-
mask_chunks[i] = torch.cat([mask_chunks[i],
|
| 173 |
-
|
| 174 |
|
| 175 |
|
| 176 |
def stack_tokens_from_all_chunks(input_id_chunks: list[Tensor], mask_chunks: list[Tensor]) -> tuple[Tensor, Tensor]:
|
|
@@ -191,6 +188,13 @@ def split_overlapping(tensor: Tensor, chunk_size: int, stride: int, minimal_chun
|
|
| 191 |
|
| 192 |
## Voice part
|
| 193 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 194 |
def transform_for_inference_text(text: str,
|
| 195 |
tokenizer: PreTrainedTokenizerBase,
|
| 196 |
chunk_size: int,
|
|
@@ -204,7 +208,7 @@ def transform_for_inference_text(text: str,
|
|
| 204 |
input_id_chunks, mask_chunks = split_tokens_into_smaller_chunks(tokens, chunk_size, stride, minimal_chunk_length)
|
| 205 |
add_special_tokens_at_beginning_and_end_inference(input_id_chunks, mask_chunks)
|
| 206 |
add_padding_tokens_inference(input_id_chunks, mask_chunks, chunk_size)
|
| 207 |
-
input_ids, attention_mask =
|
| 208 |
return {"input_ids": input_ids, "attention_mask": attention_mask}
|
| 209 |
|
| 210 |
def add_special_tokens_at_beginning_and_end_inference(input_id_chunks: list[Tensor], mask_chunks: list[Tensor]) -> None:
|
|
|
|
| 112 |
tokens = tokenize_whole_text(text, tokenizer)
|
| 113 |
input_id_chunks, mask_chunks = split_tokens_into_smaller_chunks(tokens, chunk_size, stride, minimal_chunk_length)
|
| 114 |
add_special_tokens_at_beginning_and_end(input_id_chunks, mask_chunks)
|
| 115 |
+
add_padding_tokens(input_id_chunks, mask_chunks , chunk_size)
|
| 116 |
input_ids, attention_mask = stack_tokens_from_all_chunks(input_id_chunks, mask_chunks)
|
| 117 |
return input_ids, attention_mask
|
| 118 |
|
|
|
|
| 158 |
mask_chunks[i] = torch.cat([Tensor([1]), mask_chunks[i], Tensor([1])])
|
| 159 |
|
| 160 |
|
| 161 |
+
def add_padding_tokens(input_id_chunks: list[Tensor], mask_chunks: list[Tensor] , chunk_size) -> None:
|
| 162 |
+
"""Adds padding tokens (token id = 0) at the end to make sure that all chunks have exactly 512 tokens."""
|
|
|
|
|
|
|
| 163 |
for i in range(len(input_id_chunks)):
|
| 164 |
# get required padding length
|
| 165 |
+
pad_len = chunk_size + 2 - input_id_chunks[i].shape[0]
|
| 166 |
# check if tensor length satisfies required chunk size
|
| 167 |
if pad_len > 0:
|
| 168 |
# if padding length is more than 0, we must add padding
|
| 169 |
+
input_id_chunks[i] = torch.cat([input_id_chunks[i], Tensor([0] * pad_len)])
|
| 170 |
+
mask_chunks[i] = torch.cat([mask_chunks[i], Tensor([0] * pad_len)])
|
|
|
|
| 171 |
|
| 172 |
|
| 173 |
def stack_tokens_from_all_chunks(input_id_chunks: list[Tensor], mask_chunks: list[Tensor]) -> tuple[Tensor, Tensor]:
|
|
|
|
| 188 |
|
| 189 |
## Voice part
|
| 190 |
|
| 191 |
+
def stack_tokens_from_all_chunks_for_inference(input_id_chunks: list[Tensor], mask_chunks: list[Tensor]) -> tuple[Tensor, Tensor]:
|
| 192 |
+
"""Reshapes data to a form compatible with BERT model input."""
|
| 193 |
+
input_ids = torch.stack(input_id_chunks)
|
| 194 |
+
attention_mask = torch.stack(mask_chunks)
|
| 195 |
+
|
| 196 |
+
return input_ids.long(), attention_mask.int()
|
| 197 |
+
|
| 198 |
def transform_for_inference_text(text: str,
|
| 199 |
tokenizer: PreTrainedTokenizerBase,
|
| 200 |
chunk_size: int,
|
|
|
|
| 208 |
input_id_chunks, mask_chunks = split_tokens_into_smaller_chunks(tokens, chunk_size, stride, minimal_chunk_length)
|
| 209 |
add_special_tokens_at_beginning_and_end_inference(input_id_chunks, mask_chunks)
|
| 210 |
add_padding_tokens_inference(input_id_chunks, mask_chunks, chunk_size)
|
| 211 |
+
input_ids, attention_mask = stack_tokens_from_all_chunks_for_inference(input_id_chunks, mask_chunks)
|
| 212 |
return {"input_ids": input_ids, "attention_mask": attention_mask}
|
| 213 |
|
| 214 |
def add_special_tokens_at_beginning_and_end_inference(input_id_chunks: list[Tensor], mask_chunks: list[Tensor]) -> None:
|