| | |
| |
|
| | """ Tasks data utility.""" |
| |
|
| | import re |
| | import numpy as np |
| |
|
| |
|
| | def clean_text(text): |
| | """Remove new lines and multiple spaces and adjust end of sentence dot.""" |
| |
|
| | text = text.replace("\n", " ") |
| | text = re.sub(r'\s+', ' ', text) |
| | for _ in range(3): |
| | text = text.replace(' . ', '. ') |
| |
|
| | return text |
| |
|
| |
|
| | def build_sample(ids, types, paddings, label, unique_id): |
| | """Convert to numpy and return a sample consumed by the batch producer.""" |
| |
|
| | ids_np = np.array(ids, dtype=np.int64) |
| | types_np = np.array(types, dtype=np.int64) |
| | paddings_np = np.array(paddings, dtype=np.int64) |
| | sample = ({'text': ids_np, |
| | 'types': types_np, |
| | 'padding_mask': paddings_np, |
| | 'label': int(label), |
| | 'uid': int(unique_id)}) |
| |
|
| | return sample |
| |
|
| |
|
| | def build_tokens_types_paddings_from_text(text_a, text_b, |
| | tokenizer, max_seq_length): |
| | """Build token types and paddings, trim if needed, and pad if needed.""" |
| |
|
| | text_a_ids = tokenizer.tokenize(text_a) |
| | text_b_ids = None |
| | if text_b is not None: |
| | text_b_ids = tokenizer.tokenize(text_b) |
| |
|
| | return build_tokens_types_paddings_from_ids(text_a_ids, text_b_ids, |
| | max_seq_length, tokenizer.cls, |
| | tokenizer.sep, tokenizer.pad) |
| |
|
| |
|
| | def build_tokens_types_paddings_from_ids(text_a_ids, text_b_ids, max_seq_length, |
| | cls_id, sep_id, pad_id): |
| | """Build token types and paddings, trim if needed, and pad if needed.""" |
| |
|
| | ids = [] |
| | types = [] |
| | paddings = [] |
| |
|
| | |
| | ids.append(cls_id) |
| | types.append(0) |
| | paddings.append(1) |
| |
|
| | |
| | len_text_a = len(text_a_ids) |
| | ids.extend(text_a_ids) |
| | types.extend([0] * len_text_a) |
| | paddings.extend([1] * len_text_a) |
| |
|
| | |
| | ids.append(sep_id) |
| | types.append(0) |
| | paddings.append(1) |
| |
|
| | |
| | if text_b_ids is not None: |
| | len_text_b = len(text_b_ids) |
| | ids.extend(text_b_ids) |
| | types.extend([1] * len_text_b) |
| | paddings.extend([1] * len_text_b) |
| |
|
| | |
| | trimmed = False |
| | if len(ids) >= max_seq_length: |
| | max_seq_length_m1 = max_seq_length - 1 |
| | ids = ids[0:max_seq_length_m1] |
| | types = types[0:max_seq_length_m1] |
| | paddings = paddings[0:max_seq_length_m1] |
| | trimmed = True |
| |
|
| | |
| | if (text_b_ids is not None) or trimmed: |
| | ids.append(sep_id) |
| | if text_b_ids is None: |
| | types.append(0) |
| | else: |
| | types.append(1) |
| | paddings.append(1) |
| |
|
| | |
| | padding_length = max_seq_length - len(ids) |
| | if padding_length > 0: |
| | ids.extend([pad_id] * padding_length) |
| | types.extend([pad_id] * padding_length) |
| | paddings.extend([0] * padding_length) |
| |
|
| | return ids, types, paddings |
| |
|