| import re | |
| # we split individual characters inside special tokens like [START_DNA] | |
| CUSTOM_SEQ_RE = re.compile(r"(\[START_(DNA|SMILES|I_SMILES|AMINO)])(.*?)(\[END_\2])") | |
| # token added to implement a custom sequence tokenization. This token is added at | |
| # corpus cleaning step and removed in pretokenization. The digits are added to increase the chance | |
| # that they do not occur in the corpus. The digits are escaped so that the token does not appear | |
| # literally in the source code in case we ever include it in the training data. | |
| SPLIT_MARKER = f"SPL{1}T-TH{1}S-Pl3A5E" | |
| def _insert_split_marker(m: re.Match): | |
| """ | |
| Applies split marker based on a regex match of special tokens such as | |
| [START_DNA]. | |
| Parameters | |
| ---------- | |
| n : str | |
| Input text to split | |
| Returns | |
| ---------- | |
| str - the text with the split token added | |
| """ | |
| start_token, _, sequence, end_token = m.groups() | |
| sequence = re.sub(r"(.)", fr"{SPLIT_MARKER}\1", sequence, flags=re.DOTALL) | |
| return f"{start_token}{sequence}{SPLIT_MARKER}{end_token}" | |
| def escape_custom_split_sequence(text): | |
| """ | |
| Applies custom splitting to the text for GALILEO's tokenization | |
| Parameters | |
| ---------- | |
| text : str | |
| Input text to split | |
| Returns | |
| ---------- | |
| str - the text with the split token added | |
| """ | |
| return CUSTOM_SEQ_RE.sub(_insert_split_marker, text) |