from typing import List, Optional, Union
import re
import torch
from PIL import Image
from transformers.image_processing_utils import BaseImageProcessor, BatchFeature
from transformers.image_utils import (
ImageInput,
make_list_of_images,
valid_images,
to_numpy_array,
)
from transformers.utils import TensorType
rules = [
(r'-<\|sn\|>', ''),
(r' <\|sn\|>', ' '),
(r'<\|sn\|>', ' '),
(r'<\|unk\|>', ''),
(r'', ''),
(r'', ''),
(r'\uffff', ''),
(r'_{4,}', '___'),
(r'\.{4,}', '...'),
]
def clean_special_tokens(text):
text = text.replace(' ', '').replace('Ġ', ' ').replace('Ċ', '\n').replace(
'<|bos|>', '').replace('<|eos|>', '').replace('<|pad|>', '')
for rule in rules:
text = re.sub(rule[0], rule[1], text)
text = text.replace('