| import re |
| from typing import List |
|
|
| from .operators import FieldOperator |
|
|
|
|
| class Split(FieldOperator): |
| by: str |
|
|
| def process_value(self, value: str) -> List[str]: |
| return value.split(self.by) |
|
|
|
|
| class RegexSplit(FieldOperator): |
| by: str |
|
|
| def process_value(self, value: str) -> List[str]: |
| return re.split(self.by, value) |
|
|
|
|
| class TokensSplit(FieldOperator): |
| model: str |
| _requirements_list = ["transformers"] |
|
|
| def prepare(self): |
| super().prepare() |
| from transformers import AutoTokenizer |
|
|
| self.tokenizer = AutoTokenizer.from_pretrained(self.model) |
|
|
| def process_value(self, value: str) -> List[str]: |
| return self.tokenizer.tokenize(value) |
|
|
|
|
| class Join(FieldOperator): |
| by: str |
|
|
| def process_value(self, value: List[str]) -> str: |
| return self.by.join(value) |
|
|
|
|
| class Strip(FieldOperator): |
| def process_value(self, value: str) -> str: |
| return value.strip() |
|
|
|
|
| class Replace(FieldOperator): |
| old: str |
| new: str |
|
|
| def process_value(self, value: str) -> str: |
| return value.replace(self.old, self.new) |
|
|