| | from langchain.text_splitter import CharacterTextSplitter |
| | import re |
| | from typing import List |
| |
|
| |
|
| | class AliTextSplitter(CharacterTextSplitter): |
| | def __init__(self, pdf: bool = False, **kwargs): |
| | super().__init__(**kwargs) |
| | self.pdf = pdf |
| |
|
| | def split_text(self, text: str) -> List[str]: |
| | |
| | |
| | |
| | if self.pdf: |
| | text = re.sub(r"\n{3,}", r"\n", text) |
| | text = re.sub('\s', " ", text) |
| | text = re.sub("\n\n", "", text) |
| | from modelscope.pipelines import pipeline |
| |
|
| | p = pipeline( |
| | task="document-segmentation", |
| | model='damo/nlp_bert_document-segmentation_chinese-base', |
| | device="cpu") |
| | result = p(documents=text) |
| | sent_list = [i for i in result["text"].split("\n\t") if i] |
| | return sent_list |
| |
|