| """Split text to sentences. |
| |
| Use sentence_splitter if supported, |
| else use polyglot.text.Text |
| |
| !apt install libicu-dev |
| !install pyicu pycld2 Morfessor |
| !pip install polyglot sentence_splitter |
| """ |
| from typing import List, Optional |
|
|
| from tqdm.auto import tqdm |
| from polyglot.detect.base import logger as polyglot_logger |
| from polyglot.text import Detector, Text |
| from sentence_splitter import split_text_into_sentences |
|
|
| from logzero import logger |
|
|
| |
| polyglot_logger.setLevel("ERROR") |
|
|
|
|
| |
| |
| LANG_S = ["ca", "cs", "da", "nl", "en", "fi", "fr", "de", |
| "el", "hu", "is", "it", "lv", "lt", "no", "pl", |
| "pt", "ro", "ru", "sk", "sl", "es", "sv", "tr"] |
|
|
|
|
| def seg_text( |
| text: str, |
| lang: Optional[str] = None, |
| qmode: bool = False, |
| maxlines: int = 1000 |
| ) -> List[str]: |
| |
| """ |
| Split text to sentences. |
| |
| Use sentence_splitter if supported, |
| else use polyglot.text.Text.sentences |
| |
| qmode: skip split_text_into_sentences if True, default False |
| vectors for all books are based on qmode=False. |
| qmode=True is for quick test purpose only |
| |
| maxlines (default 1000), threhold for turn on tqdm progressbar |
| set to <1 or a large number to turn it off |
| """ |
| if lang is None: |
| try: |
| lang = Detector(text).language.code |
| except Exception as exc: |
| logger.warning("polyglot.text.Detector exc: %s, setting to 'en'", exc) |
| lang = "en" |
|
|
| if not qmode and lang in LANG_S: |
| _ = [] |
| lines = text.splitlines() |
| |
| if len(lines) > maxlines > 1: |
| for para in tqdm(lines): |
| if para.strip(): |
| _.extend(split_text_into_sentences(para, lang)) |
| else: |
| for para in lines: |
| if para.strip(): |
| _.extend(split_text_into_sentences(para, lang)) |
| return _ |
|
|
| |
|
|
| return [elm.string for elm in Text(text, lang).sentences] |
|
|