Subsystem_OS_Command_Access

Sleeping

Subsystem_OS_Command_Access / gradiobee /seg_text.py

freemt

Update altair

a4a35d8 about 4 years ago

3.12 kB

	"""Split text to sentences.

	Use sentence_splitter if supported,
	else use polyglot.text.Text

	!apt install libicu-dev
	!install pyicu pycld2 Morfessor
	!pip install polyglot sentence_splitter
	"""
	# pylint: disable=

	from typing import List, Optional, Union

	import re
	from tqdm.auto import tqdm
	from polyglot.detect.base import logger as polyglot_logger
	from polyglot.text import Detector, Text
	from sentence_splitter import split_text_into_sentences

	from logzero import logger

	# turn of polyglot.text.Detector warning
	polyglot_logger.setLevel("ERROR")


	# fmt: off
	# use sentence_splitter if supported
	LANG_S = ["ca", "cs", "da", "nl", "en", "fi", "fr", "de",
	"el", "hu", "is", "it", "lv", "lt", "no", "pl",
	"pt", "ro", "ru", "sk", "sl", "es", "sv", "tr"]


	def _seg_text(
	text: str,
	lang: Optional[str] = None,
	# qmode: bool = False,
	maxlines: int = 1000
	) -> List[str]:
	# fmt: on
	"""Split text to sentences.

	Use sentence_splitter if supported,
	else use polyglot.text.Text.sentences
	Blank lines will be removed.

	qmode: quick mode, skip split_text_into_sentences if True, default False
	vectors for all books are based on qmode=False.
	qmode=True is for quick test purpose only

	maxlines (default 1000), threshold for turn on tqdm progressbar
	set to <1 or a large number to turn it off
	"""
	if lang is None:
	try:
	lang = Detector(text).language.code
	except Exception as exc:
	logger.info("text[:30]: %s", text[:30])
	logger.warning(
	"polyglot.text.Detector exc: %s, setting to 'en'",
	exc
	)
	lang = "en"

	# if not qmode and lang in LANG_S:
	if lang in LANG_S:
	_ = []
	lines = text.splitlines()
	# if maxlines > 1 and len(lines) > maxlines:
	if len(lines) > maxlines > 1:
	for para in tqdm(lines):
	if para.strip():
	_.extend(split_text_into_sentences(para, lang))
	else:
	for para in lines:
	if para.strip():
	_.extend(split_text_into_sentences(para, lang))
	return _

	# return split_text_into_sentences(text, lang)

	# empty "" text or blank to avoid Exception
	if not text.strip():
	return []

	return [elm.string for elm in Text(text, lang).sentences]


	# fmt: off
	def seg_text(
	lst: Union[str, List[str]],
	lang: Optional[str] = None,
	maxlines: int = 1000,
	extra: Optional[str] = None,
	) -> List[str]:
	# fmt:on
	"""Split a list of text.

	Arguments:
	lst: text or text list
	extra: re.split(rf"{extra}, text) first
	Returns:
	list of splitted text.
	"""
	if isinstance(lst, str):
	lst = [lst]

	if extra:
	# insert \n
	lst = [re.sub(rf"({extra})", r"\1\n", elm) for elm in lst]

	res = []
	for elm in lst:
	res.extend(_seg_text(
	elm,
	lang=lang,
	maxlines=maxlines,
	))

	return res