Albin Thörn Cleland
Clean initial commit with LFS
19b8775
"""
Client for accessing Stanford CoreNLP in Python
"""
import atexit
import contextlib
import enum
import io
import os
import re
import requests
import logging
import json
import shlex
import socket
import subprocess
import time
import sys
import uuid
from datetime import datetime
from pathlib import Path
from urllib.parse import urlparse
from stanza.protobuf import Document, parseFromDelimitedString, writeToDelimitedString, to_text
__author__ = 'arunchaganty, kelvinguu, vzhong, wmonroe4'
logger = logging.getLogger('stanza')
# pattern tmp props file should follow
SERVER_PROPS_TMP_FILE_PATTERN = re.compile('corenlp_server-(.*).props')
# Check if str is CoreNLP supported language
CORENLP_LANGS = ['ar', 'arabic', 'chinese', 'zh', 'english', 'en', 'french', 'fr', 'de', 'german', 'hu', 'hungarian',
'it', 'italian', 'es', 'spanish']
# map shorthands to full language names
LANGUAGE_SHORTHANDS_TO_FULL = {
"ar": "arabic",
"zh": "chinese",
"en": "english",
"fr": "french",
"de": "german",
"hu": "hungarian",
"it": "italian",
"es": "spanish"
}
def is_corenlp_lang(props_str):
""" Check if a string references a CoreNLP language """
return props_str.lower() in CORENLP_LANGS
# Validate CoreNLP properties
CORENLP_OUTPUT_VALS = ["conll", "conllu", "json", "serialized", "text", "xml", "inlinexml"]
def validate_corenlp_props(properties=None, annotators=None, output_format=None):
""" Do basic checks to validate CoreNLP properties """
if output_format and output_format.lower() not in CORENLP_OUTPUT_VALS:
raise ValueError(f"{output_format} not a valid CoreNLP outputFormat value! Choose from: {CORENLP_OUTPUT_VALS}")
if type(properties) == dict:
if "outputFormat" in properties and properties["outputFormat"].lower() not in CORENLP_OUTPUT_VALS:
raise ValueError(f"{properties['outputFormat']} not a valid CoreNLP outputFormat value! Choose from: "
f"{CORENLP_OUTPUT_VALS}")
class AnnotationException(Exception):
""" Exception raised when there was an error communicating with the CoreNLP server. """
pass
class TimeoutException(AnnotationException):
""" Exception raised when the CoreNLP server timed out. """
pass
class ShouldRetryException(Exception):
""" Exception raised if the service should retry the request. """
pass
class PermanentlyFailedException(Exception):
""" Exception raised if the service should NOT retry the request. """
pass
class StartServer(enum.Enum):
DONT_START = 0
FORCE_START = 1
TRY_START = 2
def clean_props_file(props_file):
# check if there is a temp server props file to remove and remove it
if props_file:
if os.path.isfile(props_file) and SERVER_PROPS_TMP_FILE_PATTERN.match(os.path.basename(props_file)):
os.remove(props_file)
class RobustService(object):
""" Service that resuscitates itself if it is not available. """
CHECK_ALIVE_TIMEOUT = 120
def __init__(self, start_cmd, stop_cmd, endpoint, stdout=None,
stderr=None, be_quiet=False, host=None, port=None, ignore_binding_error=False):
self.start_cmd = start_cmd and shlex.split(start_cmd)
self.stop_cmd = stop_cmd and shlex.split(stop_cmd)
self.endpoint = endpoint
self.stdout = stdout
self.stderr = stderr
self.server = None
self.is_active = False
self.be_quiet = be_quiet
self.host = host
self.port = port
self.ignore_binding_error = ignore_binding_error
atexit.register(self.atexit_kill)
def is_alive(self):
try:
if not self.ignore_binding_error and self.server is not None and self.server.poll() is not None:
return False
return requests.get(self.endpoint + "/ping").ok
except requests.exceptions.ConnectionError as e:
raise ShouldRetryException(e)
def start(self):
if self.start_cmd:
if self.host and self.port:
with contextlib.closing(socket.socket(socket.AF_INET, socket.SOCK_STREAM)) as sock:
try:
sock.bind((self.host, self.port))
except socket.error as e:
if self.ignore_binding_error:
logger.info(f"Connecting to existing CoreNLP server at {self.host}:{self.port}")
self.server = None
return
else:
raise PermanentlyFailedException("Error: unable to start the CoreNLP server on port %d "
"(possibly something is already running there)" % self.port) from e
if self.be_quiet:
# Issue #26: subprocess.DEVNULL isn't supported in python 2.7.
if hasattr(subprocess, 'DEVNULL'):
stderr = subprocess.DEVNULL
else:
stderr = open(os.devnull, 'w')
stdout = stderr
else:
stdout = self.stdout
stderr = self.stderr
logger.info(f"Starting server with command: {' '.join(self.start_cmd)}")
try:
self.server = subprocess.Popen(self.start_cmd,
stderr=stderr,
stdout=stdout)
except FileNotFoundError as e:
raise FileNotFoundError("When trying to run CoreNLP, a FileNotFoundError occurred, which frequently means Java was not installed or was not in the classpath.") from e
def atexit_kill(self):
# make some kind of effort to stop the service (such as a
# CoreNLP server) at the end of the program. not waiting so
# that the python script exiting isn't delayed
if self.server and self.server.poll() is None:
self.server.terminate()
def stop(self):
if self.server:
self.server.terminate()
try:
self.server.wait(5)
except subprocess.TimeoutExpired:
# Resorting to more aggressive measures...
self.server.kill()
try:
self.server.wait(5)
except subprocess.TimeoutExpired:
# oh well
pass
self.server = None
if self.stop_cmd:
subprocess.run(self.stop_cmd, check=True)
self.is_active = False
def __enter__(self):
self.start()
return self
def __exit__(self, _, __, ___):
self.stop()
def ensure_alive(self):
# Check if the service is active and alive
if self.is_active:
try:
if self.is_alive():
return
else:
self.stop()
except ShouldRetryException:
pass
# If not, try to start up the service.
if self.server is None:
self.start()
# Wait for the service to start up.
start_time = time.time()
while True:
try:
if self.is_alive():
break
except ShouldRetryException:
pass
if time.time() - start_time < self.CHECK_ALIVE_TIMEOUT:
time.sleep(1)
else:
raise PermanentlyFailedException("Timed out waiting for service to come alive.")
# At this point we are guaranteed that the service is alive.
self.is_active = True
def resolve_classpath(classpath=None):
"""
Returns the classpath to use for corenlp.
Prefers to use the given classpath parameter, if available. If
not, uses the CORENLP_HOME environment variable. Resolves $CLASSPATH
(the exact string) in either the classpath parameter or $CORENLP_HOME.
"""
if classpath == '$CLASSPATH' or (classpath is None and os.getenv("CORENLP_HOME", None) == '$CLASSPATH'):
classpath = os.getenv("CLASSPATH")
elif classpath is None:
classpath = os.getenv("CORENLP_HOME", os.path.join(str(Path.home()), 'stanza_corenlp'))
if not os.path.exists(classpath):
raise FileNotFoundError("Please install CoreNLP by running `stanza.install_corenlp()`. If you have installed it, please define "
"$CORENLP_HOME to be location of your CoreNLP distribution or pass in a classpath parameter. "
"$CORENLP_HOME={}".format(os.getenv("CORENLP_HOME")))
classpath = os.path.join(classpath, "*")
return classpath
class CoreNLPClient(RobustService):
""" A client to the Stanford CoreNLP server. """
DEFAULT_ENDPOINT = "http://localhost:9000"
DEFAULT_TIMEOUT = 60000
DEFAULT_THREADS = 5
DEFAULT_OUTPUT_FORMAT = "serialized"
DEFAULT_MEMORY = "5G"
DEFAULT_MAX_CHAR_LENGTH = 100000
def __init__(self, start_server=StartServer.FORCE_START,
endpoint=DEFAULT_ENDPOINT,
timeout=DEFAULT_TIMEOUT,
threads=DEFAULT_THREADS,
annotators=None,
pretokenized=False,
output_format=None,
properties=None,
stdout=None,
stderr=None,
memory=DEFAULT_MEMORY,
be_quiet=False,
max_char_length=DEFAULT_MAX_CHAR_LENGTH,
preload=True,
classpath=None,
**kwargs):
# whether or not server should be started by client
self.start_server = start_server
self.server_props_path = None
self.server_start_time = None
self.server_host = None
self.server_port = None
self.server_classpath = None
# validate properties
validate_corenlp_props(properties=properties, annotators=annotators, output_format=output_format)
# set up client defaults
self.properties = properties
self.annotators = annotators
self.pretokenized = pretokenized
self.output_format = output_format
self._setup_client_defaults()
# start the server
if isinstance(start_server, bool):
warning_msg = f"Setting 'start_server' to a boolean value when constructing {self.__class__.__name__} is deprecated and will stop" + \
" to function in a future version of stanza. Please consider switching to using a value from stanza.server.StartServer."
logger.warning(warning_msg)
start_server = StartServer.FORCE_START if start_server is True else StartServer.DONT_START
# start the server
if start_server is StartServer.FORCE_START or start_server is StartServer.TRY_START:
# record info for server start
self.server_start_time = datetime.now()
# set up default properties for server
self._setup_server_defaults()
host, port = urlparse(endpoint).netloc.split(":")
port = int(port)
assert host == "localhost", "If starting a server, endpoint must be localhost"
classpath = resolve_classpath(classpath)
start_cmd = f"java -Xmx{memory} -cp '{classpath}' edu.stanford.nlp.pipeline.StanfordCoreNLPServer " \
f"-port {port} -timeout {timeout} -threads {threads} -maxCharLength {max_char_length} " \
f"-quiet {be_quiet} "
self.server_classpath = classpath
self.server_host = host
self.server_port = port
# set up server defaults
if self.server_props_path is not None:
start_cmd += f" -serverProperties {self.server_props_path}"
# possibly set pretokenized
if self.pretokenized:
start_cmd += f" -preTokenized"
# set annotators for server default
if self.annotators is not None:
annotators_str = self.annotators if type(annotators) == str else ",".join(annotators)
start_cmd += f" -annotators {annotators_str}"
# specify what to preload, if anything
if preload:
if type(preload) == bool:
# -preload flag means to preload all default annotators
start_cmd += " -preload"
elif type(preload) == list:
# turn list into comma separated list string, only preload these annotators
start_cmd += f" -preload {','.join(preload)}"
elif type(preload) == str:
# comma separated list of annotators
start_cmd += f" -preload {preload}"
# set outputFormat for server default
# if no output format requested by user, set to serialized
start_cmd += f" -outputFormat {self.output_format}"
# additional options for server:
# - server_id
# - ssl
# - status_port
# - uriContext
# - strict
# - key
# - username
# - password
# - blockList
for kw in ['ssl', 'strict']:
if kwargs.get(kw) is not None:
start_cmd += f" -{kw}"
for kw in ['status_port', 'uriContext', 'key', 'username', 'password', 'blockList', 'server_id']:
if kwargs.get(kw) is not None:
start_cmd += f" -{kw} {kwargs.get(kw)}"
stop_cmd = None
else:
start_cmd = stop_cmd = None
host = port = None
super(CoreNLPClient, self).__init__(start_cmd, stop_cmd, endpoint,
stdout, stderr, be_quiet, host=host, port=port, ignore_binding_error=(start_server == StartServer.TRY_START))
self.timeout = timeout
def _setup_client_defaults(self):
"""
Do some processing of annotators and output_format specified for the client.
If interacting with an externally started server, these will be defaults for annotate() calls.
:return: None
"""
# normalize annotators to str
if self.annotators is not None:
self.annotators = self.annotators if type(self.annotators) == str else ",".join(self.annotators)
# handle case where no output format is specified
if self.output_format is None:
if type(self.properties) == dict and 'outputFormat' in self.properties:
self.output_format = self.properties['outputFormat']
else:
self.output_format = CoreNLPClient.DEFAULT_OUTPUT_FORMAT
def _setup_server_defaults(self):
"""
Set up the default properties for the server.
The properties argument can take on one of 3 value types
1. File path on system or in CLASSPATH (e.g. /path/to/server.props or StanfordCoreNLP-french.properties
2. Name of a Stanford CoreNLP supported language (e.g. french or fr)
3. Python dictionary (properties written to tmp file for Java server, erased at end)
In addition, an annotators list and output_format can be specified directly with arguments. These
will overwrite any settings in the specified properties.
If no properties are specified, the standard Stanford CoreNLP English server will be launched. The outputFormat
will be set to 'serialized' and use the ProtobufAnnotationSerializer.
"""
# ensure properties is str or dict
if self.properties is None or (not isinstance(self.properties, str) and not isinstance(self.properties, dict)):
if self.properties is not None:
logger.warning('properties passed invalid value (not a str or dict), setting properties = {}')
self.properties = {}
# check if properties is a string, pass on to server which can handle
if isinstance(self.properties, str):
# try to translate to Stanford CoreNLP language name, or assume properties is a path
if is_corenlp_lang(self.properties):
if self.properties.lower() in LANGUAGE_SHORTHANDS_TO_FULL:
self.properties = LANGUAGE_SHORTHANDS_TO_FULL[self.properties]
logger.info(
f"Using CoreNLP default properties for: {self.properties}. Make sure to have "
f"{self.properties} models jar (available for download here: "
f"https://stanfordnlp.github.io/CoreNLP/) in CLASSPATH")
else:
if not os.path.isfile(self.properties):
logger.warning(f"{self.properties} does not correspond to a file path. Make sure this file is in "
f"your CLASSPATH.")
self.server_props_path = self.properties
elif isinstance(self.properties, dict):
# make a copy
server_start_properties = dict(self.properties)
if self.annotators is not None:
server_start_properties['annotators'] = self.annotators
if self.output_format is not None and isinstance(self.output_format, str):
server_start_properties['outputFormat'] = self.output_format
# write desired server start properties to tmp file
# set up to erase on exit
tmp_path = write_corenlp_props(server_start_properties)
logger.info(f"Writing properties to tmp file: {tmp_path}")
atexit.register(clean_props_file, tmp_path)
self.server_props_path = tmp_path
def _request(self, buf, properties, reset_default=False, **kwargs):
"""
Send a request to the CoreNLP server.
:param (str | bytes) buf: data to be sent with the request
:param (dict) properties: properties that the server expects
:return: request result
"""
if self.start_server is not StartServer.DONT_START:
self.ensure_alive()
try:
input_format = properties.get("inputFormat", "text")
if input_format == "text":
ctype = "text/plain; charset=utf-8"
elif input_format == "serialized":
ctype = "application/x-protobuf"
else:
raise ValueError("Unrecognized inputFormat " + input_format)
# handle auth
if 'username' in kwargs and 'password' in kwargs:
kwargs['auth'] = requests.auth.HTTPBasicAuth(kwargs['username'], kwargs['password'])
kwargs.pop('username')
kwargs.pop('password')
r = requests.post(self.endpoint,
params={'properties': str(properties), 'resetDefault': str(reset_default).lower()},
data=buf, headers={'content-type': ctype},
timeout=(self.timeout*2)/1000, **kwargs)
r.raise_for_status()
return r
except requests.exceptions.Timeout as e:
raise TimeoutException("Timeout requesting to CoreNLPServer. Maybe server is unavailable or your document is too long")
except requests.exceptions.RequestException as e:
if e.response is not None and e.response.text is not None:
raise AnnotationException(e.response.text) from e
elif e.args:
raise AnnotationException(e.args[0]) from e
raise AnnotationException() from e
def annotate(self, text, annotators=None, output_format=None, properties=None, reset_default=None, **kwargs):
"""
Send a request to the CoreNLP server.
:param (str | unicode) text: raw text for the CoreNLPServer to parse
:param (list | string) annotators: list of annotators to use
:param (str) output_format: output type from server: serialized, json, text, conll, conllu, or xml
:param (dict) properties: additional request properties (written on top of defaults)
:param (bool) reset_default: don't use server defaults
Precedence for settings:
1. annotators and output_format args
2. Values from properties dict
3. Client defaults self.annotators and self.output_format (set during client construction)
4. Server defaults
Additional request parameters (apart from CoreNLP pipeline properties) such as 'username' and 'password'
can be specified with the kwargs.
:return: request result
"""
# validate request properties
validate_corenlp_props(properties=properties, annotators=annotators, output_format=output_format)
# set request properties
request_properties = {}
# start with client defaults
if self.annotators is not None:
request_properties['annotators'] = self.annotators
if self.output_format is not None:
request_properties['outputFormat'] = self.output_format
# add values from properties arg
# handle str case
if type(properties) == str:
if is_corenlp_lang(properties):
properties = {'pipelineLanguage': properties.lower()}
if reset_default is None:
reset_default = True
else:
raise ValueError(f"Unrecognized properties keyword {properties}")
if type(properties) == dict:
request_properties.update(properties)
# if annotators list is specified, override with that
# also can use the annotators field the object was created with
if annotators is not None and (type(annotators) == str or type(annotators) == list):
request_properties['annotators'] = annotators if type(annotators) == str else ",".join(annotators)
# if output format is specified, override with that
if output_format is not None and type(output_format) == str:
request_properties['outputFormat'] = output_format
# make the request
# if not explicitly set or the case of pipelineLanguage, reset_default should be None
if reset_default is None:
reset_default = False
r = self._request(text.encode('utf-8'), request_properties, reset_default, **kwargs)
if request_properties["outputFormat"] == "json":
return r.json()
elif request_properties["outputFormat"] == "serialized":
doc = Document()
parseFromDelimitedString(doc, r.content)
return doc
elif request_properties["outputFormat"] in ["text", "conllu", "conll", "xml"]:
return r.text
else:
return r
def update(self, doc, annotators=None, properties=None):
if properties is None:
properties = {}
properties.update({
'inputFormat': 'serialized',
'outputFormat': 'serialized',
'serializer': 'edu.stanford.nlp.pipeline.ProtobufAnnotationSerializer'
})
if annotators:
properties['annotators'] = annotators if type(annotators) == str else ",".join(annotators)
with io.BytesIO() as stream:
writeToDelimitedString(doc, stream)
msg = stream.getvalue()
r = self._request(msg, properties)
doc = Document()
parseFromDelimitedString(doc, r.content)
return doc
def tokensregex(self, text, pattern, filter=False, to_words=False, annotators=None, properties=None):
# this is required for some reason
matches = self.__regex('/tokensregex', text, pattern, filter, annotators, properties)
if to_words:
matches = regex_matches_to_indexed_words(matches)
return matches
def semgrex(self, text, pattern, filter=False, to_words=False, annotators=None, properties=None):
matches = self.__regex('/semgrex', text, pattern, filter, annotators, properties)
if to_words:
matches = regex_matches_to_indexed_words(matches)
return matches
def fill_tree_proto(self, tree, proto_tree):
if tree.label:
proto_tree.value = tree.label
for child in tree.children:
proto_child = proto_tree.child.add()
self.fill_tree_proto(child, proto_child)
def tregex(self, text=None, pattern=None, filter=False, annotators=None, properties=None, trees=None):
# parse is not included by default in some of the pipelines,
# so we may need to manually override the annotators
# to include parse in order for tregex to do anything
if annotators is None and self.annotators is not None:
assert isinstance(self.annotators, str)
pieces = self.annotators.split(",")
if "parse" not in pieces:
annotators = self.annotators + ",parse"
else:
annotators = "tokenize,ssplit,pos,parse"
if pattern is None:
raise ValueError("Cannot have None as a pattern for tregex")
# TODO: we could also allow for passing in a complete document,
# along with the original text, so that the spans returns are more accurate
if trees is not None:
if properties is None:
properties = {}
properties['inputFormat'] = 'serialized'
if 'serializer' not in properties:
properties['serializer'] = 'edu.stanford.nlp.pipeline.ProtobufAnnotationSerializer'
doc = Document()
full_text = []
for tree_idx, tree in enumerate(trees):
sentence = doc.sentence.add()
sentence.sentenceIndex = tree_idx
sentence.tokenOffsetBegin = len(full_text)
leaves = tree.leaf_labels()
full_text.extend(leaves)
sentence.tokenOffsetEnd = len(full_text)
self.fill_tree_proto(tree, sentence.parseTree)
for word in leaves:
token = sentence.token.add()
# the other side uses both value and word, weirdly enough
token.value = word
token.word = word
# without the actual tokenization, at least we can
# stop the words from running together
token.after = " "
doc.text = " ".join(full_text)
with io.BytesIO() as stream:
writeToDelimitedString(doc, stream)
text = stream.getvalue()
return self.__regex('/tregex', text, pattern, filter, annotators, properties)
def __regex(self, path, text, pattern, filter, annotators=None, properties=None):
"""
Send a regex-related request to the CoreNLP server.
:param (str | unicode) path: the path for the regex endpoint
:param text: raw text for the CoreNLPServer to apply the regex
:param (str | unicode) pattern: regex pattern
:param (bool) filter: option to filter sentences that contain matches, if false returns matches
:param properties: option to filter sentences that contain matches, if false returns matches
:return: request result
"""
if self.start_server is not StartServer.DONT_START:
self.ensure_alive()
if properties is None:
properties = {}
properties.update({
'inputFormat': 'text',
'serializer': 'edu.stanford.nlp.pipeline.ProtobufAnnotationSerializer'
})
if annotators:
properties['annotators'] = ",".join(annotators) if isinstance(annotators, list) else annotators
# force output for regex requests to be json
properties['outputFormat'] = 'json'
# if the server is trying to send back character offsets, it
# should send back codepoints counts as well in case the text
# has extra wide characters
properties['tokenize.codepoint'] = 'true'
try:
# Error occurs unless put properties in params
input_format = properties.get("inputFormat", "text")
if input_format == "text":
ctype = "text/plain; charset=utf-8"
elif input_format == "serialized":
ctype = "application/x-protobuf"
else:
raise ValueError("Unrecognized inputFormat " + input_format)
# change request method from `get` to `post` as required by CoreNLP
r = requests.post(
self.endpoint + path, params={
'pattern': pattern,
'filter': filter,
'properties': str(properties)
},
data=text.encode('utf-8') if isinstance(text, str) else text,
headers={'content-type': ctype},
timeout=(self.timeout*2)/1000,
)
r.raise_for_status()
if r.encoding is None:
r.encoding = "utf-8"
return json.loads(r.text)
except requests.HTTPError as e:
if r.text.startswith("Timeout"):
raise TimeoutException(r.text)
else:
raise AnnotationException(r.text)
except json.JSONDecodeError:
raise AnnotationException(r.text)
def scenegraph(self, text, properties=None):
"""
Send a request to the server which processes the text using SceneGraph
This will require a new CoreNLP release, 4.5.5 or later
"""
# since we're using requests ourself,
# check if the server has started or not
if self.start_server is not StartServer.DONT_START:
self.ensure_alive()
if properties is None:
properties = {}
# the only thing the scenegraph knows how to use is text
properties['inputFormat'] = 'text'
ctype = "text/plain; charset=utf-8"
# the json output format is much more useful
properties['outputFormat'] = 'json'
try:
r = requests.post(
self.endpoint + "/scenegraph",
params={
'properties': str(properties)
},
data=text.encode('utf-8') if isinstance(text, str) else text,
headers={'content-type': ctype},
timeout=(self.timeout*2)/1000,
)
r.raise_for_status()
if r.encoding is None:
r.encoding = "utf-8"
return json.loads(r.text)
except requests.HTTPError as e:
if r.text.startswith("Timeout"):
raise TimeoutException(r.text)
else:
raise AnnotationException(r.text)
except json.JSONDecodeError:
raise AnnotationException(r.text)
def read_corenlp_props(props_path):
""" Read a Stanford CoreNLP properties file into a dict """
props_dict = {}
with open(props_path) as props_file:
entry_lines = [entry_line for entry_line in props_file.read().split('\n')
if entry_line.strip() and not entry_line.startswith('#')]
for entry_line in entry_lines:
k = entry_line.split('=')[0]
k_len = len(k+"=")
v = entry_line[k_len:]
props_dict[k.strip()] = v
return props_dict
def write_corenlp_props(props_dict, file_path=None):
""" Write a Stanford CoreNLP properties dict to a file """
if file_path is None:
file_path = f"corenlp_server-{uuid.uuid4().hex[:16]}.props"
# confirm tmp file path matches pattern
assert SERVER_PROPS_TMP_FILE_PATTERN.match(file_path)
with open(file_path, 'w') as props_file:
for k, v in props_dict.items():
if isinstance(v, list):
writeable_v = ",".join(v)
else:
writeable_v = v
props_file.write(f'{k} = {writeable_v}\n\n')
return file_path
def regex_matches_to_indexed_words(matches):
"""
Transforms tokensregex and semgrex matches to indexed words.
:param matches: unprocessed regex matches
:return: flat array of indexed words
"""
words = [dict(v, **dict([('sentence', i)]))
for i, s in enumerate(matches['sentences'])
for k, v in s.items() if k != 'length']
return words
__all__ = ["CoreNLPClient", "AnnotationException", "TimeoutException", "to_text"]