stanza-digphil / stanza /server /client.py

Albin Thörn Cleland

Clean initial commit with LFS

19b8775 about 2 months ago

32.8 kB

	"""
	Client for accessing Stanford CoreNLP in Python
	"""

	import atexit
	import contextlib
	import enum
	import io
	import os
	import re
	import requests
	import logging
	import json
	import shlex
	import socket
	import subprocess
	import time
	import sys
	import uuid

	from datetime import datetime
	from pathlib import Path
	from urllib.parse import urlparse

	from stanza.protobuf import Document, parseFromDelimitedString, writeToDelimitedString, to_text
	__author__ = 'arunchaganty, kelvinguu, vzhong, wmonroe4'

	logger = logging.getLogger('stanza')

	# pattern tmp props file should follow
	SERVER_PROPS_TMP_FILE_PATTERN = re.compile('corenlp_server-(.*).props')

	# Check if str is CoreNLP supported language
	CORENLP_LANGS = ['ar', 'arabic', 'chinese', 'zh', 'english', 'en', 'french', 'fr', 'de', 'german', 'hu', 'hungarian',
	'it', 'italian', 'es', 'spanish']

	# map shorthands to full language names
	LANGUAGE_SHORTHANDS_TO_FULL = {
	"ar": "arabic",
	"zh": "chinese",
	"en": "english",
	"fr": "french",
	"de": "german",
	"hu": "hungarian",
	"it": "italian",
	"es": "spanish"
	}


	def is_corenlp_lang(props_str):
	""" Check if a string references a CoreNLP language """
	return props_str.lower() in CORENLP_LANGS


	# Validate CoreNLP properties
	CORENLP_OUTPUT_VALS = ["conll", "conllu", "json", "serialized", "text", "xml", "inlinexml"]


	def validate_corenlp_props(properties=None, annotators=None, output_format=None):
	""" Do basic checks to validate CoreNLP properties """
	if output_format and output_format.lower() not in CORENLP_OUTPUT_VALS:
	raise ValueError(f"{output_format} not a valid CoreNLP outputFormat value! Choose from: {CORENLP_OUTPUT_VALS}")
	if type(properties) == dict:
	if "outputFormat" in properties and properties["outputFormat"].lower() not in CORENLP_OUTPUT_VALS:
	raise ValueError(f"{properties['outputFormat']} not a valid CoreNLP outputFormat value! Choose from: "
	f"{CORENLP_OUTPUT_VALS}")


	class AnnotationException(Exception):
	""" Exception raised when there was an error communicating with the CoreNLP server. """
	pass


	class TimeoutException(AnnotationException):
	""" Exception raised when the CoreNLP server timed out. """
	pass


	class ShouldRetryException(Exception):
	""" Exception raised if the service should retry the request. """
	pass


	class PermanentlyFailedException(Exception):
	""" Exception raised if the service should NOT retry the request. """
	pass

	class StartServer(enum.Enum):
	DONT_START = 0
	FORCE_START = 1
	TRY_START = 2


	def clean_props_file(props_file):
	# check if there is a temp server props file to remove and remove it
	if props_file:
	if os.path.isfile(props_file) and SERVER_PROPS_TMP_FILE_PATTERN.match(os.path.basename(props_file)):
	os.remove(props_file)


	class RobustService(object):
	""" Service that resuscitates itself if it is not available. """
	CHECK_ALIVE_TIMEOUT = 120

	def __init__(self, start_cmd, stop_cmd, endpoint, stdout=None,
	stderr=None, be_quiet=False, host=None, port=None, ignore_binding_error=False):
	self.start_cmd = start_cmd and shlex.split(start_cmd)
	self.stop_cmd = stop_cmd and shlex.split(stop_cmd)
	self.endpoint = endpoint
	self.stdout = stdout
	self.stderr = stderr

	self.server = None
	self.is_active = False
	self.be_quiet = be_quiet
	self.host = host
	self.port = port
	self.ignore_binding_error = ignore_binding_error
	atexit.register(self.atexit_kill)

	def is_alive(self):
	try:
	if not self.ignore_binding_error and self.server is not None and self.server.poll() is not None:
	return False
	return requests.get(self.endpoint + "/ping").ok
	except requests.exceptions.ConnectionError as e:
	raise ShouldRetryException(e)

	def start(self):
	if self.start_cmd:
	if self.host and self.port:
	with contextlib.closing(socket.socket(socket.AF_INET, socket.SOCK_STREAM)) as sock:
	try:
	sock.bind((self.host, self.port))
	except socket.error as e:
	if self.ignore_binding_error:
	logger.info(f"Connecting to existing CoreNLP server at {self.host}:{self.port}")
	self.server = None
	return
	else:
	raise PermanentlyFailedException("Error: unable to start the CoreNLP server on port %d "
	"(possibly something is already running there)" % self.port) from e
	if self.be_quiet:
	# Issue #26: subprocess.DEVNULL isn't supported in python 2.7.
	if hasattr(subprocess, 'DEVNULL'):
	stderr = subprocess.DEVNULL
	else:
	stderr = open(os.devnull, 'w')
	stdout = stderr
	else:
	stdout = self.stdout
	stderr = self.stderr
	logger.info(f"Starting server with command: {' '.join(self.start_cmd)}")
	try:
	self.server = subprocess.Popen(self.start_cmd,
	stderr=stderr,
	stdout=stdout)
	except FileNotFoundError as e:
	raise FileNotFoundError("When trying to run CoreNLP, a FileNotFoundError occurred, which frequently means Java was not installed or was not in the classpath.") from e

	def atexit_kill(self):
	# make some kind of effort to stop the service (such as a
	# CoreNLP server) at the end of the program. not waiting so
	# that the python script exiting isn't delayed
	if self.server and self.server.poll() is None:
	self.server.terminate()

	def stop(self):
	if self.server:
	self.server.terminate()
	try:
	self.server.wait(5)
	except subprocess.TimeoutExpired:
	# Resorting to more aggressive measures...
	self.server.kill()
	try:
	self.server.wait(5)
	except subprocess.TimeoutExpired:
	# oh well
	pass
	self.server = None
	if self.stop_cmd:
	subprocess.run(self.stop_cmd, check=True)
	self.is_active = False

	def __enter__(self):
	self.start()
	return self

	def __exit__(self, _, __, ___):
	self.stop()

	def ensure_alive(self):
	# Check if the service is active and alive
	if self.is_active:
	try:
	if self.is_alive():
	return
	else:
	self.stop()
	except ShouldRetryException:
	pass

	# If not, try to start up the service.
	if self.server is None:
	self.start()

	# Wait for the service to start up.
	start_time = time.time()
	while True:
	try:
	if self.is_alive():
	break
	except ShouldRetryException:
	pass

	if time.time() - start_time < self.CHECK_ALIVE_TIMEOUT:
	time.sleep(1)
	else:
	raise PermanentlyFailedException("Timed out waiting for service to come alive.")

	# At this point we are guaranteed that the service is alive.
	self.is_active = True


	def resolve_classpath(classpath=None):
	"""
	Returns the classpath to use for corenlp.

	Prefers to use the given classpath parameter, if available. If
	not, uses the CORENLP_HOME environment variable. Resolves $CLASSPATH
	(the exact string) in either the classpath parameter or $CORENLP_HOME.
	"""
	if classpath == '$CLASSPATH' or (classpath is None and os.getenv("CORENLP_HOME", None) == '$CLASSPATH'):
	classpath = os.getenv("CLASSPATH")
	elif classpath is None:
	classpath = os.getenv("CORENLP_HOME", os.path.join(str(Path.home()), 'stanza_corenlp'))

	if not os.path.exists(classpath):
	raise FileNotFoundError("Please install CoreNLP by running `stanza.install_corenlp()`. If you have installed it, please define "
	"$CORENLP_HOME to be location of your CoreNLP distribution or pass in a classpath parameter. "
	"$CORENLP_HOME={}".format(os.getenv("CORENLP_HOME")))
	classpath = os.path.join(classpath, "*")
	return classpath


	class CoreNLPClient(RobustService):
	""" A client to the Stanford CoreNLP server. """

	DEFAULT_ENDPOINT = "http://localhost:9000"
	DEFAULT_TIMEOUT = 60000
	DEFAULT_THREADS = 5
	DEFAULT_OUTPUT_FORMAT = "serialized"
	DEFAULT_MEMORY = "5G"
	DEFAULT_MAX_CHAR_LENGTH = 100000

	def __init__(self, start_server=StartServer.FORCE_START,
	endpoint=DEFAULT_ENDPOINT,
	timeout=DEFAULT_TIMEOUT,
	threads=DEFAULT_THREADS,
	annotators=None,
	pretokenized=False,
	output_format=None,
	properties=None,
	stdout=None,
	stderr=None,
	memory=DEFAULT_MEMORY,
	be_quiet=False,
	max_char_length=DEFAULT_MAX_CHAR_LENGTH,
	preload=True,
	classpath=None,
	**kwargs):

	# whether or not server should be started by client
	self.start_server = start_server
	self.server_props_path = None
	self.server_start_time = None
	self.server_host = None
	self.server_port = None
	self.server_classpath = None
	# validate properties
	validate_corenlp_props(properties=properties, annotators=annotators, output_format=output_format)
	# set up client defaults
	self.properties = properties
	self.annotators = annotators
	self.pretokenized = pretokenized
	self.output_format = output_format
	self._setup_client_defaults()
	# start the server
	if isinstance(start_server, bool):
	warning_msg = f"Setting 'start_server' to a boolean value when constructing {self.__class__.__name__} is deprecated and will stop" + \
	" to function in a future version of stanza. Please consider switching to using a value from stanza.server.StartServer."
	logger.warning(warning_msg)
	start_server = StartServer.FORCE_START if start_server is True else StartServer.DONT_START

	# start the server
	if start_server is StartServer.FORCE_START or start_server is StartServer.TRY_START:
	# record info for server start
	self.server_start_time = datetime.now()
	# set up default properties for server
	self._setup_server_defaults()
	host, port = urlparse(endpoint).netloc.split(":")
	port = int(port)
	assert host == "localhost", "If starting a server, endpoint must be localhost"
	classpath = resolve_classpath(classpath)
	start_cmd = f"java -Xmx{memory} -cp '{classpath}' edu.stanford.nlp.pipeline.StanfordCoreNLPServer " \
	f"-port {port} -timeout {timeout} -threads {threads} -maxCharLength {max_char_length} " \
	f"-quiet {be_quiet} "

	self.server_classpath = classpath
	self.server_host = host
	self.server_port = port

	# set up server defaults
	if self.server_props_path is not None:
	start_cmd += f" -serverProperties {self.server_props_path}"

	# possibly set pretokenized
	if self.pretokenized:
	start_cmd += f" -preTokenized"

	# set annotators for server default
	if self.annotators is not None:
	annotators_str = self.annotators if type(annotators) == str else ",".join(annotators)
	start_cmd += f" -annotators {annotators_str}"

	# specify what to preload, if anything
	if preload:
	if type(preload) == bool:
	# -preload flag means to preload all default annotators
	start_cmd += " -preload"
	elif type(preload) == list:
	# turn list into comma separated list string, only preload these annotators
	start_cmd += f" -preload {','.join(preload)}"
	elif type(preload) == str:
	# comma separated list of annotators
	start_cmd += f" -preload {preload}"

	# set outputFormat for server default
	# if no output format requested by user, set to serialized
	start_cmd += f" -outputFormat {self.output_format}"

	# additional options for server:
	# - server_id
	# - ssl
	# - status_port
	# - uriContext
	# - strict
	# - key
	# - username
	# - password
	# - blockList
	for kw in ['ssl', 'strict']:
	if kwargs.get(kw) is not None:
	start_cmd += f" -{kw}"
	for kw in ['status_port', 'uriContext', 'key', 'username', 'password', 'blockList', 'server_id']:
	if kwargs.get(kw) is not None:
	start_cmd += f" -{kw} {kwargs.get(kw)}"
	stop_cmd = None
	else:
	start_cmd = stop_cmd = None
	host = port = None

	super(CoreNLPClient, self).__init__(start_cmd, stop_cmd, endpoint,
	stdout, stderr, be_quiet, host=host, port=port, ignore_binding_error=(start_server == StartServer.TRY_START))

	self.timeout = timeout

	def _setup_client_defaults(self):
	"""
	Do some processing of annotators and output_format specified for the client.
	If interacting with an externally started server, these will be defaults for annotate() calls.
	:return: None
	"""
	# normalize annotators to str
	if self.annotators is not None:
	self.annotators = self.annotators if type(self.annotators) == str else ",".join(self.annotators)

	# handle case where no output format is specified
	if self.output_format is None:
	if type(self.properties) == dict and 'outputFormat' in self.properties:
	self.output_format = self.properties['outputFormat']
	else:
	self.output_format = CoreNLPClient.DEFAULT_OUTPUT_FORMAT

	def _setup_server_defaults(self):
	"""
	Set up the default properties for the server.

	The properties argument can take on one of 3 value types

	1. File path on system or in CLASSPATH (e.g. /path/to/server.props or StanfordCoreNLP-french.properties
	2. Name of a Stanford CoreNLP supported language (e.g. french or fr)
	3. Python dictionary (properties written to tmp file for Java server, erased at end)

	In addition, an annotators list and output_format can be specified directly with arguments. These
	will overwrite any settings in the specified properties.

	If no properties are specified, the standard Stanford CoreNLP English server will be launched. The outputFormat
	will be set to 'serialized' and use the ProtobufAnnotationSerializer.
	"""

	# ensure properties is str or dict
	if self.properties is None or (not isinstance(self.properties, str) and not isinstance(self.properties, dict)):
	if self.properties is not None:
	logger.warning('properties passed invalid value (not a str or dict), setting properties = {}')
	self.properties = {}
	# check if properties is a string, pass on to server which can handle
	if isinstance(self.properties, str):
	# try to translate to Stanford CoreNLP language name, or assume properties is a path
	if is_corenlp_lang(self.properties):
	if self.properties.lower() in LANGUAGE_SHORTHANDS_TO_FULL:
	self.properties = LANGUAGE_SHORTHANDS_TO_FULL[self.properties]
	logger.info(
	f"Using CoreNLP default properties for: {self.properties}. Make sure to have "
	f"{self.properties} models jar (available for download here: "
	f"https://stanfordnlp.github.io/CoreNLP/) in CLASSPATH")
	else:
	if not os.path.isfile(self.properties):
	logger.warning(f"{self.properties} does not correspond to a file path. Make sure this file is in "
	f"your CLASSPATH.")
	self.server_props_path = self.properties
	elif isinstance(self.properties, dict):
	# make a copy
	server_start_properties = dict(self.properties)
	if self.annotators is not None:
	server_start_properties['annotators'] = self.annotators
	if self.output_format is not None and isinstance(self.output_format, str):
	server_start_properties['outputFormat'] = self.output_format
	# write desired server start properties to tmp file
	# set up to erase on exit
	tmp_path = write_corenlp_props(server_start_properties)
	logger.info(f"Writing properties to tmp file: {tmp_path}")
	atexit.register(clean_props_file, tmp_path)
	self.server_props_path = tmp_path

	def _request(self, buf, properties, reset_default=False, **kwargs):
	"""
	Send a request to the CoreNLP server.

	:param (str \| bytes) buf: data to be sent with the request
	:param (dict) properties: properties that the server expects
	:return: request result
	"""
	if self.start_server is not StartServer.DONT_START:
	self.ensure_alive()

	try:
	input_format = properties.get("inputFormat", "text")
	if input_format == "text":
	ctype = "text/plain; charset=utf-8"
	elif input_format == "serialized":
	ctype = "application/x-protobuf"
	else:
	raise ValueError("Unrecognized inputFormat " + input_format)
	# handle auth
	if 'username' in kwargs and 'password' in kwargs:
	kwargs['auth'] = requests.auth.HTTPBasicAuth(kwargs['username'], kwargs['password'])
	kwargs.pop('username')
	kwargs.pop('password')
	r = requests.post(self.endpoint,
	params={'properties': str(properties), 'resetDefault': str(reset_default).lower()},
	data=buf, headers={'content-type': ctype},
	timeout=(self.timeout2)/1000, *kwargs)
	r.raise_for_status()
	return r
	except requests.exceptions.Timeout as e:
	raise TimeoutException("Timeout requesting to CoreNLPServer. Maybe server is unavailable or your document is too long")
	except requests.exceptions.RequestException as e:
	if e.response is not None and e.response.text is not None:
	raise AnnotationException(e.response.text) from e
	elif e.args:
	raise AnnotationException(e.args[0]) from e
	raise AnnotationException() from e

	def annotate(self, text, annotators=None, output_format=None, properties=None, reset_default=None, **kwargs):
	"""
	Send a request to the CoreNLP server.

	:param (str \| unicode) text: raw text for the CoreNLPServer to parse
	:param (list \| string) annotators: list of annotators to use
	:param (str) output_format: output type from server: serialized, json, text, conll, conllu, or xml
	:param (dict) properties: additional request properties (written on top of defaults)
	:param (bool) reset_default: don't use server defaults

	Precedence for settings:

	1. annotators and output_format args
	2. Values from properties dict
	3. Client defaults self.annotators and self.output_format (set during client construction)
	4. Server defaults

	Additional request parameters (apart from CoreNLP pipeline properties) such as 'username' and 'password'
	can be specified with the kwargs.

	:return: request result
	"""

	# validate request properties
	validate_corenlp_props(properties=properties, annotators=annotators, output_format=output_format)
	# set request properties
	request_properties = {}

	# start with client defaults
	if self.annotators is not None:
	request_properties['annotators'] = self.annotators
	if self.output_format is not None:
	request_properties['outputFormat'] = self.output_format

	# add values from properties arg
	# handle str case
	if type(properties) == str:
	if is_corenlp_lang(properties):
	properties = {'pipelineLanguage': properties.lower()}
	if reset_default is None:
	reset_default = True
	else:
	raise ValueError(f"Unrecognized properties keyword {properties}")

	if type(properties) == dict:
	request_properties.update(properties)

	# if annotators list is specified, override with that
	# also can use the annotators field the object was created with
	if annotators is not None and (type(annotators) == str or type(annotators) == list):
	request_properties['annotators'] = annotators if type(annotators) == str else ",".join(annotators)

	# if output format is specified, override with that
	if output_format is not None and type(output_format) == str:
	request_properties['outputFormat'] = output_format

	# make the request
	# if not explicitly set or the case of pipelineLanguage, reset_default should be None
	if reset_default is None:
	reset_default = False
	r = self._request(text.encode('utf-8'), request_properties, reset_default, **kwargs)
	if request_properties["outputFormat"] == "json":
	return r.json()
	elif request_properties["outputFormat"] == "serialized":
	doc = Document()
	parseFromDelimitedString(doc, r.content)
	return doc
	elif request_properties["outputFormat"] in ["text", "conllu", "conll", "xml"]:
	return r.text
	else:
	return r

	def update(self, doc, annotators=None, properties=None):
	if properties is None:
	properties = {}
	properties.update({
	'inputFormat': 'serialized',
	'outputFormat': 'serialized',
	'serializer': 'edu.stanford.nlp.pipeline.ProtobufAnnotationSerializer'
	})
	if annotators:
	properties['annotators'] = annotators if type(annotators) == str else ",".join(annotators)
	with io.BytesIO() as stream:
	writeToDelimitedString(doc, stream)
	msg = stream.getvalue()

	r = self._request(msg, properties)
	doc = Document()
	parseFromDelimitedString(doc, r.content)
	return doc

	def tokensregex(self, text, pattern, filter=False, to_words=False, annotators=None, properties=None):
	# this is required for some reason
	matches = self.__regex('/tokensregex', text, pattern, filter, annotators, properties)
	if to_words:
	matches = regex_matches_to_indexed_words(matches)
	return matches

	def semgrex(self, text, pattern, filter=False, to_words=False, annotators=None, properties=None):
	matches = self.__regex('/semgrex', text, pattern, filter, annotators, properties)
	if to_words:
	matches = regex_matches_to_indexed_words(matches)
	return matches

	def fill_tree_proto(self, tree, proto_tree):
	if tree.label:
	proto_tree.value = tree.label
	for child in tree.children:
	proto_child = proto_tree.child.add()
	self.fill_tree_proto(child, proto_child)

	def tregex(self, text=None, pattern=None, filter=False, annotators=None, properties=None, trees=None):
	# parse is not included by default in some of the pipelines,
	# so we may need to manually override the annotators
	# to include parse in order for tregex to do anything
	if annotators is None and self.annotators is not None:
	assert isinstance(self.annotators, str)
	pieces = self.annotators.split(",")
	if "parse" not in pieces:
	annotators = self.annotators + ",parse"
	else:
	annotators = "tokenize,ssplit,pos,parse"
	if pattern is None:
	raise ValueError("Cannot have None as a pattern for tregex")

	# TODO: we could also allow for passing in a complete document,
	# along with the original text, so that the spans returns are more accurate
	if trees is not None:
	if properties is None:
	properties = {}
	properties['inputFormat'] = 'serialized'
	if 'serializer' not in properties:
	properties['serializer'] = 'edu.stanford.nlp.pipeline.ProtobufAnnotationSerializer'
	doc = Document()
	full_text = []
	for tree_idx, tree in enumerate(trees):
	sentence = doc.sentence.add()
	sentence.sentenceIndex = tree_idx
	sentence.tokenOffsetBegin = len(full_text)
	leaves = tree.leaf_labels()
	full_text.extend(leaves)
	sentence.tokenOffsetEnd = len(full_text)
	self.fill_tree_proto(tree, sentence.parseTree)
	for word in leaves:
	token = sentence.token.add()
	# the other side uses both value and word, weirdly enough
	token.value = word
	token.word = word
	# without the actual tokenization, at least we can
	# stop the words from running together
	token.after = " "
	doc.text = " ".join(full_text)
	with io.BytesIO() as stream:
	writeToDelimitedString(doc, stream)
	text = stream.getvalue()

	return self.__regex('/tregex', text, pattern, filter, annotators, properties)

	def __regex(self, path, text, pattern, filter, annotators=None, properties=None):
	"""
	Send a regex-related request to the CoreNLP server.

	:param (str \| unicode) path: the path for the regex endpoint
	:param text: raw text for the CoreNLPServer to apply the regex
	:param (str \| unicode) pattern: regex pattern
	:param (bool) filter: option to filter sentences that contain matches, if false returns matches
	:param properties: option to filter sentences that contain matches, if false returns matches
	:return: request result
	"""
	if self.start_server is not StartServer.DONT_START:
	self.ensure_alive()
	if properties is None:
	properties = {}
	properties.update({
	'inputFormat': 'text',
	'serializer': 'edu.stanford.nlp.pipeline.ProtobufAnnotationSerializer'
	})
	if annotators:
	properties['annotators'] = ",".join(annotators) if isinstance(annotators, list) else annotators

	# force output for regex requests to be json
	properties['outputFormat'] = 'json'
	# if the server is trying to send back character offsets, it
	# should send back codepoints counts as well in case the text
	# has extra wide characters
	properties['tokenize.codepoint'] = 'true'

	try:
	# Error occurs unless put properties in params
	input_format = properties.get("inputFormat", "text")
	if input_format == "text":
	ctype = "text/plain; charset=utf-8"
	elif input_format == "serialized":
	ctype = "application/x-protobuf"
	else:
	raise ValueError("Unrecognized inputFormat " + input_format)
	# change request method from `get` to `post` as required by CoreNLP
	r = requests.post(
	self.endpoint + path, params={
	'pattern': pattern,
	'filter': filter,
	'properties': str(properties)
	},
	data=text.encode('utf-8') if isinstance(text, str) else text,
	headers={'content-type': ctype},
	timeout=(self.timeout*2)/1000,
	)
	r.raise_for_status()
	if r.encoding is None:
	r.encoding = "utf-8"
	return json.loads(r.text)
	except requests.HTTPError as e:
	if r.text.startswith("Timeout"):
	raise TimeoutException(r.text)
	else:
	raise AnnotationException(r.text)
	except json.JSONDecodeError:
	raise AnnotationException(r.text)


	def scenegraph(self, text, properties=None):
	"""
	Send a request to the server which processes the text using SceneGraph

	This will require a new CoreNLP release, 4.5.5 or later
	"""
	# since we're using requests ourself,
	# check if the server has started or not
	if self.start_server is not StartServer.DONT_START:
	self.ensure_alive()

	if properties is None:
	properties = {}
	# the only thing the scenegraph knows how to use is text
	properties['inputFormat'] = 'text'
	ctype = "text/plain; charset=utf-8"
	# the json output format is much more useful
	properties['outputFormat'] = 'json'
	try:
	r = requests.post(
	self.endpoint + "/scenegraph",
	params={
	'properties': str(properties)
	},
	data=text.encode('utf-8') if isinstance(text, str) else text,
	headers={'content-type': ctype},
	timeout=(self.timeout*2)/1000,
	)
	r.raise_for_status()
	if r.encoding is None:
	r.encoding = "utf-8"
	return json.loads(r.text)
	except requests.HTTPError as e:
	if r.text.startswith("Timeout"):
	raise TimeoutException(r.text)
	else:
	raise AnnotationException(r.text)
	except json.JSONDecodeError:
	raise AnnotationException(r.text)


	def read_corenlp_props(props_path):
	""" Read a Stanford CoreNLP properties file into a dict """
	props_dict = {}
	with open(props_path) as props_file:
	entry_lines = [entry_line for entry_line in props_file.read().split('\n')
	if entry_line.strip() and not entry_line.startswith('#')]
	for entry_line in entry_lines:
	k = entry_line.split('=')[0]
	k_len = len(k+"=")
	v = entry_line[k_len:]
	props_dict[k.strip()] = v
	return props_dict


	def write_corenlp_props(props_dict, file_path=None):
	""" Write a Stanford CoreNLP properties dict to a file """
	if file_path is None:
	file_path = f"corenlp_server-{uuid.uuid4().hex[:16]}.props"
	# confirm tmp file path matches pattern
	assert SERVER_PROPS_TMP_FILE_PATTERN.match(file_path)
	with open(file_path, 'w') as props_file:
	for k, v in props_dict.items():
	if isinstance(v, list):
	writeable_v = ",".join(v)
	else:
	writeable_v = v
	props_file.write(f'{k} = {writeable_v}\n\n')
	return file_path


	def regex_matches_to_indexed_words(matches):
	"""
	Transforms tokensregex and semgrex matches to indexed words.
	:param matches: unprocessed regex matches
	:return: flat array of indexed words
	"""
	words = [dict(v, **dict([('sentence', i)]))
	for i, s in enumerate(matches['sentences'])
	for k, v in s.items() if k != 'length']
	return words


	__all__ = ["CoreNLPClient", "AnnotationException", "TimeoutException", "to_text"]