stanza-digphil / stanza /utils /datasets /sentiment /process_usage_german.py

Albin Thörn Cleland

Clean initial commit with LFS

19b8775 about 2 months ago

2.55 kB

	"""
	USAGE is produced by the same people as SCARE.

	USAGE has a German and English part. This script parses the German part.
	Run the script as
	process_usage_german.py path

	Here, path should be where USAGE was unpacked. It will have the
	documents, files, etc subdirectories.

	https://www.romanklinger.de/usagecorpus/
	"""

	import csv
	import glob
	import os
	import sys

	import stanza

	from stanza.models.classifiers.data import SentimentDatum
	import stanza.utils.datasets.sentiment.process_utils as process_utils

	def main(in_directory, out_directory, short_name):
	os.makedirs(out_directory, exist_ok=True)
	nlp = stanza.Pipeline('de', processors='tokenize')

	num_short_items = 0
	snippets = []
	csv_files = glob.glob(os.path.join(in_directory, "files/de*csv"))
	for csv_filename in csv_files:
	with open(csv_filename, newline='') as fin:
	cin = csv.reader(fin, delimiter='\t', quotechar=None)
	lines = list(cin)

	for index, line in enumerate(lines):
	begin, end, snippet, sentiment = [line[i] for i in [2, 3, 4, 6]]
	begin = int(begin)
	end = int(end)
	if len(snippet) != end - begin:
	raise ValueError("Error found in {} line {}. Expected {} got {}".format(csv_filename, index, (end-begin), len(snippet)))
	if sentiment.lower() == 'unknown':
	continue
	elif sentiment.lower() == 'positive':
	sentiment = 2
	elif sentiment.lower() == 'neutral':
	sentiment = 1
	elif sentiment.lower() == 'negative':
	sentiment = 0
	else:
	raise ValueError("Tell John he screwed up and this is why he can't have Mox Opal: {}".format(sentiment))
	doc = nlp(snippet)
	text = [token.text for sentence in doc.sentences for token in sentence.tokens]
	num_tokens = sum(len(sentence.tokens) for sentence in doc.sentences)
	if num_tokens < 4:
	num_short_items = num_short_items + 1
	snippets.append(SentimentDatum(sentiment, text))

	print("Total snippets found for USAGE: %d" % len(snippets))

	process_utils.write_list(os.path.join(out_directory, "%s.train.json" % short_name), snippets)

	if __name__ == '__main__':
	in_directory = sys.argv[1]
	out_directory = sys.argv[2]
	short_name = sys.argv[3]

	main(in_directory, out_directory, short_name)