""" USAGE is produced by the same people as SCARE. USAGE has a German and English part. This script parses the German part. Run the script as process_usage_german.py path Here, path should be where USAGE was unpacked. It will have the documents, files, etc subdirectories. https://www.romanklinger.de/usagecorpus/ """ import csv import glob import os import sys import stanza from stanza.models.classifiers.data import SentimentDatum import stanza.utils.datasets.sentiment.process_utils as process_utils def main(in_directory, out_directory, short_name): os.makedirs(out_directory, exist_ok=True) nlp = stanza.Pipeline('de', processors='tokenize') num_short_items = 0 snippets = [] csv_files = glob.glob(os.path.join(in_directory, "files/de*csv")) for csv_filename in csv_files: with open(csv_filename, newline='') as fin: cin = csv.reader(fin, delimiter='\t', quotechar=None) lines = list(cin) for index, line in enumerate(lines): begin, end, snippet, sentiment = [line[i] for i in [2, 3, 4, 6]] begin = int(begin) end = int(end) if len(snippet) != end - begin: raise ValueError("Error found in {} line {}. Expected {} got {}".format(csv_filename, index, (end-begin), len(snippet))) if sentiment.lower() == 'unknown': continue elif sentiment.lower() == 'positive': sentiment = 2 elif sentiment.lower() == 'neutral': sentiment = 1 elif sentiment.lower() == 'negative': sentiment = 0 else: raise ValueError("Tell John he screwed up and this is why he can't have Mox Opal: {}".format(sentiment)) doc = nlp(snippet) text = [token.text for sentence in doc.sentences for token in sentence.tokens] num_tokens = sum(len(sentence.tokens) for sentence in doc.sentences) if num_tokens < 4: num_short_items = num_short_items + 1 snippets.append(SentimentDatum(sentiment, text)) print("Total snippets found for USAGE: %d" % len(snippets)) process_utils.write_list(os.path.join(out_directory, "%s.train.json" % short_name), snippets) if __name__ == '__main__': in_directory = sys.argv[1] out_directory = sys.argv[2] short_name = sys.argv[3] main(in_directory, out_directory, short_name)