stanza-digphil / stanza /utils /datasets /sentiment /process_MELD.py

Albin Thörn Cleland

Clean initial commit with LFS

19b8775 about 2 months ago

2.41 kB

	"""
	MELD is a dataset of Friends (the TV show) utterances.

	The ratings include judgment based on the visuals, so it might be
	harder than expected to directly extract from the text. However, it
	should broaden the scope of the model and doesn't seem to hurt
	performance.

	https://github.com/SenticNet/MELD/tree/master/data/MELD

	https://github.com/SenticNet/MELD

	https://arxiv.org/pdf/1810.02508.pdf

	Files in the MELD repo are csv, with quotes in "..." if they contained commas themselves.

	Accordingly, we use the csv module to read the files and output them in the format
	<class> <sentence>

	Run using

	python3 convert_MELD.py MELD/train_sent_emo.csv train.txt
	etc

	"""

	import csv
	import os
	import sys

	from stanza.models.classifiers.data import SentimentDatum
	import stanza.utils.datasets.sentiment.process_utils as process_utils

	def get_phrases(in_filename):
	"""
	Get the phrases from a single CSV filename
	"""
	with open(in_filename, newline='', encoding='windows-1252') as fin:
	cin = csv.reader(fin, delimiter=',', quotechar='"')
	lines = list(cin)

	phrases = []
	for line in lines[1:]:
	sentiment = line[4]
	if sentiment == 'negative':
	sentiment = '0'
	elif sentiment == 'neutral':
	sentiment = '1'
	elif sentiment == 'positive':
	sentiment = '2'
	else:
	raise ValueError("Unknown sentiment: {}".format(sentiment))
	utterance = line[1].replace("Â", "")
	phrases.append(SentimentDatum(sentiment, utterance))
	return phrases

	def get_tokenized_phrases(split, in_directory):
	"""
	split in train,dev,test
	"""
	in_filename = os.path.join(in_directory, "%s_sent_emo.csv" % split)
	phrases = get_phrases(in_filename)

	phrases = process_utils.get_ptb_tokenized_phrases(phrases)
	print("Found {} phrases in MELD {}".format(len(phrases), split))
	return phrases

	def main(in_directory, out_directory, short_name):
	os.makedirs(out_directory, exist_ok=True)
	for split in ("train", "dev", "test"):
	phrases = get_tokenized_phrases(split, in_directory)
	process_utils.write_list(os.path.join(out_directory, "%s.%s.json" % (short_name, split)), phrases)

	if __name__ == '__main__':
	in_directory = sys.argv[1]
	out_directory = sys.argv[2]
	short_name = sys.argv[3]

	main(in_directory, out_directory, short_name)