| """ | |
| MELD is a dataset of Friends (the TV show) utterances. | |
| The ratings include judgment based on the visuals, so it might be | |
| harder than expected to directly extract from the text. However, it | |
| should broaden the scope of the model and doesn't seem to hurt | |
| performance. | |
| https://github.com/SenticNet/MELD/tree/master/data/MELD | |
| https://github.com/SenticNet/MELD | |
| https://arxiv.org/pdf/1810.02508.pdf | |
| Files in the MELD repo are csv, with quotes in "..." if they contained commas themselves. | |
| Accordingly, we use the csv module to read the files and output them in the format | |
| <class> <sentence> | |
| Run using | |
| python3 convert_MELD.py MELD/train_sent_emo.csv train.txt | |
| etc | |
| """ | |
| import csv | |
| import os | |
| import sys | |
| from stanza.models.classifiers.data import SentimentDatum | |
| import stanza.utils.datasets.sentiment.process_utils as process_utils | |
| def get_phrases(in_filename): | |
| """ | |
| Get the phrases from a single CSV filename | |
| """ | |
| with open(in_filename, newline='', encoding='windows-1252') as fin: | |
| cin = csv.reader(fin, delimiter=',', quotechar='"') | |
| lines = list(cin) | |
| phrases = [] | |
| for line in lines[1:]: | |
| sentiment = line[4] | |
| if sentiment == 'negative': | |
| sentiment = '0' | |
| elif sentiment == 'neutral': | |
| sentiment = '1' | |
| elif sentiment == 'positive': | |
| sentiment = '2' | |
| else: | |
| raise ValueError("Unknown sentiment: {}".format(sentiment)) | |
| utterance = line[1].replace("Â", "") | |
| phrases.append(SentimentDatum(sentiment, utterance)) | |
| return phrases | |
| def get_tokenized_phrases(split, in_directory): | |
| """ | |
| split in train,dev,test | |
| """ | |
| in_filename = os.path.join(in_directory, "%s_sent_emo.csv" % split) | |
| phrases = get_phrases(in_filename) | |
| phrases = process_utils.get_ptb_tokenized_phrases(phrases) | |
| print("Found {} phrases in MELD {}".format(len(phrases), split)) | |
| return phrases | |
| def main(in_directory, out_directory, short_name): | |
| os.makedirs(out_directory, exist_ok=True) | |
| for split in ("train", "dev", "test"): | |
| phrases = get_tokenized_phrases(split, in_directory) | |
| process_utils.write_list(os.path.join(out_directory, "%s.%s.json" % (short_name, split)), phrases) | |
| if __name__ == '__main__': | |
| in_directory = sys.argv[1] | |
| out_directory = sys.argv[2] | |
| short_name = sys.argv[3] | |
| main(in_directory, out_directory, short_name) | |