File size: 5,744 Bytes
19b8775
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
import csv
import glob
import json
import os
import tempfile

from collections import namedtuple

from tqdm import tqdm

import stanza
from stanza.models.classifiers.data import SentimentDatum

Split = namedtuple('Split', ['filename', 'weight'])

SHARDS = ("train", "dev", "test")

def write_list(out_filename, dataset):
    """
    Write a list of items to the given output file

    Expected: list(SentimentDatum)
    """
    formatted_dataset = [line._asdict() for line in dataset]
    # Rather than write the dataset at once, we write one line at a time
    # Using `indent` puts each word on a separate line, which is rather noisy,
    # but not formatting at all makes one long line out of an entire dataset,
    # which is impossible to read
    #json.dump(formatted_dataset, fout, indent=2, ensure_ascii=False)

    with open(out_filename, 'w') as fout:
        fout.write("[\n")
        for idx, line in enumerate(formatted_dataset):
            fout.write("  ")
            json.dump(line, fout, ensure_ascii=False)
            if idx < len(formatted_dataset) - 1:
                fout.write(",")
            fout.write("\n")
        fout.write("]\n")

def write_dataset(dataset, out_directory, dataset_name):
    """
    Write train, dev, test as .json files for a given dataset

    dataset: 3 lists of sentiment tuples
    """
    for shard, phrases in zip(SHARDS, dataset):
        output_file = os.path.join(out_directory, "%s.%s.json" % (dataset_name, shard))
        write_list(output_file, phrases)

def write_splits(out_directory, snippets, splits):
    """
    Write the given list of items to the split files in the specified output directory
    """
    total_weight = sum(split.weight for split in splits)
    divs = []
    subtotal = 0.0
    for split in splits:
        divs.append(int(len(snippets) * subtotal / total_weight))
        subtotal = subtotal + split.weight
    # the last div will be guaranteed to be the full thing - no math used
    divs.append(len(snippets))

    for i, split in enumerate(splits):
        filename = os.path.join(out_directory, split.filename)
        print("Writing {}:{} to {}".format(divs[i], divs[i+1], filename))
        write_list(filename, snippets[divs[i]:divs[i+1]])

def clean_tokenized_tweet(line):
    line = list(line)
    if len(line) > 3 and line[0] == 'RT' and line[1][0] == '@' and line[2] == ':':
        line = line[3:]
    elif len(line) > 4 and line[0] == 'RT' and line[1] == '@' and line[3] == ':':
        line = line[4:]
    elif line[0][0] == '@':
        line = line[1:]
    for i in range(len(line)):
        if line[i][0] == '@' or line[i][0] == '#':
            line[i] = line[i][1:]
    line = [x for x in line if x and not x.startswith("http:") and not x.startswith("https:")]
    return line

def get_ptb_tokenized_phrases(dataset):
    """
    Use the PTB tokenizer to retokenize the phrases

    Not clear which is better, "Nov." or "Nov ."
    strictAcronym=true makes it do the latter
    tokenizePerLine=true should make it only pay attention to one line at a time

    Phrases will be returned as lists of words rather than one string
    """
    with tempfile.TemporaryDirectory() as tempdir:
        phrase_filename = os.path.join(tempdir, "phrases.txt")
        #phrase_filename = "asdf.txt"
        with open(phrase_filename, "w", encoding="utf-8") as fout:
            for item in dataset:
                # extra newlines are so the tokenizer treats the lines
                # as separate sentences
                fout.write("%s\n\n\n" % (item.text))
        tok_filename = os.path.join(tempdir, "tokenized.txt")
        os.system('java edu.stanford.nlp.process.PTBTokenizer -options "strictAcronym=true,tokenizePerLine=true" -preserveLines %s > %s' % (phrase_filename, tok_filename))
        with open(tok_filename, encoding="utf-8") as fin:
            tokenized = fin.readlines()

    tokenized = [x.strip() for x in tokenized]
    tokenized = [x for x in tokenized if x]
    phrases = [SentimentDatum(x.sentiment, y.split()) for x, y in zip(dataset, tokenized)]
    return phrases

def process_datum(nlp, text, mapping, sentiment):
    doc = nlp(text.strip())

    converted_sentiment = mapping.get(sentiment, None)
    if converted_sentiment is None:
        raise ValueError("Value {} not in mapping at line {} of {}".format(sentiment, idx, csv_filename))

    text = []
    for sentence in doc.sentences:
        text.extend(token.text for token in sentence.tokens)
    text = clean_tokenized_tweet(text)
    return SentimentDatum(converted_sentiment, text)

def read_snippets(csv_filename, sentiment_column, text_column, tokenizer_language, mapping, delimiter='\t', quotechar=None, skip_first_line=False, nlp=None, encoding="utf-8"):
    """
    Read in a single CSV file and return a list of SentimentDatums
    """
    if nlp is None:
        nlp = stanza.Pipeline(tokenizer_language, processors='tokenize')

    with open(csv_filename, newline='', encoding=encoding) as fin:
        if skip_first_line:
            next(fin)
        cin = csv.reader(fin, delimiter=delimiter, quotechar=quotechar)
        lines = list(cin)

    # Read in the data and parse it
    snippets = []
    for idx, line in enumerate(tqdm(lines)):
        try:
            if isinstance(sentiment_column, int):
                sentiment = line[sentiment_column].lower()
            else:
                sentiment = tuple([line[x] for x in sentiment_column])
        except IndexError as e:
            raise IndexError("Columns {} did not exist at line {}: {}".format(sentiment_column, idx, line)) from e
        text = line[text_column]
        datum = process_datum(nlp, text, mapping, sentiment)
        snippets.append(datum)
    return snippets