update distinct
Browse files- distinct.py +7 -7
distinct.py
CHANGED
|
@@ -146,27 +146,27 @@ class distinct(evaluate.Measurement):
|
|
| 146 |
total_tokens = []
|
| 147 |
total_tokens_2grams = []
|
| 148 |
total_tokens_3grams = []
|
|
|
|
| 149 |
for prediction in predictions:
|
| 150 |
if tokenizer == "white_space":
|
| 151 |
tokens = prediction.split(" ")
|
| 152 |
-
tokens_2grams = ngrams(prediction.split(" "), 2, left_pad_symbol='<s>')
|
| 153 |
-
tokens_3grams = ngrams(prediction.split(" "), 3, left_pad_symbol='<s>')
|
| 154 |
else:
|
| 155 |
try:
|
| 156 |
tokens = list(tokenizer.tokenize(prediction))
|
| 157 |
-
tokens_2grams = ngrams(list(tokenizer.tokenize(prediction)), 2, left_pad_symbol='<s>')
|
| 158 |
-
tokens_3grams = ngrams(list(tokenizer.tokenize(prediction)), 3, left_pad_symbol='<s>')
|
| 159 |
-
|
| 160 |
except Exception as e:
|
| 161 |
raise e
|
| 162 |
-
|
| 163 |
distinct_tokens = distinct_tokens | set(tokens)
|
| 164 |
distinct_tokens_2grams = distinct_tokens_2grams | set(tokens_2grams)
|
| 165 |
distinct_tokens_3grams = distinct_tokens_3grams | set(tokens_3grams)
|
| 166 |
total_tokens.extend(tokens)
|
| 167 |
total_tokens_2grams.extend(list(tokens_2grams))
|
| 168 |
total_tokens_3grams.extend(list(tokens_3grams))
|
| 169 |
-
|
| 170 |
Distinct_1 = len(distinct_tokens)/len(total_tokens)
|
| 171 |
Distinct_2 = len(distinct_tokens_2grams)/len(total_tokens_2grams)
|
| 172 |
Distinct_3 = len(distinct_tokens_3grams)/len(total_tokens_3grams)
|
|
|
|
| 146 |
total_tokens = []
|
| 147 |
total_tokens_2grams = []
|
| 148 |
total_tokens_3grams = []
|
| 149 |
+
|
| 150 |
for prediction in predictions:
|
| 151 |
if tokenizer == "white_space":
|
| 152 |
tokens = prediction.split(" ")
|
| 153 |
+
tokens_2grams = list(ngrams(prediction.split(" "), 2, pad_left=True, left_pad_symbol='<s>'))
|
| 154 |
+
tokens_3grams = list(ngrams(prediction.split(" "), 3, pad_left=True, left_pad_symbol='<s>'))
|
| 155 |
else:
|
| 156 |
try:
|
| 157 |
tokens = list(tokenizer.tokenize(prediction))
|
| 158 |
+
tokens_2grams = list(ngrams(list(tokenizer.tokenize(prediction)), 2, pad_left=True, left_pad_symbol='<s>'))
|
| 159 |
+
tokens_3grams = list(ngrams(list(tokenizer.tokenize(prediction)), 3, pad_left=True, left_pad_symbol='<s>'))
|
|
|
|
| 160 |
except Exception as e:
|
| 161 |
raise e
|
| 162 |
+
|
| 163 |
distinct_tokens = distinct_tokens | set(tokens)
|
| 164 |
distinct_tokens_2grams = distinct_tokens_2grams | set(tokens_2grams)
|
| 165 |
distinct_tokens_3grams = distinct_tokens_3grams | set(tokens_3grams)
|
| 166 |
total_tokens.extend(tokens)
|
| 167 |
total_tokens_2grams.extend(list(tokens_2grams))
|
| 168 |
total_tokens_3grams.extend(list(tokens_3grams))
|
| 169 |
+
|
| 170 |
Distinct_1 = len(distinct_tokens)/len(total_tokens)
|
| 171 |
Distinct_2 = len(distinct_tokens_2grams)/len(total_tokens_2grams)
|
| 172 |
Distinct_3 = len(distinct_tokens_3grams)/len(total_tokens_3grams)
|