Update TrueCaser
Browse files- TrueCaser.py +8 -1
TrueCaser.py
CHANGED
|
@@ -1,5 +1,6 @@
|
|
| 1 |
import math
|
| 2 |
import pickle
|
|
|
|
| 3 |
import string
|
| 4 |
|
| 5 |
from nltk.tokenize import word_tokenize
|
|
@@ -79,10 +80,16 @@ class TrueCaser(object):
|
|
| 79 |
def first_token_case(raw):
|
| 80 |
return raw.capitalize()
|
| 81 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 82 |
def get_true_case(self, sentence, out_of_vocabulary_token_option="title"):
|
| 83 |
tokens = word_tokenize(sentence)
|
| 84 |
tokens_true_case = self.get_true_case_from_tokens(tokens, out_of_vocabulary_token_option)
|
| 85 |
-
|
|
|
|
|
|
|
| 86 |
|
| 87 |
def get_true_case_from_tokens(self, tokens, out_of_vocabulary_token_option="title"):
|
| 88 |
tokens_true_case = []
|
|
|
|
| 1 |
import math
|
| 2 |
import pickle
|
| 3 |
+
import re
|
| 4 |
import string
|
| 5 |
|
| 6 |
from nltk.tokenize import word_tokenize
|
|
|
|
| 80 |
def first_token_case(raw):
|
| 81 |
return raw.capitalize()
|
| 82 |
|
| 83 |
+
@staticmethod
|
| 84 |
+
def upper_replacement(match):
|
| 85 |
+
return '. ' + match.group(0)[-1].upper()
|
| 86 |
+
|
| 87 |
def get_true_case(self, sentence, out_of_vocabulary_token_option="title"):
|
| 88 |
tokens = word_tokenize(sentence)
|
| 89 |
tokens_true_case = self.get_true_case_from_tokens(tokens, out_of_vocabulary_token_option)
|
| 90 |
+
text = self.detknzr.detokenize(tokens_true_case)
|
| 91 |
+
text = re.sub(r' \. .', self.upper_replacement, text)
|
| 92 |
+
return text
|
| 93 |
|
| 94 |
def get_true_case_from_tokens(self, tokens, out_of_vocabulary_token_option="title"):
|
| 95 |
tokens_true_case = []
|