Update utils.py
Browse files
utils.py
CHANGED
|
@@ -1,8 +1,5 @@
|
|
| 1 |
-
import jdk
|
| 2 |
-
jdk.install('11', jre=True)
|
| 3 |
from imports import *
|
| 4 |
import unicodedata
|
| 5 |
-
rdrsegmenter = VnCoreNLP("./vncorenlp_segmenter/VnCoreNLP-1.1.1.jar", annotators="wseg", max_heap_size='-Xmx500m')
|
| 6 |
dict_map = {
|
| 7 |
"òa": "oà",
|
| 8 |
"Òa": "Oà",
|
|
@@ -59,7 +56,8 @@ def replace_all(text, dict_map=dict_map):
|
|
| 59 |
def normalize(text, segment=True):
|
| 60 |
text = replace_all(text, dict_map)
|
| 61 |
if segment:
|
| 62 |
-
text =
|
|
|
|
| 63 |
return text
|
| 64 |
def text_preprocess(document):
|
| 65 |
punc = [i for i in ["\"", "-", ".", ":"]]#string.punctuation.replace(",","")]
|
|
@@ -85,7 +83,8 @@ def text_preprocess(document):
|
|
| 85 |
document = re.sub(" ", " ", document)
|
| 86 |
document = re.sub(" ", " ", document)
|
| 87 |
try:
|
| 88 |
-
document =
|
|
|
|
| 89 |
except:
|
| 90 |
pass
|
| 91 |
return document.lower()
|
|
|
|
|
|
|
|
|
|
| 1 |
from imports import *
|
| 2 |
import unicodedata
|
|
|
|
| 3 |
dict_map = {
|
| 4 |
"òa": "oà",
|
| 5 |
"Òa": "Oà",
|
|
|
|
| 56 |
def normalize(text, segment=True):
|
| 57 |
text = replace_all(text, dict_map)
|
| 58 |
if segment:
|
| 59 |
+
text = text.split(".")
|
| 60 |
+
text = ". ".join([underthesea.word_tokenize(i, format="text") for i in text)])
|
| 61 |
return text
|
| 62 |
def text_preprocess(document):
|
| 63 |
punc = [i for i in ["\"", "-", ".", ":"]]#string.punctuation.replace(",","")]
|
|
|
|
| 83 |
document = re.sub(" ", " ", document)
|
| 84 |
document = re.sub(" ", " ", document)
|
| 85 |
try:
|
| 86 |
+
document = document.split(".")
|
| 87 |
+
document = ". ".join([underthesea.word_tokenize(i, format="text") for i in document)])
|
| 88 |
except:
|
| 89 |
pass
|
| 90 |
return document.lower()
|