Spaces:

nam194
/

Review_company_analysis_and_Resume_parsing

Sleeping

nam194 commited on Jun 18, 2023

Commit

4d38a62

1 Parent(s): 916197c

Update utils.py

Files changed (1) hide show

utils.py CHANGED Viewed

@@ -1,8 +1,5 @@
-import jdk
-jdk.install('11', jre=True)
 from imports import *
 import unicodedata
-rdrsegmenter = VnCoreNLP("./vncorenlp_segmenter/VnCoreNLP-1.1.1.jar", annotators="wseg", max_heap_size='-Xmx500m')
 dict_map = {
     "òa": "oà",
     "Òa": "Oà",
@@ -59,7 +56,8 @@ def replace_all(text, dict_map=dict_map):
 def normalize(text, segment=True):
     text = replace_all(text, dict_map)
     if segment:
-        text = ' '.join([' '.join(sent) for sent in rdrsegmenter.tokenize(text)])
     return text
 def text_preprocess(document):
     punc = [i for i in ["\"", "-", ".", ":"]]#string.punctuation.replace(",","")]
@@ -85,7 +83,8 @@ def text_preprocess(document):
     document = re.sub("   ", " ", document)
     document = re.sub("  ", " ", document)
     try:
-        document = ' '.join(rdrsegmenter.tokenize(document)[0])
     except:
         pass
     return document.lower()

 from imports import *
 import unicodedata
 dict_map = {
     "òa": "oà",
     "Òa": "Oà",
 def normalize(text, segment=True):
     text = replace_all(text, dict_map)
     if segment:
+        text = text.split(".")
+        text = ". ".join([underthesea.word_tokenize(i, format="text") for i in text)])
     return text
 def text_preprocess(document):
     punc = [i for i in ["\"", "-", ".", ":"]]#string.punctuation.replace(",","")]
     document = re.sub("   ", " ", document)
     document = re.sub("  ", " ", document)
     try:
+        document = document.split(".")
+        document = ". ".join([underthesea.word_tokenize(i, format="text") for i in document)])
     except:
         pass
     return document.lower()