|
|
|
|
|
|
|
|
import re, json
|
|
|
import nltk
|
|
|
|
|
|
|
|
|
from core.DefaultPackages import openFile, saveFile
|
|
|
from nltk.corpus import stopwords
|
|
|
from nltk.corpus.reader.api import wordpunct_tokenize
|
|
|
from nltk.tokenize import word_tokenize
|
|
|
|
|
|
from wordsegment import load, segment
|
|
|
class cleanGenText():
|
|
|
def __init__(self):
|
|
|
|
|
|
load()
|
|
|
pass
|
|
|
def removePunct(self,text,KeepPeriod=False):
|
|
|
punctuation = r'[^\w\s]'
|
|
|
if KeepPeriod==True:
|
|
|
punctuation = r'[^\w\s\.]'
|
|
|
return re.sub(punctuation, '', text)
|
|
|
def removeURL(self,text):
|
|
|
url_pattern = re.compile(r'https?://\S+|www\.\S+')
|
|
|
return url_pattern.sub(r'', text)
|
|
|
def removeHTMLTag(self,text):
|
|
|
html_tags_pattern = r'<.*?>'
|
|
|
return re.sub(html_tags_pattern, '', text)
|
|
|
def removeTabWhiteSpaceNewLine(self,text):
|
|
|
|
|
|
cleanText = text.replace("\n\n","")
|
|
|
cleanText = text.replace("\n","")
|
|
|
cleanText = cleanText.replace("\t","")
|
|
|
cleanText = cleanText.strip()
|
|
|
return cleanText
|
|
|
def removeExtraSpaceBetweenWords(self,text):
|
|
|
return re.sub(r'\s+', ' ',text).strip()
|
|
|
def removeStopWords(self,text):
|
|
|
|
|
|
filteredWord = []
|
|
|
stopWords = set(list(set(stopwords.words('english'))))
|
|
|
textWords = word_tokenize(text)
|
|
|
for word in textWords:
|
|
|
if word.lower() not in stopWords:
|
|
|
filteredWord.append(word)
|
|
|
return filteredWord
|
|
|
def removeLowercaseBetweenUppercase(self,segment):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
outputUp = []
|
|
|
segment = self.removeTabWhiteSpaceNewLine(segment)
|
|
|
segments = segment.split(" ")
|
|
|
for w in range(len(segments)):
|
|
|
word = segments[w]
|
|
|
cleanWord = self.removePunct(word)
|
|
|
cleanWord = self.removeTabWhiteSpaceNewLine(cleanWord)
|
|
|
prevWord = ""
|
|
|
if w > 0:
|
|
|
prevWord = segments[w-1]
|
|
|
cleanPreWord = self.removePunct(prevWord)
|
|
|
cleanPreWord = self.removeTabWhiteSpaceNewLine(cleanPreWord)
|
|
|
if cleanWord[0].isupper() == True:
|
|
|
if len(prevWord)>0 and prevWord[0].isupper() == True:
|
|
|
outputUp[-1] += " " + cleanWord
|
|
|
else:
|
|
|
outputUp.append(cleanWord)
|
|
|
return outputUp
|
|
|
def textPreprocessing(self, text, keepPeriod=False):
|
|
|
|
|
|
|
|
|
|
|
|
cleanText = self.removePunct(text, KeepPeriod=keepPeriod)
|
|
|
|
|
|
cleanText = self.removeURL(cleanText)
|
|
|
|
|
|
cleanText = self.removeHTMLTag(cleanText)
|
|
|
|
|
|
cleanText = self.removeTabWhiteSpaceNewLine(cleanText)
|
|
|
|
|
|
filteredWord = self.removeStopWords(cleanText)
|
|
|
|
|
|
return cleanText, filteredWord
|
|
|
|
|
|
|
|
|
def splitStickWords(self,word):
|
|
|
|
|
|
split_words = segment(word)
|
|
|
'''for w in split_words:
|
|
|
pos = word.lower().find(w)
|
|
|
if word[pos].isupper() == True:
|
|
|
output.append(w[0].upper() + w[1:])
|
|
|
else:
|
|
|
output.append(w)
|
|
|
if pos >=0:
|
|
|
if pos+len(w)<len(word):
|
|
|
if word[pos+len(w)] == ".":
|
|
|
output[-1] = output[-1] + "." '''
|
|
|
return " ".join(split_words)
|
|
|
def removeDOI(self, word, doiLink=None):
|
|
|
|
|
|
if "DOI" in word:
|
|
|
word = word.replace(word,"")
|
|
|
|
|
|
if doiLink != None:
|
|
|
w = self.splitStickWords(word)
|
|
|
cleanDOI = self.removePunct(doiLink)
|
|
|
if cleanDOI in w:
|
|
|
word = w.replace(cleanDOI,"")
|
|
|
return word |