Spaces:
Running
Running
Update NER/word2Vec/word2vec.py
Browse files- NER/word2Vec/word2vec.py +102 -541
NER/word2Vec/word2vec.py
CHANGED
|
@@ -1,374 +1,3 @@
|
|
| 1 |
-
<<<<<<< HEAD
|
| 2 |
-
'''WORD TO VECTOR'''
|
| 3 |
-
import pandas as pd
|
| 4 |
-
import json
|
| 5 |
-
import gensim
|
| 6 |
-
import spacy
|
| 7 |
-
from DefaultPackages import openFile, saveFile
|
| 8 |
-
from NER import cleanText
|
| 9 |
-
from gensim.models.keyedvectors import KeyedVectors
|
| 10 |
-
from gensim.test.utils import common_texts
|
| 11 |
-
from gensim.models.word2vec import Word2Vec
|
| 12 |
-
from gensim.scripts.glove2word2vec import glove2word2vec
|
| 13 |
-
from gensim.test.utils import datapath, get_tmpfile
|
| 14 |
-
import sys
|
| 15 |
-
import subprocess
|
| 16 |
-
# can try multiprocessing to run quicker
|
| 17 |
-
import multiprocessing
|
| 18 |
-
import copy
|
| 19 |
-
sys.setrecursionlimit(1000)
|
| 20 |
-
# creat folder word2Vec
|
| 21 |
-
#! mkdir /content/drive/MyDrive/CollectData/NER/word2Vec
|
| 22 |
-
# create word2vec model
|
| 23 |
-
#model = KeyedVectors.load_word2vec_format('/content/drive/MyDrive/CollectData/NER/word2Vec', binary=True)
|
| 24 |
-
'''Some notes for this model
|
| 25 |
-
sometimes when we do the corpus, there are some adverbs which are unnecessary but might be seen as
|
| 26 |
-
a similar word to the word we are finding, so can we try to preprocess text so that
|
| 27 |
-
we make the corpus more effective and only contains the important words. Then when we
|
| 28 |
-
train the model, the important words will be seen as important. Or
|
| 29 |
-
when we already have the similar list of words, we can remove the words in there
|
| 30 |
-
that are stopwords/unnecessary words.'''
|
| 31 |
-
### For more complex analysis, consider using sentence embedding models like "Doc2Vec" to represent the meaning of entire sentences instead of just individual words
|
| 32 |
-
class word2Vec():
|
| 33 |
-
def __init__(self, nameFile=None, modelName=None):
|
| 34 |
-
self.nameFile = nameFile
|
| 35 |
-
self.modelName = modelName
|
| 36 |
-
def spacy_similarity(self, word):
|
| 37 |
-
# when use word2vec, try medium or large is better
|
| 38 |
-
# maybe try odc similarity?
|
| 39 |
-
nlp = spacy.load("en_core_web_lg")
|
| 40 |
-
doc = nlp(word)
|
| 41 |
-
for token1 in doc:
|
| 42 |
-
for token2 in doc:
|
| 43 |
-
print(token1.text, token2.text, token1.similarity(token2))
|
| 44 |
-
pass
|
| 45 |
-
# clean text before transform to corpus
|
| 46 |
-
def cleanTextBeforeCorpus(self,oriText, doi=None):
|
| 47 |
-
cl = cleanText.cleanGenText()
|
| 48 |
-
#cl = cleanGenText()
|
| 49 |
-
output = ""
|
| 50 |
-
alreadyRemoveDoi = False
|
| 51 |
-
for word in oriText.split(" "):
|
| 52 |
-
# remove DOI
|
| 53 |
-
if doi != None and doi in oriText:
|
| 54 |
-
if alreadyRemoveDoi == False:
|
| 55 |
-
newWord = cl.removeDOI(word,doi)
|
| 56 |
-
if len(newWord) > 0 and newWord != word:
|
| 57 |
-
alreadyRemoveDoi = True
|
| 58 |
-
word = newWord
|
| 59 |
-
# remove punctuation
|
| 60 |
-
# split the sticked words
|
| 61 |
-
#word = cl.splitStickWords(word)
|
| 62 |
-
# remove punctuation
|
| 63 |
-
word = cl.removePunct(word,True)
|
| 64 |
-
# remove URL
|
| 65 |
-
word = cl.removeURL(word)
|
| 66 |
-
# remove HTMLTag
|
| 67 |
-
word = cl.removeHTMLTag(word)
|
| 68 |
-
# remove tab, white space, newline
|
| 69 |
-
word = cl.removeTabWhiteSpaceNewLine(word)
|
| 70 |
-
# optional: remove stopwords
|
| 71 |
-
#word = cl.removeStopWords(word)
|
| 72 |
-
if len(word)>0:
|
| 73 |
-
output += word + " "
|
| 74 |
-
return output
|
| 75 |
-
def cleanAllTextBeforeCorpus(self, allText, doi=None):
|
| 76 |
-
cleanOutput = ""
|
| 77 |
-
remove = "Evaluation Warning: The document was created with Spire.Doc for Python."
|
| 78 |
-
if len(allText) > 0:
|
| 79 |
-
corpusText = allText
|
| 80 |
-
for pos in range(len(corpusText.split("\n\n"))):
|
| 81 |
-
if len(corpusText.split("\n\n")[pos]) > 0:
|
| 82 |
-
lines = corpusText.split("\n\n")[pos]
|
| 83 |
-
for line in lines.split("\n"):
|
| 84 |
-
if remove in line: line = line.replace(remove, "")
|
| 85 |
-
clean_text = self.cleanTextBeforeCorpus(line, doi)
|
| 86 |
-
cleanOutput += clean_text + "\n"
|
| 87 |
-
cleanOutput += "\n\n"
|
| 88 |
-
return cleanOutput
|
| 89 |
-
def tableTransformToCorpusText(self, df, excelFile=None):
|
| 90 |
-
# PDF, Excel, WordDoc
|
| 91 |
-
#cl = cleanText.cleanGenText()
|
| 92 |
-
corpus = {}
|
| 93 |
-
# PDF or df
|
| 94 |
-
if excelFile == None:
|
| 95 |
-
if len(df) > 0:
|
| 96 |
-
try:
|
| 97 |
-
for i in range(len(df)):
|
| 98 |
-
# each new dimension/page is considered to be a sentence which ends with the period.
|
| 99 |
-
# each new line is a new list, and each new df is a new corpus
|
| 100 |
-
outputDF = []
|
| 101 |
-
text = df[i].values.tolist()
|
| 102 |
-
if len(text) > 0:
|
| 103 |
-
outputRowDF = self.helperRowTableToCorpus(text)
|
| 104 |
-
#outputColDF = self.helperColTableToCorpus(text)
|
| 105 |
-
outputDF.extend(outputRowDF)
|
| 106 |
-
#outputDF.extend(outputColDF)
|
| 107 |
-
if len(outputDF) > 0:
|
| 108 |
-
corpus["corpus" + str(i)] = outputDF
|
| 109 |
-
except:
|
| 110 |
-
outputDF = []
|
| 111 |
-
text = df.values.tolist()
|
| 112 |
-
if len(text) > 0:
|
| 113 |
-
outputRowDF = self.helperRowTableToCorpus(text)
|
| 114 |
-
#outputColDF = self.helperColTableToCorpus(text)
|
| 115 |
-
outputDF.extend(outputRowDF)
|
| 116 |
-
#outputDF.extend(outputColDF)
|
| 117 |
-
if len(outputDF) > 0:
|
| 118 |
-
corpus["corpus0"] = outputDF
|
| 119 |
-
else:
|
| 120 |
-
try:
|
| 121 |
-
df = pd.ExcelFile(excelFile)
|
| 122 |
-
except:
|
| 123 |
-
if filepath.endswith('.xls'):
|
| 124 |
-
df = pd.read_excel(filepath, engine='xlrd')
|
| 125 |
-
else:
|
| 126 |
-
df = pd.read_excel(filepath, engine='openpyxl')
|
| 127 |
-
sheetNames = df.sheet_names
|
| 128 |
-
output = []
|
| 129 |
-
if len(sheetNames) > 0:
|
| 130 |
-
for s in range(len(sheetNames)):
|
| 131 |
-
outputDF = []
|
| 132 |
-
with pd.ExcelFile(excelFile) as xls:
|
| 133 |
-
data = pd.read_excel(xls, sheetNames[s])
|
| 134 |
-
if sheetNames[s] != 'Evaluation Warning':
|
| 135 |
-
text = data.values.tolist()
|
| 136 |
-
if len(text) > 0:
|
| 137 |
-
outputRowDF = self.helperRowTableToCorpus(text)
|
| 138 |
-
#outputColDF = self.helperColTableToCorpus(text)
|
| 139 |
-
outputDF.extend(outputRowDF)
|
| 140 |
-
#outputDF.extend(outputColDF)
|
| 141 |
-
if len(outputDF) > 0:
|
| 142 |
-
corpus["corpus" + str(s)] = outputDF
|
| 143 |
-
return corpus
|
| 144 |
-
def helperRowTableToCorpus(self, textList):
|
| 145 |
-
#cl = cleanGenText()
|
| 146 |
-
cl = cleanText.cleanGenText()
|
| 147 |
-
stopWords = ["NaN","Unnamed:","nan"]
|
| 148 |
-
outputDF = []
|
| 149 |
-
for line in textList:
|
| 150 |
-
outputLine = []
|
| 151 |
-
for words in line:
|
| 152 |
-
words = str(words)
|
| 153 |
-
if len(words) > 0:
|
| 154 |
-
for word in words.split(" "):
|
| 155 |
-
# remove specific stopwords for table: "NaN", "Unnamed: 0", row index: if the number appears first, it's just a row index; keep "KM1"
|
| 156 |
-
if str(word) not in stopWords: # remove "NaN", "Unnamed:","nan"
|
| 157 |
-
#word = cl.splitStickWords(word)
|
| 158 |
-
word = cl.removePunct(word)
|
| 159 |
-
word = " ".join(cl.removeStopWords(word))
|
| 160 |
-
word = cl.removeTabWhiteSpaceNewLine(word)
|
| 161 |
-
if len(word) > 1:
|
| 162 |
-
if len(word.split(" ")) > 1:
|
| 163 |
-
for x in word.split(" "):
|
| 164 |
-
if len(x) > 1 and x.isnumeric()==False:
|
| 165 |
-
outputLine.append(x.lower())
|
| 166 |
-
else:
|
| 167 |
-
if word.isnumeric() == False:
|
| 168 |
-
outputLine.append(word.lower())
|
| 169 |
-
if len(outputLine) > 0:
|
| 170 |
-
outputDF.append(outputLine)
|
| 171 |
-
return outputDF
|
| 172 |
-
def helperColTableToCorpus(self, dfList):
|
| 173 |
-
#cl = cleanGenText()
|
| 174 |
-
cl = cleanText.cleanGenText()
|
| 175 |
-
stopWords = ["NaN","Unnamed:","nan"]
|
| 176 |
-
outputDF = []
|
| 177 |
-
# use the first length line as the column ref
|
| 178 |
-
for pos in range(len(dfList[0])):
|
| 179 |
-
outputLine = []
|
| 180 |
-
for line in dfList:
|
| 181 |
-
if pos < len(line):
|
| 182 |
-
words = line[pos]
|
| 183 |
-
words = str(words)
|
| 184 |
-
else: words = ""
|
| 185 |
-
if len(words) > 0:
|
| 186 |
-
for word in words.split(" "):
|
| 187 |
-
# remove specific stopwords for table: "NaN", "Unnamed: 0", row index: if the number appears first, it's just a row index; keep "KM1"
|
| 188 |
-
if str(word) not in stopWords: # remove "NaN", "Unnamed:","nan"
|
| 189 |
-
#word = cl.splitStickWords(word)
|
| 190 |
-
word = cl.removePunct(word)
|
| 191 |
-
word = " ".join(cl.removeStopWords(word))
|
| 192 |
-
word = cl.removeTabWhiteSpaceNewLine(word)
|
| 193 |
-
if len(word) > 1:
|
| 194 |
-
if len(word.split(" ")) > 1:
|
| 195 |
-
for x in word.split(" "):
|
| 196 |
-
if len(x) > 1 and x.isnumeric()==False:
|
| 197 |
-
outputLine.append(x.lower())
|
| 198 |
-
else:
|
| 199 |
-
if word.isnumeric() == False:
|
| 200 |
-
outputLine.append(word.lower())
|
| 201 |
-
if len(outputLine) > 0:
|
| 202 |
-
outputDF.append(outputLine)
|
| 203 |
-
return outputDF
|
| 204 |
-
# create a corpus
|
| 205 |
-
def createCorpusText(self, corpusText):
|
| 206 |
-
'''ex: "Tom is cat. Jerry is mouse."
|
| 207 |
-
corpus = [["Tom", "is", "cat"], ["Jerry", "is", "mouse"]]'''
|
| 208 |
-
# the output should be like this:
|
| 209 |
-
'''texts = {
|
| 210 |
-
"Paragraph 1": [["Cat", "is", "an","animal], ["Tom", "is", "cat"]],
|
| 211 |
-
"Paragraph 2": [["Mouse", "is", "an", "animal"], ["Jerry", "is", "mouse"]]
|
| 212 |
-
}
|
| 213 |
-
'''
|
| 214 |
-
# separate paragraph
|
| 215 |
-
'''Ex: Cat is an animal. Tom is cat.
|
| 216 |
-
|
| 217 |
-
Mouse is an animal.
|
| 218 |
-
Jerry is mouse.'''
|
| 219 |
-
texts = {}
|
| 220 |
-
cl = cleanText.cleanGenText()
|
| 221 |
-
#cl = cleanGenText()
|
| 222 |
-
for pos in range(len(corpusText.split("\n\n"))):
|
| 223 |
-
if len(corpusText.split("\n\n")[pos]) > 0:
|
| 224 |
-
texts["Paragraph "+str(pos)] = []
|
| 225 |
-
lines = corpusText.split("\n\n")[pos]
|
| 226 |
-
for line in lines.split("\n"):
|
| 227 |
-
for l in line.split("."):
|
| 228 |
-
if len(l) > 0:
|
| 229 |
-
cl.removeTabWhiteSpaceNewLine(l)
|
| 230 |
-
l = l.lower()
|
| 231 |
-
newL = []
|
| 232 |
-
for word in l.split(" "):
|
| 233 |
-
if len(word) > 0:
|
| 234 |
-
word = cl.removeStopWords(word)
|
| 235 |
-
for w in word:
|
| 236 |
-
if len(w) > 0 and w.isnumeric()==False:
|
| 237 |
-
newL.append(w)
|
| 238 |
-
if len(newL)>0:
|
| 239 |
-
texts["Paragraph "+str(pos)].append(newL)
|
| 240 |
-
if len(texts["Paragraph "+str(pos)]) == 0:
|
| 241 |
-
del texts["Paragraph "+str(pos)]
|
| 242 |
-
return texts
|
| 243 |
-
def selectParaForWC(self,corpus):
|
| 244 |
-
''' corpus should be in the format:
|
| 245 |
-
corpus = [["Tom", "is", "cat"], ["Jerry", "is", "mouse"]]'''
|
| 246 |
-
corSize, window, vector_size, sample, negative, epochs, sg = None, None, None, None, None, None, None
|
| 247 |
-
corSize = len(corpus)
|
| 248 |
-
# less than 2000
|
| 249 |
-
if 0 < corSize < 2000:
|
| 250 |
-
window=3.5
|
| 251 |
-
vector_size=75
|
| 252 |
-
sample=1e-3
|
| 253 |
-
negative=10
|
| 254 |
-
epochs=10
|
| 255 |
-
sg=1
|
| 256 |
-
# 2000 - 100000
|
| 257 |
-
elif 2000 <= corSize < 100000:
|
| 258 |
-
window=3.5
|
| 259 |
-
vector_size=75
|
| 260 |
-
sample=1e-5
|
| 261 |
-
negative=10
|
| 262 |
-
epochs=10
|
| 263 |
-
sg=1
|
| 264 |
-
elif 100000 <=corSize < 1000000:
|
| 265 |
-
window=7.5
|
| 266 |
-
vector_size=150
|
| 267 |
-
sample=1e-5
|
| 268 |
-
negative=10
|
| 269 |
-
epochs=6
|
| 270 |
-
sg=0
|
| 271 |
-
return window, vector_size, sample, negative, epochs, sg
|
| 272 |
-
def trainWord2Vec(self,nameFile,modelName,saveFolder,window=3.5,
|
| 273 |
-
vector_size=75,sample=1e-3,negative=10,epochs=10,sg=1):
|
| 274 |
-
# if you dont have backup file, you can use again the nameFile just to increase the lenght of corpus
|
| 275 |
-
jsonFile = ""
|
| 276 |
-
jsonFile = openFile.openJsonFile(nameFile) # this is a corpus json file from an article
|
| 277 |
-
cores = multiprocessing.cpu_count()
|
| 278 |
-
combinedCorpus = []
|
| 279 |
-
window, vector_size, sample, negative, epochs, sg = None, None, None, None, None, None
|
| 280 |
-
if len(jsonFile) > 0:
|
| 281 |
-
for key in jsonFile:
|
| 282 |
-
combinedCorpus.extend(jsonFile[key])
|
| 283 |
-
window, vector_size, sample, negative, epochs, sg = self.selectParaForWC(combinedCorpus)
|
| 284 |
-
# # min_count=1 ensures all words are included
|
| 285 |
-
'''w2vModel = Word2Vec(
|
| 286 |
-
min_count=1,
|
| 287 |
-
window=window,
|
| 288 |
-
vector_size=vector_size,
|
| 289 |
-
sample=sample,
|
| 290 |
-
alpha=0.03,
|
| 291 |
-
min_alpha=0.0007,
|
| 292 |
-
negative=negative,
|
| 293 |
-
workers=cores-1,
|
| 294 |
-
epochs = epochs,
|
| 295 |
-
sg=sg)'''
|
| 296 |
-
#w2vModel = Word2Vec(vector_size=150, window=10, min_count=1, workers=4)
|
| 297 |
-
accept = False
|
| 298 |
-
while not accept:
|
| 299 |
-
if window!=None and vector_size!=None and sample!=None and negative!=None and epochs!=None and sg!=None:
|
| 300 |
-
try:
|
| 301 |
-
w2vModel = Word2Vec(
|
| 302 |
-
min_count=1,
|
| 303 |
-
window=window,
|
| 304 |
-
vector_size=vector_size,
|
| 305 |
-
sample=sample,
|
| 306 |
-
alpha=0.03,
|
| 307 |
-
min_alpha=0.0007,
|
| 308 |
-
negative=negative,
|
| 309 |
-
workers=cores-1,
|
| 310 |
-
epochs = epochs,
|
| 311 |
-
sg=sg)
|
| 312 |
-
w2vModel.build_vocab(combinedCorpus)
|
| 313 |
-
w2vModel.train(combinedCorpus, total_examples=w2vModel.corpus_count, epochs=30)
|
| 314 |
-
accept = True
|
| 315 |
-
except:
|
| 316 |
-
for key in jsonFile:
|
| 317 |
-
combinedCorpus.extend(jsonFile[key])
|
| 318 |
-
window, vector_size, sample, negative, epochs, sg = self.selectParaForWC(combinedCorpus)
|
| 319 |
-
print("next is " + str(len(combinedCorpus)))
|
| 320 |
-
else:
|
| 321 |
-
print("no parameter to train")
|
| 322 |
-
break
|
| 323 |
-
#w2vModel.build_vocab(combinedCorpus)
|
| 324 |
-
#w2vModel.train(combinedCorpus, total_examples=w2vModel.corpus_count, epochs=30)
|
| 325 |
-
#w2vModel.save("/content/drive/MyDrive/CollectData/NER/word2Vec/TestExamples/models/wordVector_"+modelName+".model")
|
| 326 |
-
#w2vModel.wv.save_word2vec_format("/content/drive/MyDrive/CollectData/NER/word2Vec/TestExamples/models/wordVector_"+modelName+".txt")
|
| 327 |
-
w2vModel.save(saveFolder+"/"+modelName+".model")
|
| 328 |
-
w2vModel.wv.save_word2vec_format(saveFolder+"/"+modelName+".txt")
|
| 329 |
-
print("done w2v")
|
| 330 |
-
else: print("no corpus to train")
|
| 331 |
-
#return combinedCorpus
|
| 332 |
-
def genSimilar(self,word,modelFile,n=10, cos_thres=0.7):
|
| 333 |
-
# might not be a meaningful keyword
|
| 334 |
-
#stopWords = ["show"]
|
| 335 |
-
# same word but just plural nouns, tense
|
| 336 |
-
simWords = [word+"s",word+"es",word+"ing",word+"ed"]
|
| 337 |
-
model = KeyedVectors.load_word2vec_format(modelFile, binary = False) # model file in format txt
|
| 338 |
-
results = model.most_similar(positive=[word],topn=n)
|
| 339 |
-
#removeIndex = []
|
| 340 |
-
#currN = copy.deepcopy(n)
|
| 341 |
-
'''for r in range(len(results)):
|
| 342 |
-
if len(results[r][0]) < 2:
|
| 343 |
-
removeIndex.append(results[r])
|
| 344 |
-
# remove the same word but just plural and singular noun and lower than the cos_thres
|
| 345 |
-
elif results[r][0] == word:
|
| 346 |
-
removeIndex.append(results[r])
|
| 347 |
-
elif results[r][0] in simWords or float(results[r][1]) < cos_thres or results[r][0] in stopWords:
|
| 348 |
-
removeIndex.append(results[r])
|
| 349 |
-
for rem in removeIndex:
|
| 350 |
-
results.remove(rem)
|
| 351 |
-
while len(results)!=n and len(results) != 0:
|
| 352 |
-
moreNewResult = model.most_similar(positive=[word],topn=currN+1)[-1]
|
| 353 |
-
if moreNewResult not in results and len(moreNewResult[0])>1:
|
| 354 |
-
if moreNewResult[0] not in stopWords and results[0] != word:
|
| 355 |
-
results.append(moreNewResult)
|
| 356 |
-
currN +=1'''
|
| 357 |
-
return results
|
| 358 |
-
# adding our model into spacy
|
| 359 |
-
# this deals with command line; but instead of using it, we write python script to run command line
|
| 360 |
-
def loadWordVec(self,modelName,wordVec):
|
| 361 |
-
# modelName is the name you want to save into spacy
|
| 362 |
-
# wordVec is the trained word2vec in txt format
|
| 363 |
-
subprocess.run([sys.executable,
|
| 364 |
-
"-m",
|
| 365 |
-
"spacy",
|
| 366 |
-
"init-model",
|
| 367 |
-
"en",
|
| 368 |
-
modelName, # this modelName comes from the saved modelName of function trainWord2Vec
|
| 369 |
-
"--vectors-loc",
|
| 370 |
-
wordVec])
|
| 371 |
-
=======
|
| 372 |
'''WORD TO VECTOR'''
|
| 373 |
import pandas as pd
|
| 374 |
import json
|
|
@@ -381,11 +10,8 @@ from gensim.test.utils import common_texts
|
|
| 381 |
from gensim.models.word2vec import Word2Vec
|
| 382 |
from gensim.scripts.glove2word2vec import glove2word2vec
|
| 383 |
from gensim.test.utils import datapath, get_tmpfile
|
| 384 |
-
from gensim.models import Phrases
|
| 385 |
-
from gensim.models.phrases import Phraser
|
| 386 |
import sys
|
| 387 |
import subprocess
|
| 388 |
-
import os
|
| 389 |
# can try multiprocessing to run quicker
|
| 390 |
import multiprocessing
|
| 391 |
import copy
|
|
@@ -406,19 +32,18 @@ class word2Vec():
|
|
| 406 |
def __init__(self, nameFile=None, modelName=None):
|
| 407 |
self.nameFile = nameFile
|
| 408 |
self.modelName = modelName
|
| 409 |
-
#self.nlp = spacy.load("en_core_web_lg")
|
| 410 |
-
self.cl = cleanText.cleanGenText()
|
| 411 |
def spacy_similarity(self, word):
|
| 412 |
# when use word2vec, try medium or large is better
|
| 413 |
# maybe try odc similarity?
|
| 414 |
-
|
|
|
|
| 415 |
for token1 in doc:
|
| 416 |
for token2 in doc:
|
| 417 |
print(token1.text, token2.text, token1.similarity(token2))
|
| 418 |
pass
|
| 419 |
# clean text before transform to corpus
|
| 420 |
def cleanTextBeforeCorpus(self,oriText, doi=None):
|
| 421 |
-
|
| 422 |
#cl = cleanGenText()
|
| 423 |
output = ""
|
| 424 |
alreadyRemoveDoi = False
|
|
@@ -426,7 +51,7 @@ class word2Vec():
|
|
| 426 |
# remove DOI
|
| 427 |
if doi != None and doi in oriText:
|
| 428 |
if alreadyRemoveDoi == False:
|
| 429 |
-
newWord =
|
| 430 |
if len(newWord) > 0 and newWord != word:
|
| 431 |
alreadyRemoveDoi = True
|
| 432 |
word = newWord
|
|
@@ -434,13 +59,13 @@ class word2Vec():
|
|
| 434 |
# split the sticked words
|
| 435 |
#word = cl.splitStickWords(word)
|
| 436 |
# remove punctuation
|
| 437 |
-
word =
|
| 438 |
# remove URL
|
| 439 |
-
word =
|
| 440 |
# remove HTMLTag
|
| 441 |
-
word =
|
| 442 |
# remove tab, white space, newline
|
| 443 |
-
word =
|
| 444 |
# optional: remove stopwords
|
| 445 |
#word = cl.removeStopWords(word)
|
| 446 |
if len(word)>0:
|
|
@@ -450,18 +75,16 @@ class word2Vec():
|
|
| 450 |
cleanOutput = ""
|
| 451 |
remove = "Evaluation Warning: The document was created with Spire.Doc for Python."
|
| 452 |
if len(allText) > 0:
|
| 453 |
-
corpusText = allText
|
| 454 |
-
for pos in range(len(corpusText)):
|
| 455 |
-
|
| 456 |
-
|
| 457 |
for line in lines.split("\n"):
|
| 458 |
if remove in line: line = line.replace(remove, "")
|
| 459 |
clean_text = self.cleanTextBeforeCorpus(line, doi)
|
| 460 |
cleanOutput += clean_text + "\n"
|
| 461 |
cleanOutput += "\n\n"
|
| 462 |
return cleanOutput
|
| 463 |
-
import urllib.parse, requests
|
| 464 |
-
|
| 465 |
def tableTransformToCorpusText(self, df, excelFile=None):
|
| 466 |
# PDF, Excel, WordDoc
|
| 467 |
#cl = cleanText.cleanGenText()
|
|
@@ -496,10 +119,10 @@ class word2Vec():
|
|
| 496 |
try:
|
| 497 |
df = pd.ExcelFile(excelFile)
|
| 498 |
except:
|
| 499 |
-
if
|
| 500 |
-
df = pd.read_excel(
|
| 501 |
else:
|
| 502 |
-
df = pd.read_excel(
|
| 503 |
sheetNames = df.sheet_names
|
| 504 |
output = []
|
| 505 |
if len(sheetNames) > 0:
|
|
@@ -519,7 +142,7 @@ class word2Vec():
|
|
| 519 |
return corpus
|
| 520 |
def helperRowTableToCorpus(self, textList):
|
| 521 |
#cl = cleanGenText()
|
| 522 |
-
|
| 523 |
stopWords = ["NaN","Unnamed:","nan"]
|
| 524 |
outputDF = []
|
| 525 |
for line in textList:
|
|
@@ -531,9 +154,9 @@ class word2Vec():
|
|
| 531 |
# remove specific stopwords for table: "NaN", "Unnamed: 0", row index: if the number appears first, it's just a row index; keep "KM1"
|
| 532 |
if str(word) not in stopWords: # remove "NaN", "Unnamed:","nan"
|
| 533 |
#word = cl.splitStickWords(word)
|
| 534 |
-
word =
|
| 535 |
-
word = " ".join(
|
| 536 |
-
word =
|
| 537 |
if len(word) > 1:
|
| 538 |
if len(word.split(" ")) > 1:
|
| 539 |
for x in word.split(" "):
|
|
@@ -547,7 +170,7 @@ class word2Vec():
|
|
| 547 |
return outputDF
|
| 548 |
def helperColTableToCorpus(self, dfList):
|
| 549 |
#cl = cleanGenText()
|
| 550 |
-
|
| 551 |
stopWords = ["NaN","Unnamed:","nan"]
|
| 552 |
outputDF = []
|
| 553 |
# use the first length line as the column ref
|
|
@@ -563,9 +186,9 @@ class word2Vec():
|
|
| 563 |
# remove specific stopwords for table: "NaN", "Unnamed: 0", row index: if the number appears first, it's just a row index; keep "KM1"
|
| 564 |
if str(word) not in stopWords: # remove "NaN", "Unnamed:","nan"
|
| 565 |
#word = cl.splitStickWords(word)
|
| 566 |
-
word =
|
| 567 |
-
word = " ".join(
|
| 568 |
-
word =
|
| 569 |
if len(word) > 1:
|
| 570 |
if len(word.split(" ")) > 1:
|
| 571 |
for x in word.split(" "):
|
|
@@ -593,22 +216,21 @@ class word2Vec():
|
|
| 593 |
Mouse is an animal.
|
| 594 |
Jerry is mouse.'''
|
| 595 |
texts = {}
|
| 596 |
-
|
| 597 |
#cl = cleanGenText()
|
| 598 |
-
|
| 599 |
-
|
| 600 |
-
if len(corpus[pos]) > 0:
|
| 601 |
texts["Paragraph "+str(pos)] = []
|
| 602 |
-
lines =
|
| 603 |
for line in lines.split("\n"):
|
| 604 |
for l in line.split("."):
|
| 605 |
if len(l) > 0:
|
| 606 |
-
|
| 607 |
l = l.lower()
|
| 608 |
newL = []
|
| 609 |
for word in l.split(" "):
|
| 610 |
if len(word) > 0:
|
| 611 |
-
word =
|
| 612 |
for w in word:
|
| 613 |
if len(w) > 0 and w.isnumeric()==False:
|
| 614 |
newL.append(w)
|
|
@@ -617,86 +239,49 @@ class word2Vec():
|
|
| 617 |
if len(texts["Paragraph "+str(pos)]) == 0:
|
| 618 |
del texts["Paragraph "+str(pos)]
|
| 619 |
return texts
|
| 620 |
-
|
| 621 |
-
|
| 622 |
-
"""
|
| 623 |
-
|
| 624 |
-
Heuristically determine Word2Vec parameters.
|
| 625 |
-
"""
|
| 626 |
corSize = len(corpus)
|
| 627 |
-
|
| 628 |
-
if corSize
|
| 629 |
-
|
| 630 |
-
|
| 631 |
-
|
| 632 |
-
|
| 633 |
-
|
| 634 |
-
|
| 635 |
-
|
| 636 |
-
|
| 637 |
-
|
| 638 |
-
|
| 639 |
-
|
| 640 |
-
|
| 641 |
-
|
| 642 |
-
|
| 643 |
-
|
| 644 |
-
|
| 645 |
-
|
| 646 |
-
|
| 647 |
-
|
| 648 |
-
|
| 649 |
-
|
| 650 |
-
sample = 1e-5
|
| 651 |
-
negative = 15
|
| 652 |
-
epochs = 15
|
| 653 |
-
sg = 1
|
| 654 |
-
elif corSize < 500000:
|
| 655 |
-
window = 10
|
| 656 |
-
vector_size = 250
|
| 657 |
-
sample = 1e-5
|
| 658 |
-
negative = 15
|
| 659 |
-
epochs = 10
|
| 660 |
-
sg = 0 # CBOW is okay when data is large
|
| 661 |
-
else:
|
| 662 |
-
# Very large corpus
|
| 663 |
-
window = 12
|
| 664 |
-
vector_size = 300
|
| 665 |
-
sample = 1e-6
|
| 666 |
-
negative = 20
|
| 667 |
-
epochs = 5
|
| 668 |
-
sg = 0
|
| 669 |
-
|
| 670 |
return window, vector_size, sample, negative, epochs, sg
|
| 671 |
-
|
| 672 |
-
|
| 673 |
-
|
| 674 |
-
vector_size=None,sample=None,negative=None,epochs=None,sg=None):
|
| 675 |
jsonFile = ""
|
| 676 |
jsonFile = openFile.openJsonFile(nameFile) # this is a corpus json file from an article
|
| 677 |
-
if not jsonFile:
|
| 678 |
-
print("No corpus to train")
|
| 679 |
-
return
|
| 680 |
cores = multiprocessing.cpu_count()
|
| 681 |
combinedCorpus = []
|
| 682 |
-
|
| 683 |
-
|
| 684 |
-
|
| 685 |
-
|
| 686 |
-
bigram = Phraser(phrases)
|
| 687 |
-
combinedCorpus = [bigram[sent] for sent in combinedCorpus]
|
| 688 |
-
|
| 689 |
-
if window==None and vector_size==None and sample==None and negative==None and epochs==None and sg==None:
|
| 690 |
window, vector_size, sample, negative, epochs, sg = self.selectParaForWC(combinedCorpus)
|
| 691 |
-
|
| 692 |
-
|
| 693 |
-
accept = False
|
| 694 |
-
# add retry limit because if training keeps failing (bad corpus or corrupted input), it’ll keep retrying without limit.
|
| 695 |
-
retries = 0
|
| 696 |
-
while not accept and retries < 3:
|
| 697 |
-
if window!=None and vector_size!=None and sample!=None and negative!=None and epochs!=None and sg!=None:
|
| 698 |
-
try:
|
| 699 |
-
w2vModel = Word2Vec(
|
| 700 |
min_count=1,
|
| 701 |
window=window,
|
| 702 |
vector_size=vector_size,
|
|
@@ -706,39 +291,43 @@ class word2Vec():
|
|
| 706 |
negative=negative,
|
| 707 |
workers=cores-1,
|
| 708 |
epochs = epochs,
|
| 709 |
-
sg=sg)
|
| 710 |
-
|
| 711 |
-
|
| 712 |
-
|
| 713 |
-
|
| 714 |
-
|
| 715 |
-
|
| 716 |
-
|
| 717 |
-
|
| 718 |
-
|
| 719 |
-
|
| 720 |
-
|
| 721 |
-
|
| 722 |
-
|
| 723 |
-
|
| 724 |
-
|
| 725 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 726 |
#return combinedCorpus
|
| 727 |
-
def updateWord2Vec(self, modelPath, newCorpus, saveFolder=None):
|
| 728 |
-
if not newCorpus:
|
| 729 |
-
raise ValueError("New corpus is empty!")
|
| 730 |
-
|
| 731 |
-
model = Word2Vec.load(modelPath)
|
| 732 |
-
|
| 733 |
-
# Phrase detection on new data
|
| 734 |
-
phrases = Phrases(newCorpus, min_count=2, threshold=10)
|
| 735 |
-
bigram = Phraser(phrases)
|
| 736 |
-
newCorpus = [bigram[sent] for sent in newCorpus]
|
| 737 |
-
|
| 738 |
-
# Update vocab & retrain
|
| 739 |
-
model.build_vocab(newCorpus, update=True)
|
| 740 |
-
model.train(newCorpus, total_examples=len(newCorpus), epochs=model.epochs)
|
| 741 |
-
|
| 742 |
def genSimilar(self,word,modelFile,n=10, cos_thres=0.7):
|
| 743 |
# might not be a meaningful keyword
|
| 744 |
#stopWords = ["show"]
|
|
@@ -765,32 +354,6 @@ class word2Vec():
|
|
| 765 |
results.append(moreNewResult)
|
| 766 |
currN +=1'''
|
| 767 |
return results
|
| 768 |
-
# add more data to existing word2vec model
|
| 769 |
-
def updateWord2Vec(self, modelPath, newCorpus, saveFolder=None):
|
| 770 |
-
if not newCorpus:
|
| 771 |
-
raise ValueError("New corpus is empty!")
|
| 772 |
-
|
| 773 |
-
model = Word2Vec.load(modelPath)
|
| 774 |
-
|
| 775 |
-
# Phrase detection on new data
|
| 776 |
-
phrases = Phrases(newCorpus, min_count=2, threshold=10)
|
| 777 |
-
bigram = Phraser(phrases)
|
| 778 |
-
newCorpus = [bigram[sent] for sent in newCorpus]
|
| 779 |
-
|
| 780 |
-
# Update vocab & retrain
|
| 781 |
-
model.build_vocab(newCorpus, update=True)
|
| 782 |
-
model.train(newCorpus, total_examples=len(newCorpus), epochs=model.epochs)
|
| 783 |
-
|
| 784 |
-
# Save updated model
|
| 785 |
-
if saveFolder:
|
| 786 |
-
os.makedirs(saveFolder, exist_ok=True)
|
| 787 |
-
name = os.path.basename(modelPath).replace(".model", "_updated.model")
|
| 788 |
-
model.save(f"{saveFolder}/{name}")
|
| 789 |
-
print(f"🔁 Model updated and saved to {saveFolder}/{name}")
|
| 790 |
-
else:
|
| 791 |
-
model.save(modelPath)
|
| 792 |
-
print(f"🔁 Model updated and overwritten at {modelPath}")
|
| 793 |
-
|
| 794 |
# adding our model into spacy
|
| 795 |
# this deals with command line; but instead of using it, we write python script to run command line
|
| 796 |
def loadWordVec(self,modelName,wordVec):
|
|
@@ -803,6 +366,4 @@ class word2Vec():
|
|
| 803 |
"en",
|
| 804 |
modelName, # this modelName comes from the saved modelName of function trainWord2Vec
|
| 805 |
"--vectors-loc",
|
| 806 |
-
wordVec])
|
| 807 |
-
>>>>>>> 597aa7c (WIP: Save local changes which mainly updated appUI before moving to UpdateAppUI)
|
| 808 |
-
print("done")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
'''WORD TO VECTOR'''
|
| 2 |
import pandas as pd
|
| 3 |
import json
|
|
|
|
| 10 |
from gensim.models.word2vec import Word2Vec
|
| 11 |
from gensim.scripts.glove2word2vec import glove2word2vec
|
| 12 |
from gensim.test.utils import datapath, get_tmpfile
|
|
|
|
|
|
|
| 13 |
import sys
|
| 14 |
import subprocess
|
|
|
|
| 15 |
# can try multiprocessing to run quicker
|
| 16 |
import multiprocessing
|
| 17 |
import copy
|
|
|
|
| 32 |
def __init__(self, nameFile=None, modelName=None):
|
| 33 |
self.nameFile = nameFile
|
| 34 |
self.modelName = modelName
|
|
|
|
|
|
|
| 35 |
def spacy_similarity(self, word):
|
| 36 |
# when use word2vec, try medium or large is better
|
| 37 |
# maybe try odc similarity?
|
| 38 |
+
nlp = spacy.load("en_core_web_lg")
|
| 39 |
+
doc = nlp(word)
|
| 40 |
for token1 in doc:
|
| 41 |
for token2 in doc:
|
| 42 |
print(token1.text, token2.text, token1.similarity(token2))
|
| 43 |
pass
|
| 44 |
# clean text before transform to corpus
|
| 45 |
def cleanTextBeforeCorpus(self,oriText, doi=None):
|
| 46 |
+
cl = cleanText.cleanGenText()
|
| 47 |
#cl = cleanGenText()
|
| 48 |
output = ""
|
| 49 |
alreadyRemoveDoi = False
|
|
|
|
| 51 |
# remove DOI
|
| 52 |
if doi != None and doi in oriText:
|
| 53 |
if alreadyRemoveDoi == False:
|
| 54 |
+
newWord = cl.removeDOI(word,doi)
|
| 55 |
if len(newWord) > 0 and newWord != word:
|
| 56 |
alreadyRemoveDoi = True
|
| 57 |
word = newWord
|
|
|
|
| 59 |
# split the sticked words
|
| 60 |
#word = cl.splitStickWords(word)
|
| 61 |
# remove punctuation
|
| 62 |
+
word = cl.removePunct(word,True)
|
| 63 |
# remove URL
|
| 64 |
+
word = cl.removeURL(word)
|
| 65 |
# remove HTMLTag
|
| 66 |
+
word = cl.removeHTMLTag(word)
|
| 67 |
# remove tab, white space, newline
|
| 68 |
+
word = cl.removeTabWhiteSpaceNewLine(word)
|
| 69 |
# optional: remove stopwords
|
| 70 |
#word = cl.removeStopWords(word)
|
| 71 |
if len(word)>0:
|
|
|
|
| 75 |
cleanOutput = ""
|
| 76 |
remove = "Evaluation Warning: The document was created with Spire.Doc for Python."
|
| 77 |
if len(allText) > 0:
|
| 78 |
+
corpusText = allText
|
| 79 |
+
for pos in range(len(corpusText.split("\n\n"))):
|
| 80 |
+
if len(corpusText.split("\n\n")[pos]) > 0:
|
| 81 |
+
lines = corpusText.split("\n\n")[pos]
|
| 82 |
for line in lines.split("\n"):
|
| 83 |
if remove in line: line = line.replace(remove, "")
|
| 84 |
clean_text = self.cleanTextBeforeCorpus(line, doi)
|
| 85 |
cleanOutput += clean_text + "\n"
|
| 86 |
cleanOutput += "\n\n"
|
| 87 |
return cleanOutput
|
|
|
|
|
|
|
| 88 |
def tableTransformToCorpusText(self, df, excelFile=None):
|
| 89 |
# PDF, Excel, WordDoc
|
| 90 |
#cl = cleanText.cleanGenText()
|
|
|
|
| 119 |
try:
|
| 120 |
df = pd.ExcelFile(excelFile)
|
| 121 |
except:
|
| 122 |
+
if filepath.endswith('.xls'):
|
| 123 |
+
df = pd.read_excel(filepath, engine='xlrd')
|
| 124 |
else:
|
| 125 |
+
df = pd.read_excel(filepath, engine='openpyxl')
|
| 126 |
sheetNames = df.sheet_names
|
| 127 |
output = []
|
| 128 |
if len(sheetNames) > 0:
|
|
|
|
| 142 |
return corpus
|
| 143 |
def helperRowTableToCorpus(self, textList):
|
| 144 |
#cl = cleanGenText()
|
| 145 |
+
cl = cleanText.cleanGenText()
|
| 146 |
stopWords = ["NaN","Unnamed:","nan"]
|
| 147 |
outputDF = []
|
| 148 |
for line in textList:
|
|
|
|
| 154 |
# remove specific stopwords for table: "NaN", "Unnamed: 0", row index: if the number appears first, it's just a row index; keep "KM1"
|
| 155 |
if str(word) not in stopWords: # remove "NaN", "Unnamed:","nan"
|
| 156 |
#word = cl.splitStickWords(word)
|
| 157 |
+
word = cl.removePunct(word)
|
| 158 |
+
word = " ".join(cl.removeStopWords(word))
|
| 159 |
+
word = cl.removeTabWhiteSpaceNewLine(word)
|
| 160 |
if len(word) > 1:
|
| 161 |
if len(word.split(" ")) > 1:
|
| 162 |
for x in word.split(" "):
|
|
|
|
| 170 |
return outputDF
|
| 171 |
def helperColTableToCorpus(self, dfList):
|
| 172 |
#cl = cleanGenText()
|
| 173 |
+
cl = cleanText.cleanGenText()
|
| 174 |
stopWords = ["NaN","Unnamed:","nan"]
|
| 175 |
outputDF = []
|
| 176 |
# use the first length line as the column ref
|
|
|
|
| 186 |
# remove specific stopwords for table: "NaN", "Unnamed: 0", row index: if the number appears first, it's just a row index; keep "KM1"
|
| 187 |
if str(word) not in stopWords: # remove "NaN", "Unnamed:","nan"
|
| 188 |
#word = cl.splitStickWords(word)
|
| 189 |
+
word = cl.removePunct(word)
|
| 190 |
+
word = " ".join(cl.removeStopWords(word))
|
| 191 |
+
word = cl.removeTabWhiteSpaceNewLine(word)
|
| 192 |
if len(word) > 1:
|
| 193 |
if len(word.split(" ")) > 1:
|
| 194 |
for x in word.split(" "):
|
|
|
|
| 216 |
Mouse is an animal.
|
| 217 |
Jerry is mouse.'''
|
| 218 |
texts = {}
|
| 219 |
+
cl = cleanText.cleanGenText()
|
| 220 |
#cl = cleanGenText()
|
| 221 |
+
for pos in range(len(corpusText.split("\n\n"))):
|
| 222 |
+
if len(corpusText.split("\n\n")[pos]) > 0:
|
|
|
|
| 223 |
texts["Paragraph "+str(pos)] = []
|
| 224 |
+
lines = corpusText.split("\n\n")[pos]
|
| 225 |
for line in lines.split("\n"):
|
| 226 |
for l in line.split("."):
|
| 227 |
if len(l) > 0:
|
| 228 |
+
cl.removeTabWhiteSpaceNewLine(l)
|
| 229 |
l = l.lower()
|
| 230 |
newL = []
|
| 231 |
for word in l.split(" "):
|
| 232 |
if len(word) > 0:
|
| 233 |
+
word = cl.removeStopWords(word)
|
| 234 |
for w in word:
|
| 235 |
if len(w) > 0 and w.isnumeric()==False:
|
| 236 |
newL.append(w)
|
|
|
|
| 239 |
if len(texts["Paragraph "+str(pos)]) == 0:
|
| 240 |
del texts["Paragraph "+str(pos)]
|
| 241 |
return texts
|
| 242 |
+
def selectParaForWC(self,corpus):
|
| 243 |
+
''' corpus should be in the format:
|
| 244 |
+
corpus = [["Tom", "is", "cat"], ["Jerry", "is", "mouse"]]'''
|
| 245 |
+
corSize, window, vector_size, sample, negative, epochs, sg = None, None, None, None, None, None, None
|
|
|
|
|
|
|
| 246 |
corSize = len(corpus)
|
| 247 |
+
# less than 2000
|
| 248 |
+
if 0 < corSize < 2000:
|
| 249 |
+
window=3.5
|
| 250 |
+
vector_size=75
|
| 251 |
+
sample=1e-3
|
| 252 |
+
negative=10
|
| 253 |
+
epochs=10
|
| 254 |
+
sg=1
|
| 255 |
+
# 2000 - 100000
|
| 256 |
+
elif 2000 <= corSize < 100000:
|
| 257 |
+
window=3.5
|
| 258 |
+
vector_size=75
|
| 259 |
+
sample=1e-5
|
| 260 |
+
negative=10
|
| 261 |
+
epochs=10
|
| 262 |
+
sg=1
|
| 263 |
+
elif 100000 <=corSize < 1000000:
|
| 264 |
+
window=7.5
|
| 265 |
+
vector_size=150
|
| 266 |
+
sample=1e-5
|
| 267 |
+
negative=10
|
| 268 |
+
epochs=6
|
| 269 |
+
sg=0
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 270 |
return window, vector_size, sample, negative, epochs, sg
|
| 271 |
+
def trainWord2Vec(self,nameFile,modelName,saveFolder,window=3.5,
|
| 272 |
+
vector_size=75,sample=1e-3,negative=10,epochs=10,sg=1):
|
| 273 |
+
# if you dont have backup file, you can use again the nameFile just to increase the lenght of corpus
|
|
|
|
| 274 |
jsonFile = ""
|
| 275 |
jsonFile = openFile.openJsonFile(nameFile) # this is a corpus json file from an article
|
|
|
|
|
|
|
|
|
|
| 276 |
cores = multiprocessing.cpu_count()
|
| 277 |
combinedCorpus = []
|
| 278 |
+
window, vector_size, sample, negative, epochs, sg = None, None, None, None, None, None
|
| 279 |
+
if len(jsonFile) > 0:
|
| 280 |
+
for key in jsonFile:
|
| 281 |
+
combinedCorpus.extend(jsonFile[key])
|
|
|
|
|
|
|
|
|
|
|
|
|
| 282 |
window, vector_size, sample, negative, epochs, sg = self.selectParaForWC(combinedCorpus)
|
| 283 |
+
# # min_count=1 ensures all words are included
|
| 284 |
+
'''w2vModel = Word2Vec(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 285 |
min_count=1,
|
| 286 |
window=window,
|
| 287 |
vector_size=vector_size,
|
|
|
|
| 291 |
negative=negative,
|
| 292 |
workers=cores-1,
|
| 293 |
epochs = epochs,
|
| 294 |
+
sg=sg)'''
|
| 295 |
+
#w2vModel = Word2Vec(vector_size=150, window=10, min_count=1, workers=4)
|
| 296 |
+
accept = False
|
| 297 |
+
while not accept:
|
| 298 |
+
if window!=None and vector_size!=None and sample!=None and negative!=None and epochs!=None and sg!=None:
|
| 299 |
+
try:
|
| 300 |
+
w2vModel = Word2Vec(
|
| 301 |
+
min_count=1,
|
| 302 |
+
window=window,
|
| 303 |
+
vector_size=vector_size,
|
| 304 |
+
sample=sample,
|
| 305 |
+
alpha=0.03,
|
| 306 |
+
min_alpha=0.0007,
|
| 307 |
+
negative=negative,
|
| 308 |
+
workers=cores-1,
|
| 309 |
+
epochs = epochs,
|
| 310 |
+
sg=sg)
|
| 311 |
+
w2vModel.build_vocab(combinedCorpus)
|
| 312 |
+
w2vModel.train(combinedCorpus, total_examples=w2vModel.corpus_count, epochs=30)
|
| 313 |
+
accept = True
|
| 314 |
+
except:
|
| 315 |
+
for key in jsonFile:
|
| 316 |
+
combinedCorpus.extend(jsonFile[key])
|
| 317 |
+
window, vector_size, sample, negative, epochs, sg = self.selectParaForWC(combinedCorpus)
|
| 318 |
+
print("next is " + str(len(combinedCorpus)))
|
| 319 |
+
else:
|
| 320 |
+
print("no parameter to train")
|
| 321 |
+
break
|
| 322 |
+
#w2vModel.build_vocab(combinedCorpus)
|
| 323 |
+
#w2vModel.train(combinedCorpus, total_examples=w2vModel.corpus_count, epochs=30)
|
| 324 |
+
#w2vModel.save("/content/drive/MyDrive/CollectData/NER/word2Vec/TestExamples/models/wordVector_"+modelName+".model")
|
| 325 |
+
#w2vModel.wv.save_word2vec_format("/content/drive/MyDrive/CollectData/NER/word2Vec/TestExamples/models/wordVector_"+modelName+".txt")
|
| 326 |
+
w2vModel.save(saveFolder+"/"+modelName+".model")
|
| 327 |
+
w2vModel.wv.save_word2vec_format(saveFolder+"/"+modelName+".txt")
|
| 328 |
+
print("done w2v")
|
| 329 |
+
else: print("no corpus to train")
|
| 330 |
#return combinedCorpus
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 331 |
def genSimilar(self,word,modelFile,n=10, cos_thres=0.7):
|
| 332 |
# might not be a meaningful keyword
|
| 333 |
#stopWords = ["show"]
|
|
|
|
| 354 |
results.append(moreNewResult)
|
| 355 |
currN +=1'''
|
| 356 |
return results
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 357 |
# adding our model into spacy
|
| 358 |
# this deals with command line; but instead of using it, we write python script to run command line
|
| 359 |
def loadWordVec(self,modelName,wordVec):
|
|
|
|
| 366 |
"en",
|
| 367 |
modelName, # this modelName comes from the saved modelName of function trainWord2Vec
|
| 368 |
"--vectors-loc",
|
| 369 |
+
wordVec])
|
|
|
|
|
|