Spaces:
Build error
Build error
Commit ·
645d04b
1
Parent(s): acdbc7f
Upload 10 files
Browse files- src/ml_ner.py +8 -17
- src/nn_model.py +1 -13
- src/tagging_text.py +2 -2
src/ml_ner.py
CHANGED
|
@@ -8,7 +8,7 @@ Created on Fri Jun 12 16:41:54 2020
|
|
| 8 |
import io
|
| 9 |
import time
|
| 10 |
import numpy as np
|
| 11 |
-
|
| 12 |
def ml_intext(infile):
|
| 13 |
fin=open(infile,'r',encoding='utf-8')
|
| 14 |
alltexts=fin.read().strip().split('\n\n')
|
|
@@ -462,7 +462,7 @@ def combine_strategy(test_decode_temp, T=0.8):
|
|
| 462 |
return fout.getvalue()
|
| 463 |
|
| 464 |
|
| 465 |
-
def model_predict(
|
| 466 |
if nn_model.model_type=='cnn':
|
| 467 |
#startTime=time.time()
|
| 468 |
test_set,test_label = ml_intext_fn(ml_input)
|
|
@@ -482,7 +482,6 @@ def model_predict(session,ml_input,nn_model,ml_input_txt,ml_input_index,Threshol
|
|
| 482 |
input_test.append(test_x[3])
|
| 483 |
# print('ml-model-represent:',time.time()-startTime)
|
| 484 |
# startTime=time.time()
|
| 485 |
-
K.set_session(session)
|
| 486 |
test_pre = nn_model.model.predict(input_test)
|
| 487 |
# print('ml-model-predict:',time.time()-startTime)
|
| 488 |
|
|
@@ -492,10 +491,6 @@ def model_predict(session,ml_input,nn_model,ml_input_txt,ml_input_index,Threshol
|
|
| 492 |
test_x,test_y=nn_model.rep.load_data(test_set,test_label,word_max_len=nn_model.maxlen)
|
| 493 |
#print('ml-model-represent:',time.time()-startTime)
|
| 494 |
#startTime=time.time()
|
| 495 |
-
#K.set_session(session)
|
| 496 |
-
#with session.as_default():
|
| 497 |
-
#with session.graph.as_default():
|
| 498 |
-
#print('......session')
|
| 499 |
test_pre = nn_model.model.predict(test_x)
|
| 500 |
#print('ml-model-modedpred:',time.time()-startTime)
|
| 501 |
# startTime=time.time()
|
|
@@ -527,19 +522,15 @@ def model_predict_old(ml_input,nn_model,ml_input_txt,ml_input_index,Threshold):
|
|
| 527 |
|
| 528 |
if nn_model.fea_dict['pos'] == 1:
|
| 529 |
input_test.append(test_x[3])
|
| 530 |
-
|
| 531 |
-
|
| 532 |
-
with nn_model.session.graph.as_default():
|
| 533 |
-
test_pre = nn_model.model.predict(input_test,batch_size=256)
|
| 534 |
|
| 535 |
elif nn_model.model_type=='bert' or nn_model.model_type=='bioformer':
|
| 536 |
|
| 537 |
test_set,test_label = ml_intext_fn(ml_input)
|
| 538 |
test_x,test_y=nn_model.rep.load_data(test_set,test_label,word_max_len=nn_model.maxlen)
|
| 539 |
-
|
| 540 |
-
|
| 541 |
-
with nn_model.session.graph.as_default():
|
| 542 |
-
test_pre = nn_model.model.predict(test_x,batch_size=128)
|
| 543 |
|
| 544 |
test_score=output_result(test_pre, nn_model.rep.label_2_index,Top_N=3)
|
| 545 |
#print('test_score:',test_score)
|
|
@@ -562,7 +553,7 @@ def output_txt(ml_input_txt):
|
|
| 562 |
|
| 563 |
return fout.getvalue()
|
| 564 |
|
| 565 |
-
def ml_tagging(
|
| 566 |
# startTime=time.time()
|
| 567 |
ml_input, ml_input_txt,ml_input_index=build_ngram_testset_filted(ssplit_token)
|
| 568 |
# print('ml-ngrambuild:',time.time()-startTime)
|
|
@@ -570,7 +561,7 @@ def ml_tagging(session,ssplit_token,ml_model,Threshold):
|
|
| 570 |
#print(ml_input)
|
| 571 |
# startTime=time.time()
|
| 572 |
if len(ml_input_index)>0:
|
| 573 |
-
ml_pre_tsv=model_predict(
|
| 574 |
else:
|
| 575 |
ml_pre_tsv=output_txt(ml_input_txt)
|
| 576 |
# print('ml-modelpred:',time.time()-startTime)
|
|
|
|
| 8 |
import io
|
| 9 |
import time
|
| 10 |
import numpy as np
|
| 11 |
+
|
| 12 |
def ml_intext(infile):
|
| 13 |
fin=open(infile,'r',encoding='utf-8')
|
| 14 |
alltexts=fin.read().strip().split('\n\n')
|
|
|
|
| 462 |
return fout.getvalue()
|
| 463 |
|
| 464 |
|
| 465 |
+
def model_predict(ml_input,nn_model,ml_input_txt,ml_input_index,Threshold):
|
| 466 |
if nn_model.model_type=='cnn':
|
| 467 |
#startTime=time.time()
|
| 468 |
test_set,test_label = ml_intext_fn(ml_input)
|
|
|
|
| 482 |
input_test.append(test_x[3])
|
| 483 |
# print('ml-model-represent:',time.time()-startTime)
|
| 484 |
# startTime=time.time()
|
|
|
|
| 485 |
test_pre = nn_model.model.predict(input_test)
|
| 486 |
# print('ml-model-predict:',time.time()-startTime)
|
| 487 |
|
|
|
|
| 491 |
test_x,test_y=nn_model.rep.load_data(test_set,test_label,word_max_len=nn_model.maxlen)
|
| 492 |
#print('ml-model-represent:',time.time()-startTime)
|
| 493 |
#startTime=time.time()
|
|
|
|
|
|
|
|
|
|
|
|
|
| 494 |
test_pre = nn_model.model.predict(test_x)
|
| 495 |
#print('ml-model-modedpred:',time.time()-startTime)
|
| 496 |
# startTime=time.time()
|
|
|
|
| 522 |
|
| 523 |
if nn_model.fea_dict['pos'] == 1:
|
| 524 |
input_test.append(test_x[3])
|
| 525 |
+
|
| 526 |
+
test_pre = nn_model.model.predict(input_test,batch_size=256)
|
|
|
|
|
|
|
| 527 |
|
| 528 |
elif nn_model.model_type=='bert' or nn_model.model_type=='bioformer':
|
| 529 |
|
| 530 |
test_set,test_label = ml_intext_fn(ml_input)
|
| 531 |
test_x,test_y=nn_model.rep.load_data(test_set,test_label,word_max_len=nn_model.maxlen)
|
| 532 |
+
|
| 533 |
+
test_pre = nn_model.model.predict(test_x,batch_size=128)
|
|
|
|
|
|
|
| 534 |
|
| 535 |
test_score=output_result(test_pre, nn_model.rep.label_2_index,Top_N=3)
|
| 536 |
#print('test_score:',test_score)
|
|
|
|
| 553 |
|
| 554 |
return fout.getvalue()
|
| 555 |
|
| 556 |
+
def ml_tagging(ssplit_token,ml_model,Threshold):
|
| 557 |
# startTime=time.time()
|
| 558 |
ml_input, ml_input_txt,ml_input_index=build_ngram_testset_filted(ssplit_token)
|
| 559 |
# print('ml-ngrambuild:',time.time()-startTime)
|
|
|
|
| 561 |
#print(ml_input)
|
| 562 |
# startTime=time.time()
|
| 563 |
if len(ml_input_index)>0:
|
| 564 |
+
ml_pre_tsv=model_predict(ml_input,ml_model,ml_input_txt,ml_input_index,Threshold)
|
| 565 |
else:
|
| 566 |
ml_pre_tsv=output_txt(ml_input_txt)
|
| 567 |
# print('ml-modelpred:',time.time()-startTime)
|
src/nn_model.py
CHANGED
|
@@ -8,12 +8,10 @@ Created on Thu Mar 26 09:04:13 2020
|
|
| 8 |
import time
|
| 9 |
import sys
|
| 10 |
import numpy as np
|
| 11 |
-
import tensorflow as tf
|
| 12 |
import keras
|
| 13 |
from src.nn_represent import CNN_RepresentationLayer,BERT_RepresentationLayer
|
| 14 |
from keras.layers import *
|
| 15 |
from keras.models import Model
|
| 16 |
-
from keras import backend as K
|
| 17 |
from keras_bert import load_trained_model_from_checkpoint
|
| 18 |
|
| 19 |
|
|
@@ -37,7 +35,7 @@ class bioTag_CNN():
|
|
| 37 |
self.charfile=model_files['charfile']
|
| 38 |
self.labelfile=model_files['labelfile']
|
| 39 |
self.posfile=model_files['posfile']
|
| 40 |
-
|
| 41 |
vocab={'char':self.charfile,'label':self.labelfile,'pos':self.posfile}
|
| 42 |
print('loading w2v model.....')
|
| 43 |
self.rep = CNN_RepresentationLayer(self.w2vfile,vocab_file=vocab, frequency=400000)
|
|
@@ -94,8 +92,6 @@ class bioTag_CNN():
|
|
| 94 |
self.model = Model(inputs=all_fea, outputs=output)
|
| 95 |
def load_model(self,model_file):
|
| 96 |
self.model.load_weights(model_file)
|
| 97 |
-
self.session = K.get_session()
|
| 98 |
-
print(self.session)
|
| 99 |
#self.model.summary()
|
| 100 |
print('load cnn model done!')
|
| 101 |
|
|
@@ -107,7 +103,6 @@ class bioTag_BERT():
|
|
| 107 |
checkpoint_path = model_files['checkpoint_path']
|
| 108 |
vocab_path = model_files['vocab_path']
|
| 109 |
self.label_file=model_files['labelfile']
|
| 110 |
-
self.session = tf.Session()
|
| 111 |
|
| 112 |
self.rep = BERT_RepresentationLayer( vocab_path, self.label_file)
|
| 113 |
|
|
@@ -124,8 +119,6 @@ class bioTag_BERT():
|
|
| 124 |
|
| 125 |
def load_model(self,model_file):
|
| 126 |
self.model.load_weights(model_file)
|
| 127 |
-
self.session = K.get_session()
|
| 128 |
-
print(self.session)
|
| 129 |
#self.model.summary()
|
| 130 |
|
| 131 |
class bioTag_Bioformer():
|
|
@@ -152,11 +145,6 @@ class bioTag_Bioformer():
|
|
| 152 |
|
| 153 |
def load_model(self,model_file):
|
| 154 |
self.model.load_weights(model_file)
|
| 155 |
-
#self.model._make_predict_function()
|
| 156 |
-
#session = K.get_session()
|
| 157 |
-
#print(session)
|
| 158 |
#self.model.summary()
|
| 159 |
-
session=''
|
| 160 |
-
return session
|
| 161 |
print('load bioformer model done!')
|
| 162 |
|
|
|
|
| 8 |
import time
|
| 9 |
import sys
|
| 10 |
import numpy as np
|
|
|
|
| 11 |
import keras
|
| 12 |
from src.nn_represent import CNN_RepresentationLayer,BERT_RepresentationLayer
|
| 13 |
from keras.layers import *
|
| 14 |
from keras.models import Model
|
|
|
|
| 15 |
from keras_bert import load_trained_model_from_checkpoint
|
| 16 |
|
| 17 |
|
|
|
|
| 35 |
self.charfile=model_files['charfile']
|
| 36 |
self.labelfile=model_files['labelfile']
|
| 37 |
self.posfile=model_files['posfile']
|
| 38 |
+
|
| 39 |
vocab={'char':self.charfile,'label':self.labelfile,'pos':self.posfile}
|
| 40 |
print('loading w2v model.....')
|
| 41 |
self.rep = CNN_RepresentationLayer(self.w2vfile,vocab_file=vocab, frequency=400000)
|
|
|
|
| 92 |
self.model = Model(inputs=all_fea, outputs=output)
|
| 93 |
def load_model(self,model_file):
|
| 94 |
self.model.load_weights(model_file)
|
|
|
|
|
|
|
| 95 |
#self.model.summary()
|
| 96 |
print('load cnn model done!')
|
| 97 |
|
|
|
|
| 103 |
checkpoint_path = model_files['checkpoint_path']
|
| 104 |
vocab_path = model_files['vocab_path']
|
| 105 |
self.label_file=model_files['labelfile']
|
|
|
|
| 106 |
|
| 107 |
self.rep = BERT_RepresentationLayer( vocab_path, self.label_file)
|
| 108 |
|
|
|
|
| 119 |
|
| 120 |
def load_model(self,model_file):
|
| 121 |
self.model.load_weights(model_file)
|
|
|
|
|
|
|
| 122 |
#self.model.summary()
|
| 123 |
|
| 124 |
class bioTag_Bioformer():
|
|
|
|
| 145 |
|
| 146 |
def load_model(self,model_file):
|
| 147 |
self.model.load_weights(model_file)
|
|
|
|
|
|
|
|
|
|
| 148 |
#self.model.summary()
|
|
|
|
|
|
|
| 149 |
print('load bioformer model done!')
|
| 150 |
|
src/tagging_text.py
CHANGED
|
@@ -18,7 +18,7 @@ import time
|
|
| 18 |
import json
|
| 19 |
|
| 20 |
#hybrid method
|
| 21 |
-
def bioTag(
|
| 22 |
|
| 23 |
# startTime=time.time()
|
| 24 |
ssplit_token=ssplit_token_pos_lemma(text)
|
|
@@ -31,7 +31,7 @@ def bioTag(session,text,biotag_dic,ml_model,onlyLongest=False, abbrRecog=False,
|
|
| 31 |
# print('dict ner:',time.time()-startTime)
|
| 32 |
|
| 33 |
# startTime=time.time()
|
| 34 |
-
ml_tsv=ml_tagging(
|
| 35 |
#print('ml_tsv:\n',ml_tsv)
|
| 36 |
# print('ml ner:',time.time()-startTime)
|
| 37 |
|
|
|
|
| 18 |
import json
|
| 19 |
|
| 20 |
#hybrid method
|
| 21 |
+
def bioTag(text,biotag_dic,ml_model,onlyLongest=False, abbrRecog=False, Threshold=0.95):
|
| 22 |
|
| 23 |
# startTime=time.time()
|
| 24 |
ssplit_token=ssplit_token_pos_lemma(text)
|
|
|
|
| 31 |
# print('dict ner:',time.time()-startTime)
|
| 32 |
|
| 33 |
# startTime=time.time()
|
| 34 |
+
ml_tsv=ml_tagging(ssplit_token,ml_model,Threshold)
|
| 35 |
#print('ml_tsv:\n',ml_tsv)
|
| 36 |
# print('ml ner:',time.time()-startTime)
|
| 37 |
|