File size: 2,010 Bytes
366b225 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 | from conllu import parse
from io import open
from conllu import parse_incr
# from sklearn.externals import joblib
# import cloudpickle
import dill
data_file = open("UD_Tamil-TTB-master/ta_ttb-ud-train.conllu", "r", encoding="utf-8")
sentences=[]
for tokenlist in parse_incr(data_file):
sentences.append(tokenlist)
data_file = open("UD_Tamil-TTB-master/ta_ttb-ud-dev.conllu", "r", encoding="utf-8")
for tokenlist in parse_incr(data_file):
sentences.append(tokenlist)
data_file = open("UD_Tamil-TTB-master/ta_ttb-ud-test.conllu", "r", encoding="utf-8")
test_sentences=[]
for tokenlist in parse_incr(data_file):
test_sentences.append(tokenlist)
train_sentences = sentences#[:1500]
test_sentences = test_sentences
train_data_hn=[[(token['form'],token['upostag']) for token in sentence ]for sentence in train_sentences ]
test_data_hn=[[(token['form'],token['upostag']) for token in sentence ]for sentence in test_sentences ]
import nltk
from nltk.tag import hmm
trainer = hmm.HiddenMarkovModelTrainer()
tagger = trainer.train_supervised(train_data_hn)
with open('my_tagger.dill', 'wb') as f:
dill.dump(tagger, f)
print('MODEL SAVED')
# tagger3 = joblib.load('filename.pkl')
# print( tagger)
test1=[token['form'] for token in test_sentences[10]]
print('input :\n',test1)
out = tagger.tag(test1)
act = [(token['form'],token['upostag']) for token in test_sentences[10] ]
def acc(test_sentences=test_sentences):
num = 0
denom = 0
for sen in test_sentences:
test1=[token['form'] for token in sen]
out = tagger.tag(test1)
act = [(token['form'],token['upostag']) for token in sen ]
num+=sum(x == y for x, y in zip(out, act))
denom+=len(out)
print("acc=",num/denom)
acc()
# s=sum(x == y for x, y in zip(out, act))/sum(x != y for x, y in zip(out, act))
print("An Example \n")
print("True:\n",act)
print("\nPredicted:\n",out)
'''
test1=[token['form'] for token in test_sentences[10]]
print('input :\n',test1)
out = tagger.tag(test1)
''' |