File size: 6,636 Bytes
382124c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
import pandas as pd
import numpy as np
import csv,pickle,json,bz2
from romtoslp import *
loaded_DCS = pickle.load(open('../Simultaneous_DCS_ho.p', 'rb'))
folder =  '../NewData/skt_dcs_DS.bz2_4K_bigram_mir_heldout/'

def open_dsbz2(filename):
    with bz2.BZ2File(filename, 'r') as f:
        loader = pickle.load(f)
    
    conflicts_Dict_correct = loader['conflicts_Dict_correct']
    nodelist_to_correct_mapping = loader['nodelist_to_correct_mapping']
    nodelist_correct = loader['nodelist_correct']
    featVMat_correct = loader['featVMat_correct']
    featVMat = loader['featVMat']
    conflicts_Dict = loader['conflicts_Dict']
    nodelist = loader['nodelist']
    
    return (nodelist_correct, conflicts_Dict_correct, featVMat_correct, nodelist_to_correct_mapping,\
            nodelist, conflicts_Dict, featVMat)

#snippet for forming the groundtruth csv file
with open('groundtruth2.csv','w') as fh:
    rd = csv.writer(fh)
    rd.writerow(['File','Lemma','CNG','lemmaCorr','lemmaCNGcorr','predCNG','Conflicts'])
count=0
for ii in range(4): 
    with open("BM2_NLoss_proc"+str(ii)+".csv",'r') as fh:
        rd = csv.reader(fh)
        while(True):
            try:
                print(count)
                count+=1
                x=next(rd)  #predicted lemmas
                sentid = x[0]
                dcsobj = loaded_DCS[str(sentid)+'.p2']
#                 print(dcsobj.cng)
#                 print(dcsobj.lemmas)
#                 print(dcsobj.dcs_chunks)
                nodelist_correct, conflicts_Dict_correct, featVMat_correct, nodelist_to_correct_mapping,\
            nodelist, conflicts_Dict, featVMat = open_dsbz2(folder+str(sentid)+'.ds.bz2')
#                 print(conflicts_Dict_correct)
#                 print(nodelist_correct)
#                 break
                dll = 0
                for i in dcsobj.lemmas:
                    dll+=len(i)
                if(dll!=len(nodelist_correct)):
                    print('here')
                    print(dcsobj.lemmas)
                    print(nodelist_correct)
                gtlemmas = []
                for outerlist in dcsobj.lemmas:
                    for element in outerlist:
                        gtlemmas.append(rom_slp(element))
                pdlemmas = x[1:]
                
                x=next(rd) #predicted cngs
                gtcngs = []
                i = 0
                for outerlist in dcsobj.cng:
                    for element in outerlist:
                        gtcngs.append((element,len(conflicts_Dict_correct[i])))
                        i+=1
                pdcngs = x[1:]
                for i in range(4):
                    x=(next(rd))
#                 print(gtlemmas)
#                 print(pdlemmas)
#                 print(gtcngs)
#                 print(pdcngs)               
                pdldict = dict()
                gtldict = dict()
                for i in range(len(gtlemmas)):
                    if(gtlemmas[i] in gtldict):
                        gtldict[gtlemmas[i]].append(gtcngs[i])
                    else:
                        gtldict[gtlemmas[i]] = [gtcngs[i]]
                
                for i in range(len(pdlemmas)):
                    if(pdlemmas[i] in pdldict):
                        pdldict[pdlemmas[i]].append(pdcngs[i])
                    else:
                        pdldict[pdlemmas[i]] = [pdcngs[i]]
                
#                 print(gtldict)
#                 print(gtldict)
            
                lemmaround2 = []
                cnground2 = []
                for gtl in gtldict.keys():                  
                    for gtlcng in gtldict[gtl]:                     
                        lemmacorr = 0
                        lemmaCNGcorr=0
                        predictedcng = 'nil'
                        confcount = gtlcng[1]
                        gtlcng = gtlcng[0]
                        if(gtl in pdldict.keys()):
                            if(len(pdldict[gtl])>0):
                                if(gtlcng in pdldict[gtl]):
                                    lemmacorr = 1
                                    predictedcng = gtlcng
                                    lemmaCNGcorr = 1
                                    pdldict[gtl].remove(gtlcng)
                                    with open('groundtruth2.csv','a') as fh:
                                        rwd = csv.writer(fh)
                                        row = [sentid,gtl,gtlcng,lemmacorr,lemmaCNGcorr,gtlcng,confcount]
                                        rwd.writerow(row)
                                else:
                                    lemmaround2.append(gtl)
                                    cnground2.append((gtlcng,confcount))
                            else:
                                with open('groundtruth2.csv','a') as fh:
                                        rwd = csv.writer(fh)
                                        row = [sentid,gtl,gtlcng,lemmacorr,lemmaCNGcorr,predictedcng,confcount]
                                        rwd.writerow(row)
                        else:
                             with open('groundtruth2.csv','a') as fh:
                                        rwd = csv.writer(fh)
                                        row = [sentid,gtl,gtlcng,lemmacorr,lemmaCNGcorr,predictedcng,confcount]
                                        rwd.writerow(row)
                # now all elements with lemmaCNGcorr ==1 are out of the way
                # reiterating for the lemmas which didnt have a cng but had a lemma earlier
                for i in range(len(lemmaround2)):
                    gtl = lemmaround2[i]
                    gtlcng = cnground2[i]
                    confcount = gtlcng[1]
                    gtlcng = gtlcng[0]
                    lemmacorr = 0
                    lemmaCNGcorr = 0
                    predictedcng = 'nil'
                    if(gtl in pdldict.keys()):
                        if(len(pdldict[gtl])>0):
                            lemmacorr = 1
                            predictedcng = pdldict[gtl][0]
                            pdldict[gtl].remove(pdldict[gtl][0])
                    with open('groundtruth2.csv','a') as fh:
                            rwd = csv.writer(fh)
                            row = [sentid,gtl,gtlcng,lemmacorr,lemmaCNGcorr,predictedcng,confcount]
                            rwd.writerow(row)
#                 print('done here')
            except Exception as e:
                print(e)
                print('been there')
#                 break
                continue