File size: 6,492 Bytes
4a08ba7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
#!/usr/bin/python3
from __future__ import print_function

from math import log
from collections import deque

class contextRep(object):

    def __init__(self):
        self.count = 0.0         # times this context was observed
        self.contexts = dict()   # continuation:context dictionary
        self.precals = None      # probs can be precalculated
        self.terminal = 0.0      # number of times this context was final
    
    def __repr__(self):
        return repr([self.count, self.terminal, self.contexts])

    def __str__(self):
        return repr(self)

    def add(self, seq, count, func=lambda x: None):
        """
        add a full sequence to the representation
        """
        if len(seq) > 0:
            key = seq[0]
            if not key in self.contexts:
                self.contexts[key] = contextRep()
            self.contexts[key].add(seq[1:], count, func)
        else:
            self.terminal += count

        self.count += count

    def prob(self, key, log2=False):
        """
        get the probability of observing a particular continuation in
        the given context
        """
        if self.precals is None:
            ret = self.contexts[key].count / self.count \
                  if key in self.contexts else 0.0
        else:
            ret = self.precals[key]

        return ret if not log2 else log(ret, 2)

    def probs(self, log2=False):
        """
        Get the probabilities of getting all continuations in the given
        context
        """
        if self.precals is None:
            ret = {key:self.prob(key, log2=log2) for key in self.contexts}
        else:
            ret = self.precals if not log2 \
                  else {p:log(self.precals[p], 2) for p in self.precals}
                  ##else {p:log(p, 2) for p in self.precals}
        return ret

    def precalc(self):
        """
        Create a static image of the probabilities
        """
        self.precals = self.probs()
        for key in self.contexts:
            self.contexts[key].precalc()


    def contextProb(self, seq, terminal=False):
        """
        Create for each item in a sequence the probability of observing
        it in the given context
        """
        context = self
        ret = deque()
        for key in seq:
            if context is not None and key in context.contexts:
                ret.append(context.prob(key, False))
                context = context.contexts[key]
            else:
                context = None
                ret.append(0.0)

        if terminal:
            if context is not None:
                ret.append(context.terminal / context.count)
            else:
                ret.append(0.0)
        return list(ret)
        
    def informativity_counts(self):
        """
        Create for each item in a sequence the probability of observing
        it in the given context
        """
        retvals = {key:(-log(self.contexts[key].count / self.count, 2))
                    for key in self.contexts}
        retcounts = {key:self.contexts[key].count for key in self.contexts}

        for key in self.contexts:
            (subvals, subcounts) = \
                self.contexts[key].informativity_counts()
            for key in subvals:
                (selfval, selfcount) = (retvals[key], retcounts[key]) \
                                       if key in retvals \
                                        else (0.0, 0.0)
                retvals[key] = (selfval*selfcount +
                                 subvals[key]*subcounts[key]) / (subcounts[key]+selfcount)
                retcounts[key] = selfcount + subcounts[key]
                

        return (retvals, retcounts)

    def informativity(self):
        (informativity, counts) = self.informativity_counts()
        return informativity


    def iter(self, terminal=False, log2=False):
        logfunc = (lambda x: -log(x, 2) if x < 1 else 0) if log2 else (lambda x: x)
        
        if self.terminal > 0:
            yield [{"seg":None, "prob":logfunc(self.terminal / self.count), "count":self.count}] if terminal else []

        for key in sorted(self.contexts):
            for cont in self.contexts[key].iter(terminal=terminal, log2=log2):
                yield [{"seg":key,
                        "prob":logfunc(self.contexts[key].count / self.count),
                        "count":self.contexts[key].count}
                ] + cont


    def __iter__(self):
        for value in self.iter(log2=True, terminal=False):
            yield value


    ##
    ## Returns a pure dictionary representation of the object
    ##
    def asdict(self):
        ret = {"count":  self.count,
               "contexts": {key: self.contexts[key].asdict() for key in self.contexts},
               "precals": self.precals is None,
               "terminal": self.terminal}
        return ret

    ##
    ## reconstruct an object from a dictionary (created by asdict)
    ## I failed to create a static method and couldn't bother more with it.
    ## The only real reason to use this method + todict is to save contextRep objects in R / json easily
    ##
    def populate(self, d):
        self.count = d["count"]
        self.terminal = d["terminal"]
        self.contexts = {key:contextRep().populate(d["contexts"][key]) for key in d["contexts"]}
        self.precals = None if d["precals"] is False else self.precalc()
        return self


    ##
    ## Object equality (only to check todict / populate)
    ##
    def __eq__(self, other):
        if isinstance(other, contextRep):
            return all([self.terminal == other.terminal,
                        self.count == other.count,
                        all(self.contexts[key] == other.contexts[key] if key in other.contexts else False
                            for key in self.contexts),
                        all(key in self.contexts for key in other.contexts)])
        else:
            return False
        
            
if __name__ == "__main__":

    c = contextRep()
    
    c.add("ab", 5)
    c.add("ac", 5)
    c.add("a", 5)
    c.add("c", 15)
    c.add("P AO1 R T N OY0".split(), 1)
    print(c)
    print(c.informativity())
    print(c.probs())
    print(c.contextProb("ab"))
    print(c.contextProb("a"))
    print(c.asdict())

    c2 = contextRep()
    c2.populate(c.asdict())
    print(c2)
    print(c2 == c)
    print(c.informativity() == c2.informativity())


    for v in c.iter(terminal=True):
        print(v)