File size: 6,492 Bytes
4a08ba7 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 |
#!/usr/bin/python3
from __future__ import print_function
from math import log
from collections import deque
class contextRep(object):
def __init__(self):
self.count = 0.0 # times this context was observed
self.contexts = dict() # continuation:context dictionary
self.precals = None # probs can be precalculated
self.terminal = 0.0 # number of times this context was final
def __repr__(self):
return repr([self.count, self.terminal, self.contexts])
def __str__(self):
return repr(self)
def add(self, seq, count, func=lambda x: None):
"""
add a full sequence to the representation
"""
if len(seq) > 0:
key = seq[0]
if not key in self.contexts:
self.contexts[key] = contextRep()
self.contexts[key].add(seq[1:], count, func)
else:
self.terminal += count
self.count += count
def prob(self, key, log2=False):
"""
get the probability of observing a particular continuation in
the given context
"""
if self.precals is None:
ret = self.contexts[key].count / self.count \
if key in self.contexts else 0.0
else:
ret = self.precals[key]
return ret if not log2 else log(ret, 2)
def probs(self, log2=False):
"""
Get the probabilities of getting all continuations in the given
context
"""
if self.precals is None:
ret = {key:self.prob(key, log2=log2) for key in self.contexts}
else:
ret = self.precals if not log2 \
else {p:log(self.precals[p], 2) for p in self.precals}
##else {p:log(p, 2) for p in self.precals}
return ret
def precalc(self):
"""
Create a static image of the probabilities
"""
self.precals = self.probs()
for key in self.contexts:
self.contexts[key].precalc()
def contextProb(self, seq, terminal=False):
"""
Create for each item in a sequence the probability of observing
it in the given context
"""
context = self
ret = deque()
for key in seq:
if context is not None and key in context.contexts:
ret.append(context.prob(key, False))
context = context.contexts[key]
else:
context = None
ret.append(0.0)
if terminal:
if context is not None:
ret.append(context.terminal / context.count)
else:
ret.append(0.0)
return list(ret)
def informativity_counts(self):
"""
Create for each item in a sequence the probability of observing
it in the given context
"""
retvals = {key:(-log(self.contexts[key].count / self.count, 2))
for key in self.contexts}
retcounts = {key:self.contexts[key].count for key in self.contexts}
for key in self.contexts:
(subvals, subcounts) = \
self.contexts[key].informativity_counts()
for key in subvals:
(selfval, selfcount) = (retvals[key], retcounts[key]) \
if key in retvals \
else (0.0, 0.0)
retvals[key] = (selfval*selfcount +
subvals[key]*subcounts[key]) / (subcounts[key]+selfcount)
retcounts[key] = selfcount + subcounts[key]
return (retvals, retcounts)
def informativity(self):
(informativity, counts) = self.informativity_counts()
return informativity
def iter(self, terminal=False, log2=False):
logfunc = (lambda x: -log(x, 2) if x < 1 else 0) if log2 else (lambda x: x)
if self.terminal > 0:
yield [{"seg":None, "prob":logfunc(self.terminal / self.count), "count":self.count}] if terminal else []
for key in sorted(self.contexts):
for cont in self.contexts[key].iter(terminal=terminal, log2=log2):
yield [{"seg":key,
"prob":logfunc(self.contexts[key].count / self.count),
"count":self.contexts[key].count}
] + cont
def __iter__(self):
for value in self.iter(log2=True, terminal=False):
yield value
##
## Returns a pure dictionary representation of the object
##
def asdict(self):
ret = {"count": self.count,
"contexts": {key: self.contexts[key].asdict() for key in self.contexts},
"precals": self.precals is None,
"terminal": self.terminal}
return ret
##
## reconstruct an object from a dictionary (created by asdict)
## I failed to create a static method and couldn't bother more with it.
## The only real reason to use this method + todict is to save contextRep objects in R / json easily
##
def populate(self, d):
self.count = d["count"]
self.terminal = d["terminal"]
self.contexts = {key:contextRep().populate(d["contexts"][key]) for key in d["contexts"]}
self.precals = None if d["precals"] is False else self.precalc()
return self
##
## Object equality (only to check todict / populate)
##
def __eq__(self, other):
if isinstance(other, contextRep):
return all([self.terminal == other.terminal,
self.count == other.count,
all(self.contexts[key] == other.contexts[key] if key in other.contexts else False
for key in self.contexts),
all(key in self.contexts for key in other.contexts)])
else:
return False
if __name__ == "__main__":
c = contextRep()
c.add("ab", 5)
c.add("ac", 5)
c.add("a", 5)
c.add("c", 15)
c.add("P AO1 R T N OY0".split(), 1)
print(c)
print(c.informativity())
print(c.probs())
print(c.contextProb("ab"))
print(c.contextProb("a"))
print(c.asdict())
c2 = contextRep()
c2.populate(c.asdict())
print(c2)
print(c2 == c)
print(c.informativity() == c2.informativity())
for v in c.iter(terminal=True):
print(v)
|