XPF

File size: 6,492 Bytes

4a08ba7

#!/usr/bin/python3
from __future__ import print_function

from math import log
from collections import deque

class contextRep(object):

    def __init__(self):
        self.count = 0.0         # times this context was observed
        self.contexts = dict()   # continuation:context dictionary
        self.precals = None      # probs can be precalculated
        self.terminal = 0.0      # number of times this context was final
    
    def __repr__(self):
        return repr([self.count, self.terminal, self.contexts])

    def __str__(self):
        return repr(self)

    def add(self, seq, count, func=lambda x: None):
        """
        add a full sequence to the representation
        """
        if len(seq) > 0:
            key = seq[0]
            if not key in self.contexts:
                self.contexts[key] = contextRep()
            self.contexts[key].add(seq[1:], count, func)
        else:
            self.terminal += count

        self.count += count

    def prob(self, key, log2=False):
        """
        get the probability of observing a particular continuation in
        the given context
        """
        if self.precals is None:
            ret = self.contexts[key].count / self.count \
                  if key in self.contexts else 0.0
        else:
            ret = self.precals[key]

        return ret if not log2 else log(ret, 2)

    def probs(self, log2=False):
        """
        Get the probabilities of getting all continuations in the given
        context
        """
        if self.precals is None:
            ret = {key:self.prob(key, log2=log2) for key in self.contexts}
        else:
            ret = self.precals if not log2 \
                  else {p:log(self.precals[p], 2) for p in self.precals}
                  ##else {p:log(p, 2) for p in self.precals}
        return ret

    def precalc(self):
        """
        Create a static image of the probabilities
        """
        self.precals = self.probs()
        for key in self.contexts:
            self.contexts[key].precalc()


    def contextProb(self, seq, terminal=False):
        """
        Create for each item in a sequence the probability of observing
        it in the given context
        """
        context = self
        ret = deque()
        for key in seq:
            if context is not None and key in context.contexts:
                ret.append(context.prob(key, False))
                context = context.contexts[key]
            else:
                context = None
                ret.append(0.0)

        if terminal:
            if context is not None:
                ret.append(context.terminal / context.count)
            else:
                ret.append(0.0)
        return list(ret)
        
    def informativity_counts(self):
        """
        Create for each item in a sequence the probability of observing
        it in the given context
        """
        retvals = {key:(-log(self.contexts[key].count / self.count, 2))
                    for key in self.contexts}
        retcounts = {key:self.contexts[key].count for key in self.contexts}

        for key in self.contexts:
            (subvals, subcounts) = \
                self.contexts[key].informativity_counts()
            for key in subvals:
                (selfval, selfcount) = (retvals[key], retcounts[key]) \
                                       if key in retvals \
                                        else (0.0, 0.0)
                retvals[key] = (selfval*selfcount +
                                 subvals[key]*subcounts[key]) / (subcounts[key]+selfcount)
                retcounts[key] = selfcount + subcounts[key]
                

        return (retvals, retcounts)

    def informativity(self):
        (informativity, counts) = self.informativity_counts()
        return informativity


    def iter(self, terminal=False, log2=False):
        logfunc = (lambda x: -log(x, 2) if x < 1 else 0) if log2 else (lambda x: x)
        
        if self.terminal > 0:
            yield [{"seg":None, "prob":logfunc(self.terminal / self.count), "count":self.count}] if terminal else []

        for key in sorted(self.contexts):
            for cont in self.contexts[key].iter(terminal=terminal, log2=log2):
                yield [{"seg":key,
                        "prob":logfunc(self.contexts[key].count / self.count),
                        "count":self.contexts[key].count}
                ] + cont


    def __iter__(self):
        for value in self.iter(log2=True, terminal=False):
            yield value


    ##
    ## Returns a pure dictionary representation of the object
    ##
    def asdict(self):
        ret = {"count":  self.count,
               "contexts": {key: self.contexts[key].asdict() for key in self.contexts},
               "precals": self.precals is None,
               "terminal": self.terminal}
        return ret

    ##
    ## reconstruct an object from a dictionary (created by asdict)
    ## I failed to create a static method and couldn't bother more with it.
    ## The only real reason to use this method + todict is to save contextRep objects in R / json easily
    ##
    def populate(self, d):
        self.count = d["count"]
        self.terminal = d["terminal"]
        self.contexts = {key:contextRep().populate(d["contexts"][key]) for key in d["contexts"]}
        self.precals = None if d["precals"] is False else self.precalc()
        return self


    ##
    ## Object equality (only to check todict / populate)
    ##
    def __eq__(self, other):
        if isinstance(other, contextRep):
            return all([self.terminal == other.terminal,
                        self.count == other.count,
                        all(self.contexts[key] == other.contexts[key] if key in other.contexts else False
                            for key in self.contexts),
                        all(key in self.contexts for key in other.contexts)])
        else:
            return False
        
            
if __name__ == "__main__":

    c = contextRep()
    
    c.add("ab", 5)
    c.add("ac", 5)
    c.add("a", 5)
    c.add("c", 15)
    c.add("P AO1 R T N OY0".split(), 1)
    print(c)
    print(c.informativity())
    print(c.probs())
    print(c.contextProb("ab"))
    print(c.contextProb("a"))
    print(c.asdict())

    c2 = contextRep()
    c2.populate(c.asdict())
    print(c2)
    print(c2 == c)
    print(c.informativity() == c2.informativity())


    for v in c.iter(terminal=True):
        print(v)