Spaces:

aakash0017
/

DrVai-Rag-Testing

No application file

App Files Files Community

DrVai-Rag-Testing / myenv /lib /python3.10 /site-packages /Bio /MarkovModel.py

aakash0017

Upload folder using huggingface_hub

b7731cd about 2 years ago

raw

history blame contribute delete

23.6 kB

	# Copyright 2002 by Jeffrey Chang.
	# All rights reserved.
	#
	# This file is part of the Biopython distribution and governed by your
	# choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
	# Please see the LICENSE file that should have been included as part of this
	# package.
	"""A state-emitting MarkovModel.

	Note terminology similar to Manning and Schutze is used.


	Functions:
	train_bw Train a markov model using the Baum-Welch algorithm.
	train_visible Train a visible markov model using MLE.
	find_states Find the a state sequence that explains some observations.

	load Load a MarkovModel.
	save Save a MarkovModel.

	Classes:
	MarkovModel Holds the description of a markov model
	"""

	import numpy


	try:
	logaddexp = numpy.logaddexp
	except AttributeError:
	# Numpy versions older than 1.3 do not contain logaddexp.
	# Once we require Numpy version 1.3 or later, we should revisit this
	# module to see if we can simplify some of the other functions in
	# this module.
	import warnings

	warnings.warn(
	"For optimal speed, please update to Numpy version 1.3 or later (current version is %s)"
	% numpy.__version__
	)

	def logaddexp(logx, logy):
	"""Implement logaddexp method if Numpy version is older than 1.3."""
	if logy - logx > 100:
	return logy
	elif logx - logy > 100:
	return logx
	minxy = min(logx, logy)
	return minxy + numpy.log(numpy.exp(logx - minxy) + numpy.exp(logy - minxy))


	def itemindex(values):
	"""Return a dictionary of values with their sequence offset as keys."""
	d = {}
	entries = enumerate(values[::-1])
	n = len(values) - 1
	for index, key in entries:
	d[key] = n - index
	return d


	numpy.random.seed()

	VERY_SMALL_NUMBER = 1e-300
	LOG0 = numpy.log(VERY_SMALL_NUMBER)


	class MarkovModel:
	"""Create a state-emitting MarkovModel object."""

	def __init__(
	self, states, alphabet, p_initial=None, p_transition=None, p_emission=None
	):
	"""Initialize the class."""
	self.states = states
	self.alphabet = alphabet
	self.p_initial = p_initial
	self.p_transition = p_transition
	self.p_emission = p_emission

	def __str__(self):
	"""Create a string representation of the MarkovModel object."""
	from io import StringIO

	handle = StringIO()
	save(self, handle)
	handle.seek(0)
	return handle.read()


	def _readline_and_check_start(handle, start):
	"""Read the first line and evaluate that begisn with the correct start (PRIVATE)."""
	line = handle.readline()
	if not line.startswith(start):
	raise ValueError(f"I expected {start!r} but got {line!r}")
	return line


	def load(handle):
	"""Parse a file handle into a MarkovModel object."""
	# Load the states.
	line = _readline_and_check_start(handle, "STATES:")
	states = line.split()[1:]

	# Load the alphabet.
	line = _readline_and_check_start(handle, "ALPHABET:")
	alphabet = line.split()[1:]

	mm = MarkovModel(states, alphabet)
	N, M = len(states), len(alphabet)

	# Load the initial probabilities.
	mm.p_initial = numpy.zeros(N)
	line = _readline_and_check_start(handle, "INITIAL:")
	for i in range(len(states)):
	line = _readline_and_check_start(handle, f" {states[i]}:")
	mm.p_initial[i] = float(line.split()[-1])

	# Load the transition.
	mm.p_transition = numpy.zeros((N, N))
	line = _readline_and_check_start(handle, "TRANSITION:")
	for i in range(len(states)):
	line = _readline_and_check_start(handle, f" {states[i]}:")
	mm.p_transition[i, :] = [float(v) for v in line.split()[1:]]

	# Load the emission.
	mm.p_emission = numpy.zeros((N, M))
	line = _readline_and_check_start(handle, "EMISSION:")
	for i in range(len(states)):
	line = _readline_and_check_start(handle, f" {states[i]}:")
	mm.p_emission[i, :] = [float(v) for v in line.split()[1:]]

	return mm


	def save(mm, handle):
	"""Save MarkovModel object into handle."""
	# This will fail if there are spaces in the states or alphabet.
	w = handle.write
	w(f"STATES: {' '.join(mm.states)}\n")
	w(f"ALPHABET: {' '.join(mm.alphabet)}\n")
	w("INITIAL:\n")
	for i in range(len(mm.p_initial)):
	w(f" {mm.states[i]}: {mm.p_initial[i]:g}\n")
	w("TRANSITION:\n")
	for i in range(len(mm.p_transition)):
	w(f" {mm.states[i]}: {' '.join(str(x) for x in mm.p_transition[i])}\n")
	w("EMISSION:\n")
	for i in range(len(mm.p_emission)):
	w(f" {mm.states[i]}: {' '.join(str(x) for x in mm.p_emission[i])}\n")


	# XXX allow them to specify starting points
	def train_bw(
	states,
	alphabet,
	training_data,
	pseudo_initial=None,
	pseudo_transition=None,
	pseudo_emission=None,
	update_fn=None,
	):
	"""Train a MarkovModel using the Baum-Welch algorithm.

	Train a MarkovModel using the Baum-Welch algorithm. states is a list
	of strings that describe the names of each state. alphabet is a
	list of objects that indicate the allowed outputs. training_data
	is a list of observations. Each observation is a list of objects
	from the alphabet.

	pseudo_initial, pseudo_transition, and pseudo_emission are
	optional parameters that you can use to assign pseudo-counts to
	different matrices. They should be matrices of the appropriate
	size that contain numbers to add to each parameter matrix, before
	normalization.

	update_fn is an optional callback that takes parameters
	(iteration, log_likelihood). It is called once per iteration.
	"""
	N, M = len(states), len(alphabet)
	if not training_data:
	raise ValueError("No training data given.")
	if pseudo_initial is not None:
	pseudo_initial = numpy.asarray(pseudo_initial)
	if pseudo_initial.shape != (N,):
	raise ValueError("pseudo_initial not shape len(states)")
	if pseudo_transition is not None:
	pseudo_transition = numpy.asarray(pseudo_transition)
	if pseudo_transition.shape != (N, N):
	raise ValueError("pseudo_transition not shape len(states) X len(states)")
	if pseudo_emission is not None:
	pseudo_emission = numpy.asarray(pseudo_emission)
	if pseudo_emission.shape != (N, M):
	raise ValueError("pseudo_emission not shape len(states) X len(alphabet)")

	# Training data is given as a list of members of the alphabet.
	# Replace those with indexes into the alphabet list for easier
	# computation.
	training_outputs = []
	indexes = itemindex(alphabet)
	for outputs in training_data:
	training_outputs.append([indexes[x] for x in outputs])

	# Do some sanity checking on the outputs.
	lengths = [len(x) for x in training_outputs]
	if min(lengths) == 0:
	raise ValueError("I got training data with outputs of length 0")

	# Do the training with baum welch.
	x = _baum_welch(
	N,
	M,
	training_outputs,
	pseudo_initial=pseudo_initial,
	pseudo_transition=pseudo_transition,
	pseudo_emission=pseudo_emission,
	update_fn=update_fn,
	)
	p_initial, p_transition, p_emission = x
	return MarkovModel(states, alphabet, p_initial, p_transition, p_emission)


	MAX_ITERATIONS = 1000


	def _baum_welch(
	N,
	M,
	training_outputs,
	p_initial=None,
	p_transition=None,
	p_emission=None,
	pseudo_initial=None,
	pseudo_transition=None,
	pseudo_emission=None,
	update_fn=None,
	):
	"""Implement the Baum-Welch algorithm to evaluate unknown parameters in the MarkovModel object (PRIVATE)."""
	if p_initial is None:
	p_initial = _random_norm(N)
	else:
	p_initial = _copy_and_check(p_initial, (N,))

	if p_transition is None:
	p_transition = _random_norm((N, N))
	else:
	p_transition = _copy_and_check(p_transition, (N, N))
	if p_emission is None:
	p_emission = _random_norm((N, M))
	else:
	p_emission = _copy_and_check(p_emission, (N, M))

	# Do all the calculations in log space to avoid underflows.
	lp_initial = numpy.log(p_initial)
	lp_transition = numpy.log(p_transition)
	lp_emission = numpy.log(p_emission)
	if pseudo_initial is not None:
	lpseudo_initial = numpy.log(pseudo_initial)
	else:
	lpseudo_initial = None
	if pseudo_transition is not None:
	lpseudo_transition = numpy.log(pseudo_transition)
	else:
	lpseudo_transition = None
	if pseudo_emission is not None:
	lpseudo_emission = numpy.log(pseudo_emission)
	else:
	lpseudo_emission = None

	# Iterate through each sequence of output, updating the parameters
	# to the HMM. Stop when the log likelihoods of the sequences
	# stops varying.
	prev_llik = None
	for i in range(MAX_ITERATIONS):
	llik = LOG0
	for outputs in training_outputs:
	llik += _baum_welch_one(
	N,
	M,
	outputs,
	lp_initial,
	lp_transition,
	lp_emission,
	lpseudo_initial,
	lpseudo_transition,
	lpseudo_emission,
	)
	if update_fn is not None:
	update_fn(i, llik)
	if prev_llik is not None and numpy.fabs(prev_llik - llik) < 0.1:
	break
	prev_llik = llik
	else:
	raise RuntimeError("HMM did not converge in %d iterations" % MAX_ITERATIONS)

	# Return everything back in normal space.
	return [numpy.exp(_) for _ in (lp_initial, lp_transition, lp_emission)]


	def _baum_welch_one(
	N,
	M,
	outputs,
	lp_initial,
	lp_transition,
	lp_emission,
	lpseudo_initial,
	lpseudo_transition,
	lpseudo_emission,
	):
	"""Execute one step for Baum-Welch algorithm (PRIVATE).

	Do one iteration of Baum-Welch based on a sequence of output.
	Changes the value for lp_initial, lp_transition and lp_emission in place.
	"""
	T = len(outputs)
	fmat = _forward(N, T, lp_initial, lp_transition, lp_emission, outputs)
	bmat = _backward(N, T, lp_transition, lp_emission, outputs)

	# Calculate the probability of traversing each arc for any given
	# transition.
	lp_arc = numpy.zeros((N, N, T))
	for t in range(T):
	k = outputs[t]
	lp_traverse = numpy.zeros((N, N)) # P going over one arc.
	for i in range(N):
	for j in range(N):
	# P(getting to this arc)
	# P(making this transition)
	# P(emitting this character)
	# P(going to the end)
	lp = (
	fmat[i][t]
	+ lp_transition[i][j]
	+ lp_emission[i][k]
	+ bmat[j][t + 1]
	)
	lp_traverse[i][j] = lp
	# Normalize the probability for this time step.
	lp_arc[:, :, t] = lp_traverse - _logsum(lp_traverse)

	# Sum of all the transitions out of state i at time t.
	lp_arcout_t = numpy.zeros((N, T))
	for t in range(T):
	for i in range(N):
	lp_arcout_t[i][t] = _logsum(lp_arc[i, :, t])

	# Sum of all the transitions out of state i.
	lp_arcout = numpy.zeros(N)
	for i in range(N):
	lp_arcout[i] = _logsum(lp_arcout_t[i, :])

	# UPDATE P_INITIAL.
	lp_initial = lp_arcout_t[:, 0]
	if lpseudo_initial is not None:
	lp_initial = _logvecadd(lp_initial, lpseudo_initial)
	lp_initial = lp_initial - _logsum(lp_initial)

	# UPDATE P_TRANSITION. p_transition[i][j] is the sum of all the
	# transitions from i to j, normalized by the sum of the
	# transitions out of i.
	for i in range(N):
	for j in range(N):
	lp_transition[i][j] = _logsum(lp_arc[i, j, :]) - lp_arcout[i]
	if lpseudo_transition is not None:
	lp_transition[i] = _logvecadd(lp_transition[i], lpseudo_transition)
	lp_transition[i] = lp_transition[i] - _logsum(lp_transition[i])

	# UPDATE P_EMISSION. lp_emission[i][k] is the sum of all the
	# transitions out of i when k is observed, divided by the sum of
	# the transitions out of i.
	for i in range(N):
	ksum = numpy.zeros(M) + LOG0 # ksum[k] is the sum of all i with k.
	for t in range(T):
	k = outputs[t]
	for j in range(N):
	ksum[k] = logaddexp(ksum[k], lp_arc[i, j, t])
	ksum = ksum - _logsum(ksum) # Normalize
	if lpseudo_emission is not None:
	ksum = _logvecadd(ksum, lpseudo_emission[i])
	ksum = ksum - _logsum(ksum) # Renormalize
	lp_emission[i, :] = ksum

	# Calculate the log likelihood of the output based on the forward
	# matrix. Since the parameters of the HMM has changed, the log
	# likelihoods are going to be a step behind, and we might be doing
	# one extra iteration of training. The alternative is to rerun
	# the _forward algorithm and calculate from the clean one, but
	# that may be more expensive than overshooting the training by one
	# step.
	return _logsum(fmat[:, T])


	def _forward(N, T, lp_initial, lp_transition, lp_emission, outputs):
	"""Implement forward algorithm (PRIVATE).

	Calculate a Nx(T+1) matrix, where the last column is the total
	probability of the output.
	"""
	matrix = numpy.zeros((N, T + 1))

	# Initialize the first column to be the initial values.
	matrix[:, 0] = lp_initial
	for t in range(1, T + 1):
	k = outputs[t - 1]
	for j in range(N):
	# The probability of the state is the sum of the
	# transitions from all the states from time t-1.
	lprob = LOG0
	for i in range(N):
	lp = matrix[i][t - 1] + lp_transition[i][j] + lp_emission[i][k]
	lprob = logaddexp(lprob, lp)
	matrix[j][t] = lprob
	return matrix


	def _backward(N, T, lp_transition, lp_emission, outputs):
	"""Implement backward algorithm (PRIVATE)."""
	matrix = numpy.zeros((N, T + 1))
	for t in range(T - 1, -1, -1):
	k = outputs[t]
	for i in range(N):
	# The probability of the state is the sum of the
	# transitions from all the states from time t+1.
	lprob = LOG0
	for j in range(N):
	lp = matrix[j][t + 1] + lp_transition[i][j] + lp_emission[i][k]
	lprob = logaddexp(lprob, lp)
	matrix[i][t] = lprob
	return matrix


	def train_visible(
	states,
	alphabet,
	training_data,
	pseudo_initial=None,
	pseudo_transition=None,
	pseudo_emission=None,
	):
	"""Train a visible MarkovModel using maximum likelihoood estimates for each of the parameters.

	Train a visible MarkovModel using maximum likelihoood estimates
	for each of the parameters. states is a list of strings that
	describe the names of each state. alphabet is a list of objects
	that indicate the allowed outputs. training_data is a list of
	(outputs, observed states) where outputs is a list of the emission
	from the alphabet, and observed states is a list of states from
	states.

	pseudo_initial, pseudo_transition, and pseudo_emission are
	optional parameters that you can use to assign pseudo-counts to
	different matrices. They should be matrices of the appropriate
	size that contain numbers to add to each parameter matrix.
	"""
	N, M = len(states), len(alphabet)
	if pseudo_initial is not None:
	pseudo_initial = numpy.asarray(pseudo_initial)
	if pseudo_initial.shape != (N,):
	raise ValueError("pseudo_initial not shape len(states)")
	if pseudo_transition is not None:
	pseudo_transition = numpy.asarray(pseudo_transition)
	if pseudo_transition.shape != (N, N):
	raise ValueError("pseudo_transition not shape len(states) X len(states)")
	if pseudo_emission is not None:
	pseudo_emission = numpy.asarray(pseudo_emission)
	if pseudo_emission.shape != (N, M):
	raise ValueError("pseudo_emission not shape len(states) X len(alphabet)")

	# Training data is given as a list of members of the alphabet.
	# Replace those with indexes into the alphabet list for easier
	# computation.
	training_states, training_outputs = [], []
	states_indexes = itemindex(states)
	outputs_indexes = itemindex(alphabet)
	for toutputs, tstates in training_data:
	if len(tstates) != len(toutputs):
	raise ValueError("states and outputs not aligned")
	training_states.append([states_indexes[x] for x in tstates])
	training_outputs.append([outputs_indexes[x] for x in toutputs])

	x = _mle(
	N,
	M,
	training_outputs,
	training_states,
	pseudo_initial,
	pseudo_transition,
	pseudo_emission,
	)
	p_initial, p_transition, p_emission = x

	return MarkovModel(states, alphabet, p_initial, p_transition, p_emission)


	def _mle(
	N,
	M,
	training_outputs,
	training_states,
	pseudo_initial,
	pseudo_transition,
	pseudo_emission,
	):
	"""Implement Maximum likelihood estimation algorithm (PRIVATE)."""
	# p_initial is the probability that a sequence of states starts
	# off with a particular one.
	p_initial = numpy.zeros(N)
	if pseudo_initial:
	p_initial = p_initial + pseudo_initial
	for states in training_states:
	p_initial[states[0]] += 1
	p_initial = _normalize(p_initial)

	# p_transition is the probability that a state leads to the next
	# one. C(i,j)/C(i) where i and j are states.
	p_transition = numpy.zeros((N, N))
	if pseudo_transition:
	p_transition = p_transition + pseudo_transition
	for states in training_states:
	for n in range(len(states) - 1):
	i, j = states[n], states[n + 1]
	p_transition[i, j] += 1
	for i in range(len(p_transition)):
	p_transition[i, :] = p_transition[i, :] / sum(p_transition[i, :])

	# p_emission is the probability of an output given a state.
	# C(s,o)\|C(s) where o is an output and s is a state.
	p_emission = numpy.zeros((N, M))
	if pseudo_emission:
	p_emission = p_emission + pseudo_emission
	p_emission = numpy.ones((N, M))
	for outputs, states in zip(training_outputs, training_states):
	for o, s in zip(outputs, states):
	p_emission[s, o] += 1
	for i in range(len(p_emission)):
	p_emission[i, :] = p_emission[i, :] / sum(p_emission[i, :])

	return p_initial, p_transition, p_emission


	def _argmaxes(vector, allowance=None):
	"""Return indices of the maximum values aong the vector (PRIVATE)."""
	return [numpy.argmax(vector)]


	def find_states(markov_model, output):
	"""Find states in the given Markov model output.

	Returns a list of (states, score) tuples.
	"""
	mm = markov_model
	N = len(mm.states)

	# _viterbi does calculations in log space. Add a tiny bit to the
	# matrices so that the logs will not break.
	lp_initial = numpy.log(mm.p_initial + VERY_SMALL_NUMBER)
	lp_transition = numpy.log(mm.p_transition + VERY_SMALL_NUMBER)
	lp_emission = numpy.log(mm.p_emission + VERY_SMALL_NUMBER)
	# Change output into a list of indexes into the alphabet.
	indexes = itemindex(mm.alphabet)
	output = [indexes[x] for x in output]

	# Run the viterbi algorithm.
	results = _viterbi(N, lp_initial, lp_transition, lp_emission, output)

	for i in range(len(results)):
	states, score = results[i]
	results[i] = [mm.states[x] for x in states], numpy.exp(score)
	return results


	def _viterbi(N, lp_initial, lp_transition, lp_emission, output):
	"""Implement Viterbi algorithm to find most likely states for a given input (PRIVATE)."""
	T = len(output)
	# Store the backtrace in a NxT matrix.
	backtrace = [] # list of indexes of states in previous timestep.
	for i in range(N):
	backtrace.append([None] * T)

	# Store the best scores.
	scores = numpy.zeros((N, T))
	scores[:, 0] = lp_initial + lp_emission[:, output[0]]
	for t in range(1, T):
	k = output[t]
	for j in range(N):
	# Find the most likely place it came from.
	i_scores = scores[:, t - 1] + lp_transition[:, j] + lp_emission[j, k]
	indexes = _argmaxes(i_scores)
	scores[j, t] = i_scores[indexes[0]]
	backtrace[j][t] = indexes

	# Do the backtrace. First, find a good place to start. Then,
	# we'll follow the backtrace matrix to find the list of states.
	# In the event of ties, there may be multiple paths back through
	# the matrix, which implies a recursive solution. We'll simulate
	# it by keeping our own stack.
	in_process = [] # list of (t, states, score)
	results = [] # return values. list of (states, score)
	indexes = _argmaxes(scores[:, T - 1]) # pick the first place
	for i in indexes:
	in_process.append((T - 1, [i], scores[i][T - 1]))
	while in_process:
	t, states, score = in_process.pop()
	if t == 0:
	results.append((states, score))
	else:
	indexes = backtrace[states[0]][t]
	for i in indexes:
	in_process.append((t - 1, [i] + states, score))
	return results


	def _normalize(matrix):
	"""Normalize matrix object (PRIVATE)."""
	if len(matrix.shape) == 1:
	matrix = matrix / sum(matrix)
	elif len(matrix.shape) == 2:
	# Normalize by rows.
	for i in range(len(matrix)):
	matrix[i, :] = matrix[i, :] / sum(matrix[i, :])
	else:
	raise ValueError("I cannot handle matrixes of that shape")
	return matrix


	def _uniform_norm(shape):
	"""Normalize a uniform matrix (PRIVATE)."""
	matrix = numpy.ones(shape)
	return _normalize(matrix)


	def _random_norm(shape):
	"""Normalize a random matrix (PRIVATE)."""
	matrix = numpy.random.random(shape)
	return _normalize(matrix)


	def _copy_and_check(matrix, desired_shape):
	"""Copy a matrix and check its dimension. Normalize at the end (PRIVATE)."""
	# Copy the matrix.
	matrix = numpy.array(matrix, copy=1)
	# Check the dimensions.
	if matrix.shape != desired_shape:
	raise ValueError("Incorrect dimension")
	# Make sure it's normalized.
	if len(matrix.shape) == 1:
	if numpy.fabs(sum(matrix) - 1.0) > 0.01:
	raise ValueError("matrix not normalized to 1.0")
	elif len(matrix.shape) == 2:
	for i in range(len(matrix)):
	if numpy.fabs(sum(matrix[i]) - 1.0) > 0.01:
	raise ValueError("matrix %d not normalized to 1.0" % i)
	else:
	raise ValueError("I don't handle matrices > 2 dimensions")
	return matrix


	def _logsum(matrix):
	"""Implement logsum for a matrix object (PRIVATE)."""
	if len(matrix.shape) > 1:
	vec = numpy.reshape(matrix, (numpy.product(matrix.shape),))
	else:
	vec = matrix
	sum = LOG0
	for num in vec:
	sum = logaddexp(sum, num)
	return sum


	def _logvecadd(logvec1, logvec2):
	"""Implement a log sum for two vector objects (PRIVATE)."""
	assert len(logvec1) == len(logvec2), "vectors aren't the same length"
	sumvec = numpy.zeros(len(logvec1))
	for i in range(len(logvec1)):
	sumvec[i] = logaddexp(logvec1[i], logvec2[i])
	return sumvec


	def _exp_logsum(numbers):
	"""Return the exponential of a logsum (PRIVATE)."""
	sum = _logsum(numbers)
	return numpy.exp(sum)