Spaces:

kabuda777
/

Code2MCP-esm

Running

Code2MCP-esm / esm /source /examples /lm-design /utils /ngram.py

kabudadada

Add esm folder and minimal app

e76b79a 6 months ago

2.3 kB

	# Copyright (c) Meta Platforms, Inc. and affiliates.
	#
	# This source code is licensed under the MIT license found in the
	# LICENSE file in the root directory of this source tree.
	#
	from collections import Counter
	from pathlib import Path
	import pickle
	import random
	import time

	from nltk import ngrams
	import numpy as np


	# Code for loading constants for preprocessing
	seq_encode = ['L', 'A', 'G', 'V', 'S', 'E', 'R', 'T', 'I', 'D', 'P', 'K', 'Q', 'N', 'F', 'Y', 'M', 'H', 'W', 'C']
	BASE = Path('./utils/ngram_stats/')
	ngram_list = []
	for fn in ["monogram_seg.p", "bigram_seg.p", "trigram_seg.p", "quadgram_seg.p"]:
	with open(BASE / fn, "rb") as f:
	ngram_list.append(pickle.load(f))

	# Recompute ngram frequency based off the valid sequences used for design
	for i, ngram_dict in enumerate(ngram_list):
	idx_dict = {}
	for k, v in ngram_dict.items():
	ids = []
	error = False

	for ki in k:
	if ki not in seq_encode:
	error = True
	break
	id = seq_encode.index(ki)
	ids.append(id)

	if error:
	continue

	ids = tuple(ids)
	idx_dict[ids] = v

	total = sum(idx_dict.values())
	# Min value for ngram is 1e-5
	idx_dict = {k: max(v / total, 1e-5) for k, v in idx_dict.items()}

	ngram_list[i] = idx_dict

	def encode(seq):
	if isinstance(seq, np.ndarray):
	return seq # already encoded
	elif isinstance(seq, str):
	return np.array([seq_encode.index(AA) for AA in seq])
	else:
	raise ValueError(f'Unknown seq type {seq}')


	def compute_kl_div(seq, order):
	# Inputs Args:
	# Seq: N dimensional numpy array consisting of numbers between 0 and 19 (inclusive)
	# Order: integer for order of ngram used (should be between 0 and 3 for now)
	order_dict = ngram_list[order-1]
	seq = encode(seq) # this is not the problem.

	# Compute ngram frequency rate for the input sequence
	tup_dict = Counter(ngrams(seq,n=order))
	total = sum(tup_dict.values())
	tup_dict = {k: v / total for k, v in tup_dict.items()}

	p = np.array(list(tup_dict.values())) # observed probabilities of ngrams
	q = np.array([order_dict.get(k, 1e-5) for k in tup_dict.keys()]) # learned ngram probabilities
	return np.sum(p * np.log(p/q))