Axiovora-X / external /alphageometry /models.py

Add files using upload-large-folder tool

6d92a68 verified about 2 months ago

6.06 kB

	# Copyright 2023 DeepMind Technologies Limited
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.
	# ==============================================================================

	"""Transformer language model generate mode."""

	from typing import Any, Tuple
	import beam_search
	import decoder_stack
	import gin
	import jax
	import jax.numpy as jnp
	from transformer import models


	@gin.configurable
	class DecoderOnlyLanguageModelGenerate(models.DecoderOnlyLanguageModel):
	"""Decoder only language modeling in inference mode."""

	decoder_factory = decoder_stack.DecoderStackGenerate

	num_heads: int = gin.REQUIRED
	head_size: int = gin.REQUIRED

	def get_fake_input(self) -> dict[str, Any]:
	fake_input_dict = super().get_fake_input()
	b = self.task_config.batch_size
	n = self.num_heads
	h = self.head_size
	fake_input_dict.update({
	'dstate': tuple(
	[{
	'current_index': jnp.array([0] * b, dtype=jnp.int32),
	'keys': jnp.zeros((b, 2048, n, h), dtype=jnp.bfloat16),
	'values': jnp.zeros((b, 2048, n, h), dtype=jnp.bfloat16),
	'recurrent_kvq': None,
	'relative_position_bias': jnp.zeros(
	(b, n, 1, 1024), dtype=jnp.bfloat16
	),
	}]
	* 12
	),
	'eos': jnp.zeros([1024], dtype=jnp.bfloat16),
	'mask': jnp.ones([1024], dtype=jnp.bfloat16),
	'length': 1,
	'temperature': 1.0,
	})
	return fake_input_dict

	def __call__(self, inputs: ...) -> tuple[Any, dict[str, Any]]:
	# Make sure this code is not used on untested cases.
	if self.mode not in ['init', 'beam_search']:
	raise ValueError(f'{type(self)} cannot do mode {self.mode}')
	if self.decoder.supports_generate():
	raise ValueError(f'{type(self)}.decoder cannot supports_generate()')

	self.decoder(
	input_tokens=inputs['targets'][:, 0:1],
	target_tokens=None,
	start_of_sequence=inputs['start_of_sequence'],
	)

	b = inputs['targets'].shape[0]
	no_start_of_seq = jnp.array([False] * b, dtype=jnp.bool_)

	# This fn is used in both beam_search or topk_sampling.
	def tokens_to_logits_fn(
	input_token: jnp.ndarray, dstate: tuple[dict[str, jnp.ndarray], ...]
	) -> tuple[jnp.ndarray, tuple[dict[str, jnp.ndarray], ...]]:
	(logits, dstate, _) = self.decoder(
	input_tokens=input_token,
	target_tokens=None,
	start_of_sequence=no_start_of_seq,
	decoder_state=dstate,
	)
	return logits[:, -1, :], dstate

	last_token = jax.lax.dynamic_slice_in_dim(
	inputs['targets'], inputs['length'] - 1, 1, axis=1
	)

	# last token is used to seed beam_search
	inputs['targets'] = inputs['targets'][:, 0:-1]
	dstate = jax.lax.cond(
	inputs['start_of_sequence'][0],
	lambda: self.generate(inputs)[0],
	lambda: inputs['dstate'],
	)

	# Then we run beam search, init with last_token & dstate.
	finished_seqs, finished_scores, dstate = beam_search.beam_search_flat(
	last_token,
	dstate,
	tokens_to_logits_fn,
	max_decode_len=512,
	eos=inputs['eos'].reshape((1, 1, -1)),
	mask=inputs['mask'].reshape((1, 1, -1)),
	)

	return 0.0, {
	'finished_seqs': finished_seqs,
	'finished_scores': finished_scores,
	'dstate': dstate,
	}

	def generate(
	self, inputs: ...
	) -> tuple[tuple[dict[str, jnp.ndarray, ...], ...], jnp.ndarray]:
	"""Generate an output sequence.

	Args:
	inputs: the same as argument to _call_.

	Returns:
	An array of generated tokens of shape (batch_size, sequence_length).
	"""
	input_tokens = inputs['targets'] # [b,seq_len]
	start_of_sequence = inputs['start_of_sequence'] # [b]
	target_tokens = jnp.pad(input_tokens[:, 1:], [(0, 0), (0, 1)])
	batch_size = target_tokens.shape[0]

	# Assuming all sequences start at the same time.
	start0 = inputs['start_of_sequence'][0]
	dstate = jax.lax.cond(
	start0,
	lambda: self.decoder.init_decoder_state_vanilla( # pylint: disable=g-long-lambda
	1024, start_of_sequence
	),
	lambda: inputs['dstate'],
	)

	first_token = input_tokens[:, 0:1]
	no_start_of_seq = jnp.array([False] * batch_size, dtype=jnp.bool_)
	temperature = 1
	if 'temperature' in inputs:
	temperature = inputs['temperature']

	num_steps = inputs['length']
	if self.mode == 'beam_search':
	num_steps -= 1

	def cond_fn(scan_state) -> jnp.bool_:
	_, _, i, _ = scan_state
	return i < num_steps

	def loop_fn(scan_state: Any) -> Tuple[Any, Any, Any, Any]:
	(dstate, input_token, i, _) = scan_state

	(logits, dstate, _) = self.decoder(
	input_tokens=input_token,
	target_tokens=None,
	start_of_sequence=no_start_of_seq,
	decoder_state=dstate,
	)

	logits = logits / temperature
	output_token = jax.lax.dynamic_slice_in_dim(target_tokens, i, 1, axis=1)

	return (dstate, output_token, i + 1, logits)

	# Scan over the sequence length.
	dummy_logits = jnp.zeros((batch_size, 1, 1024))
	initial_scan_state = (dstate, first_token, 0, dummy_logits)
	dstate, _, _, logits = jax.lax.while_loop(
	cond_fn, loop_fn, initial_scan_state
	)
	return dstate, logits