pere
/

long-t5x

Model card Files Files and versions

long-t5x / longt5_1_1_base.gin

pere's picture

demo

0a8986a over 3 years ago

history blame contribute delete

1.8 kB

	# LongT5 Base model. Config based on T5.1.1 Base model.
	# Provides MODEL
	from __gin__ import dynamic_registration

	import seqio
	from t5x import adafactor
	from t5x import models
	import tasks

	ARCHITECTURE = %gin.REQUIRED

	include 'flaxformer/t5x/configs/longt5/architectures/longt5_1_1_flaxformer.gin'

	include 't5x/configs/runs/pretrain.gin'
	#include 'pretrain_cont.gin'

	MIXTURE_OR_TASK_NAME = "ncc_scandinavian_span_corruption_stream"
	TASK_FEATURE_LENGTHS = {"inputs": 4048, "targets": 910}
	# CORRECT IS 128!!
	BATCH_SIZE=32
	TRAIN_STEPS = 1_000_000
	DROPOUT_RATE = 0.0 # Changed from the default since T5-1.1 recomments this.
	#INITIAL_CHECKPOINT_PATH = "gs://nb-t5x-us-central2/norwegian_NCC_plus_English_t5x_base/checkpoint_1500000"
	#PjitPartitioner.num_partitions = 1


	# Architecture overrides
	NUM_HEADS = 12
	NUM_ENCODER_LAYERS = 12
	NUM_DECODER_LAYERS = 12
	HEAD_DIM = 64
	EMBED_DIM = 768
	MLP_DIM = 2048

	# Loss HParam defaults
	Z_LOSS = 0.0001
	LABEL_SMOOTHING = 0.0
	LOSS_NORMALIZING_FACTOR = None

	# Vocabulary (shared by encoder and decoder)
	VOCABULARY = @seqio.SentencePieceVocabulary()
	seqio.SentencePieceVocabulary.sentencepiece_model_file = "gs://t5-data/vocabs/cc_all.32000.100extra/sentencepiece.model"
	NUM_EMBEDDINGS = 32128 # vocab size rounded to a multiple of 128 for TPU efficiency

	# Optimizer
	# `learning_rate` is set by `Trainer.learning_rate_fn`.
	OPTIMIZER = @adafactor.Adafactor()
	adafactor.Adafactor:
	decay_rate = 0.8
	step_offset = 0

	# Model
	MODEL = @models.EncoderDecoderModel()
	models.EncoderDecoderModel:
	module = %ARCHITECTURE # provided by longt5_flaxformer
	input_vocabulary = %VOCABULARY
	output_vocabulary = %VOCABULARY
	optimizer_def = %OPTIMIZER
	z_loss = %Z_LOSS
	label_smoothing = %LABEL_SMOOTHING
	loss_normalizing_factor = %LOSS_NORMALIZING_FACTOR