Spaces:

OliverPerrin
/

LexiMind

Sleeping

OliverPerrin

Fixed compiling issue, added legnth penalty, and atttempting freezing encoder layers 0-5 to lower parameters and preserve T5's langauge understanding.

baf3026 about 1 month ago

raw

history blame contribute delete

1.51 kB

	# Development/Testing Configuration for FLAN-T5-base
	# FAST iteration for debugging - optimized for speed
	# VRAM Usage: ~9-10GB peak (12GB available)
	# Training time: ~5 minutes on RTX 4070 12GB
	# Use: python scripts/train.py training=dev

	dataloader:
	batch_size: 10 # Optimal with FlashAttention
	shuffle: true
	num_workers: 4
	pin_memory: true
	persistent_workers: true
	prefetch_factor: 2

	optimizer:
	name: adamw
	lr: 5.0e-5
	weight_decay: 0.01
	eps: 1.0e-8
	betas: [0.9, 0.999]

	scheduler:
	name: cosine
	warmup_steps: 50 # Less warmup for short runs

	trainer:
	max_epochs: 3
	gradient_clip_norm: 1.0
	gradient_accumulation_steps: 6 # Effective batch: 60 (10*6)
	validation_max_length: 128
	label_smoothing: 0.0 # Simpler backward graph for dev
	task_weights:
	summarization: 1.0
	emotion: 1.5
	topic: 0.5 # Reduced - topic already saturated at 86%
	max_train_samples: 3000
	max_val_samples: 300
	early_stopping_patience: 5
	log_grad_norm_frequency: 100

	# Enable compile for speed (worth the startup cost)
	compile_encoder: true
	compile_decoder: true

	# Speed optimizations
	tokenizer_max_length: 256
	gradient_checkpointing: true

	# FLAN-T5 has NO learned positional embeddings - only relative position bias
	# Disabling this causes repetition loops (model can't track sequence position)
	use_relative_position_bias: true

	# Freeze lower encoder layers (0-5) to preserve pretrained knowledge
	# Upper layers (6-11) adapt to summarization style
	freeze_encoder_layers: 6