BitTransformerLM / scripts /testing /enhanced_generation_test.py

🚀 Refined BitTransformerLM: Organized codebase with best practices

72bc506 verified 5 months ago

13.2 kB

	#!/usr/bin/env python3
	"""
	Enhanced BitTransformerLM Generation Testing
	=============================================

	Test the promising generation improvements:
	1. Autoregressive generation with automatic parity correction
	2. Longer sequence generation (50, 100, 200+ characters)
	3. Optimized diffusion parameters (50+ steps)
	4. Direct comparison between generation methods

	Goal: See if we can get from "barely-contextual gibberish" to actual language!
	"""

	import sys
	import torch
	import torch.nn.functional as F
	from datetime import datetime

	sys.path.append('/data')
	sys.path.append('/data/BitTransformerLM')

	from bit_transformer import (
	BitTransformerLM,
	text_to_bits,
	bits_to_text,
	diffusion_inference,
	set_dropout,
	enforce_parity
	)

	def load_full_attention_model():
	"""Load the full attention BitTransformerLM model."""
	print("🚀 Loading Full Attention BitTransformerLM for enhanced generation testing...")

	model = BitTransformerLM(
	d_model=512, nhead=16, num_layers=8, dim_feedforward=1024,
	max_seq_len=512, reversible=True, use_checkpoint=False,
	use_autocast=False, use_act=True, act_threshold=0.9,
	lambda_K=0.05, lambda_C=0.05, lambda_S=0.05,
	chunk_size=None, overlap=0, full_attn_logging=True
	)

	checkpoint_path = '/data/BitTransformerLM/checkpoints/checkpoint_best.pt'
	checkpoint = torch.load(checkpoint_path, map_location='cpu')
	model.load_state_dict(checkpoint['model_state_dict'])
	model.eval()
	set_dropout(model, 0.0)

	epoch = checkpoint.get('epoch', 'unknown')
	loss = checkpoint.get('loss', 'unknown')
	print(f"✅ Model loaded! Epoch: {epoch}, Loss: {loss}")

	return model

	def autoregressive_generate_with_parity_correction(model, prompt, max_new_chars=20, temperature=0.7):
	"""
	Autoregressive generation with automatic parity correction.
	This should solve the parity check failure issue that blocked autoregressive evaluation.
	"""
	print(f"\n🔄 Autoregressive generation with parity correction:")
	print(f" Prompt: '{prompt}' → generating {max_new_chars} characters...")

	# Convert prompt to bits
	input_bits = text_to_bits(prompt)
	generated_bits = input_bits.copy()

	with torch.no_grad():
	for char_idx in range(max_new_chars):
	char_bits = []

	# Generate 8 data bits + 1 parity bit per character
	for bit_idx in range(9):
	# Use last 400 bits as context
	context = generated_bits + char_bits
	context = context[-400:] if len(context) > 400 else context
	context_tensor = torch.tensor(context, dtype=torch.long).unsqueeze(0)

	# Get next bit prediction
	logits, telemetry = model(context_tensor, causal=True)
	next_bit_logits = logits[0, -1, :]

	if bit_idx < 8: # Data bits
	# Apply temperature for controlled randomness
	if temperature > 0:
	next_bit_logits = next_bit_logits / temperature
	probs = F.softmax(next_bit_logits, dim=-1)
	next_bit = torch.multinomial(probs, 1).item()
	else:
	next_bit = torch.argmax(next_bit_logits).item()
	else: # Parity bit - calculate correct parity
	data_bits = char_bits[:8]
	expected_parity = sum(data_bits) % 2
	next_bit = expected_parity

	char_bits.append(next_bit)

	# Add character to generated sequence
	generated_bits.extend(char_bits)

	# Extract only the new bits (excluding prompt)
	new_bits = generated_bits[len(input_bits):]

	# Apply additional parity correction if needed
	new_bits_tensor = torch.tensor(new_bits, dtype=torch.long)
	corrected_bits_tensor, parity_corrections = enforce_parity(new_bits_tensor)
	corrected_bits = corrected_bits_tensor.tolist()

	try:
	# Decode new text
	decoded_text = bits_to_text(corrected_bits)
	full_result = prompt + decoded_text
	print(f" ✅ SUCCESS: '{full_result}'")
	return {
	'success': True,
	'full_text': full_result,
	'new_text': decoded_text,
	'bits_generated': len(new_bits),
	'parity_corrections': parity_corrections
	}
	except Exception as e:
	print(f" ❌ DECODE FAILED: {e}")
	return {
	'success': False,
	'error': str(e),
	'bits_generated': len(new_bits)
	}

	def long_diffusion_generation(model, prompt, target_chars, steps=50):
	"""
	Generate longer sequences with optimized diffusion parameters.
	"""
	print(f"\n🌊 Long diffusion generation:")
	print(f" Prompt: '{prompt}' → generating {target_chars} characters with {steps} steps...")

	try:
	# Generate longer continuation
	continuation_bits = target_chars * 9 # 9 bits per character
	generated_bits = diffusion_inference(
	model,
	length=continuation_bits,
	steps=steps,
	batch_size=1,
	init_bits=None,
	schedule="cosine"
	)

	# Decode result
	continuation_bits_list = generated_bits.squeeze().tolist()
	continuation_text = bits_to_text(continuation_bits_list)

	full_result = prompt + continuation_text
	print(f" ✅ SUCCESS: '{full_result}'")

	return {
	'success': True,
	'full_text': full_result,
	'new_text': continuation_text,
	'bits_generated': len(continuation_bits_list),
	'diffusion_steps': steps
	}

	except Exception as e:
	print(f" ❌ FAILED: {e}")
	return {
	'success': False,
	'error': str(e),
	'diffusion_steps': steps
	}

	def test_length_scaling():
	"""Test if longer generations produce more coherent results."""
	print("\n📏 === LENGTH SCALING TESTS ===")
	print("Testing if longer generations show improved coherence...")

	model = load_full_attention_model()
	test_prompts = ["Hello", "The weather today", "I think that"]
	target_lengths = [10, 25, 50]

	results = []

	for prompt in test_prompts:
	for length in target_lengths:
	print(f"\n--- Testing '{prompt}' → {length} chars ---")

	# Test autoregressive
	auto_result = autoregressive_generate_with_parity_correction(
	model, prompt, max_new_chars=length, temperature=0.6
	)

	# Test diffusion with high steps
	diff_result = long_diffusion_generation(
	model, prompt, target_chars=length, steps=50
	)

	results.append({
	'prompt': prompt,
	'target_length': length,
	'autoregressive': auto_result,
	'diffusion': diff_result
	})

	return results

	def test_parameter_optimization():
	"""Test different generation parameters for quality."""
	print("\n⚙️ === PARAMETER OPTIMIZATION TESTS ===")
	print("Testing different temperatures and diffusion steps...")

	model = load_full_attention_model()
	prompt = "Hello world"

	results = []

	# Test different temperatures for autoregressive
	print("\n🌡️ Testing autoregressive temperatures:")
	for temp in [0.1, 0.5, 0.8, 1.0, 1.2]:
	print(f"\n--- Temperature {temp} ---")
	result = autoregressive_generate_with_parity_correction(
	model, prompt, max_new_chars=20, temperature=temp
	)
	results.append({
	'method': 'autoregressive',
	'temperature': temp,
	'result': result
	})

	# Test different diffusion steps
	print("\n🌊 Testing diffusion steps:")
	for steps in [10, 25, 50, 100]:
	print(f"\n--- {steps} steps ---")
	result = long_diffusion_generation(
	model, prompt, target_chars=20, steps=steps
	)
	results.append({
	'method': 'diffusion',
	'steps': steps,
	'result': result
	})

	return results

	def test_coherence_prompts():
	"""Test with prompts that should elicit more coherent responses."""
	print("\n🎯 === COHERENCE PROMPTS TESTS ===")
	print("Testing prompts designed to elicit coherent language patterns...")

	model = load_full_attention_model()

	# Prompts that might elicit more structured responses
	coherence_prompts = [
	"Once upon a time",
	"The quick brown fox",
	"In the beginning",
	"Python code to print hello:",
	"def main():",
	"SELECT * FROM",
	"Today is a beautiful",
	"My name is",
	"The answer is",
	"import torch"
	]

	results = []

	for prompt in coherence_prompts:
	print(f"\n--- Testing coherence with: '{prompt}' ---")

	# Test both methods with longer generation
	auto_result = autoregressive_generate_with_parity_correction(
	model, prompt, max_new_chars=30, temperature=0.7
	)

	diff_result = long_diffusion_generation(
	model, prompt, target_chars=30, steps=75
	)

	results.append({
	'prompt': prompt,
	'autoregressive': auto_result,
	'diffusion': diff_result
	})

	# Quick analysis
	if auto_result.get('success'):
	auto_text = auto_result.get('new_text', '')
	if any(word in auto_text.lower() for word in ['the', 'and', 'is', 'in', 'to', 'a']):
	print(f" 🎉 Autoregressive contains common words!")

	if diff_result.get('success'):
	diff_text = diff_result.get('new_text', '')
	if any(word in diff_text.lower() for word in ['the', 'and', 'is', 'in', 'to', 'a']):
	print(f" 🎉 Diffusion contains common words!")

	return results

	def main():
	"""Run all enhanced generation tests."""
	print("🚀 ENHANCED BITRANSFORMERLM GENERATION TESTING")
	print("=" * 60)
	print("Testing potential fixes:")
	print("1. Autoregressive with parity correction")
	print("2. Longer sequence generation")
	print("3. Optimized generation parameters")
	print("4. Coherence-focused prompts")
	print("=" * 60)

	# Run all tests
	length_results = test_length_scaling()
	param_results = test_parameter_optimization()
	coherence_results = test_coherence_prompts()

	# Summary analysis
	print("\n🎯 === OVERALL ANALYSIS ===")

	# Count successes
	total_auto = len([r for results in [length_results, coherence_results]
	for r in results if r.get('autoregressive', {}).get('success')])
	total_diff = len([r for results in [length_results, coherence_results]
	for r in results if r.get('diffusion', {}).get('success')])

	print(f"Autoregressive success rate: {total_auto}/24")
	print(f"Diffusion success rate: {total_diff}/24")

	# Look for promising outputs
	print("\n🔍 Looking for signs of linguistic improvement...")

	all_results = length_results + coherence_results
	promising_outputs = []

	for result in all_results:
	for method in ['autoregressive', 'diffusion']:
	if result.get(method, {}).get('success'):
	text = result[method].get('new_text', '')
	# Check for word-like patterns
	if len(text) > 10 and any(c.isalpha() for c in text):
	words = text.split()
	if any(len(word) > 2 and word.isalpha() for word in words):
	promising_outputs.append({
	'prompt': result['prompt'],
	'method': method,
	'text': text
	})

	if promising_outputs:
	print(f"\n🎉 Found {len(promising_outputs)} promising outputs with word-like patterns!")
	for output in promising_outputs[:5]: # Show first 5
	print(f" {output['method']}: '{output['prompt']}' → '{output['text']}'")
	else:
	print("\n💭 No clear word patterns found yet - model may need more training or different approach")

	return {
	'length_results': length_results,
	'param_results': param_results,
	'coherence_results': coherence_results,
	'summary': {
	'autoregressive_successes': total_auto,
	'diffusion_successes': total_diff,
	'promising_outputs': len(promising_outputs)
	}
	}

	if __name__ == "__main__":
	results = main()