| | |
| | """ |
| | Test script for BitTransformerLM dataset creation (small version) |
| | """ |
| |
|
| | import os |
| | import sys |
| | from pathlib import Path |
| | sys.path.insert(0, str(Path(__file__).parent)) |
| |
|
| | |
| | from bit_transformer.dataset_builder import BitTransformerDatasetBuilder |
| |
|
| | def test_small_dataset(): |
| | print("π§ͺ Testing BitTransformerLM Dataset Builder...") |
| | |
| | |
| | hf_token = "os.environ.get('HF_TOKEN', 'your-token-here')" |
| | repo_id = "BitTransformerLM" |
| | |
| | builder = BitTransformerDatasetBuilder(hf_token, repo_id) |
| | |
| | |
| | print("π Testing text-to-bits conversion...") |
| | test_texts = [ |
| | "Hello, world!", |
| | "The quick brown fox jumps over the lazy dog.", |
| | "Binary data processing with transformers.", |
| | "Information theory meets deep learning.", |
| | "Parity-protected bit sequences for safety." |
| | ] |
| | |
| | text_samples = builder.generate_text_to_bits_data(test_texts, max_len=128) |
| | print(f"β
Generated {len(text_samples)} text-to-bits samples") |
| | |
| | |
| | print("π¨ Testing synthetic patterns...") |
| | synthetic_samples = builder.generate_synthetic_patterns(10, max_len=64) |
| | print(f"β
Generated {len(synthetic_samples)} synthetic samples") |
| | |
| | |
| | print("π‘οΈ Testing safety benchmarks...") |
| | safety_samples = builder.generate_safety_benchmarks(8) |
| | print(f"β
Generated {len(safety_samples)} safety samples") |
| | |
| | |
| | print("\nπ Sample Structure:") |
| | sample = text_samples[0] |
| | for key, value in sample.items(): |
| | if key == "bit_sequence": |
| | print(f" {key}: [{len(value)} bits] {value[:10]}...") |
| | else: |
| | print(f" {key}: {value}") |
| | |
| | print("\nπ All tests passed! Dataset builder is working correctly.") |
| | return True |
| |
|
| | if __name__ == "__main__": |
| | test_small_dataset() |