| |
| """ |
| Test script for BitTransformerLM dataset creation (small version) |
| """ |
|
|
| import os |
| import sys |
| from pathlib import Path |
| sys.path.insert(0, str(Path(__file__).parent)) |
|
|
| |
| from bit_transformer.dataset_builder import BitTransformerDatasetBuilder |
|
|
| def test_small_dataset(): |
| print("π§ͺ Testing BitTransformerLM Dataset Builder...") |
| |
| |
| hf_token = "os.environ.get('HF_TOKEN', 'your-token-here')" |
| repo_id = "BitTransformerLM" |
| |
| builder = BitTransformerDatasetBuilder(hf_token, repo_id) |
| |
| |
| print("π Testing text-to-bits conversion...") |
| test_texts = [ |
| "Hello, world!", |
| "The quick brown fox jumps over the lazy dog.", |
| "Binary data processing with transformers.", |
| "Information theory meets deep learning.", |
| "Parity-protected bit sequences for safety." |
| ] |
| |
| text_samples = builder.generate_text_to_bits_data(test_texts, max_len=128) |
| print(f"β
Generated {len(text_samples)} text-to-bits samples") |
| |
| |
| print("π¨ Testing synthetic patterns...") |
| synthetic_samples = builder.generate_synthetic_patterns(10, max_len=64) |
| print(f"β
Generated {len(synthetic_samples)} synthetic samples") |
| |
| |
| print("π‘οΈ Testing safety benchmarks...") |
| safety_samples = builder.generate_safety_benchmarks(8) |
| print(f"β
Generated {len(safety_samples)} safety samples") |
| |
| |
| print("\nπ Sample Structure:") |
| sample = text_samples[0] |
| for key, value in sample.items(): |
| if key == "bit_sequence": |
| print(f" {key}: [{len(value)} bits] {value[:10]}...") |
| else: |
| print(f" {key}: {value}") |
| |
| print("\nπ All tests passed! Dataset builder is working correctly.") |
| return True |
|
|
| if __name__ == "__main__": |
| test_small_dataset() |