from LaughLM.data.dataset import DomainSampler from LaughLM.data.tokenizer import LaughTokenizer from LaughLM.data.shard_writer import BinaryShardWriter def main(): tokenizer = LaughTokenizer("tokenizer.json") sampler = DomainSampler( sources=[ {"name": "HuggingFaceFW/fineweb-edu", "weight": 0.4}, {"name": "bigcode/starcoderdata", "weight": 0.2}, ] ) writer = BinaryShardWriter( tokenizer=tokenizer, output_path="train_000.bin", shard_tokens=5_000_000_000, ) writer.build_shard(iter(sampler)) if __name__ == "__main__": main()