"""composer_replication.pipeline — the Stage-0 dataset-pipeline contract + driver. THE single reconciled dataset contract (supersedes the two divergent layouts in research/design-F1 and design-F2 — deepread finding V8/D-7), the pragmatic near-duplicate detector, and the local stage-driver that turns (tasks, env, policy) into a carded, deduped, holdout-split corpus. """ from composer_replication.pipeline.build_corpus import build_corpus from composer_replication.pipeline.dedup import ( dedup, find_near_duplicates, jaccard_estimate, minhash_signature, ) from composer_replication.pipeline.s3_contract import ( RunLayout, RunManifest, write_dataset_card, write_dpo_rows, write_sft_rows, write_tasks, write_tasks_full, ) __all__ = [ "RunLayout", "RunManifest", "build_corpus", "dedup", "find_near_duplicates", "jaccard_estimate", "minhash_signature", "write_dataset_card", "write_dpo_rows", "write_sft_rows", "write_tasks", "write_tasks_full", ]