""" Component 3 verification script. Runs a small pipeline pass to confirm: - HF loading works. - Cleaning + dedupe logic works. - Tokenized output files are created. """ from __future__ import annotations import sys from pathlib import Path # This makes script imports stable from project root. PROJECT_ROOT = Path(__file__).resolve().parents[1] if str(PROJECT_ROOT) not in sys.path: sys.path.insert(0, str(PROJECT_ROOT)) from scripts.run_component3_dataset_pipeline import main as run_pipeline_main # noqa: E402 if __name__ == "__main__": try: # We call the main runner with a small override by mutating argv. sys.argv = [ "verify_component3_dataset_pipeline.py", "--config", "configs/component3_dataset_pipeline.yaml", "--max_records_per_dataset", "200", ] run_pipeline_main() print("") print("Component 3 verification passed.") except Exception as exc: print("Component 3 verification failed.") print(f"What went wrong: {exc}") print("Fix suggestion: verify internet access and that Component 2 tokenizer exists.") raise SystemExit(1)