| """ | |
| Component 3 verification script. | |
| Runs a small pipeline pass to confirm: | |
| - HF loading works. | |
| - Cleaning + dedupe logic works. | |
| - Tokenized output files are created. | |
| """ | |
| from __future__ import annotations | |
| import sys | |
| from pathlib import Path | |
| # This makes script imports stable from project root. | |
| PROJECT_ROOT = Path(__file__).resolve().parents[1] | |
| if str(PROJECT_ROOT) not in sys.path: | |
| sys.path.insert(0, str(PROJECT_ROOT)) | |
| from scripts.run_component3_dataset_pipeline import main as run_pipeline_main # noqa: E402 | |
| if __name__ == "__main__": | |
| try: | |
| # We call the main runner with a small override by mutating argv. | |
| sys.argv = [ | |
| "verify_component3_dataset_pipeline.py", | |
| "--config", | |
| "configs/component3_dataset_pipeline.yaml", | |
| "--max_records_per_dataset", | |
| "200", | |
| ] | |
| run_pipeline_main() | |
| print("") | |
| print("Component 3 verification passed.") | |
| except Exception as exc: | |
| print("Component 3 verification failed.") | |
| print(f"What went wrong: {exc}") | |
| print("Fix suggestion: verify internet access and that Component 2 tokenizer exists.") | |
| raise SystemExit(1) | |