| # Reprocess config: no dataset download, no full pipeline rebuild. | |
| # It reads existing cleaned data and regenerates tokenized output. | |
| tokenizer_dir: artifacts/tokenizer/code_tokenizer_v1 | |
| input_clean_path: data/interim/combined_clean.jsonl | |
| output_tokenized_path: data/processed/train_tokenized.jsonl | |
| output_stats_path: data/processed/pipeline_stats.json | |
| # Safety backups before overwrite. | |
| backup_existing_tokenized: true | |
| backup_existing_stats: true | |
| # Existing language labels in clean file may be wrong from earlier runs. | |
| # true = infer language from prompt+code content only. | |
| ignore_existing_language_labels: true | |
| # Optional quick test mode. | |
| # Set null for full reprocess. | |
| max_records: null | |