# Reprocess config: no dataset download, no full pipeline rebuild. # It reads existing cleaned data and regenerates tokenized output. tokenizer_dir: artifacts/tokenizer/code_tokenizer_v1 input_clean_path: data/interim/combined_clean.jsonl output_tokenized_path: data/processed/train_tokenized.jsonl output_stats_path: data/processed/pipeline_stats.json # Safety backups before overwrite. backup_existing_tokenized: true backup_existing_stats: true # Existing language labels in clean file may be wrong from earlier runs. # true = infer language from prompt+code content only. ignore_existing_language_labels: true # Optional quick test mode. # Set null for full reprocess. max_records: null