# Reprocess config: no dataset download, no full pipeline rebuild.
# It reads existing cleaned data and regenerates tokenized output.

tokenizer_dir: artifacts/tokenizer/code_tokenizer_v1
input_clean_path: data/interim/combined_clean.jsonl
output_tokenized_path: data/processed/train_tokenized.jsonl
output_stats_path: data/processed/pipeline_stats.json

# Safety backups before overwrite.
backup_existing_tokenized: true
backup_existing_stats: true

# Existing language labels in clean file may be wrong from earlier runs.
# true = infer language from prompt+code content only.
ignore_existing_language_labels: true

# Optional quick test mode.
# Set null for full reprocess.
max_records: null