mindi-backup / configs /component3_reprocess_from_clean.yaml
Mindigenous
Initial full project backup with Git LFS
53f0cc2
raw
history blame contribute delete
695 Bytes
# Reprocess config: no dataset download, no full pipeline rebuild.
# It reads existing cleaned data and regenerates tokenized output.
tokenizer_dir: artifacts/tokenizer/code_tokenizer_v1
input_clean_path: data/interim/combined_clean.jsonl
output_tokenized_path: data/processed/train_tokenized.jsonl
output_stats_path: data/processed/pipeline_stats.json
# Safety backups before overwrite.
backup_existing_tokenized: true
backup_existing_stats: true
# Existing language labels in clean file may be wrong from earlier runs.
# true = infer language from prompt+code content only.
ignore_existing_language_labels: true
# Optional quick test mode.
# Set null for full reprocess.
max_records: null