mindi-backup / configs /component3_reprocess_from_clean.yaml

Mindigenous

Initial full project backup with Git LFS

53f0cc2 10 days ago

695 Bytes

	# Reprocess config: no dataset download, no full pipeline rebuild.
	# It reads existing cleaned data and regenerates tokenized output.

	tokenizer_dir: artifacts/tokenizer/code_tokenizer_v1
	input_clean_path: data/interim/combined_clean.jsonl
	output_tokenized_path: data/processed/train_tokenized.jsonl
	output_stats_path: data/processed/pipeline_stats.json

	# Safety backups before overwrite.
	backup_existing_tokenized: true
	backup_existing_stats: true

	# Existing language labels in clean file may be wrong from earlier runs.
	# true = infer language from prompt+code content only.
	ignore_existing_language_labels: true

	# Optional quick test mode.
	# Set null for full reprocess.
	max_records: null