| # Incremental JS augmentation config. | |
| # This script appends new JavaScript samples into existing Component 3 outputs. | |
| tokenizer_dir: artifacts/tokenizer/code_tokenizer_v1 | |
| existing_clean_path: data/interim/combined_clean.jsonl | |
| existing_tokenized_path: data/processed/train_tokenized.jsonl | |
| existing_stats_path: data/processed/pipeline_stats.json | |
| dedupe_db_path: data/interim/dedupe_hashes_incremental.sqlite | |
| # Chosen dataset for JS augmentation. | |
| new_dataset: | |
| hf_dataset_id: philschmid/code-alpaca-ruby-python-javascript | |
| split: train | |
| prompt_field: instruction | |
| code_field: output | |
| language_field: null | |
| default_language: auto | |
| # Hard target requested by user. | |
| target_new_javascript_examples: 20000 | |
| # Quality filters (same idea as Component 3). | |
| min_prompt_chars: 8 | |
| min_code_chars: 16 | |
| max_code_chars: 40000 | |
| progress_every: 500 | |