# Incremental JS augmentation config. # This script appends new JavaScript samples into existing Component 3 outputs. tokenizer_dir: artifacts/tokenizer/code_tokenizer_v1 existing_clean_path: data/interim/combined_clean.jsonl existing_tokenized_path: data/processed/train_tokenized.jsonl existing_stats_path: data/processed/pipeline_stats.json dedupe_db_path: data/interim/dedupe_hashes_incremental.sqlite # Chosen dataset for JS augmentation. new_dataset: hf_dataset_id: philschmid/code-alpaca-ruby-python-javascript split: train prompt_field: instruction code_field: output language_field: null default_language: auto # Hard target requested by user. target_new_javascript_examples: 20000 # Quality filters (same idea as Component 3). min_prompt_chars: 8 min_code_chars: 16 max_code_chars: 40000 progress_every: 500