| # Component 3 config: load, clean, deduplicate, tokenize. | |
| tokenizer_dir: artifacts/tokenizer/code_tokenizer_v1 | |
| interim_output_dir: data/interim | |
| processed_output_dir: data/processed | |
| dedupe_db_path: data/interim/dedupe_hashes.sqlite | |
| # Set null for full run. | |
| # Use a small number like 500 for fast smoke testing. | |
| max_records_per_dataset: null | |
| min_prompt_chars: 8 | |
| min_code_chars: 16 | |
| max_code_chars: 40000 | |
| progress_every: 1000 | |
| datasets: | |
| - hf_dataset_id: iamtarun/python_code_instructions_18k_alpaca | |
| split: train | |
| prompt_field: instruction | |
| code_field: output | |
| language_field: null | |
| default_language: python | |
| - hf_dataset_id: sahil2801/CodeAlpaca-20k | |
| split: train | |
| prompt_field: instruction | |
| code_field: output | |
| language_field: null | |
| default_language: python | |
| - hf_dataset_id: TokenBender/code_instructions_122k_alpaca_style | |
| split: train | |
| prompt_field: instruction | |
| code_field: output | |
| language_field: null | |
| default_language: python | |