| # Text Processing Pipeline Configuration | |
| # For JuliaGPT character-level GPT training | |
| paths: | |
| inbox: inbox | |
| output: output | |
| archive: archive | |
| logs: logs | |
| parsed: parsed | |
| cleaning: | |
| lowercase: true | |
| strip_gutenberg: true | |
| strip_mit_classics: true | |
| strip_internet_archive: true | |
| strip_non_body: true | |
| normalize_unicode: true | |
| convert_numerals: true | |
| convert_roman_numerals: true | |
| remove_urls: true | |
| collapse_whitespace: true | |
| min_line_length: 20 | |
| # Ancient script mode: only lowercase letters, space, and period | |
| # Inspired by Greek scriptio continua — minimal punctuation | |
| allowed_chars: "a-z ." | |
| chunking: | |
| max_chars: 256 | |
| min_chars: 40 | |
| break_on_sentence: true | |
| splitting: | |
| train_ratio: 0.9 | |
| shuffle: true | |
| seed: 42 | |
| huggingface: | |
| repo_id: "LisaMegaWatts/philosophy-corpus" | |
| auto_push: false | |