# Text Processing Pipeline Configuration # For JuliaGPT character-level GPT training paths: inbox: inbox output: output archive: archive logs: logs parsed: parsed cleaning: lowercase: true strip_gutenberg: true strip_mit_classics: true strip_internet_archive: true strip_non_body: true normalize_unicode: true convert_numerals: true convert_roman_numerals: true remove_urls: true collapse_whitespace: true min_line_length: 20 # Ancient script mode: only lowercase letters, space, and period # Inspired by Greek scriptio continua — minimal punctuation allowed_chars: "a-z ." chunking: max_chars: 256 min_chars: 40 break_on_sentence: true splitting: train_ratio: 0.9 shuffle: true seed: 42 huggingface: repo_id: "LisaMegaWatts/philosophy-corpus" auto_push: false