File size: 822 Bytes
163231a | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 | # Text Processing Pipeline Configuration
# For JuliaGPT character-level GPT training
paths:
inbox: inbox
output: output
archive: archive
logs: logs
parsed: parsed
cleaning:
lowercase: true
strip_gutenberg: true
strip_mit_classics: true
strip_internet_archive: true
strip_non_body: true
normalize_unicode: true
convert_numerals: true
convert_roman_numerals: true
remove_urls: true
collapse_whitespace: true
min_line_length: 20
# Ancient script mode: only lowercase letters, space, and period
# Inspired by Greek scriptio continua — minimal punctuation
allowed_chars: "a-z ."
chunking:
max_chars: 256
min_chars: 40
break_on_sentence: true
splitting:
train_ratio: 0.9
shuffle: true
seed: 42
huggingface:
repo_id: "LisaMegaWatts/philosophy-corpus"
auto_push: false
|