LisaMegaWatts's picture
Upload config.yaml with huggingface_hub
163231a verified
raw
history blame contribute delete
822 Bytes
# Text Processing Pipeline Configuration
# For JuliaGPT character-level GPT training
paths:
inbox: inbox
output: output
archive: archive
logs: logs
parsed: parsed
cleaning:
lowercase: true
strip_gutenberg: true
strip_mit_classics: true
strip_internet_archive: true
strip_non_body: true
normalize_unicode: true
convert_numerals: true
convert_roman_numerals: true
remove_urls: true
collapse_whitespace: true
min_line_length: 20
# Ancient script mode: only lowercase letters, space, and period
# Inspired by Greek scriptio continua — minimal punctuation
allowed_chars: "a-z ."
chunking:
max_chars: 256
min_chars: 40
break_on_sentence: true
splitting:
train_ratio: 0.9
shuffle: true
seed: 42
huggingface:
repo_id: "LisaMegaWatts/philosophy-corpus"
auto_push: false