File size: 822 Bytes
163231a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
# Text Processing Pipeline Configuration
# For JuliaGPT character-level GPT training

paths:
  inbox: inbox
  output: output
  archive: archive
  logs: logs
  parsed: parsed

cleaning:
  lowercase: true
  strip_gutenberg: true
  strip_mit_classics: true
  strip_internet_archive: true
  strip_non_body: true
  normalize_unicode: true
  convert_numerals: true
  convert_roman_numerals: true
  remove_urls: true
  collapse_whitespace: true
  min_line_length: 20
  # Ancient script mode: only lowercase letters, space, and period
  # Inspired by Greek scriptio continua — minimal punctuation
  allowed_chars: "a-z ."

chunking:
  max_chars: 256
  min_chars: 40
  break_on_sentence: true

splitting:
  train_ratio: 0.9
  shuffle: true
  seed: 42

huggingface:
  repo_id: "LisaMegaWatts/philosophy-corpus"
  auto_push: false