LisaMegaWatts commited on
Commit
163231a
·
verified ·
1 Parent(s): c7180df

Upload config.yaml with huggingface_hub

Browse files
Files changed (1) hide show
  1. config.yaml +39 -0
config.yaml ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Text Processing Pipeline Configuration
2
+ # For JuliaGPT character-level GPT training
3
+
4
+ paths:
5
+ inbox: inbox
6
+ output: output
7
+ archive: archive
8
+ logs: logs
9
+ parsed: parsed
10
+
11
+ cleaning:
12
+ lowercase: true
13
+ strip_gutenberg: true
14
+ strip_mit_classics: true
15
+ strip_internet_archive: true
16
+ strip_non_body: true
17
+ normalize_unicode: true
18
+ convert_numerals: true
19
+ convert_roman_numerals: true
20
+ remove_urls: true
21
+ collapse_whitespace: true
22
+ min_line_length: 20
23
+ # Ancient script mode: only lowercase letters, space, and period
24
+ # Inspired by Greek scriptio continua — minimal punctuation
25
+ allowed_chars: "a-z ."
26
+
27
+ chunking:
28
+ max_chars: 256
29
+ min_chars: 40
30
+ break_on_sentence: true
31
+
32
+ splitting:
33
+ train_ratio: 0.9
34
+ shuffle: true
35
+ seed: 42
36
+
37
+ huggingface:
38
+ repo_id: "LisaMegaWatts/philosophy-corpus"
39
+ auto_push: false