Trouter-Library commited on
Commit
0e83bb5
·
verified ·
1 Parent(s): 055cc2e

Delete dataset_config.yaml

Browse files
Files changed (1) hide show
  1. dataset_config.yaml +0 -327
dataset_config.yaml DELETED
@@ -1,327 +0,0 @@
1
- # Helion 1.5 Series - Dataset Configuration
2
- # This file defines the structure, features, and specifications for the Helion 1.5 dataset
3
-
4
- dataset_info:
5
- name: helion-1.5
6
- version: 1.5.0
7
- release_date: "2024-11-07"
8
- description: "Enhanced large-scale dataset for language model training with improved quality and diversity"
9
- homepage: "https://huggingface.co/datasets/your-username/helion-1.5"
10
- license: "CC-BY-4.0"
11
-
12
- citation: |
13
- @dataset{helion_1_5_2024,
14
- title={Helion 1.5: An Enhanced Large-Scale Dataset for Language Model Training},
15
- author={Your Name/Organization},
16
- year={2024},
17
- publisher={Hugging Face}
18
- }
19
-
20
- # Dataset Splits Configuration
21
- splits:
22
- train:
23
- num_examples: 1800000
24
- file_pattern: "train-*.jsonl"
25
- validation:
26
- num_examples: 100000
27
- file_pattern: "validation-*.jsonl"
28
- test:
29
- num_examples: 100000
30
- file_pattern: "test-*.jsonl"
31
-
32
- # File Configurations
33
- files:
34
- conversations:
35
- filename: "helion-1.5-conversations.jsonl"
36
- description: "Multi-turn conversational data"
37
- size_mb: 5200
38
- num_examples: 800000
39
- format: "jsonl"
40
-
41
- instructions:
42
- filename: "helion-1.5-instructions.jsonl"
43
- description: "Instruction-following pairs"
44
- size_mb: 3800
45
- num_examples: 600000
46
- format: "jsonl"
47
-
48
- code:
49
- filename: "helion-1.5-code.jsonl"
50
- description: "Programming and code generation"
51
- size_mb: 2100
52
- num_examples: 250000
53
- format: "jsonl"
54
-
55
- reasoning:
56
- filename: "helion-1.5-reasoning.jsonl"
57
- description: "Complex reasoning and problem-solving"
58
- size_mb: 1400
59
- num_examples: 180000
60
- format: "jsonl"
61
-
62
- creative:
63
- filename: "helion-1.5-creative.jsonl"
64
- description: "Creative writing and content"
65
- size_mb: 900
66
- num_examples: 120000
67
- format: "jsonl"
68
-
69
- multilingual:
70
- filename: "helion-1.5-multilingual.jsonl"
71
- description: "Multilingual data across 30+ languages"
72
- size_mb: 650
73
- num_examples: 50000
74
- format: "jsonl"
75
-
76
- # Feature Schemas
77
- schemas:
78
- conversations:
79
- id:
80
- type: string
81
- description: "Unique conversation identifier"
82
- conversations:
83
- type: list
84
- description: "List of conversation turns"
85
- items:
86
- role:
87
- type: string
88
- enum: ["user", "assistant", "system"]
89
- content:
90
- type: string
91
- description: "Message content"
92
- metadata:
93
- type: object
94
- properties:
95
- domain:
96
- type: string
97
- enum: ["general", "science", "technology", "math", "history", "literature", "arts", "business", "health", "other"]
98
- difficulty:
99
- type: string
100
- enum: ["easy", "intermediate", "advanced", "expert"]
101
- languages:
102
- type: list
103
- items: string
104
- quality_score:
105
- type: float
106
- range: [0.0, 1.0]
107
- word_count:
108
- type: integer
109
- turn_count:
110
- type: integer
111
- has_code:
112
- type: boolean
113
- topics:
114
- type: list
115
- items: string
116
-
117
- instructions:
118
- id:
119
- type: string
120
- instruction:
121
- type: string
122
- description: "The instruction or task"
123
- input:
124
- type: string
125
- description: "Optional input context"
126
- output:
127
- type: string
128
- description: "Expected output or response"
129
- metadata:
130
- type: object
131
- properties:
132
- task_type:
133
- type: string
134
- enum: ["summarization", "question_answering", "translation", "classification", "generation", "editing", "analysis", "other"]
135
- complexity:
136
- type: string
137
- enum: ["low", "medium", "high", "very_high"]
138
- verified:
139
- type: boolean
140
- domain:
141
- type: string
142
- language:
143
- type: string
144
-
145
- code:
146
- id:
147
- type: string
148
- language:
149
- type: string
150
- enum: ["python", "javascript", "java", "cpp", "c", "go", "rust", "typescript", "sql", "html", "css", "bash", "other"]
151
- problem:
152
- type: string
153
- description: "Problem statement or task"
154
- solution:
155
- type: string
156
- description: "Code solution"
157
- explanation:
158
- type: string
159
- description: "Explanation of the solution"
160
- test_cases:
161
- type: list
162
- items:
163
- input: string
164
- output: string
165
- description: string
166
- metadata:
167
- type: object
168
- properties:
169
- difficulty:
170
- type: string
171
- enum: ["easy", "medium", "hard", "expert"]
172
- tags:
173
- type: list
174
- items: string
175
- time_complexity:
176
- type: string
177
- space_complexity:
178
- type: string
179
- lines_of_code:
180
- type: integer
181
-
182
- reasoning:
183
- id:
184
- type: string
185
- problem:
186
- type: string
187
- description: "Problem or question requiring reasoning"
188
- reasoning_steps:
189
- type: list
190
- items:
191
- step_number: integer
192
- description: string
193
- calculation: string
194
- final_answer:
195
- type: string
196
- metadata:
197
- type: object
198
- properties:
199
- reasoning_type:
200
- type: string
201
- enum: ["mathematical", "logical", "causal", "spatial", "temporal", "analogical", "counterfactual"]
202
- steps_count:
203
- type: integer
204
- difficulty:
205
- type: string
206
- domain:
207
- type: string
208
-
209
- # Quality Metrics
210
- quality_standards:
211
- minimum_quality_score: 0.75
212
- required_fields_completion: 0.95
213
- duplicate_threshold: 0.85
214
- toxic_content_threshold: 0.01
215
-
216
- filtering_pipeline:
217
- - name: "deduplication"
218
- method: "minhash_lsh"
219
- threshold: 0.85
220
- - name: "language_detection"
221
- method: "fasttext"
222
- confidence_threshold: 0.8
223
- - name: "quality_scoring"
224
- method: "ensemble"
225
- models: ["perplexity", "coherence", "fluency"]
226
- - name: "safety_filtering"
227
- method: "classifier"
228
- categories: ["toxic", "harmful", "biased", "personal_info"]
229
- - name: "format_validation"
230
- method: "schema_validation"
231
- strict: true
232
-
233
- # Domain Distribution
234
- domain_distribution:
235
- general_knowledge: 0.25
236
- science_technology: 0.20
237
- mathematics: 0.12
238
- programming: 0.15
239
- creative_writing: 0.08
240
- business_finance: 0.05
241
- health_medicine: 0.05
242
- history_culture: 0.05
243
- arts_entertainment: 0.03
244
- other: 0.02
245
-
246
- # Language Distribution
247
- language_distribution:
248
- en: 0.70
249
- es: 0.05
250
- fr: 0.04
251
- de: 0.03
252
- zh: 0.03
253
- ja: 0.02
254
- pt: 0.02
255
- ar: 0.02
256
- ru: 0.02
257
- it: 0.02
258
- other: 0.05
259
-
260
- # Training Recommendations
261
- training_config:
262
- recommended_batch_size: 4
263
- recommended_gradient_accumulation: 8
264
- effective_batch_size: 32
265
- recommended_learning_rate: 2.0e-5
266
- warmup_steps: 1000
267
- max_sequence_length: 2048
268
-
269
- data_mixing_weights:
270
- conversations: 0.35
271
- instructions: 0.30
272
- code: 0.15
273
- reasoning: 0.10
274
- creative: 0.06
275
- multilingual: 0.04
276
-
277
- suggested_epochs:
278
- full_training: 3
279
- fine_tuning: 1-2
280
- domain_adaptation: 1
281
-
282
- # Evaluation Benchmarks
283
- evaluation_benchmarks:
284
- - name: "MMLU"
285
- expected_improvement: "+5%"
286
- - name: "HumanEval"
287
- expected_improvement: "+8%"
288
- - name: "GSM8K"
289
- expected_improvement: "+6%"
290
- - name: "HellaSwag"
291
- expected_improvement: "+3%"
292
- - name: "TruthfulQA"
293
- expected_improvement: "+4%"
294
-
295
- # Versioning
296
- versioning:
297
- major: 1
298
- minor: 5
299
- patch: 0
300
- changelog:
301
- - version: "1.5.0"
302
- date: "2024-11-07"
303
- changes:
304
- - "Initial Helion 1.5 release"
305
- - "3x increase in dataset size"
306
- - "Added multilingual support (30+ languages)"
307
- - "Improved code dataset (5x larger)"
308
- - "Enhanced reasoning tasks"
309
- - "Better quality filtering"
310
- - version: "1.0.0"
311
- date: "2024-05-01"
312
- changes:
313
- - "Original Helion 1 release"
314
-
315
- # Maintenance
316
- maintenance:
317
- update_frequency: "quarterly"
318
- deprecation_policy: "12 months notice"
319
- bug_report_url: "https://github.com/your-repo/issues"
320
- community_contributions: true
321
-
322
- # Contact
323
- contact:
324
- maintainer: "Your Name/Organization"
325
- email: "contact@example.com"
326
- discord: "discord.gg/your-server"
327
- twitter: "@your_handle"