KShoichi commited on
Commit
9779d72
·
verified ·
1 Parent(s): 8c7a6aa

Upload create_verified_dataset.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. create_verified_dataset.py +326 -0
create_verified_dataset.py ADDED
@@ -0,0 +1,326 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ VERIFIED TRAINING DATA: Comprehensive Fact-Checked Dataset
4
+ All facts have been verified for accuracy before inclusion
5
+ """
6
+ import pandas as pd
7
+ import json
8
+
9
+ def create_verified_training_data():
10
+ """
11
+ Create comprehensive training data with verified facts
12
+ All information has been fact-checked for accuracy
13
+ """
14
+
15
+ verified_data = [
16
+ # ===== EARTH SHAPE & ASTRONOMY (VERIFIED) =====
17
+ {
18
+ 'prompt': 'Earth is a sphere according to scientific evidence',
19
+ 'response': 'Earth is actually flat like a disk',
20
+ 'question': 'What is the shape of Earth according to science?',
21
+ 'is_hallucination': True,
22
+ 'category': 'Earth_Shape',
23
+ 'verified_fact': 'Earth is an oblate spheroid (nearly spherical)'
24
+ },
25
+ {
26
+ 'prompt': 'NASA satellite images show Earth is round',
27
+ 'response': 'NASA photos are fake, Earth is flat',
28
+ 'question': 'What do satellite images show about Earth?',
29
+ 'is_hallucination': True,
30
+ 'category': 'Earth_Shape',
31
+ 'verified_fact': 'Satellite images consistently show spherical Earth'
32
+ },
33
+ {
34
+ 'prompt': 'Ships disappear over the horizon because Earth is curved',
35
+ 'response': 'Ships disappear because of atmospheric effects, Earth is flat',
36
+ 'question': 'Why do ships disappear over the horizon?',
37
+ 'is_hallucination': True,
38
+ 'category': 'Earth_Shape',
39
+ 'verified_fact': 'Ship horizon effect proves Earth curvature'
40
+ },
41
+ {
42
+ 'prompt': 'Earth is a sphere with a circumference of about 40,075 km',
43
+ 'response': 'Earth is a sphere with a circumference of about 40,075 km',
44
+ 'question': 'What is Earth\'s circumference?',
45
+ 'is_hallucination': False,
46
+ 'category': 'Earth_Shape',
47
+ 'verified_fact': 'Earth circumference: 40,075 km (equatorial)'
48
+ },
49
+
50
+ # ===== BASIC PHYSICS (VERIFIED) =====
51
+ {
52
+ 'prompt': 'Water boils at 100°C (212°F) at sea level',
53
+ 'response': 'Water boils at 80°C at sea level',
54
+ 'question': 'At what temperature does water boil at sea level?',
55
+ 'is_hallucination': True,
56
+ 'category': 'Physics',
57
+ 'verified_fact': 'Water boils at 100°C (212°F) at 1 atmosphere pressure'
58
+ },
59
+ {
60
+ 'prompt': 'Gravity accelerates objects at 9.8 m/s² on Earth',
61
+ 'response': 'Gravity accelerates objects at 15 m/s² on Earth',
62
+ 'question': 'What is Earth\'s gravitational acceleration?',
63
+ 'is_hallucination': True,
64
+ 'category': 'Physics',
65
+ 'verified_fact': 'Standard gravity: 9.80665 m/s²'
66
+ },
67
+ {
68
+ 'prompt': 'Sound travels at approximately 343 m/s in air at 20°C',
69
+ 'response': 'Sound travels at approximately 343 m/s in air at 20°C',
70
+ 'question': 'How fast does sound travel in air?',
71
+ 'is_hallucination': False,
72
+ 'category': 'Physics',
73
+ 'verified_fact': 'Sound speed in air: ~343 m/s at 20°C'
74
+ },
75
+ {
76
+ 'prompt': 'Light travels at 299,792,458 meters per second in vacuum',
77
+ 'response': 'Light travels at 150,000,000 meters per second in vacuum',
78
+ 'question': 'What is the speed of light in vacuum?',
79
+ 'is_hallucination': True,
80
+ 'category': 'Physics',
81
+ 'verified_fact': 'Speed of light: 299,792,458 m/s (exact)'
82
+ },
83
+
84
+ # ===== MATHEMATICS (VERIFIED) =====
85
+ {
86
+ 'prompt': 'In mathematics, 2 + 2 equals 4',
87
+ 'response': '2 + 2 equals 5',
88
+ 'question': 'What does 2 + 2 equal?',
89
+ 'is_hallucination': True,
90
+ 'category': 'Mathematics',
91
+ 'verified_fact': '2 + 2 = 4 (basic arithmetic)'
92
+ },
93
+ {
94
+ 'prompt': 'Pi (π) is approximately 3.14159',
95
+ 'response': 'Pi (π) is approximately 3.5',
96
+ 'question': 'What is the approximate value of pi?',
97
+ 'is_hallucination': True,
98
+ 'category': 'Mathematics',
99
+ 'verified_fact': 'π ≈ 3.14159265... (irrational number)'
100
+ },
101
+ {
102
+ 'prompt': 'A circle has 360 degrees',
103
+ 'response': 'A circle has 360 degrees',
104
+ 'question': 'How many degrees are in a circle?',
105
+ 'is_hallucination': False,
106
+ 'category': 'Mathematics',
107
+ 'verified_fact': 'Circle: 360° (by definition)'
108
+ },
109
+ {
110
+ 'prompt': 'The square root of 16 is 4',
111
+ 'response': 'The square root of 16 is 6',
112
+ 'question': 'What is the square root of 16?',
113
+ 'is_hallucination': True,
114
+ 'category': 'Mathematics',
115
+ 'verified_fact': '√16 = 4 (4² = 16)'
116
+ },
117
+
118
+ # ===== GEOGRAPHY (VERIFIED) =====
119
+ {
120
+ 'prompt': 'Paris is the capital city of France',
121
+ 'response': 'Berlin is the capital city of France',
122
+ 'question': 'What is the capital of France?',
123
+ 'is_hallucination': True,
124
+ 'category': 'Geography',
125
+ 'verified_fact': 'Paris is the capital of France'
126
+ },
127
+ {
128
+ 'prompt': 'Mount Everest is the tallest mountain on Earth at 8,848.86 meters',
129
+ 'response': 'Mount Everest is 7,000 meters tall',
130
+ 'question': 'How tall is Mount Everest?',
131
+ 'is_hallucination': True,
132
+ 'category': 'Geography',
133
+ 'verified_fact': 'Mount Everest: 8,848.86m (official 2020 measurement)'
134
+ },
135
+ {
136
+ 'prompt': 'The Pacific Ocean is the largest ocean on Earth',
137
+ 'response': 'The Atlantic Ocean is the largest ocean on Earth',
138
+ 'question': 'Which is the largest ocean?',
139
+ 'is_hallucination': True,
140
+ 'category': 'Geography',
141
+ 'verified_fact': 'Pacific Ocean covers ~46% of world\'s water surface'
142
+ },
143
+ {
144
+ 'prompt': 'Australia is both a country and a continent',
145
+ 'response': 'Australia is both a country and a continent',
146
+ 'question': 'What is Australia?',
147
+ 'is_hallucination': False,
148
+ 'category': 'Geography',
149
+ 'verified_fact': 'Australia: country and continent'
150
+ },
151
+
152
+ # ===== HISTORY (VERIFIED) =====
153
+ {
154
+ 'prompt': 'World War II ended in 1945',
155
+ 'response': 'World War II ended in 1950',
156
+ 'question': 'When did World War II end?',
157
+ 'is_hallucination': True,
158
+ 'category': 'History',
159
+ 'verified_fact': 'WWII ended: Sept 2, 1945 (Japan surrender)'
160
+ },
161
+ {
162
+ 'prompt': 'The first moon landing was on July 20, 1969',
163
+ 'response': 'The first moon landing was faked in a Hollywood studio',
164
+ 'question': 'When was the first moon landing?',
165
+ 'is_hallucination': True,
166
+ 'category': 'History',
167
+ 'verified_fact': 'Apollo 11 moon landing: July 20, 1969'
168
+ },
169
+ {
170
+ 'prompt': 'The Berlin Wall fell in 1989',
171
+ 'response': 'The Berlin Wall fell in 1989',
172
+ 'question': 'When did the Berlin Wall fall?',
173
+ 'is_hallucination': False,
174
+ 'category': 'History',
175
+ 'verified_fact': 'Berlin Wall fell: November 9, 1989'
176
+ },
177
+ {
178
+ 'prompt': 'The United States declared independence in 1776',
179
+ 'response': 'The United States declared independence in 1800',
180
+ 'question': 'When did the US declare independence?',
181
+ 'is_hallucination': True,
182
+ 'category': 'History',
183
+ 'verified_fact': 'US Independence: July 4, 1776'
184
+ },
185
+
186
+ # ===== BIOLOGY (VERIFIED) =====
187
+ {
188
+ 'prompt': 'Humans have 46 chromosomes (23 pairs)',
189
+ 'response': 'Humans have 50 chromosomes',
190
+ 'question': 'How many chromosomes do humans have?',
191
+ 'is_hallucination': True,
192
+ 'category': 'Biology',
193
+ 'verified_fact': 'Humans: 46 chromosomes (23 pairs)'
194
+ },
195
+ {
196
+ 'prompt': 'Cats are mammals that give birth to live young',
197
+ 'response': 'Cats are reptiles that lay eggs',
198
+ 'question': 'What type of animals are cats?',
199
+ 'is_hallucination': True,
200
+ 'category': 'Biology',
201
+ 'verified_fact': 'Cats: mammals (viviparous)'
202
+ },
203
+ {
204
+ 'prompt': 'Photosynthesis converts sunlight into chemical energy in plants',
205
+ 'response': 'Photosynthesis converts sunlight into chemical energy in plants',
206
+ 'question': 'What does photosynthesis do?',
207
+ 'is_hallucination': False,
208
+ 'category': 'Biology',
209
+ 'verified_fact': 'Photosynthesis: light → chemical energy (glucose)'
210
+ },
211
+ {
212
+ 'prompt': 'The human heart has four chambers',
213
+ 'response': 'The human heart has six chambers',
214
+ 'question': 'How many chambers does the human heart have?',
215
+ 'is_hallucination': True,
216
+ 'category': 'Biology',
217
+ 'verified_fact': 'Human heart: 4 chambers (2 atria, 2 ventricles)'
218
+ },
219
+
220
+ # ===== CHEMISTRY (VERIFIED) =====
221
+ {
222
+ 'prompt': 'Water has the chemical formula H₂O',
223
+ 'response': 'Water has the chemical formula H₃O',
224
+ 'question': 'What is the chemical formula for water?',
225
+ 'is_hallucination': True,
226
+ 'category': 'Chemistry',
227
+ 'verified_fact': 'Water: H₂O (2 hydrogen, 1 oxygen)'
228
+ },
229
+ {
230
+ 'prompt': 'Gold has the chemical symbol Au',
231
+ 'response': 'Gold has the chemical symbol Go',
232
+ 'question': 'What is the chemical symbol for gold?',
233
+ 'is_hallucination': True,
234
+ 'category': 'Chemistry',
235
+ 'verified_fact': 'Gold: Au (from Latin aurum)'
236
+ },
237
+ {
238
+ 'prompt': 'Oxygen makes up about 21% of Earth\'s atmosphere',
239
+ 'response': 'Oxygen makes up about 21% of Earth\'s atmosphere',
240
+ 'question': 'What percentage of atmosphere is oxygen?',
241
+ 'is_hallucination': False,
242
+ 'category': 'Chemistry',
243
+ 'verified_fact': 'Atmospheric oxygen: ~20.95%'
244
+ },
245
+ {
246
+ 'prompt': 'Carbon dioxide (CO₂) is a greenhouse gas',
247
+ 'response': 'Carbon dioxide actually cools the planet',
248
+ 'question': 'What effect does CO₂ have on climate?',
249
+ 'is_hallucination': True,
250
+ 'category': 'Chemistry',
251
+ 'verified_fact': 'CO₂: greenhouse gas (absorbs infrared radiation)'
252
+ },
253
+
254
+ # ===== COMMON KNOWLEDGE (VERIFIED) =====
255
+ {
256
+ 'prompt': 'The sky appears blue during a clear day',
257
+ 'response': 'The sky appears red during a clear day',
258
+ 'question': 'What color is the sky on a clear day?',
259
+ 'is_hallucination': True,
260
+ 'category': 'Common_Knowledge',
261
+ 'verified_fact': 'Sky appears blue due to Rayleigh scattering'
262
+ },
263
+ {
264
+ 'prompt': 'There are 24 hours in a day',
265
+ 'response': 'There are 30 hours in a day',
266
+ 'question': 'How many hours are in a day?',
267
+ 'is_hallucination': True,
268
+ 'category': 'Common_Knowledge',
269
+ 'verified_fact': 'Day: 24 hours (by definition)'
270
+ },
271
+ {
272
+ 'prompt': 'There are 7 days in a week',
273
+ 'response': 'There are 7 days in a week',
274
+ 'question': 'How many days are in a week?',
275
+ 'is_hallucination': False,
276
+ 'category': 'Common_Knowledge',
277
+ 'verified_fact': 'Week: 7 days (international standard)'
278
+ },
279
+ {
280
+ 'prompt': 'Fire requires oxygen to burn',
281
+ 'response': 'Fire can burn without any oxygen',
282
+ 'question': 'What does fire need to burn?',
283
+ 'is_hallucination': True,
284
+ 'category': 'Common_Knowledge',
285
+ 'verified_fact': 'Combustion requires oxygen (oxidizer)'
286
+ }
287
+ ]
288
+
289
+ return verified_data
290
+
291
+ def save_verified_dataset():
292
+ """Save the verified dataset in multiple formats"""
293
+
294
+ print("🔍 Creating VERIFIED comprehensive training dataset...")
295
+
296
+ data = create_verified_training_data()
297
+ df = pd.DataFrame(data)
298
+
299
+ # Save as CSV
300
+ df.to_csv('verified_comprehensive_training_data.csv', index=False)
301
+
302
+ # Save as JSON for inspection
303
+ with open('verified_comprehensive_training_data.json', 'w', encoding='utf-8') as f:
304
+ json.dump(data, f, indent=2, ensure_ascii=False)
305
+
306
+ # Print statistics
307
+ print(f"📊 Dataset Statistics:")
308
+ print(f" Total examples: {len(data)}")
309
+ print(f" Hallucinations: {sum(1 for d in data if d['is_hallucination'])}")
310
+ print(f" Correct responses: {sum(1 for d in data if not d['is_hallucination'])}")
311
+ print(f" Categories: {len(df['category'].unique())}")
312
+
313
+ print(f"\n📋 Categories breakdown:")
314
+ for category in df['category'].unique():
315
+ count = len(df[df['category'] == category])
316
+ print(f" {category}: {count} examples")
317
+
318
+ print(f"\n✅ All facts have been verified for accuracy")
319
+ print(f"📁 Files saved:")
320
+ print(f" - verified_comprehensive_training_data.csv")
321
+ print(f" - verified_comprehensive_training_data.json")
322
+
323
+ return df
324
+
325
+ if __name__ == "__main__":
326
+ save_verified_dataset()