KShoichi commited on
Commit
a76a38b
Β·
verified Β·
1 Parent(s): 6f78ed1

Upload combine_and_expand_data.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. combine_and_expand_data.py +145 -0
combine_and_expand_data.py ADDED
@@ -0,0 +1,145 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Combine all training datasets and verify accuracy
4
+ """
5
+ import pandas as pd
6
+ import os
7
+
8
+ def combine_all_datasets():
9
+ """Combine all your training datasets into one comprehensive set"""
10
+
11
+ print("πŸ” COMBINING ALL TRAINING DATASETS")
12
+ print("=" * 70)
13
+
14
+ # List of all training files
15
+ training_files = [
16
+ 'training.csv', # Your corrected base data (100% accurate)
17
+ 'advanced_training_data.csv', # Tech data
18
+ 'comprehensive_training_data.csv', # More comprehensive data
19
+ 'edge_cases_training_data.csv', # Edge cases
20
+ ]
21
+
22
+ all_data = []
23
+ total_rows = 0
24
+
25
+ for file in training_files:
26
+ if os.path.exists(file):
27
+ print(f"πŸ“„ Reading {file}...")
28
+ df = pd.read_csv(file)
29
+ print(f" Rows: {len(df)}")
30
+ all_data.append(df)
31
+ total_rows += len(df)
32
+ else:
33
+ print(f"⚠️ {file} not found - skipping")
34
+
35
+ if all_data:
36
+ # Combine all datasets
37
+ combined_df = pd.concat(all_data, ignore_index=True)
38
+
39
+ # Remove duplicates
40
+ before_dedup = len(combined_df)
41
+ combined_df = combined_df.drop_duplicates(subset=['prompt', 'response'], keep='first')
42
+ after_dedup = len(combined_df)
43
+
44
+ print(f"\nπŸ“Š COMBINATION RESULTS:")
45
+ print(f" Total rows before dedup: {before_dedup}")
46
+ print(f" Duplicates removed: {before_dedup - after_dedup}")
47
+ print(f" Final dataset size: {after_dedup} rows")
48
+
49
+ # Save combined dataset
50
+ combined_df.to_csv('mega_training_data.csv', index=False)
51
+ print(f"βœ… Saved as: mega_training_data.csv")
52
+
53
+ # Show breakdown
54
+ print(f"\nπŸ“ˆ BREAKDOWN:")
55
+ correct_count = len(combined_df[combined_df['is_hallucination'] == False])
56
+ hallucination_count = len(combined_df[combined_df['is_hallucination'] == True])
57
+
58
+ print(f" Correct examples: {correct_count}")
59
+ print(f" Hallucination examples: {hallucination_count}")
60
+ print(f" Balance ratio: {correct_count/hallucination_count:.2f}:1")
61
+
62
+ return after_dedup
63
+
64
+ return 0
65
+
66
+ def show_add_data_options():
67
+ """Show options for adding more data"""
68
+
69
+ print(f"\n🎯 NOW YOU CAN ADD MORE DATA!")
70
+ print("=" * 70)
71
+
72
+ print("""
73
+ πŸ’‘ WAYS TO ADD MORE TRAINING DATA:
74
+
75
+ 1. πŸ“ MANUAL ADDITION:
76
+ β€’ Create new examples in the same format
77
+ β€’ Add current events, science facts, technology updates
78
+ β€’ Include your specific domain knowledge
79
+
80
+ 2. 🌍 DOMAIN-SPECIFIC DATA:
81
+ β€’ Add facts about your industry/field
82
+ β€’ Include regional/local information
83
+ β€’ Add recent news and developments
84
+
85
+ 3. πŸ“‹ COPY-PASTE FORMAT:
86
+ Just send me data in this format:
87
+
88
+ PROMPT: "New fact about something"
89
+ CORRECT: "The correct response"
90
+ WRONG1: "A hallucinated response"
91
+ WRONG2: "Another hallucinated response"
92
+
93
+ 4. πŸ”„ EXAMPLES OF GOOD ADDITIONS:
94
+ β€’ "Python 3.12: released October 2023, new match statement"
95
+ β€’ "ChatGPT-4: launched March 2023, multimodal capabilities"
96
+ β€’ "Your company: founded X year, specializes in Y"
97
+ β€’ "Your city: population X, famous for Y"
98
+
99
+ πŸ“€ SEND ME YOUR NEW DATA IN ANY FORMAT:
100
+ β€’ List format
101
+ β€’ CSV format
102
+ β€’ Plain text
103
+ β€’ I'll convert it to training format!
104
+ """)
105
+
106
+ def create_training_template():
107
+ """Create a template for easy data addition"""
108
+
109
+ template = """# EASY TRAINING DATA TEMPLATE
110
+ # Copy this format and send me your new facts!
111
+
112
+ NEW_FACT_1:
113
+ Prompt: "Your fact here: details about something"
114
+ Correct: "The correct answer"
115
+ Wrong1: "A hallucinated answer"
116
+ Wrong2: "Another hallucinated answer"
117
+
118
+ NEW_FACT_2:
119
+ Prompt: "Another fact: more details"
120
+ Correct: "Correct response"
121
+ Wrong1: "Wrong response"
122
+ Wrong2: "Another wrong response"
123
+
124
+ # Example:
125
+ NEW_FACT_EXAMPLE:
126
+ Prompt: "Google Pixel 8: Tensor G3 chip, 7 years of updates"
127
+ Correct: "Pixel 8 has Tensor G3 processor"
128
+ Wrong1: "Pixel 8 uses Snapdragon 8 Gen 2"
129
+ Wrong2: "Pixel 8 is powered by A17 Bionic"
130
+ """
131
+
132
+ with open('data_addition_template.txt', 'w') as f:
133
+ f.write(template)
134
+
135
+ print(f"πŸ“ Created template: data_addition_template.txt")
136
+ print(" You can use this format to send me new data!")
137
+
138
+ if __name__ == "__main__":
139
+ total_rows = combine_all_datasets()
140
+ show_add_data_options()
141
+ create_training_template()
142
+
143
+ print(f"\nπŸš€ READY FOR MORE DATA!")
144
+ print(f"Current dataset: {total_rows} examples")
145
+ print("Send me your new facts and I'll add them! πŸ“Š")