Upload combine_and_expand_data.py with huggingface_hub
Browse files- combine_and_expand_data.py +145 -0
combine_and_expand_data.py
ADDED
|
@@ -0,0 +1,145 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
Combine all training datasets and verify accuracy
|
| 4 |
+
"""
|
| 5 |
+
import pandas as pd
|
| 6 |
+
import os
|
| 7 |
+
|
| 8 |
+
def combine_all_datasets():
|
| 9 |
+
"""Combine all your training datasets into one comprehensive set"""
|
| 10 |
+
|
| 11 |
+
print("π COMBINING ALL TRAINING DATASETS")
|
| 12 |
+
print("=" * 70)
|
| 13 |
+
|
| 14 |
+
# List of all training files
|
| 15 |
+
training_files = [
|
| 16 |
+
'training.csv', # Your corrected base data (100% accurate)
|
| 17 |
+
'advanced_training_data.csv', # Tech data
|
| 18 |
+
'comprehensive_training_data.csv', # More comprehensive data
|
| 19 |
+
'edge_cases_training_data.csv', # Edge cases
|
| 20 |
+
]
|
| 21 |
+
|
| 22 |
+
all_data = []
|
| 23 |
+
total_rows = 0
|
| 24 |
+
|
| 25 |
+
for file in training_files:
|
| 26 |
+
if os.path.exists(file):
|
| 27 |
+
print(f"π Reading {file}...")
|
| 28 |
+
df = pd.read_csv(file)
|
| 29 |
+
print(f" Rows: {len(df)}")
|
| 30 |
+
all_data.append(df)
|
| 31 |
+
total_rows += len(df)
|
| 32 |
+
else:
|
| 33 |
+
print(f"β οΈ {file} not found - skipping")
|
| 34 |
+
|
| 35 |
+
if all_data:
|
| 36 |
+
# Combine all datasets
|
| 37 |
+
combined_df = pd.concat(all_data, ignore_index=True)
|
| 38 |
+
|
| 39 |
+
# Remove duplicates
|
| 40 |
+
before_dedup = len(combined_df)
|
| 41 |
+
combined_df = combined_df.drop_duplicates(subset=['prompt', 'response'], keep='first')
|
| 42 |
+
after_dedup = len(combined_df)
|
| 43 |
+
|
| 44 |
+
print(f"\nπ COMBINATION RESULTS:")
|
| 45 |
+
print(f" Total rows before dedup: {before_dedup}")
|
| 46 |
+
print(f" Duplicates removed: {before_dedup - after_dedup}")
|
| 47 |
+
print(f" Final dataset size: {after_dedup} rows")
|
| 48 |
+
|
| 49 |
+
# Save combined dataset
|
| 50 |
+
combined_df.to_csv('mega_training_data.csv', index=False)
|
| 51 |
+
print(f"β
Saved as: mega_training_data.csv")
|
| 52 |
+
|
| 53 |
+
# Show breakdown
|
| 54 |
+
print(f"\nπ BREAKDOWN:")
|
| 55 |
+
correct_count = len(combined_df[combined_df['is_hallucination'] == False])
|
| 56 |
+
hallucination_count = len(combined_df[combined_df['is_hallucination'] == True])
|
| 57 |
+
|
| 58 |
+
print(f" Correct examples: {correct_count}")
|
| 59 |
+
print(f" Hallucination examples: {hallucination_count}")
|
| 60 |
+
print(f" Balance ratio: {correct_count/hallucination_count:.2f}:1")
|
| 61 |
+
|
| 62 |
+
return after_dedup
|
| 63 |
+
|
| 64 |
+
return 0
|
| 65 |
+
|
| 66 |
+
def show_add_data_options():
|
| 67 |
+
"""Show options for adding more data"""
|
| 68 |
+
|
| 69 |
+
print(f"\nπ― NOW YOU CAN ADD MORE DATA!")
|
| 70 |
+
print("=" * 70)
|
| 71 |
+
|
| 72 |
+
print("""
|
| 73 |
+
π‘ WAYS TO ADD MORE TRAINING DATA:
|
| 74 |
+
|
| 75 |
+
1. π MANUAL ADDITION:
|
| 76 |
+
β’ Create new examples in the same format
|
| 77 |
+
β’ Add current events, science facts, technology updates
|
| 78 |
+
β’ Include your specific domain knowledge
|
| 79 |
+
|
| 80 |
+
2. π DOMAIN-SPECIFIC DATA:
|
| 81 |
+
β’ Add facts about your industry/field
|
| 82 |
+
β’ Include regional/local information
|
| 83 |
+
β’ Add recent news and developments
|
| 84 |
+
|
| 85 |
+
3. π COPY-PASTE FORMAT:
|
| 86 |
+
Just send me data in this format:
|
| 87 |
+
|
| 88 |
+
PROMPT: "New fact about something"
|
| 89 |
+
CORRECT: "The correct response"
|
| 90 |
+
WRONG1: "A hallucinated response"
|
| 91 |
+
WRONG2: "Another hallucinated response"
|
| 92 |
+
|
| 93 |
+
4. π EXAMPLES OF GOOD ADDITIONS:
|
| 94 |
+
β’ "Python 3.12: released October 2023, new match statement"
|
| 95 |
+
β’ "ChatGPT-4: launched March 2023, multimodal capabilities"
|
| 96 |
+
β’ "Your company: founded X year, specializes in Y"
|
| 97 |
+
β’ "Your city: population X, famous for Y"
|
| 98 |
+
|
| 99 |
+
π€ SEND ME YOUR NEW DATA IN ANY FORMAT:
|
| 100 |
+
β’ List format
|
| 101 |
+
β’ CSV format
|
| 102 |
+
β’ Plain text
|
| 103 |
+
β’ I'll convert it to training format!
|
| 104 |
+
""")
|
| 105 |
+
|
| 106 |
+
def create_training_template():
|
| 107 |
+
"""Create a template for easy data addition"""
|
| 108 |
+
|
| 109 |
+
template = """# EASY TRAINING DATA TEMPLATE
|
| 110 |
+
# Copy this format and send me your new facts!
|
| 111 |
+
|
| 112 |
+
NEW_FACT_1:
|
| 113 |
+
Prompt: "Your fact here: details about something"
|
| 114 |
+
Correct: "The correct answer"
|
| 115 |
+
Wrong1: "A hallucinated answer"
|
| 116 |
+
Wrong2: "Another hallucinated answer"
|
| 117 |
+
|
| 118 |
+
NEW_FACT_2:
|
| 119 |
+
Prompt: "Another fact: more details"
|
| 120 |
+
Correct: "Correct response"
|
| 121 |
+
Wrong1: "Wrong response"
|
| 122 |
+
Wrong2: "Another wrong response"
|
| 123 |
+
|
| 124 |
+
# Example:
|
| 125 |
+
NEW_FACT_EXAMPLE:
|
| 126 |
+
Prompt: "Google Pixel 8: Tensor G3 chip, 7 years of updates"
|
| 127 |
+
Correct: "Pixel 8 has Tensor G3 processor"
|
| 128 |
+
Wrong1: "Pixel 8 uses Snapdragon 8 Gen 2"
|
| 129 |
+
Wrong2: "Pixel 8 is powered by A17 Bionic"
|
| 130 |
+
"""
|
| 131 |
+
|
| 132 |
+
with open('data_addition_template.txt', 'w') as f:
|
| 133 |
+
f.write(template)
|
| 134 |
+
|
| 135 |
+
print(f"π Created template: data_addition_template.txt")
|
| 136 |
+
print(" You can use this format to send me new data!")
|
| 137 |
+
|
| 138 |
+
if __name__ == "__main__":
|
| 139 |
+
total_rows = combine_all_datasets()
|
| 140 |
+
show_add_data_options()
|
| 141 |
+
create_training_template()
|
| 142 |
+
|
| 143 |
+
print(f"\nπ READY FOR MORE DATA!")
|
| 144 |
+
print(f"Current dataset: {total_rows} examples")
|
| 145 |
+
print("Send me your new facts and I'll add them! π")
|