KShoichi
/

hallucination-detector-project

Model card Files Files and versions

xet

Community

KShoichi commited on Aug 15, 2025

Commit

a76a38b

verified ·

1 Parent(s): 6f78ed1

Upload combine_and_expand_data.py with huggingface_hub

Browse files

Files changed (1) hide show

combine_and_expand_data.py +145 -0

combine_and_expand_data.py ADDED Viewed

	@@ -0,0 +1,145 @@

+#!/usr/bin/env python3
+"""
+Combine all training datasets and verify accuracy
+"""
+import pandas as pd
+import os
+def combine_all_datasets():
+    """Combine all your training datasets into one comprehensive set"""
+    print("🔍 COMBINING ALL TRAINING DATASETS")
+    print("=" * 70)
+    # List of all training files
+    training_files = [
+        'training.csv',  # Your corrected base data (100% accurate)
+        'advanced_training_data.csv',  # Tech data
+        'comprehensive_training_data.csv',  # More comprehensive data
+        'edge_cases_training_data.csv',  # Edge cases
+    ]
+    all_data = []
+    total_rows = 0
+    for file in training_files:
+        if os.path.exists(file):
+            print(f"📄 Reading {file}...")
+            df = pd.read_csv(file)
+            print(f"   Rows: {len(df)}")
+            all_data.append(df)
+            total_rows += len(df)
+        else:
+            print(f"⚠️  {file} not found - skipping")
+    if all_data:
+        # Combine all datasets
+        combined_df = pd.concat(all_data, ignore_index=True)
+        # Remove duplicates
+        before_dedup = len(combined_df)
+        combined_df = combined_df.drop_duplicates(subset=['prompt', 'response'], keep='first')
+        after_dedup = len(combined_df)
+        print(f"\n📊 COMBINATION RESULTS:")
+        print(f"   Total rows before dedup: {before_dedup}")
+        print(f"   Duplicates removed: {before_dedup - after_dedup}")
+        print(f"   Final dataset size: {after_dedup} rows")
+        # Save combined dataset
+        combined_df.to_csv('mega_training_data.csv', index=False)
+        print(f"✅ Saved as: mega_training_data.csv")
+        # Show breakdown
+        print(f"\n📈 BREAKDOWN:")
+        correct_count = len(combined_df[combined_df['is_hallucination'] == False])
+        hallucination_count = len(combined_df[combined_df['is_hallucination'] == True])
+        print(f"   Correct examples: {correct_count}")
+        print(f"   Hallucination examples: {hallucination_count}")
+        print(f"   Balance ratio: {correct_count/hallucination_count:.2f}:1")
+        return after_dedup
+    return 0
+def show_add_data_options():
+    """Show options for adding more data"""
+    print(f"\n🎯 NOW YOU CAN ADD MORE DATA!")
+    print("=" * 70)
+    print("""
+💡 WAYS TO ADD MORE TRAINING DATA:
+1. 📝 MANUAL ADDITION:
+   • Create new examples in the same format
+   • Add current events, science facts, technology updates
+   • Include your specific domain knowledge
+2. 🌍 DOMAIN-SPECIFIC DATA:
+   • Add facts about your industry/field
+   • Include regional/local information
+   • Add recent news and developments
+3. 📋 COPY-PASTE FORMAT:
+   Just send me data in this format:
+   PROMPT: "New fact about something"
+   CORRECT: "The correct response"
+   WRONG1: "A hallucinated response"
+   WRONG2: "Another hallucinated response"
+4. 🔄 EXAMPLES OF GOOD ADDITIONS:
+   • "Python 3.12: released October 2023, new match statement"
+   • "ChatGPT-4: launched March 2023, multimodal capabilities"
+   • "Your company: founded X year, specializes in Y"
+   • "Your city: population X, famous for Y"
+📤 SEND ME YOUR NEW DATA IN ANY FORMAT:
+   • List format
+   • CSV format
+   • Plain text
+   • I'll convert it to training format!
+""")
+def create_training_template():
+    """Create a template for easy data addition"""
+    template = """# EASY TRAINING DATA TEMPLATE
+# Copy this format and send me your new facts!
+NEW_FACT_1:
+Prompt: "Your fact here: details about something"
+Correct: "The correct answer"
+Wrong1: "A hallucinated answer"
+Wrong2: "Another hallucinated answer"
+NEW_FACT_2:
+Prompt: "Another fact: more details"
+Correct: "Correct response"
+Wrong1: "Wrong response"
+Wrong2: "Another wrong response"
+# Example:
+NEW_FACT_EXAMPLE:
+Prompt: "Google Pixel 8: Tensor G3 chip, 7 years of updates"
+Correct: "Pixel 8 has Tensor G3 processor"
+Wrong1: "Pixel 8 uses Snapdragon 8 Gen 2"
+Wrong2: "Pixel 8 is powered by A17 Bionic"
+"""
+    with open('data_addition_template.txt', 'w') as f:
+        f.write(template)
+    print(f"📝 Created template: data_addition_template.txt")
+    print("   You can use this format to send me new data!")
+if __name__ == "__main__":
+    total_rows = combine_all_datasets()
+    show_add_data_options()
+    create_training_template()
+    print(f"\n🚀 READY FOR MORE DATA!")
+    print(f"Current dataset: {total_rows} examples")
+    print("Send me your new facts and I'll add them! 📊")