hallucination-detector-project / prepare_training_data.py
KShoichi's picture
Upload prepare_training_data.py with huggingface_hub
6bf662c verified
import pandas as pd
import os
from pathlib import Path
def combine_training_data():
"""Combine all training data files into one comprehensive dataset"""
# Define the training data files
data_files = [
'sample_training_data.csv',
'comprehensive_training_data.csv',
'advanced_training_data.csv',
'edge_cases_training_data.csv'
]
# Base directory
base_dir = Path(__file__).parent
# List to store all dataframes
all_dataframes = []
# Read each file and add to list
for file in data_files:
file_path = base_dir / file
if file_path.exists():
df = pd.read_csv(file_path)
print(f"Loaded {len(df)} rows from {file}")
all_dataframes.append(df)
else:
print(f"Warning: {file} not found")
# Combine all dataframes
if all_dataframes:
combined_df = pd.concat(all_dataframes, ignore_index=True)
# Remove duplicates
original_count = len(combined_df)
combined_df = combined_df.drop_duplicates()
deduplicated_count = len(combined_df)
print(f"Combined dataset: {original_count} rows")
print(f"After deduplication: {deduplicated_count} rows")
print(f"Removed {original_count - deduplicated_count} duplicates")
# Shuffle the data
combined_df = combined_df.sample(frac=1, random_state=42).reset_index(drop=True)
# Check distribution
hallucination_count = combined_df['is_hallucination'].sum()
non_hallucination_count = len(combined_df) - hallucination_count
print(f"\nDataset distribution:")
print(f"Hallucinations: {hallucination_count} ({hallucination_count/len(combined_df)*100:.1f}%)")
print(f"Non-hallucinations: {non_hallucination_count} ({non_hallucination_count/len(combined_df)*100:.1f}%)")
# Save combined dataset
output_file = base_dir / 'combined_training_data.csv'
combined_df.to_csv(output_file, index=False)
print(f"\nSaved combined dataset to: {output_file}")
# Create train/validation split
train_size = int(0.8 * len(combined_df))
train_df = combined_df[:train_size]
val_df = combined_df[train_size:]
# Save splits
train_file = base_dir / 'train_data.csv'
val_file = base_dir / 'validation_data.csv'
train_df.to_csv(train_file, index=False)
val_df.to_csv(val_file, index=False)
print(f"Training set: {len(train_df)} rows -> {train_file}")
print(f"Validation set: {len(val_df)} rows -> {val_file}")
return combined_df
else:
print("No data files found!")
return None
def analyze_training_data(df):
"""Analyze the training data for insights"""
if df is None:
return
print("\n" + "="*50)
print("TRAINING DATA ANALYSIS")
print("="*50)
# Basic statistics
print(f"Total samples: {len(df)}")
print(f"Unique prompts: {df['prompt'].nunique()}")
print(f"Unique responses: {df['response'].nunique()}")
print(f"Unique questions: {df['question'].nunique()}")
# Check for common patterns in hallucinations
hallucination_samples = df[df['is_hallucination'] == True]
non_hallucination_samples = df[df['is_hallucination'] == False]
print(f"\nHallucination samples: {len(hallucination_samples)}")
print(f"Non-hallucination samples: {len(non_hallucination_samples)}")
# Common words in hallucinations vs non-hallucinations
print("\nSample hallucination responses:")
for i, response in enumerate(hallucination_samples['response'].head(5)):
print(f"{i+1}. {response}")
print("\nSample non-hallucination responses:")
for i, response in enumerate(non_hallucination_samples['response'].head(5)):
print(f"{i+1}. {response}")
if __name__ == "__main__":
# Combine all training data
combined_data = combine_training_data()
# Analyze the combined data
analyze_training_data(combined_data)
print("\n" + "="*50)
print("TRAINING DATA PREPARATION COMPLETE")
print("="*50)
print("Ready for model training!")