| import pandas as pd
|
| import os
|
| from pathlib import Path
|
|
|
| def combine_training_data():
|
| """Combine all training data files into one comprehensive dataset"""
|
|
|
|
|
| data_files = [
|
| 'sample_training_data.csv',
|
| 'comprehensive_training_data.csv',
|
| 'advanced_training_data.csv',
|
| 'edge_cases_training_data.csv'
|
| ]
|
|
|
|
|
| base_dir = Path(__file__).parent
|
|
|
|
|
| all_dataframes = []
|
|
|
|
|
| for file in data_files:
|
| file_path = base_dir / file
|
| if file_path.exists():
|
| df = pd.read_csv(file_path)
|
| print(f"Loaded {len(df)} rows from {file}")
|
| all_dataframes.append(df)
|
| else:
|
| print(f"Warning: {file} not found")
|
|
|
|
|
| if all_dataframes:
|
| combined_df = pd.concat(all_dataframes, ignore_index=True)
|
|
|
|
|
| original_count = len(combined_df)
|
| combined_df = combined_df.drop_duplicates()
|
| deduplicated_count = len(combined_df)
|
|
|
| print(f"Combined dataset: {original_count} rows")
|
| print(f"After deduplication: {deduplicated_count} rows")
|
| print(f"Removed {original_count - deduplicated_count} duplicates")
|
|
|
|
|
| combined_df = combined_df.sample(frac=1, random_state=42).reset_index(drop=True)
|
|
|
|
|
| hallucination_count = combined_df['is_hallucination'].sum()
|
| non_hallucination_count = len(combined_df) - hallucination_count
|
|
|
| print(f"\nDataset distribution:")
|
| print(f"Hallucinations: {hallucination_count} ({hallucination_count/len(combined_df)*100:.1f}%)")
|
| print(f"Non-hallucinations: {non_hallucination_count} ({non_hallucination_count/len(combined_df)*100:.1f}%)")
|
|
|
|
|
| output_file = base_dir / 'combined_training_data.csv'
|
| combined_df.to_csv(output_file, index=False)
|
| print(f"\nSaved combined dataset to: {output_file}")
|
|
|
|
|
| train_size = int(0.8 * len(combined_df))
|
| train_df = combined_df[:train_size]
|
| val_df = combined_df[train_size:]
|
|
|
|
|
| train_file = base_dir / 'train_data.csv'
|
| val_file = base_dir / 'validation_data.csv'
|
|
|
| train_df.to_csv(train_file, index=False)
|
| val_df.to_csv(val_file, index=False)
|
|
|
| print(f"Training set: {len(train_df)} rows -> {train_file}")
|
| print(f"Validation set: {len(val_df)} rows -> {val_file}")
|
|
|
| return combined_df
|
| else:
|
| print("No data files found!")
|
| return None
|
|
|
| def analyze_training_data(df):
|
| """Analyze the training data for insights"""
|
| if df is None:
|
| return
|
|
|
| print("\n" + "="*50)
|
| print("TRAINING DATA ANALYSIS")
|
| print("="*50)
|
|
|
|
|
| print(f"Total samples: {len(df)}")
|
| print(f"Unique prompts: {df['prompt'].nunique()}")
|
| print(f"Unique responses: {df['response'].nunique()}")
|
| print(f"Unique questions: {df['question'].nunique()}")
|
|
|
|
|
| hallucination_samples = df[df['is_hallucination'] == True]
|
| non_hallucination_samples = df[df['is_hallucination'] == False]
|
|
|
| print(f"\nHallucination samples: {len(hallucination_samples)}")
|
| print(f"Non-hallucination samples: {len(non_hallucination_samples)}")
|
|
|
|
|
| print("\nSample hallucination responses:")
|
| for i, response in enumerate(hallucination_samples['response'].head(5)):
|
| print(f"{i+1}. {response}")
|
|
|
| print("\nSample non-hallucination responses:")
|
| for i, response in enumerate(non_hallucination_samples['response'].head(5)):
|
| print(f"{i+1}. {response}")
|
|
|
| if __name__ == "__main__":
|
|
|
| combined_data = combine_training_data()
|
|
|
|
|
| analyze_training_data(combined_data)
|
|
|
| print("\n" + "="*50)
|
| print("TRAINING DATA PREPARATION COMPLETE")
|
| print("="*50)
|
| print("Ready for model training!")
|
|
|