import pandas as pd import os from pathlib import Path def combine_training_data(): """Combine all training data files into one comprehensive dataset""" # Define the training data files data_files = [ 'sample_training_data.csv', 'comprehensive_training_data.csv', 'advanced_training_data.csv', 'edge_cases_training_data.csv' ] # Base directory base_dir = Path(__file__).parent # List to store all dataframes all_dataframes = [] # Read each file and add to list for file in data_files: file_path = base_dir / file if file_path.exists(): df = pd.read_csv(file_path) print(f"Loaded {len(df)} rows from {file}") all_dataframes.append(df) else: print(f"Warning: {file} not found") # Combine all dataframes if all_dataframes: combined_df = pd.concat(all_dataframes, ignore_index=True) # Remove duplicates original_count = len(combined_df) combined_df = combined_df.drop_duplicates() deduplicated_count = len(combined_df) print(f"Combined dataset: {original_count} rows") print(f"After deduplication: {deduplicated_count} rows") print(f"Removed {original_count - deduplicated_count} duplicates") # Shuffle the data combined_df = combined_df.sample(frac=1, random_state=42).reset_index(drop=True) # Check distribution hallucination_count = combined_df['is_hallucination'].sum() non_hallucination_count = len(combined_df) - hallucination_count print(f"\nDataset distribution:") print(f"Hallucinations: {hallucination_count} ({hallucination_count/len(combined_df)*100:.1f}%)") print(f"Non-hallucinations: {non_hallucination_count} ({non_hallucination_count/len(combined_df)*100:.1f}%)") # Save combined dataset output_file = base_dir / 'combined_training_data.csv' combined_df.to_csv(output_file, index=False) print(f"\nSaved combined dataset to: {output_file}") # Create train/validation split train_size = int(0.8 * len(combined_df)) train_df = combined_df[:train_size] val_df = combined_df[train_size:] # Save splits train_file = base_dir / 'train_data.csv' val_file = base_dir / 'validation_data.csv' train_df.to_csv(train_file, index=False) val_df.to_csv(val_file, index=False) print(f"Training set: {len(train_df)} rows -> {train_file}") print(f"Validation set: {len(val_df)} rows -> {val_file}") return combined_df else: print("No data files found!") return None def analyze_training_data(df): """Analyze the training data for insights""" if df is None: return print("\n" + "="*50) print("TRAINING DATA ANALYSIS") print("="*50) # Basic statistics print(f"Total samples: {len(df)}") print(f"Unique prompts: {df['prompt'].nunique()}") print(f"Unique responses: {df['response'].nunique()}") print(f"Unique questions: {df['question'].nunique()}") # Check for common patterns in hallucinations hallucination_samples = df[df['is_hallucination'] == True] non_hallucination_samples = df[df['is_hallucination'] == False] print(f"\nHallucination samples: {len(hallucination_samples)}") print(f"Non-hallucination samples: {len(non_hallucination_samples)}") # Common words in hallucinations vs non-hallucinations print("\nSample hallucination responses:") for i, response in enumerate(hallucination_samples['response'].head(5)): print(f"{i+1}. {response}") print("\nSample non-hallucination responses:") for i, response in enumerate(non_hallucination_samples['response'].head(5)): print(f"{i+1}. {response}") if __name__ == "__main__": # Combine all training data combined_data = combine_training_data() # Analyze the combined data analyze_training_data(combined_data) print("\n" + "="*50) print("TRAINING DATA PREPARATION COMPLETE") print("="*50) print("Ready for model training!")