KShoichi commited on
Commit
6bf662c
·
verified ·
1 Parent(s): 2ec7372

Upload prepare_training_data.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. prepare_training_data.py +122 -0
prepare_training_data.py ADDED
@@ -0,0 +1,122 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import os
3
+ from pathlib import Path
4
+
5
+ def combine_training_data():
6
+ """Combine all training data files into one comprehensive dataset"""
7
+
8
+ # Define the training data files
9
+ data_files = [
10
+ 'sample_training_data.csv',
11
+ 'comprehensive_training_data.csv',
12
+ 'advanced_training_data.csv',
13
+ 'edge_cases_training_data.csv'
14
+ ]
15
+
16
+ # Base directory
17
+ base_dir = Path(__file__).parent
18
+
19
+ # List to store all dataframes
20
+ all_dataframes = []
21
+
22
+ # Read each file and add to list
23
+ for file in data_files:
24
+ file_path = base_dir / file
25
+ if file_path.exists():
26
+ df = pd.read_csv(file_path)
27
+ print(f"Loaded {len(df)} rows from {file}")
28
+ all_dataframes.append(df)
29
+ else:
30
+ print(f"Warning: {file} not found")
31
+
32
+ # Combine all dataframes
33
+ if all_dataframes:
34
+ combined_df = pd.concat(all_dataframes, ignore_index=True)
35
+
36
+ # Remove duplicates
37
+ original_count = len(combined_df)
38
+ combined_df = combined_df.drop_duplicates()
39
+ deduplicated_count = len(combined_df)
40
+
41
+ print(f"Combined dataset: {original_count} rows")
42
+ print(f"After deduplication: {deduplicated_count} rows")
43
+ print(f"Removed {original_count - deduplicated_count} duplicates")
44
+
45
+ # Shuffle the data
46
+ combined_df = combined_df.sample(frac=1, random_state=42).reset_index(drop=True)
47
+
48
+ # Check distribution
49
+ hallucination_count = combined_df['is_hallucination'].sum()
50
+ non_hallucination_count = len(combined_df) - hallucination_count
51
+
52
+ print(f"\nDataset distribution:")
53
+ print(f"Hallucinations: {hallucination_count} ({hallucination_count/len(combined_df)*100:.1f}%)")
54
+ print(f"Non-hallucinations: {non_hallucination_count} ({non_hallucination_count/len(combined_df)*100:.1f}%)")
55
+
56
+ # Save combined dataset
57
+ output_file = base_dir / 'combined_training_data.csv'
58
+ combined_df.to_csv(output_file, index=False)
59
+ print(f"\nSaved combined dataset to: {output_file}")
60
+
61
+ # Create train/validation split
62
+ train_size = int(0.8 * len(combined_df))
63
+ train_df = combined_df[:train_size]
64
+ val_df = combined_df[train_size:]
65
+
66
+ # Save splits
67
+ train_file = base_dir / 'train_data.csv'
68
+ val_file = base_dir / 'validation_data.csv'
69
+
70
+ train_df.to_csv(train_file, index=False)
71
+ val_df.to_csv(val_file, index=False)
72
+
73
+ print(f"Training set: {len(train_df)} rows -> {train_file}")
74
+ print(f"Validation set: {len(val_df)} rows -> {val_file}")
75
+
76
+ return combined_df
77
+ else:
78
+ print("No data files found!")
79
+ return None
80
+
81
+ def analyze_training_data(df):
82
+ """Analyze the training data for insights"""
83
+ if df is None:
84
+ return
85
+
86
+ print("\n" + "="*50)
87
+ print("TRAINING DATA ANALYSIS")
88
+ print("="*50)
89
+
90
+ # Basic statistics
91
+ print(f"Total samples: {len(df)}")
92
+ print(f"Unique prompts: {df['prompt'].nunique()}")
93
+ print(f"Unique responses: {df['response'].nunique()}")
94
+ print(f"Unique questions: {df['question'].nunique()}")
95
+
96
+ # Check for common patterns in hallucinations
97
+ hallucination_samples = df[df['is_hallucination'] == True]
98
+ non_hallucination_samples = df[df['is_hallucination'] == False]
99
+
100
+ print(f"\nHallucination samples: {len(hallucination_samples)}")
101
+ print(f"Non-hallucination samples: {len(non_hallucination_samples)}")
102
+
103
+ # Common words in hallucinations vs non-hallucinations
104
+ print("\nSample hallucination responses:")
105
+ for i, response in enumerate(hallucination_samples['response'].head(5)):
106
+ print(f"{i+1}. {response}")
107
+
108
+ print("\nSample non-hallucination responses:")
109
+ for i, response in enumerate(non_hallucination_samples['response'].head(5)):
110
+ print(f"{i+1}. {response}")
111
+
112
+ if __name__ == "__main__":
113
+ # Combine all training data
114
+ combined_data = combine_training_data()
115
+
116
+ # Analyze the combined data
117
+ analyze_training_data(combined_data)
118
+
119
+ print("\n" + "="*50)
120
+ print("TRAINING DATA PREPARATION COMPLETE")
121
+ print("="*50)
122
+ print("Ready for model training!")