vivekprojects-GIT commited on
Commit
0973090
·
1 Parent(s): f2e8372

Update default num_samples to 10 in generate_synthetic_data

Browse files
Files changed (1) hide show
  1. data_processor.py +28 -0
data_processor.py ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ def generate_synthetic_data(original_data: pd.DataFrame, num_samples: int = 10, output_dir: str = "generated") -> pd.DataFrame:
2
+ """
3
+ Generates synthetic data using CTGAN based on original data.
4
+
5
+ Args:
6
+ original_data (pd.DataFrame): The original dataset.
7
+ num_samples (int, optional): Number of synthetic samples to generate (default is 10).
8
+ output_dir (str, optional): Directory to save the generated Parquet file.
9
+
10
+ Returns:
11
+ pd.DataFrame: Generated synthetic data.
12
+ """
13
+ # Train CTGAN model
14
+ model = CTGAN()
15
+ model.fit(original_data)
16
+
17
+ # Generate synthetic data
18
+ synthetic_data = model.sample(num_samples)
19
+
20
+ # Create output directory if it doesn't exist
21
+ os.makedirs(output_dir, exist_ok=True)
22
+
23
+ # Save synthetic data as Parquet file with timestamp
24
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
25
+ file_path = os.path.join(output_dir, f"synthetic_data_{timestamp}.parquet")
26
+ synthetic_data.to_parquet(file_path)
27
+
28
+ return synthetic_data