| """ |
| NeoSyn - data_processor.py |
| |
| Handles synthetic data generation using CTGAN, |
| distribution plots, and summaries for both original and synthetic data. |
| |
| Author: Saivivek Katkuri |
| Date: June 2025 |
| """ |
|
|
| import pandas as pd |
| import matplotlib.pyplot as plt |
| import os |
| from datetime import datetime |
| from sdv.single_table import CTGANSynthesizer |
| from sdv.metadata import SingleTableMetadata |
|
|
| def generate_synthetic_data(original_data: pd.DataFrame, num_samples: int = 10, output_dir: str = "generated") -> pd.DataFrame: |
| """ |
| Generates synthetic data using CTGAN and saves as a Parquet file. |
| |
| Args: |
| original_data (pd.DataFrame): Original data. |
| num_samples (int): Number of synthetic samples to generate. |
| output_dir (str): Directory to save synthetic data. |
| |
| Returns: |
| pd.DataFrame: Synthetic data. |
| """ |
| |
| metadata = SingleTableMetadata() |
| metadata.detect_from_dataframe(data=original_data) |
|
|
| |
| synthesizer = CTGANSynthesizer(metadata) |
|
|
| |
| synthesizer.fit(original_data) |
|
|
| |
| synthetic_data = synthesizer.sample(num_rows=num_samples) |
|
|
| |
| os.makedirs(output_dir, exist_ok=True) |
| timestamp = datetime.now().strftime("%Y%m%d%H%M%S") |
| output_path = os.path.join(output_dir, f"synthetic_data_{timestamp}.parquet") |
| synthetic_data.to_parquet(output_path) |
|
|
| return synthetic_data |
|
|
| def plot_distribution(original_data: pd.DataFrame, synthetic_data: pd.DataFrame, column: str, output_dir: str = "plots") -> str: |
| """ |
| Plots the distribution of a specified column in both original and synthetic data. |
| |
| Args: |
| original_data (pd.DataFrame): Original data. |
| synthetic_data (pd.DataFrame): Synthetic data. |
| column (str): Column to plot. |
| output_dir (str): Directory to save the plot. |
| |
| Returns: |
| str: Path to the saved plot image. |
| """ |
| if column not in original_data.columns: |
| raise ValueError(f"Column '{column}' not found in original data.") |
|
|
| os.makedirs(output_dir, exist_ok=True) |
| plt.figure(figsize=(10, 6)) |
| plt.hist(original_data[column], bins=30, alpha=0.5, label="Original") |
| plt.hist(synthetic_data[column], bins=30, alpha=0.5, label="Synthetic") |
| plt.title(f"Distribution of {column}") |
| plt.xlabel(column) |
| plt.ylabel("Frequency") |
| plt.legend() |
| plot_path = os.path.join(output_dir, f"{column}_distribution.png") |
| plt.savefig(plot_path) |
| plt.close() |
|
|
| return plot_path |
|
|
| def summarize_data(data: pd.DataFrame) -> str: |
| """ |
| Generates a textual summary of the data using pandas describe. |
| |
| Args: |
| data (pd.DataFrame): Data to summarize. |
| |
| Returns: |
| str: Textual summary. |
| """ |
| return data.describe().to_string() |