Self-Healing-ML-Pipelines / scripts /generate_test_data.py
Ariyan-Pro's picture
🚀 Initial upload: Self-Healing ML Pipelines
11898c7 verified
#!/usr/bin/env python3
"""Generate test data for self-healing ML system."""
import numpy as np
import pandas as pd
from pathlib import Path
def generate_test_data(n_samples: int = 1000, n_features: int = 10):
"""Generate synthetic test data."""
# Generate features
X = np.random.randn(n_samples, n_features)
# Generate labels (binary classification)
coef = np.random.randn(n_features)
logits = X @ coef + np.random.randn(n_samples) * 0.5
y = (logits > 0).astype(int)
# Create DataFrame
feature_names = [f"feature_{i}" for i in range(n_features)]
df = pd.DataFrame(X, columns=feature_names)
df['target'] = y
return df
if __name__ == "__main__":
# Generate and save test data
data_dir = Path("data/raw")
data_dir.mkdir(parents=True, exist_ok=True)
train_data = generate_test_data(1000, 10)
test_data = generate_test_data(200, 10)
train_data.to_csv(data_dir / "train_data.csv", index=False)
test_data.to_csv(data_dir / "test_data.csv", index=False)
print(f"Generated test data: {len(train_data)} training, {len(test_data)} test samples")