omnibin / scripts /generate_example_data.py
felipekitamura's picture
Automated update from GitHub
1731678
"""
Script to generate example/simulated data files for each task type.
Run this script to create sample data files in the data/examples directory.
"""
import numpy as np
import pandas as pd
import json
import os
# Set random seed for reproducibility
np.random.seed(42)
# Create output directory
OUTPUT_DIR = os.path.join(os.path.dirname(os.path.dirname(__file__)), "data", "examples")
os.makedirs(OUTPUT_DIR, exist_ok=True)
def generate_classification_data():
"""Generate binary classification example data."""
n_samples = 500
# Simulate disease presence (30% positive rate)
y_true = np.random.binomial(1, 0.3, n_samples)
# Simulate model predictions with reasonable AUC (~0.85)
y_pred = np.zeros(n_samples)
y_pred[y_true == 1] = np.random.beta(5, 2, sum(y_true == 1)) # Higher scores for positives
y_pred[y_true == 0] = np.random.beta(2, 5, sum(y_true == 0)) # Lower scores for negatives
df = pd.DataFrame({
'y_true': y_true,
'y_pred': y_pred
})
filepath = os.path.join(OUTPUT_DIR, "classification_example.csv")
df.to_csv(filepath, index=False)
print(f"Created: {filepath}")
print(f" - Samples: {n_samples}")
print(f" - Positive rate: {y_true.mean():.1%}")
return filepath
def generate_regression_data():
"""Generate regression example data (e.g., tumor size prediction)."""
n_samples = 300
# Simulate true tumor sizes (mm) - log-normal distribution
y_true = np.random.lognormal(mean=2.5, sigma=0.5, size=n_samples)
y_true = np.clip(y_true, 5, 100) # Realistic range: 5-100mm
# Simulate predictions with some error
noise = np.random.normal(0, 3, n_samples) # ~3mm average error
y_pred = y_true + noise
y_pred = np.clip(y_pred, 0, 120) # Keep predictions reasonable
df = pd.DataFrame({
'y_true': np.round(y_true, 2),
'y_pred': np.round(y_pred, 2)
})
filepath = os.path.join(OUTPUT_DIR, "regression_example.csv")
df.to_csv(filepath, index=False)
print(f"Created: {filepath}")
print(f" - Samples: {n_samples}")
print(f" - True value range: [{y_true.min():.1f}, {y_true.max():.1f}]")
return filepath
def generate_segmentation_data_2d():
"""Generate 2D segmentation example data (e.g., lung nodule segmentation)."""
height, width = 256, 256
# Create ground truth mask with a circular lesion
y_true = np.zeros((height, width), dtype=np.uint8)
center_y, center_x = 128, 128
radius = 40
y, x = np.ogrid[:height, :width]
mask = (x - center_x)**2 + (y - center_y)**2 <= radius**2
y_true[mask] = 1
# Create prediction with slight offset and size difference (simulating model output)
y_pred = np.zeros((height, width), dtype=np.uint8)
pred_center_y, pred_center_x = 130, 132 # Slight offset
pred_radius = 38 # Slightly smaller
mask_pred = (x - pred_center_x)**2 + (y - pred_center_y)**2 <= pred_radius**2
y_pred[mask_pred] = 1
# Add some noise to prediction (small FP regions)
noise_mask = np.random.random((height, width)) < 0.001
y_pred[noise_mask] = 1
gt_path = os.path.join(OUTPUT_DIR, "segmentation_2d_ground_truth.npy")
pred_path = os.path.join(OUTPUT_DIR, "segmentation_2d_prediction.npy")
np.save(gt_path, y_true)
np.save(pred_path, y_pred)
print(f"Created: {gt_path}")
print(f"Created: {pred_path}")
print(f" - Shape: {y_true.shape}")
print(f" - GT pixels: {y_true.sum()}, Pred pixels: {y_pred.sum()}")
return gt_path, pred_path
def generate_segmentation_data_3d():
"""Generate 3D segmentation example data (e.g., liver segmentation from CT)."""
depth, height, width = 32, 128, 128
# Create ground truth mask with an ellipsoid organ
y_true = np.zeros((depth, height, width), dtype=np.uint8)
center_z, center_y, center_x = 16, 64, 64
radius_z, radius_y, radius_x = 10, 30, 35
z, y, x = np.ogrid[:depth, :height, :width]
mask = ((x - center_x)/radius_x)**2 + ((y - center_y)/radius_y)**2 + ((z - center_z)/radius_z)**2 <= 1
y_true[mask] = 1
# Create prediction with slight differences
y_pred = np.zeros((depth, height, width), dtype=np.uint8)
pred_center_z, pred_center_y, pred_center_x = 16, 65, 63
pred_radius_z, pred_radius_y, pred_radius_x = 9, 28, 33
mask_pred = ((x - pred_center_x)/pred_radius_x)**2 + ((y - pred_center_y)/pred_radius_y)**2 + ((z - pred_center_z)/pred_radius_z)**2 <= 1
y_pred[mask_pred] = 1
gt_path = os.path.join(OUTPUT_DIR, "segmentation_3d_ground_truth.npy")
pred_path = os.path.join(OUTPUT_DIR, "segmentation_3d_prediction.npy")
np.save(gt_path, y_true)
np.save(pred_path, y_pred)
print(f"Created: {gt_path}")
print(f"Created: {pred_path}")
print(f" - Shape: {y_true.shape}")
print(f" - GT voxels: {y_true.sum()}, Pred voxels: {y_pred.sum()}")
return gt_path, pred_path
def generate_multiclass_segmentation_data():
"""Generate multi-class segmentation data (e.g., brain tumor segmentation)."""
height, width = 256, 256
# Create ground truth with multiple classes
# 0 = background, 1 = tumor core, 2 = edema, 3 = enhancing tumor
y_true = np.zeros((height, width), dtype=np.uint8)
y, x = np.ogrid[:height, :width]
# Edema (largest region) - class 2
edema_mask = (x - 128)**2 + (y - 128)**2 <= 50**2
y_true[edema_mask] = 2
# Tumor core - class 1
core_mask = (x - 128)**2 + (y - 128)**2 <= 30**2
y_true[core_mask] = 1
# Enhancing tumor (innermost) - class 3
enhancing_mask = (x - 128)**2 + (y - 128)**2 <= 15**2
y_true[enhancing_mask] = 3
# Create prediction with some errors
y_pred = np.zeros((height, width), dtype=np.uint8)
# Slightly different boundaries
edema_mask_pred = (x - 130)**2 + (y - 126)**2 <= 48**2
y_pred[edema_mask_pred] = 2
core_mask_pred = (x - 130)**2 + (y - 126)**2 <= 28**2
y_pred[core_mask_pred] = 1
enhancing_mask_pred = (x - 130)**2 + (y - 126)**2 <= 14**2
y_pred[enhancing_mask_pred] = 3
gt_path = os.path.join(OUTPUT_DIR, "segmentation_multiclass_ground_truth.npy")
pred_path = os.path.join(OUTPUT_DIR, "segmentation_multiclass_prediction.npy")
np.save(gt_path, y_true)
np.save(pred_path, y_pred)
print(f"Created: {gt_path}")
print(f"Created: {pred_path}")
print(f" - Shape: {y_true.shape}")
print(f" - Classes: 0=background, 1=tumor core, 2=edema, 3=enhancing")
return gt_path, pred_path
def generate_detection_data():
"""Generate object detection example data (e.g., lung nodule detection)."""
n_images = 50
predictions = []
ground_truths = []
for img_idx in range(n_images):
# Random number of ground truth nodules (0-4 per image)
n_nodules = np.random.choice([0, 1, 1, 2, 2, 3, 4], p=[0.1, 0.25, 0.25, 0.2, 0.1, 0.07, 0.03])
img_gt = []
img_pred = []
for _ in range(n_nodules):
# Generate ground truth box
x1 = int(np.random.randint(50, 400))
y1 = int(np.random.randint(50, 400))
size = int(np.random.randint(20, 80))
gt_box = [x1, y1, x1 + size, y1 + size]
img_gt.append(gt_box)
# 80% chance of detecting this nodule
if np.random.random() < 0.8:
# Add some localization error
offset = np.random.randint(-8, 8, 4)
pred_box = [
int(max(0, gt_box[0] + offset[0])),
int(max(0, gt_box[1] + offset[1])),
int(gt_box[2] + offset[2]),
int(gt_box[3] + offset[3])
]
score = float(np.random.uniform(0.5, 0.98))
img_pred.append({"box": pred_box, "score": round(score, 3)})
# Add some false positives (0-2 per image)
n_fp = np.random.choice([0, 0, 1, 1, 2], p=[0.4, 0.2, 0.2, 0.15, 0.05])
for _ in range(n_fp):
x1 = int(np.random.randint(50, 400))
y1 = int(np.random.randint(50, 400))
size = int(np.random.randint(15, 50))
fp_box = [x1, y1, x1 + size, y1 + size]
score = float(np.random.uniform(0.3, 0.7)) # Lower confidence for FPs
img_pred.append({"box": fp_box, "score": round(score, 3)})
predictions.append(img_pred)
ground_truths.append(img_gt)
data = {
"predictions": predictions,
"ground_truths": ground_truths,
"metadata": {
"description": "Simulated lung nodule detection data",
"n_images": n_images,
"box_format": "[x1, y1, x2, y2]"
}
}
filepath = os.path.join(OUTPUT_DIR, "detection_example.json")
with open(filepath, 'w') as f:
json.dump(data, f, indent=2)
total_gt = sum(len(gt) for gt in ground_truths)
total_pred = sum(len(pred) for pred in predictions)
print(f"Created: {filepath}")
print(f" - Images: {n_images}")
print(f" - Total ground truth boxes: {total_gt}")
print(f" - Total predictions: {total_pred}")
return filepath
def main():
print("=" * 60)
print("Generating example data files for Omnibin")
print("=" * 60)
print()
print("1. Binary Classification Data")
print("-" * 40)
generate_classification_data()
print()
print("2. Regression Data")
print("-" * 40)
generate_regression_data()
print()
print("3. 2D Segmentation Data")
print("-" * 40)
generate_segmentation_data_2d()
print()
print("4. 3D Segmentation Data")
print("-" * 40)
generate_segmentation_data_3d()
print()
print("5. Multi-class Segmentation Data")
print("-" * 40)
generate_multiclass_segmentation_data()
print()
print("6. Object Detection Data")
print("-" * 40)
generate_detection_data()
print()
print("=" * 60)
print("All example data files generated successfully!")
print(f"Location: {OUTPUT_DIR}")
print("=" * 60)
if __name__ == "__main__":
main()