File size: 10,072 Bytes
1731678
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
"""
Script to generate example/simulated data files for each task type.
Run this script to create sample data files in the data/examples directory.
"""

import numpy as np
import pandas as pd
import json
import os

# Set random seed for reproducibility
np.random.seed(42)

# Create output directory
OUTPUT_DIR = os.path.join(os.path.dirname(os.path.dirname(__file__)), "data", "examples")
os.makedirs(OUTPUT_DIR, exist_ok=True)


def generate_classification_data():
    """Generate binary classification example data."""
    n_samples = 500

    # Simulate disease presence (30% positive rate)
    y_true = np.random.binomial(1, 0.3, n_samples)

    # Simulate model predictions with reasonable AUC (~0.85)
    y_pred = np.zeros(n_samples)
    y_pred[y_true == 1] = np.random.beta(5, 2, sum(y_true == 1))  # Higher scores for positives
    y_pred[y_true == 0] = np.random.beta(2, 5, sum(y_true == 0))  # Lower scores for negatives

    df = pd.DataFrame({
        'y_true': y_true,
        'y_pred': y_pred
    })

    filepath = os.path.join(OUTPUT_DIR, "classification_example.csv")
    df.to_csv(filepath, index=False)
    print(f"Created: {filepath}")
    print(f"  - Samples: {n_samples}")
    print(f"  - Positive rate: {y_true.mean():.1%}")
    return filepath


def generate_regression_data():
    """Generate regression example data (e.g., tumor size prediction)."""
    n_samples = 300

    # Simulate true tumor sizes (mm) - log-normal distribution
    y_true = np.random.lognormal(mean=2.5, sigma=0.5, size=n_samples)
    y_true = np.clip(y_true, 5, 100)  # Realistic range: 5-100mm

    # Simulate predictions with some error
    noise = np.random.normal(0, 3, n_samples)  # ~3mm average error
    y_pred = y_true + noise
    y_pred = np.clip(y_pred, 0, 120)  # Keep predictions reasonable

    df = pd.DataFrame({
        'y_true': np.round(y_true, 2),
        'y_pred': np.round(y_pred, 2)
    })

    filepath = os.path.join(OUTPUT_DIR, "regression_example.csv")
    df.to_csv(filepath, index=False)
    print(f"Created: {filepath}")
    print(f"  - Samples: {n_samples}")
    print(f"  - True value range: [{y_true.min():.1f}, {y_true.max():.1f}]")
    return filepath


def generate_segmentation_data_2d():
    """Generate 2D segmentation example data (e.g., lung nodule segmentation)."""
    height, width = 256, 256

    # Create ground truth mask with a circular lesion
    y_true = np.zeros((height, width), dtype=np.uint8)
    center_y, center_x = 128, 128
    radius = 40

    y, x = np.ogrid[:height, :width]
    mask = (x - center_x)**2 + (y - center_y)**2 <= radius**2
    y_true[mask] = 1

    # Create prediction with slight offset and size difference (simulating model output)
    y_pred = np.zeros((height, width), dtype=np.uint8)
    pred_center_y, pred_center_x = 130, 132  # Slight offset
    pred_radius = 38  # Slightly smaller

    mask_pred = (x - pred_center_x)**2 + (y - pred_center_y)**2 <= pred_radius**2
    y_pred[mask_pred] = 1

    # Add some noise to prediction (small FP regions)
    noise_mask = np.random.random((height, width)) < 0.001
    y_pred[noise_mask] = 1

    gt_path = os.path.join(OUTPUT_DIR, "segmentation_2d_ground_truth.npy")
    pred_path = os.path.join(OUTPUT_DIR, "segmentation_2d_prediction.npy")

    np.save(gt_path, y_true)
    np.save(pred_path, y_pred)

    print(f"Created: {gt_path}")
    print(f"Created: {pred_path}")
    print(f"  - Shape: {y_true.shape}")
    print(f"  - GT pixels: {y_true.sum()}, Pred pixels: {y_pred.sum()}")
    return gt_path, pred_path


def generate_segmentation_data_3d():
    """Generate 3D segmentation example data (e.g., liver segmentation from CT)."""
    depth, height, width = 32, 128, 128

    # Create ground truth mask with an ellipsoid organ
    y_true = np.zeros((depth, height, width), dtype=np.uint8)

    center_z, center_y, center_x = 16, 64, 64
    radius_z, radius_y, radius_x = 10, 30, 35

    z, y, x = np.ogrid[:depth, :height, :width]
    mask = ((x - center_x)/radius_x)**2 + ((y - center_y)/radius_y)**2 + ((z - center_z)/radius_z)**2 <= 1
    y_true[mask] = 1

    # Create prediction with slight differences
    y_pred = np.zeros((depth, height, width), dtype=np.uint8)

    pred_center_z, pred_center_y, pred_center_x = 16, 65, 63
    pred_radius_z, pred_radius_y, pred_radius_x = 9, 28, 33

    mask_pred = ((x - pred_center_x)/pred_radius_x)**2 + ((y - pred_center_y)/pred_radius_y)**2 + ((z - pred_center_z)/pred_radius_z)**2 <= 1
    y_pred[mask_pred] = 1

    gt_path = os.path.join(OUTPUT_DIR, "segmentation_3d_ground_truth.npy")
    pred_path = os.path.join(OUTPUT_DIR, "segmentation_3d_prediction.npy")

    np.save(gt_path, y_true)
    np.save(pred_path, y_pred)

    print(f"Created: {gt_path}")
    print(f"Created: {pred_path}")
    print(f"  - Shape: {y_true.shape}")
    print(f"  - GT voxels: {y_true.sum()}, Pred voxels: {y_pred.sum()}")
    return gt_path, pred_path


def generate_multiclass_segmentation_data():
    """Generate multi-class segmentation data (e.g., brain tumor segmentation)."""
    height, width = 256, 256

    # Create ground truth with multiple classes
    # 0 = background, 1 = tumor core, 2 = edema, 3 = enhancing tumor
    y_true = np.zeros((height, width), dtype=np.uint8)

    y, x = np.ogrid[:height, :width]

    # Edema (largest region) - class 2
    edema_mask = (x - 128)**2 + (y - 128)**2 <= 50**2
    y_true[edema_mask] = 2

    # Tumor core - class 1
    core_mask = (x - 128)**2 + (y - 128)**2 <= 30**2
    y_true[core_mask] = 1

    # Enhancing tumor (innermost) - class 3
    enhancing_mask = (x - 128)**2 + (y - 128)**2 <= 15**2
    y_true[enhancing_mask] = 3

    # Create prediction with some errors
    y_pred = np.zeros((height, width), dtype=np.uint8)

    # Slightly different boundaries
    edema_mask_pred = (x - 130)**2 + (y - 126)**2 <= 48**2
    y_pred[edema_mask_pred] = 2

    core_mask_pred = (x - 130)**2 + (y - 126)**2 <= 28**2
    y_pred[core_mask_pred] = 1

    enhancing_mask_pred = (x - 130)**2 + (y - 126)**2 <= 14**2
    y_pred[enhancing_mask_pred] = 3

    gt_path = os.path.join(OUTPUT_DIR, "segmentation_multiclass_ground_truth.npy")
    pred_path = os.path.join(OUTPUT_DIR, "segmentation_multiclass_prediction.npy")

    np.save(gt_path, y_true)
    np.save(pred_path, y_pred)

    print(f"Created: {gt_path}")
    print(f"Created: {pred_path}")
    print(f"  - Shape: {y_true.shape}")
    print(f"  - Classes: 0=background, 1=tumor core, 2=edema, 3=enhancing")
    return gt_path, pred_path


def generate_detection_data():
    """Generate object detection example data (e.g., lung nodule detection)."""
    n_images = 50

    predictions = []
    ground_truths = []

    for img_idx in range(n_images):
        # Random number of ground truth nodules (0-4 per image)
        n_nodules = np.random.choice([0, 1, 1, 2, 2, 3, 4], p=[0.1, 0.25, 0.25, 0.2, 0.1, 0.07, 0.03])

        img_gt = []
        img_pred = []

        for _ in range(n_nodules):
            # Generate ground truth box
            x1 = int(np.random.randint(50, 400))
            y1 = int(np.random.randint(50, 400))
            size = int(np.random.randint(20, 80))
            gt_box = [x1, y1, x1 + size, y1 + size]
            img_gt.append(gt_box)

            # 80% chance of detecting this nodule
            if np.random.random() < 0.8:
                # Add some localization error
                offset = np.random.randint(-8, 8, 4)
                pred_box = [
                    int(max(0, gt_box[0] + offset[0])),
                    int(max(0, gt_box[1] + offset[1])),
                    int(gt_box[2] + offset[2]),
                    int(gt_box[3] + offset[3])
                ]
                score = float(np.random.uniform(0.5, 0.98))
                img_pred.append({"box": pred_box, "score": round(score, 3)})

        # Add some false positives (0-2 per image)
        n_fp = np.random.choice([0, 0, 1, 1, 2], p=[0.4, 0.2, 0.2, 0.15, 0.05])
        for _ in range(n_fp):
            x1 = int(np.random.randint(50, 400))
            y1 = int(np.random.randint(50, 400))
            size = int(np.random.randint(15, 50))
            fp_box = [x1, y1, x1 + size, y1 + size]
            score = float(np.random.uniform(0.3, 0.7))  # Lower confidence for FPs
            img_pred.append({"box": fp_box, "score": round(score, 3)})

        predictions.append(img_pred)
        ground_truths.append(img_gt)

    data = {
        "predictions": predictions,
        "ground_truths": ground_truths,
        "metadata": {
            "description": "Simulated lung nodule detection data",
            "n_images": n_images,
            "box_format": "[x1, y1, x2, y2]"
        }
    }

    filepath = os.path.join(OUTPUT_DIR, "detection_example.json")
    with open(filepath, 'w') as f:
        json.dump(data, f, indent=2)

    total_gt = sum(len(gt) for gt in ground_truths)
    total_pred = sum(len(pred) for pred in predictions)

    print(f"Created: {filepath}")
    print(f"  - Images: {n_images}")
    print(f"  - Total ground truth boxes: {total_gt}")
    print(f"  - Total predictions: {total_pred}")
    return filepath


def main():
    print("=" * 60)
    print("Generating example data files for Omnibin")
    print("=" * 60)
    print()

    print("1. Binary Classification Data")
    print("-" * 40)
    generate_classification_data()
    print()

    print("2. Regression Data")
    print("-" * 40)
    generate_regression_data()
    print()

    print("3. 2D Segmentation Data")
    print("-" * 40)
    generate_segmentation_data_2d()
    print()

    print("4. 3D Segmentation Data")
    print("-" * 40)
    generate_segmentation_data_3d()
    print()

    print("5. Multi-class Segmentation Data")
    print("-" * 40)
    generate_multiclass_segmentation_data()
    print()

    print("6. Object Detection Data")
    print("-" * 40)
    generate_detection_data()
    print()

    print("=" * 60)
    print("All example data files generated successfully!")
    print(f"Location: {OUTPUT_DIR}")
    print("=" * 60)


if __name__ == "__main__":
    main()