Preetham22 commited on
Commit
9d9cc25
·
1 Parent(s): d936f35

script for generating test data

Browse files
Files changed (1) hide show
  1. src/generate_test_csv.py +149 -0
src/generate_test_csv.py ADDED
@@ -0,0 +1,149 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import random
2
+ import csv
3
+ import string
4
+ from pathlib import Path
5
+
6
+ # Constants
7
+ SAMPLES_PER_CLASS = 10
8
+ CURRENT_DIR = Path(__file__).resolve().parent
9
+ PROJECT_ROOT = CURRENT_DIR.parent
10
+ IMAGE_DIR = PROJECT_ROOT / "data" / "images"
11
+ TRAIN_CSV_PATH = PROJECT_ROOT / "data" / "emr_records.csv"
12
+ OUTPUT_CSV = PROJECT_ROOT / "data" / "test_samples.csv"
13
+ LABELS = ["COVID", "NORMAL", "VIRAL PNEUMONIA"]
14
+
15
+ alt_symptoms = [
16
+ "The patient has noted intermittent chest pressure and occasional shortness of breath.",
17
+ "A gradual onset of dry cough with mild respiratory discomfort has been documented.",
18
+ "Reported complaints include mild fatigue and sporadic episodes of wheezing.",
19
+ "Mild respiratory symptoms have progressed over several days.",
20
+ "Episodes of throat irritation and general malaise observed.",
21
+ ]
22
+
23
+ alt_diagnosis = [
24
+ "Clinical features are suggestive of a nonspecific viral etiology.",
25
+ "Diagnosis remains unclear pending further laboratory confirmation.",
26
+ "Preliminary indicators fall into a diagnostic grey area.",
27
+ "No definitive pattern observed; further evaluation is warranted.",
28
+ "Presentation overlaps multiple pulmonary conditions.",
29
+ ]
30
+
31
+ alt_noise = [
32
+ "Patient remains oriented with stable hemodynamics.",
33
+ "No remarkable family history or chronic illness reported.",
34
+ "Nutritional intake and sleep patterns appear adequate.",
35
+ "No prior admissions or surgical history disclosed.",
36
+ "Standard precautions have been advised post-evaluation.",
37
+ ]
38
+
39
+
40
+ def random_token():
41
+ prefix = "TEST"
42
+ letters = "".join(random.choices(string.ascii_uppercase, k=2))
43
+ digits = "".join(random.choices(string.digits, k=2))
44
+ return f"{prefix}-{letters}{digits}"
45
+
46
+
47
+ def get_oxygen(label):
48
+ if label == "NORMAL":
49
+ return random.randint(94, 100)
50
+ elif label == "VIRAL PNEUMONIA":
51
+ return random.randint(90, 96)
52
+ else:
53
+ return random.randint(87, 94)
54
+
55
+
56
+ def get_temp(label):
57
+ if label == "NORMAL":
58
+ return round(random.uniform(97.5, 99.0), 1)
59
+ else:
60
+ return round(random.uniform(98.8, 102.5), 1)
61
+
62
+
63
+ def get_age():
64
+ return random.randint(18, 85)
65
+
66
+
67
+ def get_days():
68
+ return random.randint(1, 10)
69
+
70
+
71
+ def build_alt_emr(label):
72
+ pid = random_token()
73
+ age = f"{get_age()} years old"
74
+ days = get_days()
75
+ temp = get_temp(label)
76
+ oxygen = get_oxygen(label)
77
+
78
+ sent_intro = f"Patient {pid}, a {age} individual presented after experiencing symptoms for approximately {days} days."
79
+ sent_vitals = f"Vital measurements include a body temperature of {temp}°F and an oxygen saturation level of {oxygen}%."
80
+
81
+ body = [
82
+ sent_intro,
83
+ random.choice(alt_symptoms),
84
+ sent_vitals,
85
+ random.choice(alt_diagnosis),
86
+ ]
87
+
88
+ if random.random() < 0.3:
89
+ if label == "COVID":
90
+ body.append("Anosmia has been intermittently observed over recent days.")
91
+ elif label == "VIRAL PNEUMONIA":
92
+ body.append("Radiographic evidence reveals dispersed infiltrative patterns.")
93
+ elif label == "NORMAL":
94
+ body.append("There are currently no active complaints from the patient.")
95
+
96
+ # inject 1-2 neutral clinical observations
97
+ if random.random() < 0.9:
98
+ body.insert(random.randint(1, len(body)), random.choice(alt_noise))
99
+ if random.random() < 0.5:
100
+ body.insert(random.randint(1, len(body)), random.choice(alt_noise))
101
+
102
+ random.shuffle(body[1:]) # Keep the first sentence intact
103
+ return " ".join(body)
104
+
105
+
106
+ def get_training_image_set():
107
+ if not TRAIN_CSV_PATH.exists():
108
+ raise FileNotFoundError(f"Training CSV not found at {TRAIN_CSV_PATH}")
109
+ with open(TRAIN_CSV_PATH, newline="") as f:
110
+ reader = csv.DictReader(f)
111
+ return set(row["image_path"].strip() for row in reader)
112
+
113
+
114
+ def generate_test_csv():
115
+ training_images = get_training_image_set()
116
+ records = []
117
+
118
+ for label in LABELS:
119
+ label_dir = IMAGE_DIR / label
120
+ image_files = sorted([
121
+ f for f in label_dir.glob("*") if f.suffix.lower() in [".png", ".jpg", ".jpeg"]
122
+ ])
123
+ unseen_images = [
124
+ f for f in image_files
125
+ if str(f.relative_to(PROJECT_ROOT)) not in training_images
126
+ ]
127
+
128
+ if len(unseen_images) < SAMPLES_PER_CLASS:
129
+ raise ValueError(f"Not enough unseen images in {label_dir}."
130
+ f"Needed {SAMPLES_PER_CLASS}, found {len(unseen_images)}")
131
+ sampled_images = random.sample(unseen_images, SAMPLES_PER_CLASS)
132
+
133
+ for img_path in sampled_images:
134
+ relative_path = str(img_path.relative_to(PROJECT_ROOT))
135
+ text = build_alt_emr(label)
136
+ records.append([text, relative_path, label])
137
+
138
+ random.shuffle(records)
139
+ with open(OUTPUT_CSV, "w", newline="") as f:
140
+ writer = csv.writer(f)
141
+ writer.writerow(["text", "image_path", "label"])
142
+ writer.writerows(records)
143
+
144
+ print(f"✅ test CSV file generated: {OUTPUT_CSV}")
145
+ print(f"📦 Total samples: {len(records)} (10 per class)")
146
+
147
+
148
+ if __name__ == "__main__":
149
+ generate_test_csv()