Preetham22 commited on
Commit
376c77f
·
1 Parent(s): c6cd3a4

made directory changes, after findind the optimal data generation script

Browse files
{assets → experiments/Training_curves_iterations}/model_training_curve_fuzzytext.png RENAMED
File without changes
{assets → experiments/Training_curves_iterations}/model_training_curve_image.png RENAMED
File without changes
{assets → experiments/Training_curves_iterations}/model_training_curve_multimodal.png RENAMED
File without changes
experiments/Training_curves_iterations/model_training_curve_richfuzzytext.png ADDED

Git LFS Details

  • SHA256: bc50d9088d15142d92ae729d4e584d307fcd0bebf22dbfb3e28d3fd161fde6d7
  • Pointer size: 130 Bytes
  • Size of remote file: 20.5 kB
experiments/Training_curves_iterations/model_training_curve_softlabelstext.png ADDED

Git LFS Details

  • SHA256: 8e27b194bc0d22b690bec961c9e5f3cc4f41d32e504790462e3f6340a3a2b8ba
  • Pointer size: 130 Bytes
  • Size of remote file: 31.4 kB
{assets → experiments/Training_curves_iterations}/model_training_curve_text.png RENAMED
File without changes
experiments/csv_file_generator_iterations/generate_emr_csv_final.py ADDED
@@ -0,0 +1,132 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import random
2
+ import csv
3
+ import string
4
+ from pathlib import Path
5
+
6
+ # Paths
7
+ CURRENT_DIR = Path(__file__).resolve().parent
8
+ IMAGES_DIR = CURRENT_DIR.parent / "data" / "images"
9
+ OUTPUT_FILE = CURRENT_DIR.parent / "data" / "emr_records_softlabels.csv"
10
+
11
+ # Label to triage
12
+ triage_map = {"COVID": "high", "NORMAL": "low", "VIRAL PNEUMONIA": "medium"}
13
+ SAMPLES_PER_CLASS = 300
14
+
15
+ # Folders
16
+ categories = {
17
+ "COVID": IMAGES_DIR / "COVID",
18
+ "NORMAL": IMAGES_DIR / "NORMAL",
19
+ "VIRAL PNEUMONIA": IMAGES_DIR / "VIRAL PNEUMONIA"
20
+ }
21
+
22
+ # Shared ambiguous templates
23
+ shared_symptoms = [
24
+ "Mild cough and slight fever reported.",
25
+ "General fatigue and throat irritation present.",
26
+ "Breathing mildly labored during physical exertion.",
27
+ "No major respiratory distress; mild wheezing noted.",
28
+ "Occasional chest tightness reported.",
29
+ "Vital signs mostly stable; slight variation in temperature.",
30
+ ]
31
+
32
+ # Overlapping diagnosis clues
33
+ shared_diagnosis = [
34
+ "Symptoms could relate to a range of viral infections.",
35
+ "Presentation not distinctly matching any single infection.",
36
+ "Further tests required to confirm diagnosis.",
37
+ "Findings are borderline; clinical judgment advised.",
38
+ "Observation warranted due to overlapping signs.",
39
+ "Initial assessment inconclusive."
40
+ ]
41
+
42
+ # Noise sentences
43
+ neutral_noise = [
44
+ "Patient is cooperative and alert.",
45
+ "Dietary habits unremarkable.",
46
+ "Hydration status normal.",
47
+ "Follow-up advised if symptoms persist.",
48
+ "No notable family medical history.",
49
+ "No medications currently administered.",
50
+ ]
51
+
52
+ def random_token():
53
+ prefix = "ID"
54
+ letters = ''.join(random.choices(string.ascii_uppercase, k=2))
55
+ digits = ''.join(random.choices(string.digits, k=2))
56
+ return f"{prefix}-{letters}{digits}"
57
+
58
+ def get_oxygen(label):
59
+ # Soft blur across classes
60
+ if label == "NORMAL":
61
+ return random.randint(94, 100)
62
+ elif label == "VIRAL PNEUMONIA":
63
+ return random.randint(90, 96)
64
+ else:
65
+ return random.randint(87, 94)
66
+
67
+ def get_temp(label):
68
+ if label == "NORMAL":
69
+ return round(random.uniform(97.5, 99.0), 1)
70
+ else:
71
+ return round(random.uniform(98.8, 102.5), 1)
72
+
73
+ def get_age():
74
+ return random.randint(18, 85)
75
+
76
+ def get_days():
77
+ return random.randint(1, 10)
78
+
79
+ def build_emr(label, i):
80
+ pid = random_token()
81
+ age = f"{get_age()}-year-old"
82
+ days = get_days()
83
+ temp = get_temp(label)
84
+ oxygen = get_oxygen(label)
85
+
86
+ intro = f"Patient {pid}, a {age}, reports symptoms for {days} days."
87
+ vitals = f"Temperature recorded at {temp}°F and SPO2 at {oxygen}%."
88
+
89
+ # Shared symptoms + blurred logic
90
+ body = [
91
+ intro,
92
+ random.choice(shared_symptoms),
93
+ vitals,
94
+ random.choice(shared_diagnosis)
95
+ ]
96
+
97
+ # Optionally inject a mild class-specific clue (with low probability)
98
+ if random.random() < 0.3:
99
+ if label == "COVID":
100
+ body.append("Patient reports recent loss of taste.")
101
+ elif label == "VIRAL PNEUMONIA":
102
+ body.append("Chest X-ray shows scattered infiltrates.")
103
+ elif label == "NORMAL":
104
+ body.append("No active complaints at this time.")
105
+
106
+ # Inject 1–2 noise sentences
107
+ if random.random() < 0.8:
108
+ body.insert(random.randint(1, len(body)), random.choice(neutral_noise))
109
+ if random.random() < 0.5:
110
+ body.insert(random.randint(1, len(body)), random.choice(neutral_noise))
111
+
112
+ random.shuffle(body[1:]) # Keep intro in position 0
113
+ return " ".join(body)
114
+
115
+ # Generate records
116
+ records = []
117
+ for label, img_dir in categories.items():
118
+ image_files = sorted([f for f in img_dir.glob("*") if f.suffix.lower() in [".png", ".jpg", ".jpeg"]])
119
+ for i in range(SAMPLES_PER_CLASS):
120
+ image_path = str(random.choice(image_files).relative_to(IMAGES_DIR.parent.parent))
121
+ text = build_emr(label, i)
122
+ triage = triage_map[label]
123
+ records.append([f"{label}-{i+1}", image_path, text, triage])
124
+
125
+ # Shuffle + write
126
+ random.shuffle(records)
127
+ with open(OUTPUT_FILE, "w", newline="") as f:
128
+ writer = csv.writer(f)
129
+ writer.writerow(["patient_id", "image_path", "emr_text", "triage_level"])
130
+ writer.writerows(records)
131
+
132
+ print(f"✅ Softlabel EMR dataset generated at {OUTPUT_FILE}")
experiments/{generate_emr_csv_fuzzy.py → csv_file_generator_iterations/generate_emr_csv_v1.py} RENAMED
@@ -5,10 +5,10 @@ from pathlib import Path
5
  # Setup paths
6
  CURRENT_DIR = Path(__file__).resolve().parent
7
  IMAGES_DIR = CURRENT_DIR.parent / "data" / "images"
8
- OUTPUT_FILE = CURRENT_DIR.parent / "data" / "emr_records_fuzzy.csv"
9
 
10
  # Sample size
11
- SAMPLES_PER_CLASS = 300 # 900 total
12
 
13
  # Categories and labels
14
  categories = {
@@ -17,13 +17,14 @@ categories = {
17
  "VIRAL PNEUMONIA": IMAGES_DIR / "VIRAL PNEUMONIA"
18
  }
19
 
 
20
  triage_map = {
21
  "COVID": "high",
22
  "NORMAL": "low",
23
  "VIRAL PNEUMONIA": "medium"
24
  }
25
 
26
- # --- Shared & ambiguous templates ---
27
  noise_sentences = [
28
  "Follow-up scheduled for next week.",
29
  "Patient advised to maintain hydration and rest.",
@@ -31,17 +32,21 @@ noise_sentences = [
31
  "Patient remains alert and oriented.",
32
  "Vitals are within acceptable ranges.",
33
  "No complications noted during assessment.",
34
- "Doctor recommends continued observation.",
35
  "Patient has no known drug allergies.",
 
36
  "Supportive care was initiated.",
37
  "Patient advised to avoid strenuous activity.",
 
 
38
  "Mild discomfort reported with no severe symptoms.",
39
  "Symptoms are self-limiting according to patient.",
 
40
  "No medication administered at this stage.",
41
- "Doctor recommends home rest and observation.",
42
  "Evaluation ongoing for possible infection."
43
  ]
44
 
 
45
  ambiguous_templates = [
46
  "Mild fever noted. No cough. Patient recently traveled.",
47
  "Normal oxygen levels observed. Slight wheeze on auscultation.",
@@ -50,45 +55,62 @@ ambiguous_templates = [
50
  "Slight fatigue without other systemic symptoms."
51
  ]
52
 
53
- # --- Vitals ---
54
  def get_oxygen(label):
55
- ranges = {
56
  "COVID": (85, 94),
57
  "VIRAL PNEUMONIA": (88, 95),
58
  "NORMAL": (96, 99)
59
  }
60
- low, high = ranges[label]
61
- oxygen = random.randint(low - 1, high + 1)
 
62
  return min(100, max(80, oxygen))
63
 
64
  def get_temp(label):
65
  if label == "NORMAL":
66
- temp = random.uniform(96.5, 99.0)
67
  else:
68
- temp = random.uniform(98.5, 104.0)
69
- return round(temp, 1)
 
 
 
 
 
 
70
 
71
- def get_days(): return random.randint(1, 14)
72
- def get_age(): return random.randint(18, 85)
73
 
74
- # --- Build EMR ---
75
  def build_emr(label, i):
76
  name = f"Patient-{label}-{i+1}"
77
  age = f"{get_age()}-year-old"
78
  days = get_days()
79
  temp = get_temp(label)
80
  oxygen = get_oxygen(label)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
81
 
82
- # Shared symptoms across labels
83
- shared_symptoms = [
84
- f"{name} ({age}) reports dry cough and fatigue for {days} days.",
85
- f"{name} reports breathlessness. Temp recorded as {temp}°F.",
86
- f"{name} is experiencing low oxygen levels at {oxygen}%.",
87
- f"{name} complains of throat irritation and tiredness.",
88
- f"{name} has fever, but vitals are otherwise stable."
89
- ]
90
-
91
- # Label-specific diagnosis
92
  diagnosis = {
93
  "COVID": [
94
  "Findings suggest viral respiratory infection.",
@@ -97,48 +119,51 @@ def build_emr(label, i):
97
  ],
98
  "NORMAL": [
99
  "No signs of respiratory infection.",
100
- "Checkup results within normal limits.",
101
- "No abnormal findings detected."
102
  ],
103
  "VIRAL PNEUMONIA": [
104
  "X-ray shows patchy infiltrates.",
105
- "Clinical signs indicate viral pneumonia.",
106
- "Suspected viral origin of symptoms."
107
  ]
108
  }
109
 
110
- # Build full body
111
- emr = [random.choice(shared_symptoms), random.choice(diagnosis[label])]
112
 
113
- # Add ambiguity (~60%)
114
- if random.random() < 0.6:
115
- emr.insert(random.randint(0, len(emr)), random.choice(ambiguous_templates))
116
 
117
- # Add noise (~90%)
118
  if random.random() < 0.9:
119
- for _ in range(random.randint(1, 2)):
120
- emr.insert(random.randint(0, len(emr)), random.choice(noise_sentences))
121
-
122
- random.shuffle(emr)
123
- return " ".join(emr)
124
 
125
- # --- Generate records ---
126
  records = []
127
  for label, img_dir in categories.items():
128
- files = sorted([f for f in img_dir.glob("*") if f.suffix.lower() in [".png", ".jpg", ".jpeg"]])
 
 
 
129
  for i in range(SAMPLES_PER_CLASS):
130
- image_path = str(random.choice(files).relative_to(IMAGES_DIR.parent.parent))
 
131
  emr_text = build_emr(label, i)
132
- triage = triage_map[label]
133
- pid = f"{label}-{i+1}"
134
- records.append([pid, image_path, emr_text, triage])
135
 
136
  random.shuffle(records)
137
 
138
- # --- Save to CSV ---
139
  with open(OUTPUT_FILE, "w", newline="") as f:
140
  writer = csv.writer(f)
141
  writer.writerow(["patient_id", "image_path", "emr_text", "triage_level"])
142
  writer.writerows(records)
143
 
144
- print(f"✅ Regenerated {len(records)} fuzzy EMR records at: {OUTPUT_FILE}")
 
5
  # Setup paths
6
  CURRENT_DIR = Path(__file__).resolve().parent
7
  IMAGES_DIR = CURRENT_DIR.parent / "data" / "images"
8
+ OUTPUT_FILE = CURRENT_DIR.parent / "data" / "emr_records_extended.csv"
9
 
10
  # Sample size
11
+ SAMPLES_PER_CLASS = 300 # 300 * 3 = 900 total
12
 
13
  # Categories and labels
14
  categories = {
 
17
  "VIRAL PNEUMONIA": IMAGES_DIR / "VIRAL PNEUMONIA"
18
  }
19
 
20
+ # Triage mapping
21
  triage_map = {
22
  "COVID": "high",
23
  "NORMAL": "low",
24
  "VIRAL PNEUMONIA": "medium"
25
  }
26
 
27
+ # --- Noise Sentences ---
28
  noise_sentences = [
29
  "Follow-up scheduled for next week.",
30
  "Patient advised to maintain hydration and rest.",
 
32
  "Patient remains alert and oriented.",
33
  "Vitals are within acceptable ranges.",
34
  "No complications noted during assessment.",
 
35
  "Patient has no known drug allergies.",
36
+ "Doctor recommends continued observation.",
37
  "Supportive care was initiated.",
38
  "Patient advised to avoid strenuous activity.",
39
+ "No complications noted during assessment",
40
+ "No prior history of respiratory illness.",
41
  "Mild discomfort reported with no severe symptoms.",
42
  "Symptoms are self-limiting according to patient.",
43
+ "Patient remains alert and cooperative.",
44
  "No medication administered at this stage.",
45
+ "Doctor recommends home resr and observation.",
46
  "Evaluation ongoing for possible infection."
47
  ]
48
 
49
+ # --- ambiguity sentences ---
50
  ambiguous_templates = [
51
  "Mild fever noted. No cough. Patient recently traveled.",
52
  "Normal oxygen levels observed. Slight wheeze on auscultation.",
 
55
  "Slight fatigue without other systemic symptoms."
56
  ]
57
 
58
+ # --- Vitals & Symptoms ---
59
  def get_oxygen(label):
60
+ base_ranges = {
61
  "COVID": (85, 94),
62
  "VIRAL PNEUMONIA": (88, 95),
63
  "NORMAL": (96, 99)
64
  }
65
+ base_min, base_max = base_ranges[label]
66
+ # Apply + or - 1 blur, clamping between 80 and 100
67
+ oxygen = random.randint(base_min - 1, base_max + 1)
68
  return min(100, max(80, oxygen))
69
 
70
  def get_temp(label):
71
  if label == "NORMAL":
72
+ base_min, base_max = 97.0, 98.6
73
  else:
74
+ base_min, base_max = 99.0, 103.5
75
+
76
+ # Apply + or - 0.5°F blur and clamp between 95-105°F
77
+ temp = random.uniform(base_min - 0.5, base_max + 0.5)
78
+ return round(min(105.0, max(95.0, temp)), 1)
79
+
80
+ def get_days():
81
+ return random.randint(1, 14)
82
 
83
+ def get_age():
84
+ return random.randint(18, 80)
85
 
86
+ # --- Templates ---
87
  def build_emr(label, i):
88
  name = f"Patient-{label}-{i+1}"
89
  age = f"{get_age()}-year-old"
90
  days = get_days()
91
  temp = get_temp(label)
92
  oxygen = get_oxygen(label)
93
+
94
+ # Symptoms Pool
95
+ symptoms = {
96
+ "COVID": [
97
+ f"{name} ({age}) reports fatigue and dry cough for {days} days.",
98
+ f"{name} complains of shortness of breath and fever of {temp}°F.",
99
+ f"{name} reports loss of taste. SPO2 at {oxygen}%.",
100
+ ],
101
+ "NORMAL": [
102
+ f"{name} ({age}) presents for routine check-up. Vitals stable.",
103
+ f"{name} shows no respiratory distress. Oxygen at {oxygen}%.",
104
+ f"{name} denies any recent illness. Temperature is {temp}°F.",
105
+ ],
106
+ "VIRAL PNEUMONIA": [
107
+ f"{name} ({age}) complains of dry cough for {days} days.",
108
+ f"{name} experiencing low-grade fever and SPO2 at {oxygen}%.",
109
+ f"{name} reports breathlessness. X-ray indicates mild infiltrates.",
110
+ ]
111
+ }
112
 
113
+ # Diagnosis Observations
 
 
 
 
 
 
 
 
 
114
  diagnosis = {
115
  "COVID": [
116
  "Findings suggest viral respiratory infection.",
 
119
  ],
120
  "NORMAL": [
121
  "No signs of respiratory infection.",
122
+ "No abnormal findings detected.",
123
+ "Checkup results within normal limits."
124
  ],
125
  "VIRAL PNEUMONIA": [
126
  "X-ray shows patchy infiltrates.",
127
+ "Suspected viral origin of symptoms.",
128
+ "Clinical signs indicate viral pneumonia."
129
  ]
130
  }
131
 
132
+ # Construct sentence pool
133
+ body = [random.choice(symptoms[label]), random.choice(diagnosis[label])]
134
 
135
+ # adding ambiguous cases randomly (~70% of cases)
136
+ if random.random() < 0.7:
137
+ body.insert(random.randint(0, len(body)), random.choice(ambiguous_templates))
138
 
139
+ # adding noise to 90% of cases
140
  if random.random() < 0.9:
141
+ for _ in range(random.randint(1,2)):
142
+ body.insert(random.randint(0, len(body)), random.choice(noise_sentences))
143
+
144
+ random.shuffle(body)
145
+ return " ".join(body)
146
 
147
+ # Generate dataset
148
  records = []
149
  for label, img_dir in categories.items():
150
+ valid_exts = [".png", ".jpg", ".jpeg"]
151
+ image_files = sorted(
152
+ [f for f in img_dir.glob("*") if f.suffix.lower() in valid_exts]
153
+ )
154
  for i in range(SAMPLES_PER_CLASS):
155
+ patient_id = f"{label}-{i+1}"
156
+ image_path = str(random.choice(image_files).relative_to(IMAGES_DIR.parent.parent))
157
  emr_text = build_emr(label, i)
158
+ triage_level = triage_map[label]
159
+ records.append([patient_id, image_path, emr_text, triage_level])
 
160
 
161
  random.shuffle(records)
162
 
163
+ # Save to CSV
164
  with open(OUTPUT_FILE, "w", newline="") as f:
165
  writer = csv.writer(f)
166
  writer.writerow(["patient_id", "image_path", "emr_text", "triage_level"])
167
  writer.writerows(records)
168
 
169
+ print(f"✅Generated {len(records)} EMR records in {OUTPUT_FILE}")
experiments/csv_file_generator_iterations/generate_emr_csv_v2.py ADDED
@@ -0,0 +1,122 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import random
2
+ import csv
3
+ import string
4
+ from pathlib import Path
5
+
6
+ # Paths
7
+ CURRENT_DIR = Path(__file__).resolve().parent
8
+ IMAGES_DIR = CURRENT_DIR.parent / "data" / "images"
9
+ OUTPUT_FILE = CURRENT_DIR.parent / "data" / "emr_records_richfuzzy.csv"
10
+
11
+ # Label to triage
12
+ triage_map = {"COVID": "high", "NORMAL": "low", "VIRAL PNEUMONIA": "medium"}
13
+ SAMPLES_PER_CLASS = 300
14
+
15
+ # Folders
16
+ categories = {
17
+ "COVID": IMAGES_DIR / "COVID",
18
+ "NORMAL": IMAGES_DIR / "NORMAL",
19
+ "VIRAL PNEUMONIA": IMAGES_DIR / "VIRAL PNEUMONIA"
20
+ }
21
+
22
+ # Shared ambiguous templates
23
+ ambiguous_phrases = [
24
+ "Slight throat irritation without systemic symptoms.",
25
+ "Mild dyspnea but normal vitals.",
26
+ "Minor dry cough reported, patient stable.",
27
+ "Chest X-ray inconclusive.",
28
+ "No recent exposure or travel noted.",
29
+ "Intermittent headache without fever.",
30
+ ]
31
+
32
+ # Noise sentences
33
+ neutral_noise = [
34
+ "Patient is cooperative and alert.",
35
+ "Dietary habits unremarkable.",
36
+ "Follow-up recommended if symptoms persist.",
37
+ "Hydration status is normal.",
38
+ "No family history of chronic illness.",
39
+ "Patient expresses concern about possible flu.",
40
+ ]
41
+
42
+ # ---Patient random token genrator ---
43
+ def random_token():
44
+ prefix = "ID"
45
+ letters = ''.join(random.choices(string.ascii_uppercase, k=2))
46
+ digits = ''.join(random.choices(string.digits, k=2))
47
+ return f"{prefix}-{letters}{digits}"
48
+
49
+ # Vitals (blurred)
50
+ def get_oxygen(label):
51
+ base = {"COVID": (85, 94), "VIRAL PNEUMONIA": (89, 96), "NORMAL": (96, 99)}
52
+ min_, max_ = base[label]
53
+ return min(100, max(80, random.randint(min_-1, max_+1)))
54
+
55
+ def get_temp(label):
56
+ if label == "NORMAL":
57
+ min_, max_ = 97.0, 98.5
58
+ else:
59
+ min_, max_ = 99.0, 103.0
60
+ return round(random.uniform(min_ - 0.6, max_ + 0.6), 1)
61
+
62
+ def get_age(): return random.randint(18, 85)
63
+ def get_days(): return random.randint(1, 10)
64
+
65
+ # EMR generator
66
+ def build_emr(label, i):
67
+ patient_id = random_token()
68
+ age = f"{get_age()}-year-old"
69
+ oxygen = get_oxygen(label)
70
+ temp = get_temp(label)
71
+ days = get_days()
72
+
73
+ general_intro = f"Patient {patient_id}, a {age}, presents with symptoms for {days} days."
74
+ vitals = f"Temperature recorded at {temp}°F, SPO2 levels at {oxygen}%."
75
+
76
+ # Label-specific (but fuzzy) symptoms
77
+ symptoms = {
78
+ "COVID": ["Complains of fatigue and shortness of breath.", "Dry cough with mild fever noted."],
79
+ "NORMAL": ["No major complaints; here for general checkup.", "Reports good health, no active issues."],
80
+ "VIRAL PNEUMONIA": ["Persistent cough and mild fever observed.", "Slight wheezing with chest tightness."]
81
+ }
82
+
83
+ diagnosis = {
84
+ "COVID": ["Viral etiology suspected.", "COVID infection not ruled out."],
85
+ "NORMAL": ["Unlikely presence of infection.", "Clinical impression is benign."],
86
+ "VIRAL PNEUMONIA": ["Signs may indicate atypical pneumonia.", "Possible viral infection of lower tract."]
87
+ }
88
+
89
+ body = [
90
+ general_intro,
91
+ random.choice(symptoms[label]),
92
+ vitals,
93
+ random.choice(diagnosis[label])
94
+ ]
95
+
96
+ # Inject 1–2 ambiguous or neutral sentences
97
+ if random.random() < 0.8:
98
+ body.insert(random.randint(1, len(body)), random.choice(ambiguous_phrases))
99
+ if random.random() < 0.7:
100
+ body.insert(random.randint(1, len(body)), random.choice(neutral_noise))
101
+
102
+ random.shuffle(body[1:])
103
+ return " ".join(body)
104
+
105
+ # Generate records
106
+ records = []
107
+ for label, img_dir in categories.items():
108
+ image_files = sorted([f for f in img_dir.glob("*") if f.suffix.lower() in [".png", ".jpg", ".jpeg"]])
109
+ for i in range(SAMPLES_PER_CLASS):
110
+ image_path = str(random.choice(image_files).relative_to(IMAGES_DIR.parent.parent))
111
+ text = build_emr(label, i)
112
+ triage = triage_map[label]
113
+ records.append([f"{label}-{i+1}", image_path, text, triage])
114
+
115
+ # Shuffle + Write
116
+ random.shuffle(records)
117
+ with open(OUTPUT_FILE, "w", newline="") as f:
118
+ writer = csv.writer(f)
119
+ writer.writerow(["patient_id", "image_path", "emr_text", "triage_level"])
120
+ writer.writerows(records)
121
+
122
+ print(f"✅ Rich fuzzy EMR dataset saved at {OUTPUT_FILE}")
experiments/csv_file_iterations/emr_records.csv ADDED
@@ -0,0 +1,91 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ patient_id,image_path,emr_text,triage_level
2
+ COVID-1,data/images/COVID/COVID-1.png,"The patient presents with a dry cough, fever, and shortness of breath. Symptoms began 5 days ago.",high
3
+ COVID-2,data/images/COVID/COVID-10.png,"The patient reports loss of taste and smell, with a persistent cough.",high
4
+ COVID-3,data/images/COVID/COVID-100.png,"The patient reports loss of taste and smell, with a persistent cough.",high
5
+ COVID-4,data/images/COVID/COVID-1000.png,"The patient reports loss of taste and smell, with a persistent cough.",high
6
+ COVID-5,data/images/COVID/COVID-1001.png,"The patient presents with a dry cough, fever, and shortness of breath. Symptoms began 5 days ago.",high
7
+ COVID-6,data/images/COVID/COVID-1002.png,"The patient presents with a dry cough, fever, and shortness of breath. Symptoms began 5 days ago.",high
8
+ COVID-7,data/images/COVID/COVID-1003.png,Progressive difficulty in breathing. Oxygen saturation is below the normal range.,high
9
+ COVID-8,data/images/COVID/COVID-1004.png,Progressive difficulty in breathing. Oxygen saturation is below the normal range.,high
10
+ COVID-9,data/images/COVID/COVID-1005.png,Progressive difficulty in breathing. Oxygen saturation is below the normal range.,high
11
+ COVID-10,data/images/COVID/COVID-1006.png,"The patient reports loss of taste and smell, with a persistent cough.",high
12
+ COVID-11,data/images/COVID/COVID-1007.png,"The patient reports loss of taste and smell, with a persistent cough.",high
13
+ COVID-12,data/images/COVID/COVID-1008.png,"The patient reports loss of taste and smell, with a persistent cough.",high
14
+ COVID-13,data/images/COVID/COVID-1009.png,"The patient presents with a dry cough, fever, and shortness of breath. Symptoms began 5 days ago.",high
15
+ COVID-14,data/images/COVID/COVID-101.png,"The patient presents with a dry cough, fever, and shortness of breath. Symptoms began 5 days ago.",high
16
+ COVID-15,data/images/COVID/COVID-1010.png,"The patient reports loss of taste and smell, with a persistent cough.",high
17
+ COVID-16,data/images/COVID/COVID-1011.png,"The patient reports loss of taste and smell, with a persistent cough.",high
18
+ COVID-17,data/images/COVID/COVID-1012.png,"The patient reports loss of taste and smell, with a persistent cough.",high
19
+ COVID-18,data/images/COVID/COVID-1013.png,"The patient reports loss of taste and smell, with a persistent cough.",high
20
+ COVID-19,data/images/COVID/COVID-1014.png,"The patient reports loss of taste and smell, with a persistent cough.",high
21
+ COVID-20,data/images/COVID/COVID-1015.png,"The patient presents with a dry cough, fever, and shortness of breath. Symptoms began 5 days ago.",high
22
+ COVID-21,data/images/COVID/COVID-1016.png,"The patient reports loss of taste and smell, with a persistent cough.",high
23
+ COVID-22,data/images/COVID/COVID-1017.png,"The patient presents with a dry cough, fever, and shortness of breath. Symptoms began 5 days ago.",high
24
+ COVID-23,data/images/COVID/COVID-1018.png,Progressive difficulty in breathing. Oxygen saturation is below the normal range.,high
25
+ COVID-24,data/images/COVID/COVID-1019.png,"The patient presents with a dry cough, fever, and shortness of breath. Symptoms began 5 days ago.",high
26
+ COVID-25,data/images/COVID/COVID-102.png,"The patient presents with a dry cough, fever, and shortness of breath. Symptoms began 5 days ago.",high
27
+ COVID-26,data/images/COVID/COVID-1020.png,Progressive difficulty in breathing. Oxygen saturation is below the normal range.,high
28
+ COVID-27,data/images/COVID/COVID-1021.png,"The patient reports loss of taste and smell, with a persistent cough.",high
29
+ COVID-28,data/images/COVID/COVID-1022.png,"The patient presents with a dry cough, fever, and shortness of breath. Symptoms began 5 days ago.",high
30
+ COVID-29,data/images/COVID/COVID-1023.png,"The patient presents with a dry cough, fever, and shortness of breath. Symptoms began 5 days ago.",high
31
+ COVID-30,data/images/COVID/COVID-1024.png,"The patient presents with a dry cough, fever, and shortness of breath. Symptoms began 5 days ago.",high
32
+ NORMAL-1,data/images/NORMAL/Normal-1.png,Clear lungs on auscultation. No signs of infection. Chest x-ray was unremarkable.,low
33
+ NORMAL-2,data/images/NORMAL/Normal-10.png,Routine checkup with no abnormal findings. The patient denies cough or chest pain.,low
34
+ NORMAL-3,data/images/NORMAL/Normal-100.png,Clear lungs on auscultation. No signs of infection. Chest x-ray was unremarkable.,low
35
+ NORMAL-4,data/images/NORMAL/Normal-1000.png,No complaints. Normal vitals and physical exam.,low
36
+ NORMAL-5,data/images/NORMAL/Normal-10000.png,No complaints. Normal vitals and physical exam.,low
37
+ NORMAL-6,data/images/NORMAL/Normal-10001.png,Routine checkup with no abnormal findings. The patient denies cough or chest pain.,low
38
+ NORMAL-7,data/images/NORMAL/Normal-10002.png,Routine checkup with no abnormal findings. The patient denies cough or chest pain.,low
39
+ NORMAL-8,data/images/NORMAL/Normal-10003.png,Clear lungs on auscultation. No signs of infection. Chest x-ray was unremarkable.,low
40
+ NORMAL-9,data/images/NORMAL/Normal-10004.png,Clear lungs on auscultation. No signs of infection. Chest x-ray was unremarkable.,low
41
+ NORMAL-10,data/images/NORMAL/Normal-10005.png,No complaints. Normal vitals and physical exam.,low
42
+ NORMAL-11,data/images/NORMAL/Normal-10006.png,No complaints. Normal vitals and physical exam.,low
43
+ NORMAL-12,data/images/NORMAL/Normal-10007.png,No complaints. Normal vitals and physical exam.,low
44
+ NORMAL-13,data/images/NORMAL/Normal-10008.png,Routine checkup with no abnormal findings. The patient denies cough or chest pain.,low
45
+ NORMAL-14,data/images/NORMAL/Normal-10009.png,Clear lungs on auscultation. No signs of infection. Chest x-ray was unremarkable.,low
46
+ NORMAL-15,data/images/NORMAL/Normal-1001.png,Clear lungs on auscultation. No signs of infection. Chest x-ray was unremarkable.,low
47
+ NORMAL-16,data/images/NORMAL/Normal-10010.png,Clear lungs on auscultation. No signs of infection. Chest x-ray was unremarkable.,low
48
+ NORMAL-17,data/images/NORMAL/Normal-10011.png,Routine checkup with no abnormal findings. The patient denies cough or chest pain.,low
49
+ NORMAL-18,data/images/NORMAL/Normal-10012.png,Routine checkup with no abnormal findings. The patient denies cough or chest pain.,low
50
+ NORMAL-19,data/images/NORMAL/Normal-10013.png,No complaints. Normal vitals and physical exam.,low
51
+ NORMAL-20,data/images/NORMAL/Normal-10014.png,No complaints. Normal vitals and physical exam.,low
52
+ NORMAL-21,data/images/NORMAL/Normal-10015.png,Clear lungs on auscultation. No signs of infection. Chest x-ray was unremarkable.,low
53
+ NORMAL-22,data/images/NORMAL/Normal-10016.png,Clear lungs on auscultation. No signs of infection. Chest x-ray was unremarkable.,low
54
+ NORMAL-23,data/images/NORMAL/Normal-10017.png,No complaints. Normal vitals and physical exam.,low
55
+ NORMAL-24,data/images/NORMAL/Normal-10018.png,No complaints. Normal vitals and physical exam.,low
56
+ NORMAL-25,data/images/NORMAL/Normal-10019.png,Clear lungs on auscultation. No signs of infection. Chest x-ray was unremarkable.,low
57
+ NORMAL-26,data/images/NORMAL/Normal-1002.png,Routine checkup with no abnormal findings. The patient denies cough or chest pain.,low
58
+ NORMAL-27,data/images/NORMAL/Normal-10020.png,No complaints. Normal vitals and physical exam.,low
59
+ NORMAL-28,data/images/NORMAL/Normal-10021.png,Clear lungs on auscultation. No signs of infection. Chest x-ray was unremarkable.,low
60
+ NORMAL-29,data/images/NORMAL/Normal-10022.png,Clear lungs on auscultation. No signs of infection. Chest x-ray was unremarkable.,low
61
+ NORMAL-30,data/images/NORMAL/Normal-10023.png,No complaints. Normal vitals and physical exam.,low
62
+ VIRAL PNEUMONIA-1,data/images/VIRAL PNEUMONIA/Viral Pneumonia-1.png,"Mild fever, chest tightness, and dry cough for the past 3 days. Oxygen levels are normal.",medium
63
+ VIRAL PNEUMONIA-2,data/images/VIRAL PNEUMONIA/Viral Pneumonia-10.png,"Mild fever, chest tightness, and dry cough for the past 3 days. Oxygen levels are normal.",medium
64
+ VIRAL PNEUMONIA-3,data/images/VIRAL PNEUMONIA/Viral Pneumonia-100.png,"Mild fever, chest tightness, and dry cough for the past 3 days. Oxygen levels are normal.",medium
65
+ VIRAL PNEUMONIA-4,data/images/VIRAL PNEUMONIA/Viral Pneumonia-1000.png,"Mild fever, chest tightness, and dry cough for the past 3 days. Oxygen levels are normal.",medium
66
+ VIRAL PNEUMONIA-5,data/images/VIRAL PNEUMONIA/Viral Pneumonia-1001.png,The X-ray shows patchy infiltrates in the lungs. The patient is recovering from a recent viral infection.,medium
67
+ VIRAL PNEUMONIA-6,data/images/VIRAL PNEUMONIA/Viral Pneumonia-1002.png,"Mild fever, chest tightness, and dry cough for the past 3 days. Oxygen levels are normal.",medium
68
+ VIRAL PNEUMONIA-7,data/images/VIRAL PNEUMONIA/Viral Pneumonia-1003.png,"Mild fever, chest tightness, and dry cough for the past 3 days. Oxygen levels are normal.",medium
69
+ VIRAL PNEUMONIA-8,data/images/VIRAL PNEUMONIA/Viral Pneumonia-1004.png,Crackles are auscultated in the lower lobes. The patient presents with fatigue and mild respiratory distress.,medium
70
+ VIRAL PNEUMONIA-9,data/images/VIRAL PNEUMONIA/Viral Pneumonia-1005.png,The X-ray shows patchy infiltrates in the lungs. The patient is recovering from a recent viral infection.,medium
71
+ VIRAL PNEUMONIA-10,data/images/VIRAL PNEUMONIA/Viral Pneumonia-1006.png,Crackles are auscultated in the lower lobes. The patient presents with fatigue and mild respiratory distress.,medium
72
+ VIRAL PNEUMONIA-11,data/images/VIRAL PNEUMONIA/Viral Pneumonia-1007.png,"Mild fever, chest tightness, and dry cough for the past 3 days. Oxygen levels are normal.",medium
73
+ VIRAL PNEUMONIA-12,data/images/VIRAL PNEUMONIA/Viral Pneumonia-1008.png,Crackles are auscultated in the lower lobes. The patient presents with fatigue and mild respiratory distress.,medium
74
+ VIRAL PNEUMONIA-13,data/images/VIRAL PNEUMONIA/Viral Pneumonia-1009.png,The X-ray shows patchy infiltrates in the lungs. The patient is recovering from a recent viral infection.,medium
75
+ VIRAL PNEUMONIA-14,data/images/VIRAL PNEUMONIA/Viral Pneumonia-101.png,The X-ray shows patchy infiltrates in the lungs. The patient is recovering from a recent viral infection.,medium
76
+ VIRAL PNEUMONIA-15,data/images/VIRAL PNEUMONIA/Viral Pneumonia-1010.png,"Mild fever, chest tightness, and dry cough for the past 3 days. Oxygen levels are normal.",medium
77
+ VIRAL PNEUMONIA-16,data/images/VIRAL PNEUMONIA/Viral Pneumonia-1011.png,"Mild fever, chest tightness, and dry cough for the past 3 days. Oxygen levels are normal.",medium
78
+ VIRAL PNEUMONIA-17,data/images/VIRAL PNEUMONIA/Viral Pneumonia-1012.png,Crackles are auscultated in the lower lobes. The patient presents with fatigue and mild respiratory distress.,medium
79
+ VIRAL PNEUMONIA-18,data/images/VIRAL PNEUMONIA/Viral Pneumonia-1013.png,"Mild fever, chest tightness, and dry cough for the past 3 days. Oxygen levels are normal.",medium
80
+ VIRAL PNEUMONIA-19,data/images/VIRAL PNEUMONIA/Viral Pneumonia-1014.png,"Mild fever, chest tightness, and dry cough for the past 3 days. Oxygen levels are normal.",medium
81
+ VIRAL PNEUMONIA-20,data/images/VIRAL PNEUMONIA/Viral Pneumonia-1015.png,The X-ray shows patchy infiltrates in the lungs. The patient is recovering from a recent viral infection.,medium
82
+ VIRAL PNEUMONIA-21,data/images/VIRAL PNEUMONIA/Viral Pneumonia-1016.png,"Mild fever, chest tightness, and dry cough for the past 3 days. Oxygen levels are normal.",medium
83
+ VIRAL PNEUMONIA-22,data/images/VIRAL PNEUMONIA/Viral Pneumonia-1017.png,The X-ray shows patchy infiltrates in the lungs. The patient is recovering from a recent viral infection.,medium
84
+ VIRAL PNEUMONIA-23,data/images/VIRAL PNEUMONIA/Viral Pneumonia-1018.png,Crackles are auscultated in the lower lobes. The patient presents with fatigue and mild respiratory distress.,medium
85
+ VIRAL PNEUMONIA-24,data/images/VIRAL PNEUMONIA/Viral Pneumonia-1019.png,Crackles are auscultated in the lower lobes. The patient presents with fatigue and mild respiratory distress.,medium
86
+ VIRAL PNEUMONIA-25,data/images/VIRAL PNEUMONIA/Viral Pneumonia-102.png,Crackles are auscultated in the lower lobes. The patient presents with fatigue and mild respiratory distress.,medium
87
+ VIRAL PNEUMONIA-26,data/images/VIRAL PNEUMONIA/Viral Pneumonia-1020.png,The X-ray shows patchy infiltrates in the lungs. The patient is recovering from a recent viral infection.,medium
88
+ VIRAL PNEUMONIA-27,data/images/VIRAL PNEUMONIA/Viral Pneumonia-1021.png,Crackles are auscultated in the lower lobes. The patient presents with fatigue and mild respiratory distress.,medium
89
+ VIRAL PNEUMONIA-28,data/images/VIRAL PNEUMONIA/Viral Pneumonia-1022.png,The X-ray shows patchy infiltrates in the lungs. The patient is recovering from a recent viral infection.,medium
90
+ VIRAL PNEUMONIA-29,data/images/VIRAL PNEUMONIA/Viral Pneumonia-1023.png,"Mild fever, chest tightness, and dry cough for the past 3 days. Oxygen levels are normal.",medium
91
+ VIRAL PNEUMONIA-30,data/images/VIRAL PNEUMONIA/Viral Pneumonia-1024.png,"Mild fever, chest tightness, and dry cough for the past 3 days. Oxygen levels are normal.",medium
experiments/csv_file_iterations/emr_records_extended.csv ADDED
The diff for this file is too large to render. See raw diff
 
experiments/csv_file_iterations/emr_records_fuzzy.csv ADDED
The diff for this file is too large to render. See raw diff
 
experiments/csv_file_iterations/emr_records_richfuzzy.csv ADDED
The diff for this file is too large to render. See raw diff
 
experiments/csv_file_iterations/emr_records_softlabels.csv ADDED
The diff for this file is too large to render. See raw diff
 
src/generate_emr_csv.py CHANGED
@@ -1,169 +1,132 @@
1
  import random
2
  import csv
 
3
  from pathlib import Path
4
 
5
- # Setup paths
6
  CURRENT_DIR = Path(__file__).resolve().parent
7
  IMAGES_DIR = CURRENT_DIR.parent / "data" / "images"
8
- OUTPUT_FILE = CURRENT_DIR.parent / "data" / "emr_records_extended.csv"
9
 
10
- # Sample size
11
- SAMPLES_PER_CLASS = 300 # 300 * 3 = 900 total
 
12
 
13
- # Categories and labels
14
  categories = {
15
  "COVID": IMAGES_DIR / "COVID",
16
  "NORMAL": IMAGES_DIR / "NORMAL",
17
  "VIRAL PNEUMONIA": IMAGES_DIR / "VIRAL PNEUMONIA"
18
  }
19
 
20
- # Triage mapping
21
- triage_map = {
22
- "COVID": "high",
23
- "NORMAL": "low",
24
- "VIRAL PNEUMONIA": "medium"
25
- }
 
 
 
26
 
27
- # --- Noise Sentences ---
28
- noise_sentences = [
29
- "Follow-up scheduled for next week.",
30
- "Patient advised to maintain hydration and rest.",
31
- "No previous episodes of similar symptoms.",
32
- "Patient remains alert and oriented.",
33
- "Vitals are within acceptable ranges.",
34
- "No complications noted during assessment.",
35
- "Patient has no known drug allergies.",
36
- "Doctor recommends continued observation.",
37
- "Supportive care was initiated.",
38
- "Patient advised to avoid strenuous activity.",
39
- "No complications noted during assessment",
40
- "No prior history of respiratory illness.",
41
- "Mild discomfort reported with no severe symptoms.",
42
- "Symptoms are self-limiting according to patient.",
43
- "Patient remains alert and cooperative.",
44
- "No medication administered at this stage.",
45
- "Doctor recommends home resr and observation.",
46
- "Evaluation ongoing for possible infection."
47
  ]
48
 
49
- # --- ambiguity sentences ---
50
- ambiguous_templates = [
51
- "Mild fever noted. No cough. Patient recently traveled.",
52
- "Normal oxygen levels observed. Slight wheeze on auscultation.",
53
- "Patient reports chest discomfort but vitals are stable.",
54
- "No known exposure. Minor throat irritation present.",
55
- "Slight fatigue without other systemic symptoms."
 
56
  ]
57
 
58
- # --- Vitals & Symptoms ---
 
 
 
 
 
59
  def get_oxygen(label):
60
- base_ranges = {
61
- "COVID": (85, 94),
62
- "VIRAL PNEUMONIA": (88, 95),
63
- "NORMAL": (96, 99)
64
- }
65
- base_min, base_max = base_ranges[label]
66
- # Apply + or - 1 blur, clamping between 80 and 100
67
- oxygen = random.randint(base_min - 1, base_max + 1)
68
- return min(100, max(80, oxygen))
69
 
70
  def get_temp(label):
71
  if label == "NORMAL":
72
- base_min, base_max = 97.0, 98.6
73
  else:
74
- base_min, base_max = 99.0, 103.5
75
-
76
- # Apply + or - 0.5°F blur and clamp between 95-105°F
77
- temp = random.uniform(base_min - 0.5, base_max + 0.5)
78
- return round(min(105.0, max(95.0, temp)), 1)
79
-
80
- def get_days():
81
- return random.randint(1, 14)
82
 
83
  def get_age():
84
- return random.randint(18, 80)
 
 
 
85
 
86
- # --- Templates ---
87
  def build_emr(label, i):
88
- name = f"Patient-{label}-{i+1}"
89
  age = f"{get_age()}-year-old"
90
  days = get_days()
91
  temp = get_temp(label)
92
  oxygen = get_oxygen(label)
93
-
94
- # Symptoms Pool
95
- symptoms = {
96
- "COVID": [
97
- f"{name} ({age}) reports fatigue and dry cough for {days} days.",
98
- f"{name} complains of shortness of breath and fever of {temp}°F.",
99
- f"{name} reports loss of taste. SPO2 at {oxygen}%.",
100
- ],
101
- "NORMAL": [
102
- f"{name} ({age}) presents for routine check-up. Vitals stable.",
103
- f"{name} shows no respiratory distress. Oxygen at {oxygen}%.",
104
- f"{name} denies any recent illness. Temperature is {temp}°F.",
105
- ],
106
- "VIRAL PNEUMONIA": [
107
- f"{name} ({age}) complains of dry cough for {days} days.",
108
- f"{name} experiencing low-grade fever and SPO2 at {oxygen}%.",
109
- f"{name} reports breathlessness. X-ray indicates mild infiltrates.",
110
- ]
111
- }
112
-
113
- # Diagnosis Observations
114
- diagnosis = {
115
- "COVID": [
116
- "Findings suggest viral respiratory infection.",
117
- "Signs consistent with COVID-19 infection.",
118
- "Clinical features align with COVID diagnosis."
119
- ],
120
- "NORMAL": [
121
- "No signs of respiratory infection.",
122
- "No abnormal findings detected.",
123
- "Checkup results within normal limits."
124
- ],
125
- "VIRAL PNEUMONIA": [
126
- "X-ray shows patchy infiltrates.",
127
- "Suspected viral origin of symptoms.",
128
- "Clinical signs indicate viral pneumonia."
129
- ]
130
- }
131
-
132
- # Construct sentence pool
133
- body = [random.choice(symptoms[label]), random.choice(diagnosis[label])]
134
-
135
- # adding ambiguous cases randomly (~70% of cases)
136
- if random.random() < 0.7:
137
- body.insert(random.randint(0, len(body)), random.choice(ambiguous_templates))
138
-
139
- # adding noise to 90% of cases
140
- if random.random() < 0.9:
141
- for _ in range(random.randint(1,2)):
142
- body.insert(random.randint(0, len(body)), random.choice(noise_sentences))
143
-
144
- random.shuffle(body)
145
  return " ".join(body)
146
 
147
- # Generate dataset
148
  records = []
149
  for label, img_dir in categories.items():
150
- valid_exts = [".png", ".jpg", ".jpeg"]
151
- image_files = sorted(
152
- [f for f in img_dir.glob("*") if f.suffix.lower() in valid_exts]
153
- )
154
  for i in range(SAMPLES_PER_CLASS):
155
- patient_id = f"{label}-{i+1}"
156
  image_path = str(random.choice(image_files).relative_to(IMAGES_DIR.parent.parent))
157
- emr_text = build_emr(label, i)
158
- triage_level = triage_map[label]
159
- records.append([patient_id, image_path, emr_text, triage_level])
160
 
 
161
  random.shuffle(records)
162
-
163
- # Save to CSV
164
  with open(OUTPUT_FILE, "w", newline="") as f:
165
  writer = csv.writer(f)
166
  writer.writerow(["patient_id", "image_path", "emr_text", "triage_level"])
167
  writer.writerows(records)
168
 
169
- print(f"✅Generated {len(records)} EMR records in {OUTPUT_FILE}")
 
1
  import random
2
  import csv
3
+ import string
4
  from pathlib import Path
5
 
6
+ # Paths
7
  CURRENT_DIR = Path(__file__).resolve().parent
8
  IMAGES_DIR = CURRENT_DIR.parent / "data" / "images"
9
+ OUTPUT_FILE = CURRENT_DIR.parent / "data" / "emr_records_softlabels.csv"
10
 
11
+ # Label to triage
12
+ triage_map = {"COVID": "high", "NORMAL": "low", "VIRAL PNEUMONIA": "medium"}
13
+ SAMPLES_PER_CLASS = 300
14
 
15
+ # Folders
16
  categories = {
17
  "COVID": IMAGES_DIR / "COVID",
18
  "NORMAL": IMAGES_DIR / "NORMAL",
19
  "VIRAL PNEUMONIA": IMAGES_DIR / "VIRAL PNEUMONIA"
20
  }
21
 
22
+ # Shared ambiguous templates
23
+ shared_symptoms = [
24
+ "Mild cough and slight fever reported.",
25
+ "General fatigue and throat irritation present.",
26
+ "Breathing mildly labored during physical exertion.",
27
+ "No major respiratory distress; mild wheezing noted.",
28
+ "Occasional chest tightness reported.",
29
+ "Vital signs mostly stable; slight variation in temperature.",
30
+ ]
31
 
32
+ # Overlapping diagnosis clues
33
+ shared_diagnosis = [
34
+ "Symptoms could relate to a range of viral infections.",
35
+ "Presentation not distinctly matching any single infection.",
36
+ "Further tests required to confirm diagnosis.",
37
+ "Findings are borderline; clinical judgment advised.",
38
+ "Observation warranted due to overlapping signs.",
39
+ "Initial assessment inconclusive."
 
 
 
 
 
 
 
 
 
 
 
 
40
  ]
41
 
42
+ # Noise sentences
43
+ neutral_noise = [
44
+ "Patient is cooperative and alert.",
45
+ "Dietary habits unremarkable.",
46
+ "Hydration status normal.",
47
+ "Follow-up advised if symptoms persist.",
48
+ "No notable family medical history.",
49
+ "No medications currently administered.",
50
  ]
51
 
52
+ def random_token():
53
+ prefix = "ID"
54
+ letters = ''.join(random.choices(string.ascii_uppercase, k=2))
55
+ digits = ''.join(random.choices(string.digits, k=2))
56
+ return f"{prefix}-{letters}{digits}"
57
+
58
  def get_oxygen(label):
59
+ # Soft blur across classes
60
+ if label == "NORMAL":
61
+ return random.randint(94, 100)
62
+ elif label == "VIRAL PNEUMONIA":
63
+ return random.randint(90, 96)
64
+ else:
65
+ return random.randint(87, 94)
 
 
66
 
67
  def get_temp(label):
68
  if label == "NORMAL":
69
+ return round(random.uniform(97.5, 99.0), 1)
70
  else:
71
+ return round(random.uniform(98.8, 102.5), 1)
 
 
 
 
 
 
 
72
 
73
  def get_age():
74
+ return random.randint(18, 85)
75
+
76
+ def get_days():
77
+ return random.randint(1, 10)
78
 
 
79
  def build_emr(label, i):
80
+ pid = random_token()
81
  age = f"{get_age()}-year-old"
82
  days = get_days()
83
  temp = get_temp(label)
84
  oxygen = get_oxygen(label)
85
+
86
+ intro = f"Patient {pid}, a {age}, reports symptoms for {days} days."
87
+ vitals = f"Temperature recorded at {temp}°F and SPO2 at {oxygen}%."
88
+
89
+ # Shared symptoms + blurred logic
90
+ body = [
91
+ intro,
92
+ random.choice(shared_symptoms),
93
+ vitals,
94
+ random.choice(shared_diagnosis)
95
+ ]
96
+
97
+ # Optionally inject a mild class-specific clue (with low probability)
98
+ if random.random() < 0.3:
99
+ if label == "COVID":
100
+ body.append("Patient reports recent loss of taste.")
101
+ elif label == "VIRAL PNEUMONIA":
102
+ body.append("Chest X-ray shows scattered infiltrates.")
103
+ elif label == "NORMAL":
104
+ body.append("No active complaints at this time.")
105
+
106
+ # Inject 1–2 noise sentences
107
+ if random.random() < 0.8:
108
+ body.insert(random.randint(1, len(body)), random.choice(neutral_noise))
109
+ if random.random() < 0.5:
110
+ body.insert(random.randint(1, len(body)), random.choice(neutral_noise))
111
+
112
+ random.shuffle(body[1:]) # Keep intro in position 0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
113
  return " ".join(body)
114
 
115
+ # Generate records
116
  records = []
117
  for label, img_dir in categories.items():
118
+ image_files = sorted([f for f in img_dir.glob("*") if f.suffix.lower() in [".png", ".jpg", ".jpeg"]])
 
 
 
119
  for i in range(SAMPLES_PER_CLASS):
 
120
  image_path = str(random.choice(image_files).relative_to(IMAGES_DIR.parent.parent))
121
+ text = build_emr(label, i)
122
+ triage = triage_map[label]
123
+ records.append([f"{label}-{i+1}", image_path, text, triage])
124
 
125
+ # Shuffle + write
126
  random.shuffle(records)
 
 
127
  with open(OUTPUT_FILE, "w", newline="") as f:
128
  writer = csv.writer(f)
129
  writer.writerow(["patient_id", "image_path", "emr_text", "triage_level"])
130
  writer.writerows(records)
131
 
132
+ print(f"✅ Softlabel EMR dataset generated at {OUTPUT_FILE}")
src/train.py CHANGED
@@ -56,7 +56,7 @@ def train_model(mode="multimodal"): # Function to instantiate model and data, tr
56
  config = load_config()
57
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # Use GPU if available or else use CPU
58
 
59
- dataset_dir = os.path.join(base_dir, "data", "emr_records_fuzzy.csv")
60
  dataset = TriageDataset(
61
  csv_file=dataset_dir,
62
  mode=mode
@@ -152,11 +152,11 @@ def train_model(mode="multimodal"): # Function to instantiate model and data, tr
152
  print(f"Val Accuracy: {val_acc_epoch:.4f}, F1 Score: {val_f1:.4f}")
153
 
154
  # Save model
155
- model_path = os.path.join(base_dir, f"medi_llm_model_fuzzy{mode}.pth")
156
  torch.save(model.state_dict(), model_path) # Saves the model weights only not total architecture to reuse later
157
 
158
  # Plot accuracy
159
- plot_path = os.path.join(base_dir, "assets", f"model_training_curve_fuzzy{mode}.png")
160
  plt.plot(train_acc, label="Train Acc")
161
  plt.plot(val_acc, label="Val Acc")
162
  plt.legend()
 
56
  config = load_config()
57
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # Use GPU if available or else use CPU
58
 
59
+ dataset_dir = os.path.join(base_dir, "data", "emr_records_softlabels.csv")
60
  dataset = TriageDataset(
61
  csv_file=dataset_dir,
62
  mode=mode
 
152
  print(f"Val Accuracy: {val_acc_epoch:.4f}, F1 Score: {val_f1:.4f}")
153
 
154
  # Save model
155
+ model_path = os.path.join(base_dir, f"medi_llm_model_softlabels{mode}.pth")
156
  torch.save(model.state_dict(), model_path) # Saves the model weights only not total architecture to reuse later
157
 
158
  # Plot accuracy
159
+ plot_path = os.path.join(base_dir, "assets", f"model_training_curve_softlabels{mode}.png")
160
  plt.plot(train_acc, label="Train Acc")
161
  plt.plot(val_acc, label="Val Acc")
162
  plt.legend()