jen900704 commited on
Commit
cbad063
·
verified ·
1 Parent(s): 6dc62ac

Upload utils.py

Browse files
Files changed (1) hide show
  1. api/utils.py +1347 -0
api/utils.py ADDED
@@ -0,0 +1,1347 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from flask import Blueprint, send_file, make_response, request, jsonify
2
+ from services.nifti_processor import NiftiProcessor
3
+ from services.session_manager import SessionManager, generate_uuid
4
+ from services.auto_segmentor import run_auto_segmentation
5
+ from models.application_session import ApplicationSession
6
+ from models.combined_labels import CombinedLabels
7
+ from models.base import db
8
+ from constants import Constants
9
+
10
+ from io import BytesIO
11
+ from datetime import datetime
12
+ from reportlab.pdfgen import canvas
13
+ from reportlab.lib.pagesizes import letter
14
+
15
+ from typing import Any, Dict, Optional, Set, List, Tuple
16
+
17
+ import os
18
+ import uuid
19
+ import re
20
+ import time
21
+ import math
22
+ import numpy as np
23
+ import nibabel as nib
24
+ from scipy.ndimage import distance_transform_edt
25
+ from collections import defaultdict
26
+ from services.npz_processor import NpzProcessor
27
+ from PIL import Image
28
+ from openpyxl import load_workbook
29
+ import requests
30
+ import pandas as pd
31
+ # Track last session validation time
32
+ last_session_check = datetime.now()
33
+
34
+ # Progress tracking structure: {session_id: (start_time, expected_total_seconds)}
35
+ progress_tracker = {}
36
+
37
+ def id_is_training(index):
38
+ return index < 9000
39
+
40
+
41
+
42
+ def combine_label_npz(index: int):
43
+ npz_processor = NpzProcessor()
44
+ npz_processor.combine_labels(index)
45
+ return
46
+ def get_panTS_id(index):
47
+ cur_case_id = str(index)
48
+ iter = max(0, 8 - len(str(index)))
49
+ for _ in range(iter):
50
+ cur_case_id = "0" + cur_case_id
51
+ cur_case_id = "PanTS_" + cur_case_id
52
+ return cur_case_id
53
+
54
+ def clean_nan(obj):
55
+ """Recursively replace NaN with None for JSON serialization."""
56
+ if isinstance(obj, dict):
57
+ return {k: clean_nan(v) for k, v in obj.items()}
58
+ elif isinstance(obj, list):
59
+ return [clean_nan(elem) for elem in obj]
60
+ elif isinstance(obj, float) and math.isnan(obj):
61
+ return None
62
+ else:
63
+ return obj
64
+
65
+ def format_value(value):
66
+ """Format values for display, replacing 999999 or None with 'N/A'."""
67
+ return "N/A" if value in [999999, None] else str(value)
68
+
69
+ def organname_to_name(filename):
70
+ """Convert a NIfTI file name to a human-readable organ name."""
71
+ name = filename.replace(".nii.gz", "").replace("_", " ")
72
+ return name.title()
73
+
74
+ def get_mask_data_internal(id, fallback=False):
75
+ """Retrieve or compute organ metadata from NIfTI and mask paths for a session."""
76
+ try:
77
+ subfolder = "ImageTr" if int(id) < 9000 else "ImageTe"
78
+ label_subfolder = "LabelTr" if int(id) < 9000 else "LabelTe"
79
+ main_nifti_path = f"{Constants.PANTS_PATH}/data/{subfolder}/{get_panTS_id(id)}/{Constants.MAIN_NIFTI_FILENAME}"
80
+ combined_labels_path = f"{Constants.PANTS_PATH}/data/{label_subfolder}/{get_panTS_id(id)}/{Constants.COMBINED_LABELS_NIFTI_FILENAME}"
81
+ print(f"[INFO] Processing NIFTI for id {id}")
82
+ organ_intensities = None
83
+
84
+ organ_intensities_path = f"{Constants.PANTS_PATH}/data/{label_subfolder}/{get_panTS_id(id)}/{Constants.ORGAN_INTENSITIES_FILENAME}"
85
+ if not os.path.exists(organ_intensities_path) or not os.path.exists(combined_labels_path):
86
+ npz_processor = NpzProcessor()
87
+ labels, organ_intensities = npz_processor.combine_labels(int(id), keywords={"pancrea": "pancreas"}, save=True)
88
+ else:
89
+ with open(organ_intensities_path, "r") as f:
90
+ organ_intensities = json.load(f)
91
+
92
+ nifti_processor = NiftiProcessor(main_nifti_path, combined_labels_path)
93
+ nifti_processor.set_organ_intensities(organ_intensities)
94
+ organ_metadata = nifti_processor.calculate_metrics()
95
+ organ_metadata = clean_nan(organ_metadata)
96
+
97
+ return organ_metadata
98
+
99
+ except Exception as e:
100
+ print(f"[ERROR] get_mask_data_internal: {e}")
101
+ return {"error": str(e)}
102
+
103
+ def generate_distinct_colors(n):
104
+ """Generate n visually distinct RGB colors."""
105
+ import colorsys
106
+ HSV_tuples = [(x / n, 0.7, 0.9) for x in range(n)]
107
+ RGB_tuples = [tuple(int(c * 255) for c in colorsys.hsv_to_rgb(*hsv)) for hsv in HSV_tuples]
108
+ return RGB_tuples
109
+
110
+ def fill_voids_with_nearest_label(label_array):
111
+ """Fill all 0-valued voxels with the nearest non-zero label."""
112
+ mask = label_array == 0
113
+ if not np.any(mask):
114
+ return label_array
115
+
116
+ nonzero_coords = np.array(np.nonzero(label_array)).T
117
+ distances, indices = distance_transform_edt(mask, return_indices=True)
118
+ filled_array = label_array.copy()
119
+ filled_array[mask] = label_array[tuple(indices[:, mask])]
120
+ return filled_array
121
+
122
+ def build_adjacency_graph(label_array):
123
+ """Build adjacency graph of label connectivity in 6 directions."""
124
+ adjacency = defaultdict(set)
125
+ offsets = [(-1, 0, 0), (1, 0, 0),
126
+ (0, -1, 0), (0, 1, 0),
127
+ (0, 0, -1), (0, 0, 1)]
128
+
129
+ for dx, dy, dz in offsets:
130
+ shifted = np.roll(label_array, shift=(dx, dy, dz), axis=(0, 1, 2))
131
+ mask = (label_array != shifted) & (label_array != 0) & (shifted != 0)
132
+ l1 = label_array[mask]
133
+ l2 = shifted[mask]
134
+ for a, b in zip(l1, l2):
135
+ if a != b:
136
+ adjacency[a].add(b)
137
+ adjacency[b].add(a)
138
+ return adjacency
139
+
140
+ def assign_colors_with_high_contrast(label_ids, adjacency_graph, min_initial_colors=20, max_total_colors=50):
141
+ """
142
+ Assign colors to labels such that adjacent labels have different colors,
143
+ maximizing contrast and balance.
144
+ """
145
+ from itertools import combinations
146
+ import colorsys
147
+
148
+ def generate_distinct_colors(n):
149
+ HSV_tuples = [(x / n, 0.7, 0.9) for x in range(n)]
150
+ RGB_tuples = [tuple(int(c * 255) for c in colorsys.hsv_to_rgb(*hsv)) for hsv in HSV_tuples]
151
+ return RGB_tuples
152
+
153
+ def can_use_color(label, color_idx, assignments, adjacency_graph):
154
+ for neighbor in adjacency_graph[label]:
155
+ if assignments.get(neighbor) == color_idx:
156
+ return False
157
+ return True
158
+
159
+ label_ids = sorted(label_ids)
160
+ assignments = {}
161
+ num_colors = min_initial_colors
162
+ color_usage_count = {i: 0 for i in range(num_colors)}
163
+
164
+ while True:
165
+ colors = generate_distinct_colors(num_colors)
166
+ assignments.clear()
167
+ color_usage_count = {i: 0 for i in range(num_colors)}
168
+ success = True
169
+
170
+ for label in label_ids:
171
+ color_order = sorted(range(num_colors), key=lambda c: (color_usage_count[c], c))
172
+ for color_idx in color_order:
173
+ if can_use_color(label, color_idx, assignments, adjacency_graph):
174
+ assignments[label] = color_idx
175
+ color_usage_count[color_idx] += 1
176
+ break
177
+ else:
178
+ success = False
179
+ break
180
+
181
+ if success:
182
+ break
183
+ elif num_colors >= max_total_colors:
184
+ print(f"⚠️ Warning: reached max color count {max_total_colors}, some neighbors may share color")
185
+ break
186
+ else:
187
+ num_colors += 1
188
+
189
+ final_colors = generate_distinct_colors(num_colors)
190
+ print(f"✅ Final color count used: {len(set(assignments.values()))}")
191
+
192
+ color_map = {
193
+ str(round(label)): {
194
+ "R": final_colors[color_idx][0],
195
+ "G": final_colors[color_idx][1],
196
+ "B": final_colors[color_idx][2],
197
+ "A": 128
198
+ }
199
+ for label, color_idx in assignments.items()
200
+ }
201
+
202
+ return color_map, color_usage_count
203
+
204
+ def wait_for_file(filepath, timeout=30, check_interval=0.5):
205
+ """Wait until a file exists, or timeout is reached."""
206
+ start_time = time.time()
207
+ while not os.path.exists(filepath):
208
+ if time.time() - start_time > timeout:
209
+ raise TimeoutError(f"Timeout: File {filepath} not found after {timeout} seconds.")
210
+ time.sleep(check_interval)
211
+
212
+ def volume_to_png(volume, axis=2, index=None):
213
+ if index is None:
214
+ index = volume.shape[axis] // 2
215
+
216
+ slice_ = np.take(volume, index, axis=axis)
217
+ # window_center = 40
218
+ # window_width = 400
219
+ # min_val = window_center - window_width / 2
220
+ # max_val = window_center + window_width / 2
221
+
222
+ # slice_clipped = np.clip(slice_, min_val, max_val)
223
+ # slice_norm = 255 * (slice_clipped - min_val) / (max_val - min_val)
224
+ slice_norm = 255 * (slice_ - np.min(slice_)) / (np.max(slice_) - np.min(slice_))
225
+ slice_norm = slice_norm.astype(np.uint8)
226
+
227
+ slice_norm = np.rot90(slice_norm, k=1)
228
+ slice_norm = np.flip(slice_norm, axis=0)
229
+
230
+ pil_img = Image.fromarray(slice_norm)
231
+ buf = BytesIO()
232
+ pil_img.save(buf, format="PNG")
233
+ buf.seek(0)
234
+ return buf
235
+ def generate_pdf_with_template(
236
+ output_pdf,
237
+ folder_name,
238
+ ct_path,
239
+ mask_path,
240
+ template_pdf,
241
+ temp_pdf_path,
242
+ id,
243
+ extracted_data=None,
244
+ column_headers=None,
245
+ ):
246
+ import os
247
+ import nibabel as nib
248
+ import numpy as np
249
+ import pandas as pd
250
+ from PyPDF2 import PdfReader, PdfWriter
251
+ from PyPDF2._page import PageObject
252
+ from reportlab.pdfgen import canvas
253
+ from reportlab.lib.pagesizes import letter
254
+
255
+ LABELS = {v: k for k, v in Constants.PREDEFINED_LABELS.items()}
256
+ NAME_TO_ORGAN = {
257
+ # Pancreas and its lesions
258
+ "pancreas": "pancreas",
259
+ "pancreas_body": "pancreas",
260
+ "pancreas_head": "pancreas",
261
+ "pancreas_tail": "pancreas",
262
+ "pancreatic_lesion": "pancreas",
263
+ "pancreatic_duct": "pancreas",
264
+
265
+ # All other organs: map to self
266
+ "aorta": "aorta",
267
+ "adrenal_gland_left": "adrenal_gland_left",
268
+ "adrenal_gland_right": "adrenal_gland_right",
269
+ "bladder": "bladder",
270
+ "common_bile_duct": "common_bile_duct",
271
+ "celic_artery": "celiac_artery",
272
+ "colon": "colon",
273
+ "duodenum": "duodenum",
274
+ "femur_right": "femur_right",
275
+ "femur_left": "femur_left",
276
+ "gall_bladder": "gall_bladder",
277
+ "postcava": "postcava",
278
+ "kidney_left": "kidney_left",
279
+ "kidney_right": "kidney_right",
280
+ "liver": "liver",
281
+ "postcava": "postcava",
282
+ "prostate": "prostate",
283
+ "superior_mesenteric_artery": "superior_mesenteric_artery",
284
+ "intestine": "intestine",
285
+ "spleen": "spleen",
286
+ "stomach": "stomach",
287
+ "veins": "veins",
288
+ }
289
+
290
+ try:
291
+ temp_pdf = canvas.Canvas(temp_pdf_path, pagesize=letter)
292
+ width, height = letter
293
+ left_margin, top_margin = 50, 100
294
+ line_height, section_spacing = 12, 30
295
+ y_position = height - top_margin
296
+
297
+ def reset_page():
298
+ nonlocal y_position
299
+ temp_pdf.showPage()
300
+ y_position = height - 120
301
+ temp_pdf.setFont("Helvetica", 10)
302
+
303
+ def write_wrapped_text(x, y, content, bold=False, font_size=10, max_width=None):
304
+ temp_pdf.setFont("Helvetica-Bold" if bold else "Helvetica", font_size)
305
+ words = content.split()
306
+ current_line = ""
307
+ max_width = max_width or width - left_margin * 2
308
+ for word in words:
309
+ if temp_pdf.stringWidth(current_line + word + " ", "Helvetica", font_size) > max_width:
310
+ temp_pdf.drawString(x, y, current_line.strip())
311
+ y -= line_height
312
+ current_line = f"{word} "
313
+ if y < 50:
314
+ reset_page()
315
+ y = y_position
316
+ else:
317
+ current_line += f"{word} "
318
+ if current_line:
319
+ temp_pdf.drawString(x, y, current_line.strip())
320
+ y -= line_height
321
+ return y
322
+
323
+ def safe_extract(index, default="N/A"):
324
+ if extracted_data is not None and index in extracted_data:
325
+ val = extracted_data[index]
326
+ return "N/A" if pd.isna(val) else val
327
+ return default
328
+
329
+ wb = load_workbook(os.path.join(Constants.PANTS_PATH, "data", "metadata.xlsx"))
330
+ sheet = wb["PanTS_metadata"]
331
+ age = None
332
+ sex = "-"
333
+ contrast = ""
334
+ study_detail = ""
335
+ for row in sheet.iter_rows(values_only=True):
336
+ if row[0] == get_panTS_id(folder_name):
337
+ age = row[5]
338
+ sex = row[4]
339
+ contrast = row[3]
340
+ study_detail = row[8]
341
+ break
342
+
343
+ # Title
344
+ temp_pdf.setFont("Helvetica-Bold", 26)
345
+ title_text = "MEDICAL REPORT"
346
+ title_width = temp_pdf.stringWidth(title_text, "Helvetica-Bold", 26)
347
+ temp_pdf.drawString((width - title_width) / 2, height - 70, title_text)
348
+ y_position = height - 100
349
+
350
+ # Patient info
351
+ temp_pdf.setFont("Helvetica-Bold", 12)
352
+ temp_pdf.drawString(left_margin, y_position, "PATIENT INFORMATION")
353
+ y_position -= line_height
354
+
355
+ left_y = write_wrapped_text(left_margin, y_position, f"PANTS ID: {folder_name}")
356
+ right_y = write_wrapped_text(width / 2, y_position, f"Sex: {sex}")
357
+ y_position -= line_height
358
+
359
+ write_wrapped_text(left_margin, y_position, f"Age: {age}")
360
+
361
+ y_position = min(left_y, right_y) - section_spacing
362
+
363
+ # Imaging detail
364
+ temp_pdf.setFont("Helvetica-Bold", 12)
365
+ temp_pdf.drawString(left_margin, y_position, "IMAGING DETAIL")
366
+ y_position -= line_height
367
+
368
+ ct_nii = nib.load(ct_path)
369
+ spacing = ct_nii.header.get_zooms()
370
+ shape = ct_nii.shape
371
+
372
+ try:
373
+ scanner_info = str(ct_nii.header['descrip'].tobytes().decode('utf-8')).strip().replace('\x00', '')
374
+ except Exception:
375
+ scanner_info = "N/A"
376
+
377
+
378
+ y_position = write_wrapped_text(left_margin, y_position, f"Spacing: {spacing}")
379
+ y_position = write_wrapped_text(left_margin, y_position, f"Shape: {shape}")
380
+ y_position = write_wrapped_text(left_margin, y_position, f"Study type: {study_detail}")
381
+ y_position = write_wrapped_text(left_margin, y_position, f"Contrast: {contrast}")
382
+ y_position -= section_spacing
383
+
384
+ # Load image data
385
+ ct_array = ct_nii.get_fdata()
386
+ mask_array = nib.load(mask_path).get_fdata().astype(np.uint8)
387
+ voxel_volume = np.prod(nib.load(mask_path).header.get_zooms()) / 1000 # mm³ to cm³
388
+ print(np.unique(mask_array))
389
+
390
+ # AI Measurements
391
+ temp_pdf.setFont("Helvetica-Bold", 12)
392
+ temp_pdf.drawString(left_margin, y_position, "AI MEASUREMENTS")
393
+ y_position -= line_height
394
+
395
+ # Table configuration
396
+ headers = ["Organ", "Volume (cc)", "Mean HU"]
397
+ col_widths = [120, 100, 100]
398
+ row_height = 20
399
+
400
+ def draw_table_row(row_data, is_header=False):
401
+ nonlocal y_position
402
+ if y_position - row_height < 50:
403
+ reset_page()
404
+ temp_pdf.setFont("Helvetica-Bold", 12)
405
+ temp_pdf.drawString(left_margin, y_position, "AI MEASUREMENTS (continued)")
406
+ y_position -= line_height
407
+ draw_table_row(headers, is_header=True)
408
+ x = left_margin
409
+ temp_pdf.setFont("Helvetica-Bold" if is_header else "Helvetica", 9)
410
+ for i, cell in enumerate(row_data):
411
+ temp_pdf.drawString(x + 2, y_position - row_height + 5, str(cell))
412
+ temp_pdf.line(x, y_position, x, y_position - row_height)
413
+ x += col_widths[i]
414
+ temp_pdf.line(left_margin + sum(col_widths), y_position, left_margin + sum(col_widths), y_position - row_height)
415
+ temp_pdf.line(left_margin, y_position, left_margin + sum(col_widths), y_position)
416
+ y_position -= row_height
417
+ temp_pdf.line(left_margin, y_position, left_margin + sum(col_widths), y_position)
418
+
419
+ draw_table_row(headers, is_header=True)
420
+
421
+ lession_volume_dict={}
422
+ for organ, label_id in LABELS.items():
423
+ if organ in NAME_TO_ORGAN and NAME_TO_ORGAN[organ] != organ:
424
+ mask = (mask_array == label_id)
425
+ if not np.any(mask):
426
+ print("none")
427
+ continue
428
+ volume = np.sum(mask) * voxel_volume
429
+ mean_hu = np.mean(ct_array[mask])
430
+ if NAME_TO_ORGAN[organ] in lession_volume_dict:
431
+ lession_volume_dict[NAME_TO_ORGAN[organ]]["number"] += 1
432
+ lession_volume_dict[NAME_TO_ORGAN[organ]]["volume"] += volume
433
+ else:
434
+ lession_volume_dict[NAME_TO_ORGAN[organ]] = {
435
+ "number": 1,
436
+ "volume": volume
437
+ }
438
+
439
+ print(lession_volume_dict)
440
+
441
+ for organ, label_id in LABELS.items():
442
+ if organ in NAME_TO_ORGAN and NAME_TO_ORGAN[organ] != organ:
443
+ continue
444
+ if label_id == 0:
445
+ continue
446
+ mask = (mask_array == label_id)
447
+ if not np.any(mask):
448
+ continue
449
+ volume = np.sum(mask) * voxel_volume
450
+ mean_hu = np.mean(ct_array[mask])
451
+
452
+ if organ in lession_volume_dict:
453
+ row = [organ.replace('_', ' '), f"{volume:.2f}", f"{mean_hu:.1f}"]
454
+ else:
455
+ row = [organ.replace('_', ' '), f"{volume:.2f}", f"{mean_hu:.1f}"]
456
+ draw_table_row(row)
457
+
458
+ # y_position -= section_spacing
459
+
460
+ # === Step 2: PDAC Staging ===
461
+ # temp_pdf.setFont("Helvetica-Bold", 12)
462
+ # temp_pdf.drawString(left_margin, y_position, "PDAC STAGING")
463
+ # y_position -= line_height
464
+
465
+ # try:
466
+ # pdac_info = get_pdac_staging(id)
467
+ # print(pdac_info, id)
468
+ # pdac_text = pdac_info.get("staging_description", "No staging data available.")
469
+ # except Exception:
470
+ # pdac_text = "Error fetching PDAC staging information."
471
+
472
+ # y_position = write_wrapped_text(left_margin, y_position, pdac_text, bold=False, font_size=10)
473
+ # === Step 3: Key Images ===
474
+
475
+ # include_liver = np.count_nonzero(mask_array == LABELS["liver"]) > 0
476
+ # include_pancreas = lession_volume_dict.get("pancreas", {}).get("number", 0) > 0
477
+ # include_kidney = np.count_nonzero(mask_array == LABELS["kidney_left"]) > 0 or np.count_nonzero(mask_array == LABELS["kidney_right"]) > 0
478
+ # print(include_liver, include_pancreas, include_kidney)
479
+ # if include_liver or include_pancreas or include_kidney:
480
+ # def check_and_reset_page(space_needed):
481
+ # nonlocal y_position
482
+ # if y_position - space_needed < 50:
483
+ # reset_page()
484
+
485
+ # temp_pdf.showPage()
486
+ # y_position = height - top_margin
487
+ # temp_pdf.setFont("Helvetica-Bold", 14)
488
+ # # temp_pdf.drawString(left_margin, y_position, "KEY IMAGES")
489
+ # y_position -= section_spacing
490
+
491
+ # organs = {
492
+ # "liver": include_liver,
493
+ # "pancreas": include_pancreas,
494
+ # "kidney_left": include_kidney,
495
+ # "kidney_right": include_kidney
496
+ # }
497
+ # download_clean_folder(ct_path.replace("/inputs/", "/outputs/").rsplit("/", 1)[0])
498
+ # for organ in organs:
499
+ # organ_data = lession_volume_dict.get(organ)
500
+ # if not organ_data or organ_data.get("number", 0) == 0:
501
+ # continue
502
+
503
+ # header = f"{organ.replace('_', ' ').upper()} TUMORS"
504
+ # check_and_reset_page(line_height)
505
+ # temp_pdf.setFont("Helvetica", 12)
506
+ # temp_pdf.drawString(left_margin, y_position, header)
507
+ # y_position -= line_height
508
+ # print(organ, organ_data)
509
+ # check_and_reset_page(220)
510
+ # overlay_path = f"/tmp/{organ}_overlay.png"
511
+ # print(ct_path, mask_path)
512
+ # organ_mask_path = mask_path.replace('combined_labels.nii.gz', 'segmentations/'+organ+'.nii.gz')
513
+ # print(organ_mask_path)
514
+ # if create_overlay_image(ct_path, organ_mask_path, overlay_path, color="red"):
515
+ # try:
516
+ # temp_pdf.drawImage(overlay_path, left_margin, y_position - 200, width=200, height=200)
517
+ # except:
518
+ # print(overlay_path)
519
+ # check_and_reset_page(220)
520
+ # zoom_path = f"/tmp/{organ}_zoomed.png"
521
+ # if zoom_into_labeled_area(ct_path, organ_mask_path, zoom_path, color="red"):
522
+ # temp_pdf.drawImage(zoom_path, left_margin + 250, y_position - 205, width=210, height=210)
523
+ # print('521')
524
+ # y_position -= 220
525
+
526
+ temp_pdf.save()
527
+
528
+ # Merge with template
529
+ template_reader = PdfReader(template_pdf)
530
+ content_reader = PdfReader(temp_pdf_path)
531
+ writer = PdfWriter()
532
+
533
+ for page in content_reader.pages:
534
+ template_page = template_reader.pages[0]
535
+ merged_page = PageObject.create_blank_page(
536
+ width=template_page.mediabox.width,
537
+ height=template_page.mediabox.height
538
+ )
539
+ merged_page.merge_page(template_page)
540
+ merged_page.merge_page(page)
541
+ writer.add_page(merged_page)
542
+
543
+ with open(output_pdf, "wb") as f:
544
+ writer.write(f)
545
+
546
+
547
+ except Exception as e:
548
+ raise RuntimeError(f"Error generating PDF for {folder_name}: {e}")
549
+ finally:
550
+ if os.path.exists(temp_pdf_path):
551
+ os.remove(temp_pdf_path)
552
+
553
+ # Helper Function to Process CT and Mask
554
+ def get_most_labeled_slice(ct_path, mask_path, output_png, contrast_min=-150, contrast_max=250):
555
+ """
556
+ Load CT and mask, ensure RAS orientation, find the most labeled slice, and generate an overlay image.
557
+ """
558
+
559
+ try:
560
+ import SimpleITK as sitk
561
+ import matplotlib
562
+ matplotlib.use('Agg') # ✅ 关键:不再尝试调用 GUI
563
+
564
+ import matplotlib.pyplot as plt
565
+
566
+ # Load the CT scan and mask
567
+ ct_scan = sitk.ReadImage(ct_path)
568
+ print('543',mask_path)
569
+ mask = sitk.ReadImage(mask_path)
570
+ print(mask_path)
571
+ # Reorient to RAS
572
+ ct_scan = sitk.DICOMOrient(ct_scan, 'RAS')
573
+ mask = sitk.DICOMOrient(mask, 'RAS')
574
+
575
+ # Convert to numpy arrays
576
+ ct_array = sitk.GetArrayFromImage(ct_scan)
577
+ mask_array = sitk.GetArrayFromImage(mask)
578
+
579
+ # Check for shape mismatches
580
+ if ct_array.shape != mask_array.shape:
581
+ raise ValueError(f"Shape mismatch: CT shape {ct_array.shape}, Mask shape {mask_array.shape}")
582
+
583
+ # Find the slice with the most labels
584
+ slice_sums = np.sum(mask_array, axis=(1, 2))
585
+ most_labeled_slice_index = np.argmax(slice_sums)
586
+
587
+ # Get the CT and mask slices
588
+ ct_slice = ct_array[most_labeled_slice_index]
589
+ mask_slice = mask_array[most_labeled_slice_index]
590
+
591
+ # Apply mirroring
592
+ ct_slice = np.fliplr(ct_slice)
593
+ mask_slice = np.fliplr(mask_slice)
594
+
595
+ # Apply contrast adjustment
596
+ ct_slice = np.clip(ct_slice, contrast_min, contrast_max)
597
+ ct_slice = (ct_slice - contrast_min) / (contrast_max - contrast_min) * 255
598
+ ct_slice = ct_slice.astype(np.uint8)
599
+
600
+ # Overlay mask contours on CT slice
601
+ plt.figure(figsize=(6, 6))
602
+ plt.imshow(ct_slice, cmap='gray', origin='lower')
603
+ plt.contour(mask_slice, colors='red', linewidths=1) # Use red contours for the mask
604
+ plt.axis('off')
605
+ plt.savefig(output_png, bbox_inches="tight", pad_inches=0)
606
+ plt.close()
607
+ print('586')
608
+ return True
609
+ except:
610
+ return False
611
+
612
+ def create_overlay_image(ct_path, mask_path, output_path, color="red"):
613
+ """
614
+ Generate overlay images for most labeled slices using the unified RAS orientation logic.
615
+ """
616
+ return get_most_labeled_slice(ct_path, mask_path, output_path)
617
+
618
+
619
+ # Helper Function to Zoom into Labeled Area
620
+ def zoom_into_labeled_area(ct_path, mask_path, output_path, color="red"):
621
+ """
622
+ Create a zoomed-in view of the largest labeled area with consistent RAS orientation.
623
+ """
624
+ import SimpleITK as sitk
625
+ import matplotlib.pyplot as plt
626
+ try:
627
+ # Load the CT scan and mask
628
+ ct_scan = sitk.ReadImage(ct_path)
629
+ mask = sitk.ReadImage(mask_path)
630
+
631
+ # Reorient to RAS
632
+ ct_scan = sitk.DICOMOrient(ct_scan, 'RAS')
633
+ mask = sitk.DICOMOrient(mask, 'RAS')
634
+
635
+ # Convert to numpy arrays
636
+ ct_array = sitk.GetArrayFromImage(ct_scan)
637
+ mask_array = sitk.GetArrayFromImage(mask)
638
+
639
+ # Check for shape mismatches
640
+ if ct_array.shape != mask_array.shape:
641
+ raise ValueError(f"Shape mismatch: CT shape {ct_array.shape}, Mask shape {mask_array.shape}")
642
+
643
+ # Find the slice with the most labels
644
+ slice_sums = np.sum(mask_array, axis=(1, 2))
645
+ largest_slice_idx = np.argmax(slice_sums)
646
+ if slice_sums[largest_slice_idx] == 0:
647
+ raise ValueError("No labeled area found in the mask.")
648
+
649
+ # Get the mask slice and calculate the bounding box
650
+ mask_slice = mask_array[largest_slice_idx]
651
+ coords = np.array(np.where(mask_slice))
652
+ min_row, max_row = np.min(coords[0]), np.max(coords[0])
653
+ min_col, max_col = np.min(coords[1]), np.max(coords[1])
654
+ padding = 20
655
+ min_row = max(min_row - padding, 0)
656
+ max_row = min(max_row + padding, mask_slice.shape[0])
657
+ min_col = max(min_col - padding, 0)
658
+ max_col = min(max_col + padding, mask_slice.shape[1])
659
+
660
+ # Extract the zoomed region
661
+ zoomed_image = ct_array[largest_slice_idx][min_row:max_row, min_col:max_col]
662
+ zoomed_mask = mask_array[largest_slice_idx][min_row:max_row, min_col:max_col]
663
+
664
+ # Apply mirroring
665
+ zoomed_image = np.fliplr(zoomed_image)
666
+ zoomed_mask = np.fliplr(zoomed_mask)
667
+
668
+ # Apply contrast adjustment to the zoomed CT slice
669
+ zoomed_image = np.clip(zoomed_image, -150, 250)
670
+ zoomed_image = (zoomed_image + 150) / 400 * 255
671
+ zoomed_image = zoomed_image.astype(np.uint8)
672
+
673
+ # Save the zoomed-in image with overlay
674
+ plt.figure(figsize=(6, 6))
675
+ plt.imshow(zoomed_image, cmap="gray", origin="lower")
676
+ plt.contour(zoomed_mask, colors=color, linewidths=1)
677
+ plt.axis("off")
678
+ plt.savefig(output_path, bbox_inches="tight")
679
+ plt.close()
680
+ return True
681
+ except Exception as e:
682
+ return False
683
+
684
+ def get_pdac_staging(clabel_id):
685
+ try:
686
+ subfolder = "ImageTr" if int(clabel_id) < 9000 else "ImageTe"
687
+ label_subfolder = "LabelTr" if int(clabel_id) < 9000 else "LabelTe"
688
+ main_nifti_path = f"{Constants.PANTS_PATH}/data/{subfolder}/{get_panTS_id(clabel_id)}/{Constants.MAIN_NIFTI_FILENAME}"
689
+ combined_labels_path = f"{Constants.PANTS_PATH}/data/{label_subfolder}/{get_panTS_id(clabel_id)}/{Constants.COMBINED_LABELS_NIFTI_FILENAME}"
690
+
691
+ nifti_processor = NiftiProcessor(main_nifti_path, combined_labels_path)
692
+ staging_result = nifti_processor.calculate_pdac_sma_staging()
693
+
694
+ return {"staging_description": staging_result}
695
+
696
+ except Exception as e:
697
+ import traceback
698
+ traceback.print_exc()
699
+ return {"error": f"PDAC staging failed: {str(e)}"}
700
+
701
+ import json
702
+ def download_clean_folder(root):
703
+ """
704
+ 如果文件正好匹配4个目标名,则删除其中两个,并将combined_labels.nii.gz根据dataset.json分割为独立器官文件。
705
+ """
706
+ target_files = {
707
+ "combined_labels.nii.gz",
708
+ "dataset.json",
709
+ "plans.json",
710
+ "predict_from_raw_data_args.json"
711
+ }
712
+
713
+ actual_files = set(os.listdir(root))
714
+ if actual_files == target_files:
715
+ # 删除 plans.json 和 predict_from_raw_data_args.json
716
+ for fname in ["plans.json", "predict_from_raw_data_args.json"]:
717
+ fpath = os.path.join(root, fname)
718
+ if os.path.exists(fpath):
719
+ os.remove(fpath)
720
+ print(f"🗑️ Removed during zip: {fpath}")
721
+
722
+ # 读取 dataset.json
723
+ dataset_json_path = os.path.join(root, "dataset.json")
724
+ with open(dataset_json_path, 'r') as f:
725
+ dataset_info = json.load(f)
726
+
727
+ labels = dataset_info["labels"] # 获取标签名与ID的映射
728
+
729
+ # 读取 combined_labels.nii.gz
730
+ combined_path = os.path.join(root, "combined_labels.nii.gz")
731
+ combined_img = nib.load(combined_path)
732
+ combined_data = combined_img.get_fdata()
733
+ affine = combined_img.affine
734
+
735
+ # 创建 segmentations 文件夹
736
+ seg_folder = os.path.join(root, "segmentations")
737
+ os.makedirs(seg_folder, exist_ok=True)
738
+
739
+ # 为每个标签生成单独的 mask 文件
740
+ for label_name, label_value in labels.items():
741
+ mask = (combined_data == label_value).astype(np.uint8)
742
+ label_img = nib.Nifti1Image(mask, affine)
743
+ out_path = os.path.join(seg_folder, f"{label_name}.nii.gz")
744
+ nib.save(label_img, out_path)
745
+ print(f"✅ Saved: {out_path}")
746
+ os.remove(dataset_json_path)
747
+ else:
748
+ print("ℹ️ Folder content does not match the expected file set. Skipping cleanup and split.")
749
+
750
+ async def store_files(combined_labels_id):
751
+ subfolder = "LabelTr" if int(combined_labels_id) < 9000 else "LabelTe"
752
+ image_subfolder = "ImageTr" if int(combined_labels_id) < 9000 else "ImageTe"
753
+
754
+ def download(url, path):
755
+ os.makedirs(os.path.dirname(path), exist_ok=True)
756
+ headers = {"User-Agent": "Mozilla/5.0"}
757
+ res = requests.get(url, stream=True, headers=headers, allow_redirects=True)
758
+ if res.status_code == 200:
759
+ with open(path, "wb") as f:
760
+ for chunk in res.iter_content(1024):
761
+ f.write(chunk)
762
+ print(f"Saved: {path}")
763
+ else:
764
+ print(f"Failed: {url} ({res.status_code})")
765
+
766
+ # main CT
767
+ image_url = f"https://huggingface.co/datasets/BodyMaps/iPanTSMini/resolve/main/image_only/{get_panTS_id(combined_labels_id)}/ct.nii.gz"
768
+ image_path = f"{Constants.PANTS_PATH}/data/{image_subfolder}/{get_panTS_id(combined_labels_id)}/ct.nii.gz"
769
+ download(image_url, image_path)
770
+
771
+ # labels
772
+ for label in list(Constants.PREDEFINED_LABELS.values()):
773
+ mask_url = f"https://huggingface.co/datasets/BodyMaps/iPanTSMini/resolve/main/mask_only/{get_panTS_id(combined_labels_id)}/segmentations/{label}.nii.gz"
774
+ mask_path = f"{Constants.PANTS_PATH}/data/{subfolder}/{get_panTS_id(combined_labels_id)}/segmentations/{label}.nii.gz"
775
+ download(mask_url, mask_path)
776
+
777
+ META_FILE = f"{Constants.PANTS_PATH}/data/metadata.xlsx"
778
+ # ---------------------------
779
+ # Helpers
780
+ # ---------------------------
781
+ def _arg(name: str, default=None):
782
+ return request.args.get(name, default)
783
+
784
+ def to_int(x) -> Optional[int]:
785
+ try:
786
+ return int(x)
787
+ except Exception:
788
+ return None
789
+
790
+ def _to_float(x) -> Optional[float]:
791
+ try:
792
+ return float(x)
793
+ except Exception:
794
+ return None
795
+
796
+ def _to01_query(x) -> Optional[int]:
797
+ if x is None: return None
798
+ s = str(x).strip().lower()
799
+ if s in ("1","true","yes","y"): return 1
800
+ if s in ("0","false","no","n"): return 0
801
+ return None
802
+
803
+ def _collect_list_params(names: List[str]) -> List[str]:
804
+ out: List[str] = []
805
+ for n in names:
806
+ if n in request.args:
807
+ out += request.args.getlist(n)
808
+ tmp: List[str] = []
809
+ for s in out:
810
+ if "," in s:
811
+ tmp += [t.strip() for t in s.split(",") if t.strip()]
812
+ else:
813
+ tmp.append(s.strip())
814
+ return [t for t in tmp if t]
815
+
816
+ def _nan2none(v):
817
+ try:
818
+ if v is None: return None
819
+ if pd.isna(v): return None
820
+ except Exception:
821
+ pass
822
+ return v
823
+
824
+ def clean_json_list(items: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
825
+ def _clean(v):
826
+ if isinstance(v, (np.integer,)): return int(v)
827
+ if isinstance(v, (np.floating,)): return float(v)
828
+ if isinstance(v, (np.bool_,)): return bool(v)
829
+ return v
830
+ return [{k: _clean(v) for k, v in d.items()} for d in items]
831
+
832
+ def _canon_letters_digits(s: str) -> str:
833
+ # 把 "LightSpeed16" 變成 "LightSpeed 16"
834
+ s2 = re.sub(r"([A-Za-z])(\d)", r"\1 \2", s)
835
+ s2 = re.sub(r"(\d)([A-Za-z])", r"\1 \2", s2)
836
+ return re.sub(r"\s+", " ", s2).strip()
837
+
838
+ def canon_model(s: str) -> str:
839
+ if not s: return ""
840
+ base = str(s).strip()
841
+ # 標準化空白/底線/大小寫
842
+ low = re.sub(r"[_\-]+", " ", base).strip().lower()
843
+ low = _canon_letters_digits(low)
844
+ # 套用別名表
845
+ if low in Constants.MODEL_ALIASES:
846
+ return Constants.MODEL_ALIASES[low]
847
+ # 沒有在別名表時:維持「字母數字分隔 + 每字首大寫」的安全格式
848
+ spaced = _canon_letters_digits(base)
849
+ # 常見廠牌固定大寫
850
+ spaced = re.sub(r"(?i)^somatom", "SOMATOM", spaced)
851
+ spaced = re.sub(r"(?i)^iqon", "IQon", spaced)
852
+ return spaced
853
+
854
+ # ---------------------------
855
+ # Load & normalize
856
+ # ---------------------------
857
+ def _norm_cols(df_raw: pd.DataFrame) -> pd.DataFrame:
858
+ """標準化欄位,產出搜尋/排序需要的衍生欄位。"""
859
+ df = df_raw.copy()
860
+
861
+ # ---- Case ID ----
862
+ case_cols = ["PanTS ID", "PanTS_ID", "case_id", "id", "case", "CaseID"]
863
+ def _first_nonempty(row, cols):
864
+ for c in cols:
865
+ if c in row.index and pd.notna(row[c]) and str(row[c]).strip():
866
+ return str(row[c]).strip(), c
867
+ return "", None
868
+
869
+ cases, mapping = [], []
870
+ for _, r in df.iterrows():
871
+ s, c = _first_nonempty(r, case_cols)
872
+ cases.append(s); mapping.append({"case": c} if c else {})
873
+ df["__case_str"] = cases
874
+ df["_orig_cols"] = mapping
875
+
876
+ # ---- Tumor -> __tumor01 ----
877
+ def _canon(s: str) -> str: return re.sub(r"[^a-z]+", "", str(s).lower())
878
+ tumor_names = [c for c in df.columns if "tumor" in _canon(c)] or []
879
+ tcol = tumor_names[0] if tumor_names else None
880
+
881
+ def _to01_v(v):
882
+ if pd.isna(v): return np.nan
883
+ s = str(v).strip().lower()
884
+ if s in ("1","yes","y","true","t"): return 1
885
+ if s in ("0","no","n","false","f"): return 0
886
+ try:
887
+ iv = int(float(s))
888
+ return 1 if iv == 1 else (0 if iv == 0 else np.nan)
889
+ except Exception:
890
+ return np.nan
891
+
892
+ df["__tumor01"] = (df[tcol].map(_to01_v) if tcol else pd.Series([np.nan]*len(df), index=df.index))
893
+ if tcol:
894
+ df["_orig_cols"] = [{**(df["_orig_cols"].iat[i] or {}), "tumor": tcol} for i in range(len(df))]
895
+
896
+ # ---- Sex -> __sex ----
897
+ df["__sex"] = df.get("sex", pd.Series([""]*len(df))).astype(str).str.strip().str.upper()
898
+ df["__sex"] = df["__sex"].where(df["__sex"].isin(["F","M"]), "")
899
+
900
+ # ---- Generic column finder ----
901
+ def _find_col(prefer, keyword_sets=None):
902
+ for c in prefer:
903
+ if c in df.columns: return c
904
+ if keyword_sets:
905
+ canon_map = {c: re.sub(r"[^a-z0-9]+", "", str(c).lower()) for c in df.columns}
906
+ for c, cs in canon_map.items():
907
+ for ks in keyword_sets:
908
+ if all(k in cs for k in ks): return c
909
+ return None
910
+
911
+ # ---- CT phase -> __ct / __ct_lc ----
912
+ ct_col = _find_col(
913
+ prefer=["ct phase","CT phase","ct_phase","CT_phase","ct"],
914
+ keyword_sets=[["ct","phase"],["phase"]],
915
+ )
916
+ if ct_col:
917
+ df["__ct"] = df[ct_col].astype(str).str.strip()
918
+ df["__ct_lc"] = df["__ct"].str.lower()
919
+ df["_orig_cols"] = [{**(df["_orig_cols"].iat[i] or {}), "ct_phase": ct_col} for i in range(len(df))]
920
+ else:
921
+ df["__ct"], df["__ct_lc"] = "", ""
922
+
923
+ # ---- Manufacturer -> __mfr / __mfr_lc ----
924
+ mfr_col = _find_col(
925
+ prefer=["manufacturer","Manufacturer","mfr","MFR","vendor","Vendor","manufacturer name","Manufacturer Name"],
926
+ keyword_sets=[["manufactur"],["vendor"],["brand"],["maker"]],
927
+ )
928
+ if mfr_col:
929
+ df["__mfr"] = df[mfr_col].astype(str).str.strip()
930
+ df["__mfr_lc"] = df["__mfr"].str.lower()
931
+ df["_orig_cols"] = [{**(df["_orig_cols"].iat[i] or {}), "manufacturer": mfr_col} for i in range(len(df))]
932
+ else:
933
+ df["__mfr"], df["__mfr_lc"] = "", ""
934
+
935
+ # ---- Manufacturer model -> model / __model_lc ----
936
+ model_col = _find_col(
937
+ prefer=["manufacturer model", "Manufacturer model", "model", "Model"],
938
+ keyword_sets=[["model"]],
939
+ )
940
+ if model_col:
941
+ # 保留原始字串以便追蹤
942
+ df["model_raw"] = df[model_col].astype(str).str.strip()
943
+ # 規則化為標準型號(大小寫、空白、數字黏在一起等)
944
+ df["model"] = df["model_raw"].map(canon_model)
945
+ df["__model_lc"] = df["model"].str.lower()
946
+ df["_orig_cols"] = [
947
+ {**(df["_orig_cols"].iat[i] or {}), "model": model_col}
948
+ for i in range(len(df))
949
+ ]
950
+ else:
951
+ # 以免前端讀不到欄位
952
+ df["model_raw"] = ""
953
+ df["model"] = ""
954
+ df["__model_lc"] = ""
955
+
956
+ # ---- Year -> __year_int ----
957
+ year_col = _find_col(prefer=["study year", "Study year", "study_year", "year", "Year"],
958
+ keyword_sets=[["year"]])
959
+ df["__year_int"] = (
960
+ pd.to_numeric(df[year_col], errors="coerce")
961
+ if year_col else pd.Series([np.nan] * len(df), index=df.index)
962
+ )
963
+ if year_col:
964
+ df["_orig_cols"] = [
965
+ {**(df["_orig_cols"].iat[i] or {}), "year": year_col}
966
+ for i in range(len(df))
967
+ ]
968
+
969
+ # ---- Age -> __age ----
970
+ age_col = _find_col(prefer=["age", "Age"], keyword_sets=[["age"]])
971
+ df["__age"] = (
972
+ pd.to_numeric(df[age_col], errors="coerce")
973
+ if age_col else pd.Series([np.nan] * len(df), index=df.index)
974
+ )
975
+ if age_col:
976
+ df["_orig_cols"] = [
977
+ {**(df["_orig_cols"].iat[i] or {}), "age": age_col}
978
+ for i in range(len(df))
979
+ ]
980
+
981
+ # ---- Study type -> study_type / __st_lc ----
982
+ st_col = _find_col(
983
+ prefer=["study type", "Study type", "study_type", "Study_type"],
984
+ keyword_sets=[["study", "type"]],
985
+ )
986
+ if st_col:
987
+ df["study_type"] = df[st_col].astype(str)
988
+ df["__st_lc"] = df["study_type"].astype(str).str.strip().str.lower()
989
+ df["_orig_cols"] = [
990
+ {**(df["_orig_cols"].iat[i] or {}), "study_type": st_col}
991
+ for i in range(len(df))
992
+ ]
993
+ else:
994
+ df["study_type"] = ""
995
+ df["__st_lc"] = ""
996
+
997
+ # ---- Site nationality -> site_nationality / __sn_lc ----
998
+ sn_col = _find_col(
999
+ prefer=[
1000
+ "site nationality", "Site nationality", "site_nationality", "Site_nationality",
1001
+ "nationality", "Nationality", "site country", "Site country", "country", "Country"
1002
+ ],
1003
+ keyword_sets=[["site", "national"], ["nationality"], ["site", "country"], ["country"]],
1004
+ )
1005
+ if sn_col:
1006
+ df["site_nationality"] = df[sn_col].astype(str)
1007
+ df["__sn_lc"] = df["site_nationality"].astype(str).str.strip().str.lower()
1008
+ df["_orig_cols"] = [
1009
+ {**(df["_orig_cols"].iat[i] or {}), "site_nationality": sn_col}
1010
+ for i in range(len(df))
1011
+ ]
1012
+ else:
1013
+ df["site_nationality"] = ""
1014
+ df["__sn_lc"] = ""
1015
+
1016
+ return df
1017
+
1018
+
1019
+ def _safe_float(x) -> Optional[float]:
1020
+ try:
1021
+ if x is None: return None
1022
+ if isinstance(x, float) and np.isnan(x): return None
1023
+ if isinstance(x, str):
1024
+ s = x.strip().replace(",", " ")
1025
+ if not s: return None
1026
+ return float(s)
1027
+ return float(x)
1028
+ except Exception:
1029
+ return None
1030
+
1031
+ def _take_first_str(row, cols: List[str]) -> str:
1032
+ for c in cols:
1033
+ if c in row and pd.notna(row[c]) and str(row[c]).strip():
1034
+ return str(row[c]).strip()
1035
+ return ""
1036
+
1037
+ def _case_key(row) -> int:
1038
+ s = _take_first_str(row, ["PanTS ID","PanTS_ID","case_id","id","__case_str"])
1039
+ if not s: return 0
1040
+ m = re.search(r"(\d+)", str(s))
1041
+ return int(m.group(1)) if m else 0
1042
+
1043
+ def _parse_3tuple_from_row(row, name_candidates: List[str]) -> List[Optional[float]]:
1044
+ # 3 個獨立欄
1045
+ for base in name_candidates:
1046
+ cx, cy, cz = f"{base}_x", f"{base}_y", f"{base}_z"
1047
+ if cx in row and cy in row and cz in row:
1048
+ xs = [_safe_float(row[c]) for c in (cx, cy, cz)]
1049
+ if all(v is not None for v in xs):
1050
+ return xs
1051
+ # 單欄字串
1052
+ seps = [",", "x", " ", "×", "X", ";", "|"]
1053
+ str_cols = []
1054
+ for base in name_candidates:
1055
+ str_cols += [base, f"{base}_str", base.replace(" ", "_")]
1056
+ for c in str_cols:
1057
+ if c in row and pd.notna(row[c]):
1058
+ s = str(row[c]).strip()
1059
+ if not s: continue
1060
+ s2 = re.sub(r"[\[\]\(\)\{\}]", " ", s)
1061
+ for sep in seps:
1062
+ s2 = s2.replace(sep, " ")
1063
+ parts = [p for p in s2.split() if p]
1064
+ vals = [_safe_float(p) for p in parts[:3]]
1065
+ if len(vals) == 3 and all(v is not None for v in vals):
1066
+ return vals
1067
+ return [None, None, None]
1068
+
1069
+ def _spacing_sum(row) -> Optional[float]:
1070
+ vals = _parse_3tuple_from_row(row, ["spacing","voxel_spacing","voxel_size","pixel_spacing"])
1071
+ if any(v is None for v in vals): return None
1072
+ return float(vals[0] + vals[1] + vals[2])
1073
+
1074
+ def _shape_sum(row) -> Optional[float]:
1075
+ vals = _parse_3tuple_from_row(row, ["shape","dim","size","image_shape","resolution"])
1076
+ if any(v is None for v in vals): return None
1077
+ return float(vals[0] + vals[1] + vals[2])
1078
+
1079
+ def ensure_sort_cols(df: pd.DataFrame) -> pd.DataFrame:
1080
+ if "__case_sortkey" not in df.columns:
1081
+ df["__case_sortkey"] = df.apply(_case_key, axis=1)
1082
+ if "__spacing_sum" not in df.columns:
1083
+ df["__spacing_sum"] = df.apply(_spacing_sum, axis=1)
1084
+ if "__shape_sum" not in df.columns:
1085
+ df["__shape_sum"] = df.apply(_shape_sum, axis=1)
1086
+
1087
+ # 完整度:Browse 與 top 排序會用到
1088
+ need_cols = ["__spacing_sum", "__shape_sum", "__sex", "__age"]
1089
+ complete = pd.Series(True, index=df.index)
1090
+ for c in need_cols:
1091
+ if c not in df.columns:
1092
+ complete &= False
1093
+ elif c == "__sex":
1094
+ complete &= (df[c].astype(str).str.strip() != "")
1095
+ else:
1096
+ complete &= df[c].notna()
1097
+ df["__complete"] = complete
1098
+ return df
1099
+
1100
+ # 前面應該已經有:
1101
+ # import os, re
1102
+ # import pandas as pd
1103
+ # from typing import Optional, Set
1104
+ # 等等...
1105
+
1106
+ # =========================
1107
+ # 讀取 metadata(全域)
1108
+ # =========================
1109
+ BASE_PATH = os.getenv("BASE_PATH") or "/app"
1110
+
1111
+ META_FILE = os.path.join(BASE_PATH, "data", "metadata.xlsx")
1112
+
1113
+ if not os.path.exists(META_FILE):
1114
+ print(f"[WARNING] metadata not found at {META_FILE}")
1115
+ DF_RAW = pd.DataFrame() # 空的 DataFrame,避免程式直接掛掉
1116
+ else:
1117
+ DF_RAW = pd.read_excel(META_FILE)
1118
+
1119
+ # 正規化(如果空的 DataFrame 會失敗,就退回原始 DF)
1120
+ try:
1121
+ DF = _norm_cols(DF_RAW)
1122
+ except Exception:
1123
+ DF = DF_RAW
1124
+
1125
+
1126
+ # =========================
1127
+ # apply_filters 函數從這裡開始
1128
+ # =========================
1129
+ def apply_filters(base: pd.DataFrame, exclude: Optional[Set[str]] = None) -> pd.DataFrame:
1130
+ exclude = exclude or set()
1131
+ df = base
1132
+
1133
+ # --- Case ID / keyword(精準匹配) ---
1134
+ q = (_arg("q") or _arg("caseid") or "").strip()
1135
+ if q and "caseid" not in exclude and "__case_str" in df.columns:
1136
+ s = df["__case_str"].astype(str)
1137
+ if q.isdigit():
1138
+ # 把每列所有數字 token 抓出來,做數值等號;77 不會吃 177/077(前導 0 忽略)
1139
+ qq = int(q)
1140
+ nums = s.str.findall(r"\d+")
1141
+ mask_num = nums.apply(lambda xs: any(int(x) == qq for x in xs))
1142
+ # 備援:允許 "Case 77"
1143
+ patt = rf"(?i)\b(?:case\s*)?{re.escape(q)}\b"
1144
+ mask_regex = s.str.contains(patt, na=False, regex=True)
1145
+ df = df[mask_num | mask_regex]
1146
+ else:
1147
+ # 一般文字搜尋(忽略大小寫;避免把查詢當正則)
1148
+ df = df[s.str.contains(re.escape(q), na=False, case=False, regex=False)]
1149
+
1150
+ # --- Tumor ---
1151
+ tv = _to01_query(_arg("tumor"))
1152
+ tnull = _to01_query(_arg("tumor_is_null"))
1153
+ if (_arg("tumor", "").strip().lower() == "unknown"):
1154
+ tnull, tv = 1, None
1155
+ if "__tumor01" in df.columns and "tumor" not in exclude:
1156
+ if tnull in (0, 1) and "tumor_is_null" not in exclude:
1157
+ df = df[df["__tumor01"].isna()] if tnull == 1 else df[df["__tumor01"].notna()]
1158
+ elif tv in (0, 1):
1159
+ df = df[df["__tumor01"] == tv]
1160
+
1161
+ # --- Sex(多選 + Unknown)---
1162
+ sv_list = _collect_list_params(["sex", "sex[]"])
1163
+ snull = _to01_query(_arg("sex_is_null"))
1164
+ if not sv_list:
1165
+ sv = (_arg("sex", "") or "").strip().upper()
1166
+ if sv:
1167
+ sv_list = [sv]
1168
+ sv_norm = []
1169
+ for s_ in sv_list:
1170
+ s2 = (s_ or "").strip().upper()
1171
+ if s2 in ("M", "F"):
1172
+ sv_norm.append(s2)
1173
+ elif s2 in ("U", "UNKNOWN"):
1174
+ sv_norm.append("UNKNOWN")
1175
+ if "__sex" in df.columns and "sex" not in exclude and (sv_norm or snull in (0, 1)):
1176
+ ser = df["__sex"].fillna("").str.strip().str.upper()
1177
+ take = pd.Series(False, index=df.index)
1178
+ vals = [s for s in sv_norm if s in ("F", "M")]
1179
+ if vals:
1180
+ take |= ser.isin(vals)
1181
+ if ("UNKNOWN" in sv_norm) or (snull == 1):
1182
+ take |= (ser == "")
1183
+ df = df[take]
1184
+
1185
+ # --- Age:支援 age_bin[](含 90+ / UNKNOWN),否則回退 age_from/age_to ---
1186
+ bins = _collect_list_params(["age_bin", "age_bin[]"])
1187
+ age_null = _to01_query(_arg("age_is_null"))
1188
+ if "__age" in df.columns and bins:
1189
+ age_series = pd.to_numeric(df["__age"], errors="coerce")
1190
+ mask = pd.Series(False, index=df.index)
1191
+ for b in bins:
1192
+ s = (b or "").strip()
1193
+ m_plus = re.match(r"^\s*(\d+)\s*\+\s*$", s)
1194
+ if m_plus:
1195
+ lo = int(m_plus.group(1))
1196
+ mask |= (age_series >= lo)
1197
+ continue
1198
+ m_rng = re.match(r"^\s*(\d+)\s*[-–—]\s*(\d+)\s*$", s)
1199
+ if m_rng:
1200
+ lo, hi = int(m_rng.group(1)), int(m_rng.group(2))
1201
+ mask |= age_series.between(lo, hi, inclusive="both")
1202
+ if (age_null == 1) or any((t or "").strip().upper() == "UNKNOWN" for t in bins):
1203
+ mask |= age_series.isna() | (df["__age"].astype(str).str.strip().str.upper() == "UNKNOWN")
1204
+ df = df[mask]
1205
+ elif "__age" in df.columns:
1206
+ af = _to_float(_arg("age_from")); at = _to_float(_arg("age_to"))
1207
+ age_series = pd.to_numeric(df["__age"], errors="coerce")
1208
+ if "age_from" not in exclude and af is not None:
1209
+ df = df[age_series >= af]
1210
+ if "age_to" not in exclude and at is not None:
1211
+ df = df[age_series <= at]
1212
+
1213
+ # --- CT phase ---
1214
+ ct = (_arg("ct_phase", "") or "").strip().lower()
1215
+ ct_list = _collect_list_params(["ct_phase", "ct_phase[]"])
1216
+ if ct == "unknown" or any((s or "").lower() == "unknown" for s in ct_list):
1217
+ if "__ct" in df.columns:
1218
+ s_ct = df["__ct"].astype(str).str.strip().str.lower()
1219
+ tokens_null_ct = {'', 'unknown', 'nan', 'n/a', 'na', 'none', '(blank)', '(null)'}
1220
+ df = df[df["__ct"].isna() | s_ct.isin(tokens_null_ct)]
1221
+ elif (ct or ct_list) and "__ct_lc" in df.columns:
1222
+ parts = []
1223
+ if ct:
1224
+ parts += [p.strip() for p in re.split(r"[;,/]+", ct) if p.strip()]
1225
+ parts += [p.strip().lower() for p in ct_list if p.strip()]
1226
+ patt = "|".join(re.escape(p) for p in parts)
1227
+ df = df[df["__ct_lc"].str.contains(patt, na=False)]
1228
+
1229
+ # --- Manufacturer ---
1230
+ m_list = _collect_list_params(["manufacturer", "manufacturer[]", "mfr"])
1231
+ m_raw = (_arg("manufacturer", "") or "").strip()
1232
+ if m_raw and not m_list:
1233
+ m_list = [p.strip() for p in m_raw.split(",") if p.strip()]
1234
+ if m_list and "__mfr_lc" in df.columns:
1235
+ m_lc = [s.lower() for s in m_list]
1236
+ df = df[df["__mfr_lc"].isin(m_lc)]
1237
+
1238
+ # --- Model(canonical;可 fuzzy)---
1239
+ model_list = _collect_list_params(["model", "model[]", "manufacturer_model"])
1240
+ model_raw = (_arg("model", "") or "").strip()
1241
+ if model_raw and not model_list:
1242
+ model_list = [p.strip() for p in re.split(r"[;,/|]+", model_raw) if p.strip()]
1243
+ if model_list and "__model_lc" in df.columns and "model" not in exclude:
1244
+ wants = [canon_model(p).lower() for p in model_list if p]
1245
+ wants = [w for w in wants if w]
1246
+ fuzzy = str(_arg("model_fuzzy", "0")).lower() in ("1", "true", "yes")
1247
+ if fuzzy:
1248
+ patt = "|".join(re.escape(w) for w in wants)
1249
+ df = df[df["__model_lc"].str.contains(patt, na=False)]
1250
+ else:
1251
+ df = df[df["__model_lc"].isin(set(wants))]
1252
+
1253
+ # --- Study type ---
1254
+ st_list = _collect_list_params(["study_type", "study_type[]"])
1255
+ st_raw = (_arg("study_type", "") or "").strip()
1256
+ if st_raw and not st_list:
1257
+ st_list = [p.strip() for p in re.split(r"[;,/|]+", st_raw) if p.strip()]
1258
+ if st_list and "__st_lc" in df.columns and "study_type" not in exclude:
1259
+ parts = [p.lower() for p in st_list]
1260
+ patt = "|".join(re.escape(p) for p in parts)
1261
+ df = df[df["__st_lc"].str.contains(patt, na=False)]
1262
+
1263
+ # --- Site nationality ---
1264
+ nat_list = _collect_list_params(["site_nat", "site_nat[]", "site_nationality", "site_nationality[]"])
1265
+ nat_raw = (_arg("site_nationality", "") or _arg("site_nat", "") or "").strip()
1266
+ if nat_raw and not nat_list:
1267
+ nat_list = [p.strip() for p in re.split(r"[;,/|]+", nat_raw) if p.strip()]
1268
+ if nat_list and "__sn_lc" in df.columns and "site_nationality" not in exclude:
1269
+ parts = [p.lower() for p in nat_list]
1270
+ patt = "|".join(re.escape(p) for p in parts)
1271
+ df = df[df["__sn_lc"].str.contains(patt, na=False)]
1272
+
1273
+ # --- Year(新增)---
1274
+ if "year" not in exclude:
1275
+ _year_cols_pref = ["__year_int", "study_year", "Study year", "study year", "Year", "year"]
1276
+ _found_cols = [c for c in _year_cols_pref if c in df.columns]
1277
+ if _found_cols:
1278
+ yser = pd.to_numeric(df[_found_cols[0]], errors="coerce")
1279
+
1280
+ # 1) 多選年份
1281
+ year_list = _collect_list_params(["year", "year[]"])
1282
+ year_raw = (_arg("year", "") or "").strip()
1283
+ if year_raw and not year_list:
1284
+ year_list = [p.strip() for p in re.split(r"[;,/|]+", year_raw) if p.strip()]
1285
+
1286
+ # 2) 範圍
1287
+ y_from = to_int(_arg("year_from"))
1288
+ y_to = to_int(_arg("year_to"))
1289
+
1290
+ # 3) Unknown / Null
1291
+ y_is_null = _to01_query(_arg("year_is_null"))
1292
+ _unk_tokens = {"unknown", "nan", "none", "n/a", "na", "(blank)", "(null)"}
1293
+ wants_unknown = (y_is_null == 1) or any(
1294
+ (s or "").strip().lower() in _unk_tokens for s in year_list
1295
+ )
1296
+
1297
+ mask = pd.Series(True, index=df.index)
1298
+
1299
+ exact_years = []
1300
+ for s in year_list:
1301
+ try:
1302
+ exact_years.append(int(s))
1303
+ except Exception:
1304
+ pass
1305
+ if exact_years:
1306
+ mask &= yser.isin(set(exact_years))
1307
+
1308
+ if y_from is not None:
1309
+ mask &= (yser >= y_from)
1310
+ if y_to is not None:
1311
+ mask &= (yser <= y_to)
1312
+
1313
+ if wants_unknown:
1314
+ mask = mask | yser.isna()
1315
+
1316
+ df = df[mask]
1317
+
1318
+ return df
1319
+
1320
+
1321
+ def row_to_item(row: pd.Series) -> Dict[str, Any]:
1322
+ cols = row.get("_orig_cols")
1323
+ cols = cols if isinstance(cols, dict) else {}
1324
+
1325
+ def pick(k, fallback=None):
1326
+ col = cols.get(k)
1327
+ if col and col in row.index:
1328
+ return row[col]
1329
+ return fallback
1330
+
1331
+ return {
1332
+ "PanTS ID": _nan2none(pick("case") or row.get("__case_str")),
1333
+ "case_id": _nan2none(pick("case") or row.get("__case_str")),
1334
+ "tumor": (int(row.get("__tumor01")) if pd.notna(row.get("__tumor01")) else None),
1335
+ "sex": _nan2none(row.get("__sex")),
1336
+ "age": _nan2none(row.get("__age")),
1337
+ "ct phase": _nan2none(pick("ct_phase") or row.get("__ct")),
1338
+ "manufacturer": _nan2none(pick("manufacturer") or row.get("__mfr")),
1339
+ "manufacturer model": _nan2none(pick("model") or row.get("model")),
1340
+ "study year": _nan2none(row.get("__year_int")),
1341
+ "study type": _nan2none(pick("study_type") or row.get("study_type")),
1342
+ "site nationality": _nan2none(pick("site_nationality") or row.get("site_nationality")),
1343
+ # 排序輔助輸出
1344
+ "spacing_sum": _nan2none(row.get("__spacing_sum")),
1345
+ "shape_sum": _nan2none(row.get("__shape_sum")),
1346
+ "complete": bool(row.get("__complete")) if "__complete" in row else None,
1347
+ }