rntc commited on
Commit
d2d1011
Β·
verified Β·
1 Parent(s): 5693960

Upload app.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. app.py +493 -0
app.py ADDED
@@ -0,0 +1,493 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Gradio app to explore pancreas cancer clinical report annotations.
3
+ Loads data from rntc/biomed-fr-pancreas-annotations on HuggingFace.
4
+ """
5
+
6
+ import gradio as gr
7
+ from datasets import load_dataset
8
+ from difflib import SequenceMatcher
9
+
10
+ # Load the dataset
11
+ print("Loading dataset from HuggingFace...")
12
+ dataset = load_dataset("rntc/biomed-fr-pancreas-annotations", split="train")
13
+ print(f"Loaded {len(dataset)} samples")
14
+
15
+
16
+ def fuzzy_find_span(text: str, span: str, threshold: float = 0.85) -> tuple:
17
+ """
18
+ Find a span in text with fuzzy matching.
19
+ Returns (start, end) or None if not found.
20
+ """
21
+ # First try exact match
22
+ idx = text.find(span)
23
+ if idx != -1:
24
+ return (idx, idx + len(span))
25
+
26
+ # Try fuzzy match with sliding window
27
+ span_len = len(span)
28
+ if span_len < 10 or span_len > len(text):
29
+ return None
30
+
31
+ best_ratio = 0
32
+ best_pos = None
33
+
34
+ # Use a window slightly larger than span
35
+ window_size = min(span_len + 20, len(text))
36
+
37
+ for i in range(0, len(text) - span_len + 1, max(1, span_len // 4)):
38
+ window = text[i:i + window_size]
39
+ ratio = SequenceMatcher(None, span, window[:span_len]).ratio()
40
+ if ratio > best_ratio and ratio >= threshold:
41
+ best_ratio = ratio
42
+ best_pos = i
43
+
44
+ if best_pos is not None:
45
+ return (best_pos, best_pos + span_len)
46
+
47
+ return None
48
+
49
+
50
+ def escape_html(text: str) -> str:
51
+ """Escape HTML special characters."""
52
+ if not text:
53
+ return ""
54
+ return (str(text)
55
+ .replace("&", "&amp;")
56
+ .replace("<", "&lt;")
57
+ .replace(">", "&gt;")
58
+ .replace('"', "&quot;"))
59
+
60
+
61
+ # Soft pastel colors for better readability
62
+ COLORS = [
63
+ "#FFE082", # amber
64
+ "#A5D6A7", # green
65
+ "#90CAF9", # blue
66
+ "#FFAB91", # deep orange
67
+ "#CE93D8", # purple
68
+ "#80DEEA", # cyan
69
+ "#C5E1A5", # light green
70
+ "#FFCC80", # orange
71
+ "#B39DDB", # deep purple
72
+ "#81D4FA", # light blue
73
+ "#EF9A9A", # red
74
+ "#FFF59D", # yellow
75
+ "#F48FB1", # pink
76
+ "#80CBC4", # teal
77
+ "#BCAAA4", # brown
78
+ ]
79
+
80
+
81
+ def highlight_spans_in_text(cr_text: str, annotation: dict) -> str:
82
+ """
83
+ Highlight spans in the CR text based on annotations.
84
+ Returns HTML with highlighted spans.
85
+ """
86
+ if not cr_text or not annotation:
87
+ return f"<div class='cr-text'>{escape_html(cr_text)}</div>"
88
+
89
+ # Collect all spans with their variable names
90
+ spans_to_highlight = []
91
+ for var_name, var_data in annotation.items():
92
+ if var_data and isinstance(var_data, dict):
93
+ span = var_data.get("span")
94
+ value = var_data.get("value")
95
+ if span and value and len(span) >= 5: # Skip very short spans
96
+ spans_to_highlight.append({
97
+ "span": span,
98
+ "var_name": var_name,
99
+ "value": str(value)
100
+ })
101
+
102
+ if not spans_to_highlight:
103
+ return f"<div class='cr-text'>{escape_html(cr_text)}</div>"
104
+
105
+ # Sort spans by length (longest first) to prioritize longer matches
106
+ spans_to_highlight.sort(key=lambda x: len(x["span"]), reverse=True)
107
+
108
+ # Find spans in text (with fuzzy matching)
109
+ found_spans = []
110
+ for item in spans_to_highlight:
111
+ result = fuzzy_find_span(cr_text, item["span"])
112
+ if result:
113
+ start, end = result
114
+ found_spans.append({
115
+ "start": start,
116
+ "end": end,
117
+ "var_name": item["var_name"],
118
+ "value": item["value"],
119
+ "span": cr_text[start:end] # Use actual text from CR
120
+ })
121
+
122
+ if not found_spans:
123
+ return f"<div class='cr-text'>{escape_html(cr_text)}</div>"
124
+
125
+ # Sort by start position
126
+ found_spans.sort(key=lambda x: x["start"])
127
+
128
+ # Remove overlapping spans (keep the first/longest one)
129
+ non_overlapping = []
130
+ for span in found_spans:
131
+ if not non_overlapping:
132
+ non_overlapping.append(span)
133
+ elif span["start"] >= non_overlapping[-1]["end"]:
134
+ non_overlapping.append(span)
135
+
136
+ # Assign colors to variable names
137
+ var_colors = {}
138
+ color_idx = 0
139
+ for span in non_overlapping:
140
+ if span["var_name"] not in var_colors:
141
+ var_colors[span["var_name"]] = COLORS[color_idx % len(COLORS)]
142
+ color_idx += 1
143
+
144
+ # Build HTML with highlights
145
+ html_parts = []
146
+ last_end = 0
147
+
148
+ for span in non_overlapping:
149
+ # Add text before this span
150
+ if span["start"] > last_end:
151
+ html_parts.append(escape_html(cr_text[last_end:span["start"]]))
152
+
153
+ # Add highlighted span
154
+ color = var_colors[span["var_name"]]
155
+ var_label = span["var_name"].replace("_", " ").replace(" ", " ").title()
156
+ tooltip = f"{var_label}\\n→ {span['value']}"
157
+
158
+ html_parts.append(
159
+ f'<mark class="entity" style="background-color: {color};" '
160
+ f'title="{escape_html(tooltip)}" '
161
+ f'data-var="{escape_html(var_label)}">'
162
+ f'{escape_html(span["span"])}'
163
+ f'<span class="entity-label">{escape_html(var_label[:20])}</span>'
164
+ f'</mark>'
165
+ )
166
+ last_end = span["end"]
167
+
168
+ # Add remaining text
169
+ if last_end < len(cr_text):
170
+ html_parts.append(escape_html(cr_text[last_end:]))
171
+
172
+ html = "".join(html_parts)
173
+ return f"<div class='cr-text'>{html}</div>"
174
+
175
+
176
+ def format_annotations_table(annotation: dict) -> str:
177
+ """Format annotations as an HTML table with categories."""
178
+ if not annotation:
179
+ return "<p>No annotations</p>"
180
+
181
+ # Group variables by category (simple heuristic based on name)
182
+ categories = {
183
+ "Patient Info": ["date_of_birth", "age_at_cancer_diagnosis", "biological_gender", "vital_status", "date_of_death"],
184
+ "Diagnosis": ["date_of_cancer_diagnostic", "primary_tumor_localisation", "ctnm_stage", "stage_as_per_ehr", "histological_type", "epithelial_tumor_subtype"],
185
+ "Tumor Characteristics": ["resectability_status", "two_largest_diameters", "metastasis_localisation", "number_of_metastatic_sites"],
186
+ "Lab Results": ["crp_at_diagnosis", "albumin_at_diagnosis", "alanine_transaminase", "aspartate_aminotransferase", "conjugated_bilirubin", "ca19_9"],
187
+ "Treatment": ["surgery", "loco_regional_radiotherapy", "immunotherapy", "targeted_therapy", "full_course_of_initial_treatment"],
188
+ "Molecular": ["germline_mutation", "tumor_molecular_profiling"],
189
+ "Progression": ["date_of_first_progression", "type_of_first_progression", "treatment_at_first_progression", "best_response", "reason_for_treatment_end"],
190
+ }
191
+
192
+ def get_category(var_name):
193
+ for cat, keywords in categories.items():
194
+ for kw in keywords:
195
+ if kw in var_name.lower():
196
+ return cat
197
+ return "Other"
198
+
199
+ # Group rows by category
200
+ categorized = {}
201
+ for var_name, var_data in annotation.items():
202
+ if var_data and isinstance(var_data, dict):
203
+ value = var_data.get("value")
204
+ if value:
205
+ cat = get_category(var_name)
206
+ if cat not in categorized:
207
+ categorized[cat] = []
208
+ categorized[cat].append((var_name, var_data))
209
+
210
+ if not categorized:
211
+ return "<p class='no-data'>No extracted values</p>"
212
+
213
+ html_parts = []
214
+
215
+ for category in ["Patient Info", "Diagnosis", "Tumor Characteristics", "Lab Results", "Treatment", "Molecular", "Progression", "Other"]:
216
+ if category not in categorized:
217
+ continue
218
+
219
+ html_parts.append(f"<div class='category'><h4>{category}</h4>")
220
+ html_parts.append("<table class='annotations-table'>")
221
+
222
+ for var_name, var_data in categorized[category]:
223
+ value = var_data.get("value", "")
224
+ span = var_data.get("span", "")
225
+ var_label = var_name.replace("_", " ").title()
226
+
227
+ span_preview = span[:80] + "..." if span and len(span) > 80 else span
228
+
229
+ html_parts.append(f"""
230
+ <tr>
231
+ <td class='var-name'>{escape_html(var_label)}</td>
232
+ <td class='var-value'>{escape_html(str(value))}</td>
233
+ <td class='var-span'>{escape_html(span_preview) if span_preview else '-'}</td>
234
+ </tr>
235
+ """)
236
+
237
+ html_parts.append("</table></div>")
238
+
239
+ return "".join(html_parts)
240
+
241
+
242
+ def get_stats(annotation: dict) -> str:
243
+ """Get statistics about extracted values."""
244
+ if not annotation:
245
+ return "No data"
246
+
247
+ total = len(annotation)
248
+ extracted = sum(1 for v in annotation.values() if v and isinstance(v, dict) and v.get("value"))
249
+
250
+ return f"πŸ“Š Extracted: {extracted}/{total} variables ({100*extracted//total}%)"
251
+
252
+
253
+ def display_sample(sample_idx: int):
254
+ """Display a sample from the dataset."""
255
+ if sample_idx < 0 or sample_idx >= len(dataset):
256
+ return "Invalid sample index", "<p>Invalid sample index</p>", "Invalid"
257
+
258
+ sample = dataset[int(sample_idx)]
259
+ cr_text = sample.get("CR", "")
260
+ annotation = sample.get("annotation", {})
261
+
262
+ highlighted_html = highlight_spans_in_text(cr_text, annotation)
263
+ annotations_html = format_annotations_table(annotation)
264
+ stats = get_stats(annotation)
265
+
266
+ return highlighted_html, annotations_html, stats
267
+
268
+
269
+ def search_samples(query: str):
270
+ """Search samples by text content."""
271
+ if not query or len(query) < 3:
272
+ # Return first 20 samples
273
+ return [[i, dataset[i]["CR"][:80] + "..."] for i in range(min(20, len(dataset)))]
274
+
275
+ results = []
276
+ query_lower = query.lower()
277
+ for i, sample in enumerate(dataset):
278
+ cr = sample.get("CR", "")
279
+ if query_lower in cr.lower():
280
+ results.append([i, cr[:80] + "..."])
281
+ if len(results) >= 50:
282
+ break
283
+
284
+ if not results:
285
+ return [["No results", f"No samples found containing '{query}'"]]
286
+
287
+ return results
288
+
289
+
290
+ # Custom CSS for better styling
291
+ custom_css = """
292
+ .cr-text {
293
+ font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif;
294
+ font-size: 14px;
295
+ line-height: 1.8;
296
+ padding: 20px;
297
+ background: #fafafa;
298
+ border-radius: 8px;
299
+ white-space: pre-wrap;
300
+ max-height: 500px;
301
+ overflow-y: auto;
302
+ }
303
+
304
+ .entity {
305
+ padding: 2px 6px;
306
+ border-radius: 4px;
307
+ cursor: help;
308
+ position: relative;
309
+ display: inline;
310
+ transition: all 0.2s;
311
+ }
312
+
313
+ .entity:hover {
314
+ filter: brightness(0.9);
315
+ box-shadow: 0 2px 8px rgba(0,0,0,0.15);
316
+ }
317
+
318
+ .entity-label {
319
+ display: none;
320
+ position: absolute;
321
+ bottom: 100%;
322
+ left: 0;
323
+ background: #333;
324
+ color: white;
325
+ padding: 4px 8px;
326
+ border-radius: 4px;
327
+ font-size: 11px;
328
+ white-space: nowrap;
329
+ z-index: 100;
330
+ }
331
+
332
+ .entity:hover .entity-label {
333
+ display: block;
334
+ }
335
+
336
+ .category {
337
+ margin-bottom: 20px;
338
+ }
339
+
340
+ .category h4 {
341
+ color: #1976d2;
342
+ border-bottom: 2px solid #1976d2;
343
+ padding-bottom: 8px;
344
+ margin-bottom: 12px;
345
+ }
346
+
347
+ .annotations-table {
348
+ width: 100%;
349
+ border-collapse: collapse;
350
+ font-size: 13px;
351
+ }
352
+
353
+ .annotations-table tr:nth-child(even) {
354
+ background: #f5f5f5;
355
+ }
356
+
357
+ .annotations-table td {
358
+ padding: 10px 12px;
359
+ border-bottom: 1px solid #e0e0e0;
360
+ vertical-align: top;
361
+ }
362
+
363
+ .var-name {
364
+ font-weight: 600;
365
+ color: #333;
366
+ width: 30%;
367
+ }
368
+
369
+ .var-value {
370
+ color: #1976d2;
371
+ font-weight: 500;
372
+ width: 25%;
373
+ }
374
+
375
+ .var-span {
376
+ color: #666;
377
+ font-style: italic;
378
+ font-size: 12px;
379
+ width: 45%;
380
+ }
381
+
382
+ .no-data {
383
+ color: #999;
384
+ font-style: italic;
385
+ padding: 20px;
386
+ text-align: center;
387
+ }
388
+
389
+ .stats-badge {
390
+ background: #e3f2fd;
391
+ color: #1976d2;
392
+ padding: 8px 16px;
393
+ border-radius: 20px;
394
+ font-weight: 500;
395
+ display: inline-block;
396
+ }
397
+ """
398
+
399
+
400
+ # Build the Gradio interface
401
+ with gr.Blocks(
402
+ title="Pancreas Cancer Annotations Explorer",
403
+ theme=gr.themes.Soft(primary_hue="blue"),
404
+ css=custom_css
405
+ ) as demo:
406
+
407
+ gr.Markdown("""
408
+ # πŸ”¬ Pancreas Cancer Clinical Report Annotations Explorer
409
+
410
+ Explore structured annotations extracted from synthetic French clinical reports about pancreas cancer.
411
+
412
+ **How to use:**
413
+ - Use the slider or search to navigate samples
414
+ - Hover over highlighted text to see extracted variables
415
+ - View the complete annotation table below
416
+ """)
417
+
418
+ with gr.Row():
419
+ with gr.Column(scale=2):
420
+ sample_slider = gr.Slider(
421
+ minimum=0,
422
+ maximum=len(dataset) - 1,
423
+ step=1,
424
+ value=0,
425
+ label=f"πŸ“Œ Sample Index (0 - {len(dataset) - 1})",
426
+ info="Drag to browse samples"
427
+ )
428
+ with gr.Column(scale=1):
429
+ stats_display = gr.Markdown("", elem_classes=["stats-badge"])
430
+
431
+ with gr.Row():
432
+ with gr.Column(scale=1):
433
+ search_box = gr.Textbox(
434
+ label="πŸ” Search",
435
+ placeholder="Type to search in clinical reports...",
436
+ info="Min 3 characters"
437
+ )
438
+ search_results = gr.Dataframe(
439
+ headers=["#", "Preview"],
440
+ label="Results",
441
+ interactive=False,
442
+ height=200
443
+ )
444
+
445
+ gr.Markdown("---")
446
+ gr.Markdown("### πŸ“„ Clinical Report with Entity Highlighting")
447
+ gr.Markdown("*Hover over colored text to see the extracted variable and value*")
448
+
449
+ cr_display = gr.HTML()
450
+
451
+ gr.Markdown("---")
452
+ gr.Markdown("### πŸ“Š Extracted Annotations")
453
+
454
+ annotations_display = gr.HTML()
455
+
456
+ # Event handlers
457
+ sample_slider.change(
458
+ fn=display_sample,
459
+ inputs=[sample_slider],
460
+ outputs=[cr_display, annotations_display, stats_display]
461
+ )
462
+
463
+ search_box.change(
464
+ fn=search_samples,
465
+ inputs=[search_box],
466
+ outputs=[search_results]
467
+ )
468
+
469
+ def on_select(evt: gr.SelectData, data):
470
+ if data is not None and len(data) > 0:
471
+ try:
472
+ selected_idx = int(data[evt.index[0]][0])
473
+ return selected_idx
474
+ except (ValueError, IndexError, TypeError):
475
+ pass
476
+ return 0
477
+
478
+ search_results.select(
479
+ fn=on_select,
480
+ inputs=[search_results],
481
+ outputs=[sample_slider]
482
+ )
483
+
484
+ # Load first sample on start
485
+ demo.load(
486
+ fn=display_sample,
487
+ inputs=[sample_slider],
488
+ outputs=[cr_display, annotations_display, stats_display]
489
+ )
490
+
491
+
492
+ if __name__ == "__main__":
493
+ demo.launch()