Spaces:
Sleeping
Sleeping
File size: 7,038 Bytes
d2d1011 1c5482c 60291ef d2d1011 1c5482c d2d1011 1c5482c d2d1011 1c5482c d2d1011 1c5482c d2d1011 1c5482c d2d1011 1c5482c d2d1011 1c5482c d2d1011 1c5482c d2d1011 1c5482c d2d1011 1c5482c d2d1011 1c5482c d2d1011 1c5482c d2d1011 60291ef d2d1011 1c5482c d2d1011 1c5482c d2d1011 1c5482c d2d1011 1c5482c 60291ef 1c5482c 60291ef 1c5482c 60291ef 911ca7b 1c5482c 60291ef 911ca7b 1c5482c d2d1011 60291ef 911ca7b d2d1011 1c5482c 60291ef d2d1011 60291ef d2d1011 1c5482c 911ca7b 1c5482c 911ca7b f65575b 1c5482c 911ca7b d2d1011 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 | """
Gradio app to explore pancreas cancer clinical report annotations.
"""
import gradio as gr
from datasets import load_dataset
# Load the dataset
print("Loading dataset...")
full_dataset = load_dataset("rntc/biomed-fr-pancreas-annotations", split="train")
print(f"Loaded {len(full_dataset)} samples")
# Filter: keep only samples with >= 10 real annotations
MIN_ANNOTATIONS = 10
def count_real_annotations(annotation):
"""Count real annotations (excluding 'not found' placeholders)."""
count = 0
for var_data in annotation.values():
if var_data and isinstance(var_data, dict):
value = var_data.get("value")
span = var_data.get("span", "")
if value:
if span and "pas de mention" in span.lower():
continue
if "not performed" in str(value).lower():
continue
count += 1
return count
# Filter dataset
filtered_indices = [
i for i, sample in enumerate(full_dataset)
if count_real_annotations(sample.get("annotation", {})) >= MIN_ANNOTATIONS
]
print(f"Filtered to {len(filtered_indices)} samples with >= {MIN_ANNOTATIONS} annotations")
# Colors for highlighting
COLORS = [
"#FFEB3B", "#4CAF50", "#2196F3", "#FF9800", "#E91E63",
"#9C27B0", "#00BCD4", "#8BC34A", "#FF5722", "#607D8B",
]
def escape_html(text):
if not text:
return ""
return str(text).replace("&", "&").replace("<", "<").replace(">", ">")
def highlight_text(cr_text, annotation):
"""Highlight spans in CR text."""
if not cr_text or not annotation:
return f"<pre style='white-space:pre-wrap;'>{escape_html(cr_text)}</pre>"
# Collect valid spans (that exist in text)
spans = []
for var_name, var_data in annotation.items():
if var_data and isinstance(var_data, dict):
span = var_data.get("span")
value = var_data.get("value")
if span and value and span in cr_text:
spans.append({
"text": span,
"start": cr_text.find(span),
"var": var_name.replace("_", " ").title(),
"value": str(value)
})
if not spans:
return f"<pre style='white-space:pre-wrap;'>{escape_html(cr_text)}</pre>"
# Sort by position and remove overlaps
spans.sort(key=lambda x: x["start"])
filtered = []
for s in spans:
s["end"] = s["start"] + len(s["text"])
if not filtered or s["start"] >= filtered[-1]["end"]:
filtered.append(s)
# Build HTML
html = []
pos = 0
color_map = {}
color_idx = 0
for s in filtered:
if s["start"] > pos:
html.append(escape_html(cr_text[pos:s["start"]]))
if s["var"] not in color_map:
color_map[s["var"]] = COLORS[color_idx % len(COLORS)]
color_idx += 1
color = color_map[s["var"]]
html.append(
f'<mark style="background:{color};padding:1px 3px;border-radius:3px;" '
f'title="{escape_html(s["var"])}: {escape_html(s["value"])}">'
f'{escape_html(s["text"])}</mark>'
)
pos = s["end"]
if pos < len(cr_text):
html.append(escape_html(cr_text[pos:]))
return f"<pre style='white-space:pre-wrap;line-height:1.6;'>{''.join(html)}</pre>"
def format_table(annotation):
"""Format annotations as HTML table."""
if not annotation:
return "<p>No annotations</p>"
rows = []
for var_name, var_data in annotation.items():
if var_data and isinstance(var_data, dict):
value = var_data.get("value")
span = var_data.get("span", "")
var_label = var_name.replace("_", " ").title()
if value:
if span and "pas de mention" in span.lower():
display_value = "/"
display_span = ""
elif "not performed" in str(value).lower():
display_value = "/"
display_span = ""
else:
display_value = str(value)
display_span = span[:60] + "..." if span and len(span) > 60 else (span or "")
else:
display_value = "/"
display_span = ""
rows.append(f"""<tr>
<td style="padding:6px 10px;border-bottom:1px solid #ddd;font-weight:500;">{escape_html(var_label)}</td>
<td style="padding:6px 10px;border-bottom:1px solid #ddd;color:#1565C0;">{escape_html(display_value)}</td>
<td style="padding:6px 10px;border-bottom:1px solid #ddd;color:#666;font-size:12px;font-style:italic;">{escape_html(display_span)}</td>
</tr>""")
return f"""<table style="width:100%;border-collapse:collapse;font-size:13px;">
<thead><tr style="background:#f5f5f5;">
<th style="padding:8px 10px;text-align:left;border-bottom:2px solid #ddd;">Variable</th>
<th style="padding:8px 10px;text-align:left;border-bottom:2px solid #ddd;">Value</th>
<th style="padding:8px 10px;text-align:left;border-bottom:2px solid #ddd;">Source</th>
</tr></thead>
<tbody>{"".join(rows)}</tbody>
</table>"""
def display_sample(slider_idx):
"""Display a sample."""
slider_idx = int(slider_idx)
if slider_idx < 0 or slider_idx >= len(filtered_indices):
return "Invalid", "Invalid", "Invalid"
real_idx = filtered_indices[slider_idx]
sample = full_dataset[real_idx]
original = sample.get("original_text", "")
cr = sample.get("CR", "")
annotation = sample.get("annotation", {})
n_annotations = count_real_annotations(annotation)
original_html = f"<pre style='white-space:pre-wrap;line-height:1.6;'>{escape_html(original)}</pre>"
cr_html = f"<p><b>Sample #{real_idx}</b> — {n_annotations} annotations</p>" + highlight_text(cr, annotation)
return original_html, cr_html, format_table(annotation)
# Build UI
with gr.Blocks(title="Pancreas Annotations", theme=gr.themes.Base()) as demo:
gr.Markdown("# 🔬 Pancreas Cancer Annotations Explorer")
gr.Markdown(f"Showing {len(filtered_indices)} samples with >= {MIN_ANNOTATIONS} annotations. Hover over highlights to see values.")
with gr.Row():
slider = gr.Slider(0, len(filtered_indices) - 1, value=0, step=1, label="Sample")
with gr.Row():
with gr.Column():
gr.Markdown("### Original (English)")
original_html = gr.HTML()
with gr.Column():
gr.Markdown("### Generated CR (French)")
cr_html = gr.HTML()
with gr.Column():
gr.Markdown("### Extracted Variables")
table_html = gr.HTML()
slider.change(display_sample, inputs=[slider], outputs=[original_html, cr_html, table_html])
demo.load(display_sample, inputs=[slider], outputs=[original_html, cr_html, table_html])
if __name__ == "__main__":
demo.launch()
|