rntc commited on
Commit
60291ef
·
verified ·
1 Parent(s): 1c5482c

Upload app.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. app.py +43 -16
app.py CHANGED
@@ -7,8 +7,33 @@ from datasets import load_dataset
7
 
8
  # Load the dataset
9
  print("Loading dataset...")
10
- dataset = load_dataset("rntc/biomed-fr-pancreas-annotations", split="train")
11
- print(f"Loaded {len(dataset)} samples")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12
 
13
  # Colors for highlighting
14
  COLORS = [
@@ -60,16 +85,13 @@ def highlight_text(cr_text, annotation):
60
  color_idx = 0
61
 
62
  for s in filtered:
63
- # Text before span
64
  if s["start"] > pos:
65
  html.append(escape_html(cr_text[pos:s["start"]]))
66
 
67
- # Assign color
68
  if s["var"] not in color_map:
69
  color_map[s["var"]] = COLORS[color_idx % len(COLORS)]
70
  color_idx += 1
71
 
72
- # Highlighted span
73
  color = color_map[s["var"]]
74
  html.append(
75
  f'<mark style="background:{color};padding:1px 3px;border-radius:3px;" '
@@ -78,11 +100,10 @@ def highlight_text(cr_text, annotation):
78
  )
79
  pos = s["end"]
80
 
81
- # Remaining text
82
  if pos < len(cr_text):
83
  html.append(escape_html(cr_text[pos:]))
84
 
85
- return f"<pre style='white-space:pre-wrap;line-height:1.6;'>{' '.join(html)}</pre>"
86
 
87
 
88
  def format_table(annotation):
@@ -99,8 +120,10 @@ def format_table(annotation):
99
  var_label = var_name.replace("_", " ").title()
100
 
101
  if value:
102
- # Check if span is a "not found" explanation
103
- if span and ("pas de mention" in span.lower() or "not performed" in str(value).lower()):
 
 
104
  display_value = "/"
105
  display_span = ""
106
  else:
@@ -126,26 +149,30 @@ def format_table(annotation):
126
  </table>"""
127
 
128
 
129
- def display_sample(idx):
130
  """Display a sample."""
131
- idx = int(idx)
132
- if idx < 0 or idx >= len(dataset):
133
  return "Invalid index", "Invalid index"
134
 
135
- sample = dataset[idx]
 
136
  cr = sample.get("CR", "")
137
  annotation = sample.get("annotation", {})
138
 
139
- return highlight_text(cr, annotation), format_table(annotation)
 
 
 
140
 
141
 
142
  # Build UI
143
  with gr.Blocks(title="Pancreas Annotations", theme=gr.themes.Base()) as demo:
144
  gr.Markdown("# 🔬 Pancreas Cancer Annotations Explorer")
145
- gr.Markdown("Hover over highlighted text to see extracted values. `/` means not found.")
146
 
147
  with gr.Row():
148
- slider = gr.Slider(0, len(dataset) - 1, value=0, step=1, label="Sample")
149
 
150
  with gr.Row():
151
  with gr.Column():
 
7
 
8
  # Load the dataset
9
  print("Loading dataset...")
10
+ full_dataset = load_dataset("rntc/biomed-fr-pancreas-annotations", split="train")
11
+ print(f"Loaded {len(full_dataset)} samples")
12
+
13
+ # Filter: keep only samples with >= 10 real annotations
14
+ MIN_ANNOTATIONS = 10
15
+
16
+ def count_real_annotations(annotation):
17
+ """Count real annotations (excluding 'not found' placeholders)."""
18
+ count = 0
19
+ for var_data in annotation.values():
20
+ if var_data and isinstance(var_data, dict):
21
+ value = var_data.get("value")
22
+ span = var_data.get("span", "")
23
+ if value:
24
+ if span and "pas de mention" in span.lower():
25
+ continue
26
+ if "not performed" in str(value).lower():
27
+ continue
28
+ count += 1
29
+ return count
30
+
31
+ # Filter dataset
32
+ filtered_indices = [
33
+ i for i, sample in enumerate(full_dataset)
34
+ if count_real_annotations(sample.get("annotation", {})) >= MIN_ANNOTATIONS
35
+ ]
36
+ print(f"Filtered to {len(filtered_indices)} samples with >= {MIN_ANNOTATIONS} annotations")
37
 
38
  # Colors for highlighting
39
  COLORS = [
 
85
  color_idx = 0
86
 
87
  for s in filtered:
 
88
  if s["start"] > pos:
89
  html.append(escape_html(cr_text[pos:s["start"]]))
90
 
 
91
  if s["var"] not in color_map:
92
  color_map[s["var"]] = COLORS[color_idx % len(COLORS)]
93
  color_idx += 1
94
 
 
95
  color = color_map[s["var"]]
96
  html.append(
97
  f'<mark style="background:{color};padding:1px 3px;border-radius:3px;" '
 
100
  )
101
  pos = s["end"]
102
 
 
103
  if pos < len(cr_text):
104
  html.append(escape_html(cr_text[pos:]))
105
 
106
+ return f"<pre style='white-space:pre-wrap;line-height:1.6;'>{''.join(html)}</pre>"
107
 
108
 
109
  def format_table(annotation):
 
120
  var_label = var_name.replace("_", " ").title()
121
 
122
  if value:
123
+ if span and "pas de mention" in span.lower():
124
+ display_value = "/"
125
+ display_span = ""
126
+ elif "not performed" in str(value).lower():
127
  display_value = "/"
128
  display_span = ""
129
  else:
 
149
  </table>"""
150
 
151
 
152
+ def display_sample(slider_idx):
153
  """Display a sample."""
154
+ slider_idx = int(slider_idx)
155
+ if slider_idx < 0 or slider_idx >= len(filtered_indices):
156
  return "Invalid index", "Invalid index"
157
 
158
+ real_idx = filtered_indices[slider_idx]
159
+ sample = full_dataset[real_idx]
160
  cr = sample.get("CR", "")
161
  annotation = sample.get("annotation", {})
162
 
163
+ n_annotations = count_real_annotations(annotation)
164
+ header = f"<p><b>Sample #{real_idx}</b> — {n_annotations} annotations</p>"
165
+
166
+ return header + highlight_text(cr, annotation), format_table(annotation)
167
 
168
 
169
  # Build UI
170
  with gr.Blocks(title="Pancreas Annotations", theme=gr.themes.Base()) as demo:
171
  gr.Markdown("# 🔬 Pancreas Cancer Annotations Explorer")
172
+ gr.Markdown(f"Showing {len(filtered_indices)} samples with >= {MIN_ANNOTATIONS} annotations. Hover over highlights to see values.")
173
 
174
  with gr.Row():
175
+ slider = gr.Slider(0, len(filtered_indices) - 1, value=0, step=1, label="Sample")
176
 
177
  with gr.Row():
178
  with gr.Column():