Rodrigo Batista commited on
Commit
ae6a44f
·
1 Parent(s): 93a2c69

first commit

Browse files
Files changed (1) hide show
  1. src/streamlit_app.py +560 -38
src/streamlit_app.py CHANGED
@@ -1,40 +1,562 @@
1
- import altair as alt
2
- import numpy as np
3
- import pandas as pd
4
  import streamlit as st
 
 
 
 
 
5
 
6
- """
7
- # Welcome to Streamlit!
8
-
9
- Edit `/streamlit_app.py` to customize this app to your heart's desire :heart:.
10
- If you have any questions, checkout our [documentation](https://docs.streamlit.io) and [community
11
- forums](https://discuss.streamlit.io).
12
-
13
- In the meantime, below is an example of what you can do with just a few lines of code:
14
- """
15
-
16
- num_points = st.slider("Number of points in spiral", 1, 10000, 1100)
17
- num_turns = st.slider("Number of turns in spiral", 1, 300, 31)
18
-
19
- indices = np.linspace(0, 1, num_points)
20
- theta = 2 * np.pi * num_turns * indices
21
- radius = indices
22
-
23
- x = radius * np.cos(theta)
24
- y = radius * np.sin(theta)
25
-
26
- df = pd.DataFrame({
27
- "x": x,
28
- "y": y,
29
- "idx": indices,
30
- "rand": np.random.randn(num_points),
31
- })
32
-
33
- st.altair_chart(alt.Chart(df, height=700, width=700)
34
- .mark_point(filled=True)
35
- .encode(
36
- x=alt.X("x", axis=None),
37
- y=alt.Y("y", axis=None),
38
- color=alt.Color("idx", legend=None, scale=alt.Scale()),
39
- size=alt.Size("rand", legend=None, scale=alt.Scale(range=[1, 150])),
40
- ))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import streamlit as st
2
+ import html
3
+ import torch
4
+ from transformers import AutoTokenizer, AutoModelForTokenClassification
5
+ import streamlit.components.v1 as components
6
+ import os
7
 
8
+ # Force Hugging Face cache to local folder with permissions
9
+ os.environ["HF_HOME"] = "/app/hf_cache"
10
+ os.environ["TRANSFORMERS_CACHE"] = "/app/hf_cache"
11
+ os.makedirs("/app/hf_cache", exist_ok=True)
12
+
13
+ # Opcional: chunker
14
+ try:
15
+ from predictor.document_chunker import DocumentChunker
16
+ except Exception:
17
+ DocumentChunker = None
18
+
19
+ # ============================================================
20
+ # PAGE CONFIG
21
+ # ============================================================
22
+ st.set_page_config(
23
+ page_title="MiNER - Stage 2: Metadata Extraction",
24
+ page_icon="🏷️",
25
+ layout="wide",
26
+ initial_sidebar_state="expanded"
27
+ )
28
+
29
+ # ============================================================
30
+ # LABEL STYLES
31
+ # ============================================================
32
+ LABEL_STYLES = {
33
+ "DATE": {"class": "highlight-date", "name": "📅 Date"},
34
+ "START-TIME": {"class": "highlight-start-time", "name": "🕐 Start time"},
35
+ "END-TIME": {"class": "highlight-end-time", "name": "🕑 End time"},
36
+ "LOCATION": {"class": "highlight-location", "name": "📍 Location"},
37
+ "MINUTE-ID": {"class": "highlight-minute-id", "name": "📋 Minute ID"},
38
+ "MEETING-TYPE": {"class": "highlight-meeting-type", "name": "📌 Meeting type"},
39
+
40
+ # President variants
41
+ "PRESIDENT-PRESENT": {"class": "highlight-president", "name": "👔 President (Present)"},
42
+ "PRESIDENT-ABSENT": {"class": "highlight-president-absent", "name": "🚫 President (Absent)"},
43
+ "PRESIDENT-SUBSTITUTED": {"class": "highlight-president-substituted", "name": "🔄 President (Substituted)"},
44
+
45
+ # Councilor variants
46
+ "COUNCILOR-PRESENT": {"class": "highlight-councilor", "name": "👥 Councilor (Present)"},
47
+ "COUNCILOR-ABSENT": {"class": "highlight-councilor-absent", "name": "🚫 Councilor (Absent)"},
48
+ "COUNCILOR-SUBSTITUTED": {"class": "highlight-councilor-substituted", "name": "🔄 Councilor (Substituted)"},
49
+ }
50
+
51
+
52
+
53
+ DEFAULT_CHUNK_SIZE = 800
54
+ DEFAULT_OVERLAP = 200
55
+ DEFAULT_AGGREGATION = "average"
56
+
57
+ # ============================================================
58
+ # CUSTOM CSS (adapted from Stage 1)
59
+ # ============================================================
60
+ st.markdown("""
61
+ <style>
62
+ .main-header {
63
+ font-size: 2.5rem;
64
+ font-weight: bold;
65
+ color: #1f77b4;
66
+ text-align: center;
67
+ margin-bottom: 1rem;
68
+ }
69
+
70
+ .subtitle {
71
+ text-align: center;
72
+ color: #555;
73
+ font-size: 1rem;
74
+ margin-bottom: 1.5rem;
75
+ }
76
+
77
+ /* ======= Entity Highlight Styles ======= */
78
+ .highlight-date {
79
+ background-color: #4CAF50;
80
+ border-bottom: 3px solid #2e7d32;
81
+ padding: 2px 4px;
82
+ border-radius: 3px;
83
+ color: #000;
84
+ }
85
+ .highlight-start-time {
86
+ background-color: #2196F3;
87
+ border-bottom: 3px solid #0d47a1;
88
+ padding: 2px 4px;
89
+ border-radius: 3px;
90
+ color: #000;
91
+ }
92
+ .highlight-end-time {
93
+ background-color: #3F51B5;
94
+ border-bottom: 3px solid #1a237e;
95
+ padding: 2px 4px;
96
+ border-radius: 3px;
97
+ color: #000;
98
+ }
99
+ .highlight-location {
100
+ background-color: #FF9800;
101
+ border-bottom: 3px solid #e65100;
102
+ padding: 2px 4px;
103
+ border-radius: 3px;
104
+ color: #000;
105
+ }
106
+ .highlight-minute-id {
107
+ background-color: #9C27B0;
108
+ border-bottom: 3px solid #4a148c;
109
+ padding: 2px 4px;
110
+ border-radius: 3px;
111
+ color: #000;
112
+ }
113
+ .highlight-meeting-type {
114
+ background-color: #607D8B;
115
+ border-bottom: 3px solid #37474f;
116
+ padding: 2px 4px;
117
+ border-radius: 3px;
118
+ color: #000;
119
+ }
120
+ .highlight-president {
121
+ background-color: #f27e91;
122
+ border-bottom: 3px solid #ad1457;
123
+ padding: 2px 4px;
124
+ border-radius: 3px;
125
+ color: #000;
126
+ }
127
+ .highlight-councilor {
128
+ background-color: #F44336;
129
+ border-bottom: 3px solid #b71c1c;
130
+ padding: 2px 4px;
131
+ border-radius: 3px;
132
+ color: #000;
133
+ }
134
+
135
+ .annotation-box {
136
+ padding: 1rem;
137
+ margin: 0.75rem 0;
138
+ border-radius: 0.5rem;
139
+ background-color: #f0f2f6;
140
+ white-space: pre-wrap;
141
+ line-height: 1.8;
142
+ font-family: 'Segoe UI', Roboto, monospace;
143
+ }
144
+
145
+ .legend-item {
146
+ display: inline-block;
147
+ padding: 4px 8px;
148
+ margin: 3px;
149
+ border-radius: 4px;
150
+ font-size: 0.8rem;
151
+ font-weight: 500;
152
+ }
153
+
154
+ .highlight-president-absent {
155
+ background-color: #f27e91 !important;
156
+ border: 2px solid #050505 !important;
157
+ color: #fff !important;
158
+ }
159
+
160
+ .highlight-councilor-absent {
161
+ background-color: #F44336 !important;
162
+ border: 2px solid #050505 !important;
163
+ color: #fff !important;
164
+ }
165
+ .highlight-president-substituted {
166
+ background-color: #f27e91 !important;
167
+ border: 2px solid #7b69c7 !important;
168
+ color: #fff !important;
169
+ }
170
+ .highlight-councilor-substituted {
171
+ background-color: #F44336 !important;
172
+ border: 2px solid #7b69c7 !important;
173
+ color: #fff !important;
174
+ }
175
+
176
+
177
+ </style>
178
+ """, unsafe_allow_html=True)
179
+
180
+
181
+ LABEL_ALIASES = {
182
+ "NUMERO-ATA": "MINUTE-ID",
183
+ "DATA": "DATE",
184
+ "TIPO-REUNIAO": "MEETING-TYPE",
185
+ "TIPO-REUNIAO-ORDINARIA": "MEETING-TYPE",
186
+ "TIPO-REUNIAO-EXTRAORDINARIA": "MEETING-TYPE",
187
+ "LOCAL": "LOCATION",
188
+ "HORARIO-INICIO": "START-TIME",
189
+ "HORARIO-FIM": "END-TIME",
190
+ "PARTICIPANTE-PRESIDENTE-PRESENTE": "PRESIDENT-PRESENT",
191
+ "PARTICIPANTE-PRESIDENTE-AUSENTE": "PRESIDENT-ABSENT",
192
+ "PARTICIPANTE-PRESIDENTE-SUBSTITUIDO": "PRESIDENT-SUBSTITUTED",
193
+ "PARTICIPANTE-VEREADOR-PRESENTE": "COUNCILOR-PRESENT",
194
+ "PARTICIPANTE-VEREADOR-AUSENTE": "COUNCILOR-ABSENT",
195
+ "PARTICIPANTE-VEREADOR-SUBSTITUIDO": "COUNCILOR-SUBSTITUTED",
196
+ }
197
+
198
+
199
+ # ============================================================
200
+ # RENDER FUNCTIONS
201
+ # ============================================================
202
+
203
+ def _style_for_label(raw_label: str):
204
+ """Return style dict for a given label."""
205
+ return LABEL_STYLES.get(raw_label, {"bg": "#ddd", "fg": "#000", "name": raw_label})
206
+
207
+
208
+ def render_html(base_text: str, spans: list, scores: list = None):
209
+ """Render annotated text with colored entity highlights (VotIE-style)."""
210
+ out, cur = [], 0
211
+ for i, (s, e, lbl) in enumerate(spans):
212
+ if cur < s:
213
+ out.append(html.escape(base_text[cur:s]))
214
+ norm_lbl = LABEL_ALIASES.get(lbl, lbl)
215
+ style = LABEL_STYLES.get(norm_lbl, {"class": "", "name": lbl})
216
+ css_class = style.get("class", "")
217
+ title = f"{style['name']} ({scores[i]:.2f})" if scores else style['name']
218
+ out.append(
219
+ f"<span class='{css_class}' title='{title}'>{html.escape(base_text[s:e])}</span>"
220
+ )
221
+ cur = e
222
+ if cur < len(base_text):
223
+ out.append(html.escape(base_text[cur:]))
224
+ return "<div class='annotation-box'>" + "".join(out) + "</div>"
225
+
226
+
227
+ def render_legend():
228
+ """Render entity legend with same colors as highlight classes."""
229
+ legend_html = """
230
+ <div class='legend-box'>
231
+ <strong>🔖 Entity Legend:</strong><br><br>
232
+ """
233
+
234
+ for label, info in LABEL_STYLES.items():
235
+ css_class = info.get("class", "")
236
+ name = info.get("name", label)
237
+ legend_html += f"<span class='legend-item {css_class}'>{name}</span> "
238
+
239
+ legend_html += "</div>"
240
+ return legend_html
241
+
242
+
243
+
244
+
245
+
246
+ # ============================================================
247
+ # MODEL LOADING
248
+ # ============================================================
249
+
250
+ @st.cache_resource
251
+ def load_ner_model(model_name: str):
252
+ """Load NER model from Hugging Face"""
253
+ try:
254
+ cache_dir = "/app/hf_cache" # safe writable path
255
+ tokenizer = AutoTokenizer.from_pretrained(model_name, cache_dir=cache_dir)
256
+ model = AutoModelForTokenClassification.from_pretrained(model_name, cache_dir=cache_dir)
257
+ model.eval()
258
+ device = "cpu"
259
+ model.to(device)
260
+ id2label = model.config.id2label
261
+ return tokenizer, model, device, id2label, None
262
+ except Exception as e:
263
+ return None, None, None, None, f"❌ Error loading model: {str(e)}"
264
+
265
+
266
+ # ============================================================
267
+ # INFERENCE FUNCTIONS
268
+ # ============================================================
269
+
270
+ def predict_spans_chunk(text_chunk: str, tokenizer, model, device, id2label, aggregation: str = "average"):
271
+ """Predict entity spans for a single chunk"""
272
+ enc = tokenizer(text_chunk, return_offsets_mapping=True, return_tensors="pt", truncation=True)
273
+ word_ids = enc.word_ids()
274
+ enc = {k: v.to(device) for k, v in enc.items()}
275
+
276
+ with torch.no_grad():
277
+ logits = model(**{k: v for k, v in enc.items() if k in {"input_ids", "attention_mask"}}).logits
278
+ probs = torch.softmax(logits, dim=-1)
279
+
280
+ pred_ids = probs.argmax(-1).squeeze(0).tolist()
281
+ offsets = enc["offset_mapping"].squeeze(0).tolist()
282
+
283
+ spans = []
284
+ curr_label = None
285
+ curr_start = None
286
+ curr_end = None
287
+ curr_scores = []
288
+
289
+ def close_span(end_idx=None):
290
+ nonlocal spans, curr_label, curr_start, curr_end, curr_scores
291
+ if curr_label is None or curr_start is None:
292
+ return
293
+ start_char = curr_start
294
+ end_char = curr_end if curr_end is not None else end_idx
295
+ if end_char is None or end_char <= start_char:
296
+ curr_label = curr_start = curr_end = None
297
+ curr_scores = []
298
+ return
299
+ if aggregation == "average" and curr_scores:
300
+ score = sum(curr_scores) / len(curr_scores)
301
+ elif curr_scores:
302
+ score = max(curr_scores)
303
+ else:
304
+ score = 0.0
305
+ spans.append({"label": curr_label, "start": start_char, "end": end_char, "score": float(score)})
306
+ curr_label = curr_start = curr_end = None
307
+ curr_scores = []
308
+
309
+ for idx, (pid, (start, end)) in enumerate(zip(pred_ids, offsets)):
310
+ wid = word_ids[idx]
311
+ if start == end or wid is None or (idx > 0 and wid == word_ids[idx - 1]):
312
+ continue
313
+
314
+ label = id2label.get(pid, "O")
315
+
316
+ last_idx = idx
317
+ for j in range(idx + 1, len(word_ids)):
318
+ if word_ids[j] != wid:
319
+ break
320
+ last_idx = j
321
+ word_start = start
322
+ word_end = offsets[last_idx][1]
323
+
324
+ if label == "O":
325
+ if curr_label is not None:
326
+ close_span()
327
+ continue
328
+
329
+ if label.startswith("B-"):
330
+ if curr_label is not None:
331
+ close_span()
332
+ curr_label = label[2:]
333
+ curr_start = word_start
334
+ curr_end = word_end
335
+ curr_scores = [float(probs[0, idx, pid].item())]
336
+
337
+ elif label.startswith("I-"):
338
+ ent = label[2:]
339
+ if curr_label == ent:
340
+ curr_end = word_end
341
+ curr_scores.append(float(probs[0, idx, pid].item()))
342
+ else:
343
+ if curr_label is not None:
344
+ close_span()
345
+ curr_label = ent
346
+ curr_start = word_start
347
+ curr_end = word_end
348
+ curr_scores = [float(probs[0, idx, pid].item())]
349
+
350
+ last_real = 0
351
+ for (s, e) in offsets[::-1]:
352
+ if s != e:
353
+ last_real = e
354
+ break
355
+ close_span(last_real)
356
+ return spans
357
+
358
+
359
+ def extract_metadata(base_text: str, tokenizer, model, device, id2label):
360
+ """Main extraction function"""
361
+ if not base_text or len(base_text.strip()) < 10:
362
+ return None, None, "Please enter text to analyze (minimum 10 characters)."
363
+
364
+ if DocumentChunker is None:
365
+ spans_chunk = predict_spans_chunk(base_text, tokenizer, model, device, id2label, DEFAULT_AGGREGATION)
366
+ spans_all = [(sp["start"], sp["end"], sp["label"]) for sp in spans_chunk]
367
+ scores_all = [sp["score"] for sp in spans_chunk]
368
+ else:
369
+ chunker = DocumentChunker(chunk_size=DEFAULT_CHUNK_SIZE, chunk_overlap=DEFAULT_OVERLAP)
370
+ chunks = chunker.chunk_document(base_text)
371
+
372
+ spans_all, scores_all = [], []
373
+ cursor = 0
374
+ for ch in chunks:
375
+ start_ch = base_text.find(ch, cursor)
376
+ if start_ch == -1:
377
+ start_ch = cursor
378
+ spans_chunk = predict_spans_chunk(ch, tokenizer, model, device, id2label, DEFAULT_AGGREGATION)
379
+ for sp in spans_chunk:
380
+ spans_all.append((sp["start"] + start_ch, sp["end"] + start_ch, sp["label"]))
381
+ scores_all.append(sp["score"])
382
+ cursor = start_ch + 1
383
+
384
+ # Sort and remove overlaps
385
+ paired = sorted(zip(spans_all, scores_all), key=lambda x: (x[0][0], x[0][1]))
386
+ kept, kept_scores, last_e = [], [], -1
387
+ for (s, e, lbl), sc in paired:
388
+ if s >= last_e:
389
+ kept.append((s, e, lbl))
390
+ kept_scores.append(sc)
391
+ last_e = e
392
+
393
+ return kept, kept_scores, None
394
+
395
+
396
+ # ============================================================
397
+ # EXAMPLE TEXTS
398
+ # ============================================================
399
+ EXAMPLE_TEXT_PT_INTRO = "Câmara Municipal de Guimarães.\n ATA Nº 10 Fls. __14__ REUNIÃO ORDINÁRIA DE 25 DE MAIO DE 2023 \nATA\nAos vinte e cinco dias do mês de maio do ano de dois mil e vinte e três, no Edifício dos Paços do Concelho, na Sala de Reuniões, compareceram os Excelentíssimos Senhores – Presidente da Câmara – Domingos Bragança Salgado – e os Vereadores Adelina Paula Mendes Pinto, Paulo Rui Lopes Pereira da Silva, Paula Cristina dos Santos Oliveira, Nelson José Guimarães Felgueiras, Ana Maria Prego de Faria Berkeley Cotter, Ricardo José Machado Pereira da Silva Araújo, Vânia Carvalho Dias da Silva de Antas de Barros, Hugo Miguel Alves Ribeiro e Eduardo Miguel Teixeira Fernandes. \nO Vereador Bruno Alberto Vieira Fernandes solicitou a sua substituição na presente reunião, nos termos do art.º 78.º da Lei nº 169/99, de 18 de setembro, na sua redação atual. Nesta sequência, a cidadã imediatamente a seguir na ordem da lista da Coligação Juntos por Guimarães pelo PPD/PSD, Emília Rosa Leite Pereira Lemos, manifestou impossibilidade em estar presente na reunião, pelo que foi substituída pelo cidadão imediatamente a seguir na ordem da referida lista, Eduardo Miguel Teixeira Fernandes, nos termos do nº 7, do art.º 77º, do mesmo diploma legal, tendo o Presidente da Câmara verificado a conformidade formal do processo eleitoral com a identidade do eleito. \nNão compareceu a Vereadora Alice Sofia de Freitas Soares Ferreira Fernandes, cuja falta foi considerada justificada. \nSecretariou a Diretora Municipal ***********************, em regime de substituição, **************************************. \nPelas 10.00 horas foi declarada aberta a reunião. \n \nORDEM DO DIA \nINFORMAÇÕES\n"
400
+ EXAMPLE_TEXT_PT_CLOSING = "PELAS ONZE HORAS E CINQUENTA MINUTOS O PRESIDENTE DA CÂMARA DEU POR ENCERRADA A REUNIÃO, DE QUE, PARA CONSTAR, SE LAVROU A PRESENTE ATA. \n"
401
+ EXAMPLE_TEXT_EN_INTRO = "Guimarães City Council.\n MINUTES NO. 1 FLS. __10__ ORDINARY MEETING OF JANUARY 13, 2022 \nMINUTES\nOn the thirteenth day of January in the year two thousand and twenty-two, in the Town Hall Building, in the Meeting Room, the following Gentlemen attended: Mayor – Domingos Bragança Salgado and Councilors – Adelina Paula Mendes Pinto, Paulo Rui Lopes Pereira da Silva, Paula Cristina dos Santos Oliveira, Nelson José Guimarães Felgueiras, Alice Sofia de Freitas Soares Ferreira Fernandes, Ana Maria Prego de Faria Berkeley Cotter, Bruno Alberto Vieira Fernandes, Ricardo José Machado Pereira da Silva Araújo, Vânia Carvalho Dias da Silva de Antas de Barros and Hugo Miguel Alves Ribeiro. \nSecretary the Director ***************, **************************************. \nAt 10.10 am the meeting was declared open. \n \nAGENDA \nINFORMATION\n"
402
+ EXAMPLE_TEXT_EN_CLOSING = "AT ELEVEN O'CLOCK AND FIFTY MINUTES, THE PRESIDENT OF THE CHAMBER CLOSED THE MEETING, OF WHICH, FOR THE RECORD, THESE MINUTES WERE DRAWN UP. \n"
403
+
404
+ # Pre-load models
405
+ load_ner_model("anonymous13542/BERTimbau-large-metadata-council-pt")
406
+ load_ner_model("anonymous13542/XLMR-large-metadata-council-en")
407
+
408
+
409
+ # ============================================================
410
+ # MAIN APP
411
+ # ============================================================
412
+
413
+ def main():
414
+ # Header
415
+ st.markdown('<h1 class="main-header">🏷️ MiNER — Stage 2: Metadata Extraction Demo</h1>', unsafe_allow_html=True)
416
+ st.markdown("""
417
+ <p class="subtitle">
418
+ Automatic extraction of structured metadata from municipal meeting minutes
419
+ </p>
420
+ """, unsafe_allow_html=True)
421
+
422
+ # Sidebar
423
+ st.sidebar.header("⚙️ Configuration")
424
+
425
+ example_choice = st.sidebar.selectbox(
426
+ "Choose an example or enter your own text:",
427
+ ["Custom Text", "Portuguese Example - Intro", "Portuguese Example - Closing",
428
+ "English Example - Intro", "English Example - Closing"],
429
+ index=0
430
+ )
431
+
432
+ # Auto-select model based on example (English/multilingual as default)
433
+ if "Portuguese" in example_choice:
434
+ model_name = "anonymous13542/BERTimbau-large-metadata-council-pt"
435
+ else:
436
+ model_name = "anonymous13542/XLMR-large-metadata-council-en"
437
+
438
+ tokenizer, model, device, id2label, error = load_ner_model(model_name)
439
+
440
+ if error or model is None:
441
+ st.sidebar.markdown(f"<div class='status-box error'>❌ Error loading model:<br>{error}</div>",
442
+ unsafe_allow_html=True)
443
+ else:
444
+ st.sidebar.markdown(
445
+ f"<div class='status-box success'>✅ Loaded automatically: <strong>{model_name.split('/')[-1]}</strong></div>",
446
+ unsafe_allow_html=True)
447
+
448
+ st.sidebar.markdown("---")
449
+ st.sidebar.markdown("### 📊 About")
450
+ st.sidebar.info("""
451
+ **MiNER Stage 2** uses Named Entity Recognition models to automatically extract metadata from meeting minutes.
452
+
453
+ - **Model**: BERTimbau / XLM-RoBERTa fine-tuned
454
+ - **Languages**: Portuguese and English
455
+ - **Method**: Token Classification (NER) with BIO tagging
456
+ """)
457
+
458
+ st.sidebar.markdown("---")
459
+ st.sidebar.markdown(render_legend(), unsafe_allow_html=True)
460
+
461
+ st.sidebar.markdown("---")
462
+ st.sidebar.markdown("### 🔗 Resources")
463
+ st.sidebar.markdown("""
464
+ - [📖 Model Card (PT)](https://huggingface.co/anonymous13542/BERTimbau-large-metadata-council-pt)
465
+ - [📖 Model Card (EN)](https://huggingface.co/anonymous13542/XLMR-large-metadata-council-en)
466
+ """)
467
+
468
+ # Main layout
469
+ col1, col2 = st.columns([1, 1])
470
+
471
+ with col1:
472
+ st.subheader("📄 Input Document")
473
+
474
+ if example_choice == "Portuguese Example - Intro":
475
+ input_text = st.text_area("Input", value=EXAMPLE_TEXT_PT_INTRO, height=400, label_visibility="collapsed")
476
+ elif example_choice == "Portuguese Example - Closing":
477
+ input_text = st.text_area("Input", value=EXAMPLE_TEXT_PT_CLOSING, height=400, label_visibility="collapsed")
478
+ elif example_choice == "English Example - Intro":
479
+ input_text = st.text_area("Input", value=EXAMPLE_TEXT_EN_INTRO, height=400, label_visibility="collapsed")
480
+ elif example_choice == "English Example - Closing":
481
+ input_text = st.text_area("Input", value=EXAMPLE_TEXT_EN_CLOSING, height=400, label_visibility="collapsed")
482
+ else:
483
+ input_text = st.text_area("Input", placeholder="Paste your document text here…", height=400,
484
+ label_visibility="collapsed")
485
+
486
+ extract_button = st.button("🏷️ Extract Metadata", type="primary", use_container_width=True)
487
+
488
+ with col2:
489
+ st.subheader("📊 Extraction Results")
490
+
491
+ results_placeholder = st.empty()
492
+
493
+ if extract_button:
494
+ with st.spinner("🔄 Analyzing document..."):
495
+ if model is None:
496
+ results_placeholder.warning("⚠️ Model could not be loaded. Please refresh and try again.")
497
+ elif not input_text or len(input_text.strip()) < 10:
498
+ results_placeholder.warning("⚠️ Please enter a longer text (minimum 10 characters).")
499
+ else:
500
+ spans, scores, err = extract_metadata(input_text, tokenizer, model, device, id2label)
501
+ if err:
502
+ results_placeholder.error(f"❌ {err}")
503
+ elif not spans:
504
+ results_placeholder.info("ℹ️ No entities found in the text.")
505
+ else:
506
+ with results_placeholder.container():
507
+ st.markdown(
508
+ f"**Found {len(spans)} entities** with average confidence: **{sum(scores) / len(scores):.2%}**")
509
+ st.markdown("---")
510
+ st.markdown("**📝 Annotated Text**")
511
+ annotated_html = render_html(input_text, spans, scores)
512
+ #st.write(annotated_html)
513
+ st.markdown(annotated_html, unsafe_allow_html=True)
514
+ else:
515
+ results_placeholder.info("👈 Enter text in the input box and click 'Extract Metadata' to begin.")
516
+
517
+ # How it works
518
+ st.markdown("---")
519
+ st.subheader("🎯 How It Works")
520
+ st.markdown("""
521
+ The model analyzes the **meeting minutes** to automatically extract **structured metadata** using a *Named Entity Recognition (NER)* approach.
522
+
523
+ Each token in the document is classified, identifying information such as:
524
+ - 📅 **Date**
525
+ - 🕐 **Start / End time**
526
+ - 📍 **Location**
527
+ - 📋 **Minute ID**
528
+ - 📌 **Meeting type**
529
+ - 👔 **President** (present / absent / substituted)
530
+ - 👥 **Councilors** (present / absent / substituted)
531
+
532
+ The model uses the **BIO tagging scheme** (*Begin, Inside, Outside*) to mark entity boundaries, and the final spans are reconstructed from token-level predictions.
533
+ """)
534
+
535
+ st.markdown("**Example:**")
536
+ st.code("""
537
+ Introduction text: "Câmara Municipal de Guimarães. ATA Nº 10 Fls. __14__ REUNIÃO ORDINÁRIA DE 25 DE MAIO DE 2023 ATA Aos vinte e cinco dias do mês de maio do ano de dois mil e vinte e três, no Edifício dos Paços do Concelho, na Sala de Reuniões, compareceram os Excelentíssimos Senhores – Presidente da Câmara – Domingos Bragança Salgado – e os Vereadores Adelina Paula Mendes Pinto, Paulo Rui Lopes Pereira da Silva, Paula Cristina dos Santos Oliveira, Nelson José Guimarães Felgueiras, Ana Maria Prego de Faria Berkeley Cotter, Ricardo José Machado Pereira da Silva Araújo, Vânia Carvalho Dias da Silva de Antas de Barros, Hugo Miguel Alves Ribeiro e Eduardo Miguel Teixeira Fernandes. O Vereador Bruno Alberto Vieira Fernandes solicitou a sua substituição na presente reunião, nos termos do art.º 78.º da Lei nº 169/99, de 18 de setembro, na sua redação atual. Nesta sequência, a cidadã imediatamente a seguir na ordem da lista da Coligação Juntos por Guimarães pelo PPD/PSD, Emília Rosa Leite Pereira Lemos, manifestou impossibilidade em estar presente na reunião, pelo que foi substituída pelo cidadão imediatamente a seguir na ordem da referida lista, Eduardo Miguel Teixeira Fernandes, nos termos do nº 7, do art.º 77º, do mesmo diploma legal, tendo o Presidente da Câmara verificado a conformidade formal do processo eleitoral com a identidade do eleito. Não compareceu a Vereadora Alice Sofia de Freitas Soares Ferreira Fernandes, cuja falta foi considerada justificada. Secretariou a Diretora Municipal ***********************, em regime de substituição, **************************************. Pelas 10.00 horas foi declarada aberta a reunião. ORDEM DO DIA INFORMAÇÕES "
538
+
539
+ Predicted entities:
540
+ - MINUTE-ID → "10"
541
+ - MEETING-TYPE → "ORDINÁRIA"
542
+ - DATE → "11/09/2024"
543
+ - PRESIDENT-PRESENT → "João Maria Aranha Grilo"
544
+ - COUNCILOR-PRESENT → "Paulo Jorge da Silva Gonçalves"
545
+ - LOCATION → "Edifício Sede do Município de Alandroal"
546
+ - START-TIME → "15 horas e 30 minutos"
547
+ """)
548
+
549
+
550
+ # Footer
551
+ st.markdown("---")
552
+ st.markdown("""
553
+ <div style='text-align:center; color:#64748b; font-size:0.875rem; padding:2rem 0;'>
554
+ <p style='margin:0 0 8px 0;'><strong style='color:#667eea;'>MiNER</strong> — Municipal Information Extraction & Recognition</p>
555
+ <p style='margin:0;'>Anonymous research demo for Stage 2 (Metadata Extraction via NER)</p>
556
+ <p style='margin:12px 0 0 0; opacity:0.7;'>Built with ❤️ using Streamlit & Transformers</p>
557
+ </div>
558
+ """, unsafe_allow_html=True)
559
+
560
+
561
+ if __name__ == "__main__":
562
+ main()