Spaces:
Runtime error
Runtime error
Rodrigo Batista
commited on
Commit
·
ae6a44f
1
Parent(s):
93a2c69
first commit
Browse files- src/streamlit_app.py +560 -38
src/streamlit_app.py
CHANGED
|
@@ -1,40 +1,562 @@
|
|
| 1 |
-
import altair as alt
|
| 2 |
-
import numpy as np
|
| 3 |
-
import pandas as pd
|
| 4 |
import streamlit as st
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5 |
|
| 6 |
-
|
| 7 |
-
|
| 8 |
-
|
| 9 |
-
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
|
| 14 |
-
|
| 15 |
-
|
| 16 |
-
|
| 17 |
-
|
| 18 |
-
|
| 19 |
-
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
|
| 28 |
-
|
| 29 |
-
|
| 30 |
-
|
| 31 |
-
}
|
| 32 |
-
|
| 33 |
-
|
| 34 |
-
|
| 35 |
-
|
| 36 |
-
|
| 37 |
-
|
| 38 |
-
|
| 39 |
-
|
| 40 |
-
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
import streamlit as st
|
| 2 |
+
import html
|
| 3 |
+
import torch
|
| 4 |
+
from transformers import AutoTokenizer, AutoModelForTokenClassification
|
| 5 |
+
import streamlit.components.v1 as components
|
| 6 |
+
import os
|
| 7 |
|
| 8 |
+
# Force Hugging Face cache to local folder with permissions
|
| 9 |
+
os.environ["HF_HOME"] = "/app/hf_cache"
|
| 10 |
+
os.environ["TRANSFORMERS_CACHE"] = "/app/hf_cache"
|
| 11 |
+
os.makedirs("/app/hf_cache", exist_ok=True)
|
| 12 |
+
|
| 13 |
+
# Opcional: chunker
|
| 14 |
+
try:
|
| 15 |
+
from predictor.document_chunker import DocumentChunker
|
| 16 |
+
except Exception:
|
| 17 |
+
DocumentChunker = None
|
| 18 |
+
|
| 19 |
+
# ============================================================
|
| 20 |
+
# PAGE CONFIG
|
| 21 |
+
# ============================================================
|
| 22 |
+
st.set_page_config(
|
| 23 |
+
page_title="MiNER - Stage 2: Metadata Extraction",
|
| 24 |
+
page_icon="🏷️",
|
| 25 |
+
layout="wide",
|
| 26 |
+
initial_sidebar_state="expanded"
|
| 27 |
+
)
|
| 28 |
+
|
| 29 |
+
# ============================================================
|
| 30 |
+
# LABEL STYLES
|
| 31 |
+
# ============================================================
|
| 32 |
+
LABEL_STYLES = {
|
| 33 |
+
"DATE": {"class": "highlight-date", "name": "📅 Date"},
|
| 34 |
+
"START-TIME": {"class": "highlight-start-time", "name": "🕐 Start time"},
|
| 35 |
+
"END-TIME": {"class": "highlight-end-time", "name": "🕑 End time"},
|
| 36 |
+
"LOCATION": {"class": "highlight-location", "name": "📍 Location"},
|
| 37 |
+
"MINUTE-ID": {"class": "highlight-minute-id", "name": "📋 Minute ID"},
|
| 38 |
+
"MEETING-TYPE": {"class": "highlight-meeting-type", "name": "📌 Meeting type"},
|
| 39 |
+
|
| 40 |
+
# President variants
|
| 41 |
+
"PRESIDENT-PRESENT": {"class": "highlight-president", "name": "👔 President (Present)"},
|
| 42 |
+
"PRESIDENT-ABSENT": {"class": "highlight-president-absent", "name": "🚫 President (Absent)"},
|
| 43 |
+
"PRESIDENT-SUBSTITUTED": {"class": "highlight-president-substituted", "name": "🔄 President (Substituted)"},
|
| 44 |
+
|
| 45 |
+
# Councilor variants
|
| 46 |
+
"COUNCILOR-PRESENT": {"class": "highlight-councilor", "name": "👥 Councilor (Present)"},
|
| 47 |
+
"COUNCILOR-ABSENT": {"class": "highlight-councilor-absent", "name": "🚫 Councilor (Absent)"},
|
| 48 |
+
"COUNCILOR-SUBSTITUTED": {"class": "highlight-councilor-substituted", "name": "🔄 Councilor (Substituted)"},
|
| 49 |
+
}
|
| 50 |
+
|
| 51 |
+
|
| 52 |
+
|
| 53 |
+
DEFAULT_CHUNK_SIZE = 800
|
| 54 |
+
DEFAULT_OVERLAP = 200
|
| 55 |
+
DEFAULT_AGGREGATION = "average"
|
| 56 |
+
|
| 57 |
+
# ============================================================
|
| 58 |
+
# CUSTOM CSS (adapted from Stage 1)
|
| 59 |
+
# ============================================================
|
| 60 |
+
st.markdown("""
|
| 61 |
+
<style>
|
| 62 |
+
.main-header {
|
| 63 |
+
font-size: 2.5rem;
|
| 64 |
+
font-weight: bold;
|
| 65 |
+
color: #1f77b4;
|
| 66 |
+
text-align: center;
|
| 67 |
+
margin-bottom: 1rem;
|
| 68 |
+
}
|
| 69 |
+
|
| 70 |
+
.subtitle {
|
| 71 |
+
text-align: center;
|
| 72 |
+
color: #555;
|
| 73 |
+
font-size: 1rem;
|
| 74 |
+
margin-bottom: 1.5rem;
|
| 75 |
+
}
|
| 76 |
+
|
| 77 |
+
/* ======= Entity Highlight Styles ======= */
|
| 78 |
+
.highlight-date {
|
| 79 |
+
background-color: #4CAF50;
|
| 80 |
+
border-bottom: 3px solid #2e7d32;
|
| 81 |
+
padding: 2px 4px;
|
| 82 |
+
border-radius: 3px;
|
| 83 |
+
color: #000;
|
| 84 |
+
}
|
| 85 |
+
.highlight-start-time {
|
| 86 |
+
background-color: #2196F3;
|
| 87 |
+
border-bottom: 3px solid #0d47a1;
|
| 88 |
+
padding: 2px 4px;
|
| 89 |
+
border-radius: 3px;
|
| 90 |
+
color: #000;
|
| 91 |
+
}
|
| 92 |
+
.highlight-end-time {
|
| 93 |
+
background-color: #3F51B5;
|
| 94 |
+
border-bottom: 3px solid #1a237e;
|
| 95 |
+
padding: 2px 4px;
|
| 96 |
+
border-radius: 3px;
|
| 97 |
+
color: #000;
|
| 98 |
+
}
|
| 99 |
+
.highlight-location {
|
| 100 |
+
background-color: #FF9800;
|
| 101 |
+
border-bottom: 3px solid #e65100;
|
| 102 |
+
padding: 2px 4px;
|
| 103 |
+
border-radius: 3px;
|
| 104 |
+
color: #000;
|
| 105 |
+
}
|
| 106 |
+
.highlight-minute-id {
|
| 107 |
+
background-color: #9C27B0;
|
| 108 |
+
border-bottom: 3px solid #4a148c;
|
| 109 |
+
padding: 2px 4px;
|
| 110 |
+
border-radius: 3px;
|
| 111 |
+
color: #000;
|
| 112 |
+
}
|
| 113 |
+
.highlight-meeting-type {
|
| 114 |
+
background-color: #607D8B;
|
| 115 |
+
border-bottom: 3px solid #37474f;
|
| 116 |
+
padding: 2px 4px;
|
| 117 |
+
border-radius: 3px;
|
| 118 |
+
color: #000;
|
| 119 |
+
}
|
| 120 |
+
.highlight-president {
|
| 121 |
+
background-color: #f27e91;
|
| 122 |
+
border-bottom: 3px solid #ad1457;
|
| 123 |
+
padding: 2px 4px;
|
| 124 |
+
border-radius: 3px;
|
| 125 |
+
color: #000;
|
| 126 |
+
}
|
| 127 |
+
.highlight-councilor {
|
| 128 |
+
background-color: #F44336;
|
| 129 |
+
border-bottom: 3px solid #b71c1c;
|
| 130 |
+
padding: 2px 4px;
|
| 131 |
+
border-radius: 3px;
|
| 132 |
+
color: #000;
|
| 133 |
+
}
|
| 134 |
+
|
| 135 |
+
.annotation-box {
|
| 136 |
+
padding: 1rem;
|
| 137 |
+
margin: 0.75rem 0;
|
| 138 |
+
border-radius: 0.5rem;
|
| 139 |
+
background-color: #f0f2f6;
|
| 140 |
+
white-space: pre-wrap;
|
| 141 |
+
line-height: 1.8;
|
| 142 |
+
font-family: 'Segoe UI', Roboto, monospace;
|
| 143 |
+
}
|
| 144 |
+
|
| 145 |
+
.legend-item {
|
| 146 |
+
display: inline-block;
|
| 147 |
+
padding: 4px 8px;
|
| 148 |
+
margin: 3px;
|
| 149 |
+
border-radius: 4px;
|
| 150 |
+
font-size: 0.8rem;
|
| 151 |
+
font-weight: 500;
|
| 152 |
+
}
|
| 153 |
+
|
| 154 |
+
.highlight-president-absent {
|
| 155 |
+
background-color: #f27e91 !important;
|
| 156 |
+
border: 2px solid #050505 !important;
|
| 157 |
+
color: #fff !important;
|
| 158 |
+
}
|
| 159 |
+
|
| 160 |
+
.highlight-councilor-absent {
|
| 161 |
+
background-color: #F44336 !important;
|
| 162 |
+
border: 2px solid #050505 !important;
|
| 163 |
+
color: #fff !important;
|
| 164 |
+
}
|
| 165 |
+
.highlight-president-substituted {
|
| 166 |
+
background-color: #f27e91 !important;
|
| 167 |
+
border: 2px solid #7b69c7 !important;
|
| 168 |
+
color: #fff !important;
|
| 169 |
+
}
|
| 170 |
+
.highlight-councilor-substituted {
|
| 171 |
+
background-color: #F44336 !important;
|
| 172 |
+
border: 2px solid #7b69c7 !important;
|
| 173 |
+
color: #fff !important;
|
| 174 |
+
}
|
| 175 |
+
|
| 176 |
+
|
| 177 |
+
</style>
|
| 178 |
+
""", unsafe_allow_html=True)
|
| 179 |
+
|
| 180 |
+
|
| 181 |
+
LABEL_ALIASES = {
|
| 182 |
+
"NUMERO-ATA": "MINUTE-ID",
|
| 183 |
+
"DATA": "DATE",
|
| 184 |
+
"TIPO-REUNIAO": "MEETING-TYPE",
|
| 185 |
+
"TIPO-REUNIAO-ORDINARIA": "MEETING-TYPE",
|
| 186 |
+
"TIPO-REUNIAO-EXTRAORDINARIA": "MEETING-TYPE",
|
| 187 |
+
"LOCAL": "LOCATION",
|
| 188 |
+
"HORARIO-INICIO": "START-TIME",
|
| 189 |
+
"HORARIO-FIM": "END-TIME",
|
| 190 |
+
"PARTICIPANTE-PRESIDENTE-PRESENTE": "PRESIDENT-PRESENT",
|
| 191 |
+
"PARTICIPANTE-PRESIDENTE-AUSENTE": "PRESIDENT-ABSENT",
|
| 192 |
+
"PARTICIPANTE-PRESIDENTE-SUBSTITUIDO": "PRESIDENT-SUBSTITUTED",
|
| 193 |
+
"PARTICIPANTE-VEREADOR-PRESENTE": "COUNCILOR-PRESENT",
|
| 194 |
+
"PARTICIPANTE-VEREADOR-AUSENTE": "COUNCILOR-ABSENT",
|
| 195 |
+
"PARTICIPANTE-VEREADOR-SUBSTITUIDO": "COUNCILOR-SUBSTITUTED",
|
| 196 |
+
}
|
| 197 |
+
|
| 198 |
+
|
| 199 |
+
# ============================================================
|
| 200 |
+
# RENDER FUNCTIONS
|
| 201 |
+
# ============================================================
|
| 202 |
+
|
| 203 |
+
def _style_for_label(raw_label: str):
|
| 204 |
+
"""Return style dict for a given label."""
|
| 205 |
+
return LABEL_STYLES.get(raw_label, {"bg": "#ddd", "fg": "#000", "name": raw_label})
|
| 206 |
+
|
| 207 |
+
|
| 208 |
+
def render_html(base_text: str, spans: list, scores: list = None):
|
| 209 |
+
"""Render annotated text with colored entity highlights (VotIE-style)."""
|
| 210 |
+
out, cur = [], 0
|
| 211 |
+
for i, (s, e, lbl) in enumerate(spans):
|
| 212 |
+
if cur < s:
|
| 213 |
+
out.append(html.escape(base_text[cur:s]))
|
| 214 |
+
norm_lbl = LABEL_ALIASES.get(lbl, lbl)
|
| 215 |
+
style = LABEL_STYLES.get(norm_lbl, {"class": "", "name": lbl})
|
| 216 |
+
css_class = style.get("class", "")
|
| 217 |
+
title = f"{style['name']} ({scores[i]:.2f})" if scores else style['name']
|
| 218 |
+
out.append(
|
| 219 |
+
f"<span class='{css_class}' title='{title}'>{html.escape(base_text[s:e])}</span>"
|
| 220 |
+
)
|
| 221 |
+
cur = e
|
| 222 |
+
if cur < len(base_text):
|
| 223 |
+
out.append(html.escape(base_text[cur:]))
|
| 224 |
+
return "<div class='annotation-box'>" + "".join(out) + "</div>"
|
| 225 |
+
|
| 226 |
+
|
| 227 |
+
def render_legend():
|
| 228 |
+
"""Render entity legend with same colors as highlight classes."""
|
| 229 |
+
legend_html = """
|
| 230 |
+
<div class='legend-box'>
|
| 231 |
+
<strong>🔖 Entity Legend:</strong><br><br>
|
| 232 |
+
"""
|
| 233 |
+
|
| 234 |
+
for label, info in LABEL_STYLES.items():
|
| 235 |
+
css_class = info.get("class", "")
|
| 236 |
+
name = info.get("name", label)
|
| 237 |
+
legend_html += f"<span class='legend-item {css_class}'>{name}</span> "
|
| 238 |
+
|
| 239 |
+
legend_html += "</div>"
|
| 240 |
+
return legend_html
|
| 241 |
+
|
| 242 |
+
|
| 243 |
+
|
| 244 |
+
|
| 245 |
+
|
| 246 |
+
# ============================================================
|
| 247 |
+
# MODEL LOADING
|
| 248 |
+
# ============================================================
|
| 249 |
+
|
| 250 |
+
@st.cache_resource
|
| 251 |
+
def load_ner_model(model_name: str):
|
| 252 |
+
"""Load NER model from Hugging Face"""
|
| 253 |
+
try:
|
| 254 |
+
cache_dir = "/app/hf_cache" # safe writable path
|
| 255 |
+
tokenizer = AutoTokenizer.from_pretrained(model_name, cache_dir=cache_dir)
|
| 256 |
+
model = AutoModelForTokenClassification.from_pretrained(model_name, cache_dir=cache_dir)
|
| 257 |
+
model.eval()
|
| 258 |
+
device = "cpu"
|
| 259 |
+
model.to(device)
|
| 260 |
+
id2label = model.config.id2label
|
| 261 |
+
return tokenizer, model, device, id2label, None
|
| 262 |
+
except Exception as e:
|
| 263 |
+
return None, None, None, None, f"❌ Error loading model: {str(e)}"
|
| 264 |
+
|
| 265 |
+
|
| 266 |
+
# ============================================================
|
| 267 |
+
# INFERENCE FUNCTIONS
|
| 268 |
+
# ============================================================
|
| 269 |
+
|
| 270 |
+
def predict_spans_chunk(text_chunk: str, tokenizer, model, device, id2label, aggregation: str = "average"):
|
| 271 |
+
"""Predict entity spans for a single chunk"""
|
| 272 |
+
enc = tokenizer(text_chunk, return_offsets_mapping=True, return_tensors="pt", truncation=True)
|
| 273 |
+
word_ids = enc.word_ids()
|
| 274 |
+
enc = {k: v.to(device) for k, v in enc.items()}
|
| 275 |
+
|
| 276 |
+
with torch.no_grad():
|
| 277 |
+
logits = model(**{k: v for k, v in enc.items() if k in {"input_ids", "attention_mask"}}).logits
|
| 278 |
+
probs = torch.softmax(logits, dim=-1)
|
| 279 |
+
|
| 280 |
+
pred_ids = probs.argmax(-1).squeeze(0).tolist()
|
| 281 |
+
offsets = enc["offset_mapping"].squeeze(0).tolist()
|
| 282 |
+
|
| 283 |
+
spans = []
|
| 284 |
+
curr_label = None
|
| 285 |
+
curr_start = None
|
| 286 |
+
curr_end = None
|
| 287 |
+
curr_scores = []
|
| 288 |
+
|
| 289 |
+
def close_span(end_idx=None):
|
| 290 |
+
nonlocal spans, curr_label, curr_start, curr_end, curr_scores
|
| 291 |
+
if curr_label is None or curr_start is None:
|
| 292 |
+
return
|
| 293 |
+
start_char = curr_start
|
| 294 |
+
end_char = curr_end if curr_end is not None else end_idx
|
| 295 |
+
if end_char is None or end_char <= start_char:
|
| 296 |
+
curr_label = curr_start = curr_end = None
|
| 297 |
+
curr_scores = []
|
| 298 |
+
return
|
| 299 |
+
if aggregation == "average" and curr_scores:
|
| 300 |
+
score = sum(curr_scores) / len(curr_scores)
|
| 301 |
+
elif curr_scores:
|
| 302 |
+
score = max(curr_scores)
|
| 303 |
+
else:
|
| 304 |
+
score = 0.0
|
| 305 |
+
spans.append({"label": curr_label, "start": start_char, "end": end_char, "score": float(score)})
|
| 306 |
+
curr_label = curr_start = curr_end = None
|
| 307 |
+
curr_scores = []
|
| 308 |
+
|
| 309 |
+
for idx, (pid, (start, end)) in enumerate(zip(pred_ids, offsets)):
|
| 310 |
+
wid = word_ids[idx]
|
| 311 |
+
if start == end or wid is None or (idx > 0 and wid == word_ids[idx - 1]):
|
| 312 |
+
continue
|
| 313 |
+
|
| 314 |
+
label = id2label.get(pid, "O")
|
| 315 |
+
|
| 316 |
+
last_idx = idx
|
| 317 |
+
for j in range(idx + 1, len(word_ids)):
|
| 318 |
+
if word_ids[j] != wid:
|
| 319 |
+
break
|
| 320 |
+
last_idx = j
|
| 321 |
+
word_start = start
|
| 322 |
+
word_end = offsets[last_idx][1]
|
| 323 |
+
|
| 324 |
+
if label == "O":
|
| 325 |
+
if curr_label is not None:
|
| 326 |
+
close_span()
|
| 327 |
+
continue
|
| 328 |
+
|
| 329 |
+
if label.startswith("B-"):
|
| 330 |
+
if curr_label is not None:
|
| 331 |
+
close_span()
|
| 332 |
+
curr_label = label[2:]
|
| 333 |
+
curr_start = word_start
|
| 334 |
+
curr_end = word_end
|
| 335 |
+
curr_scores = [float(probs[0, idx, pid].item())]
|
| 336 |
+
|
| 337 |
+
elif label.startswith("I-"):
|
| 338 |
+
ent = label[2:]
|
| 339 |
+
if curr_label == ent:
|
| 340 |
+
curr_end = word_end
|
| 341 |
+
curr_scores.append(float(probs[0, idx, pid].item()))
|
| 342 |
+
else:
|
| 343 |
+
if curr_label is not None:
|
| 344 |
+
close_span()
|
| 345 |
+
curr_label = ent
|
| 346 |
+
curr_start = word_start
|
| 347 |
+
curr_end = word_end
|
| 348 |
+
curr_scores = [float(probs[0, idx, pid].item())]
|
| 349 |
+
|
| 350 |
+
last_real = 0
|
| 351 |
+
for (s, e) in offsets[::-1]:
|
| 352 |
+
if s != e:
|
| 353 |
+
last_real = e
|
| 354 |
+
break
|
| 355 |
+
close_span(last_real)
|
| 356 |
+
return spans
|
| 357 |
+
|
| 358 |
+
|
| 359 |
+
def extract_metadata(base_text: str, tokenizer, model, device, id2label):
|
| 360 |
+
"""Main extraction function"""
|
| 361 |
+
if not base_text or len(base_text.strip()) < 10:
|
| 362 |
+
return None, None, "Please enter text to analyze (minimum 10 characters)."
|
| 363 |
+
|
| 364 |
+
if DocumentChunker is None:
|
| 365 |
+
spans_chunk = predict_spans_chunk(base_text, tokenizer, model, device, id2label, DEFAULT_AGGREGATION)
|
| 366 |
+
spans_all = [(sp["start"], sp["end"], sp["label"]) for sp in spans_chunk]
|
| 367 |
+
scores_all = [sp["score"] for sp in spans_chunk]
|
| 368 |
+
else:
|
| 369 |
+
chunker = DocumentChunker(chunk_size=DEFAULT_CHUNK_SIZE, chunk_overlap=DEFAULT_OVERLAP)
|
| 370 |
+
chunks = chunker.chunk_document(base_text)
|
| 371 |
+
|
| 372 |
+
spans_all, scores_all = [], []
|
| 373 |
+
cursor = 0
|
| 374 |
+
for ch in chunks:
|
| 375 |
+
start_ch = base_text.find(ch, cursor)
|
| 376 |
+
if start_ch == -1:
|
| 377 |
+
start_ch = cursor
|
| 378 |
+
spans_chunk = predict_spans_chunk(ch, tokenizer, model, device, id2label, DEFAULT_AGGREGATION)
|
| 379 |
+
for sp in spans_chunk:
|
| 380 |
+
spans_all.append((sp["start"] + start_ch, sp["end"] + start_ch, sp["label"]))
|
| 381 |
+
scores_all.append(sp["score"])
|
| 382 |
+
cursor = start_ch + 1
|
| 383 |
+
|
| 384 |
+
# Sort and remove overlaps
|
| 385 |
+
paired = sorted(zip(spans_all, scores_all), key=lambda x: (x[0][0], x[0][1]))
|
| 386 |
+
kept, kept_scores, last_e = [], [], -1
|
| 387 |
+
for (s, e, lbl), sc in paired:
|
| 388 |
+
if s >= last_e:
|
| 389 |
+
kept.append((s, e, lbl))
|
| 390 |
+
kept_scores.append(sc)
|
| 391 |
+
last_e = e
|
| 392 |
+
|
| 393 |
+
return kept, kept_scores, None
|
| 394 |
+
|
| 395 |
+
|
| 396 |
+
# ============================================================
|
| 397 |
+
# EXAMPLE TEXTS
|
| 398 |
+
# ============================================================
|
| 399 |
+
EXAMPLE_TEXT_PT_INTRO = "Câmara Municipal de Guimarães.\n ATA Nº 10 Fls. __14__ REUNIÃO ORDINÁRIA DE 25 DE MAIO DE 2023 \nATA\nAos vinte e cinco dias do mês de maio do ano de dois mil e vinte e três, no Edifício dos Paços do Concelho, na Sala de Reuniões, compareceram os Excelentíssimos Senhores – Presidente da Câmara – Domingos Bragança Salgado – e os Vereadores Adelina Paula Mendes Pinto, Paulo Rui Lopes Pereira da Silva, Paula Cristina dos Santos Oliveira, Nelson José Guimarães Felgueiras, Ana Maria Prego de Faria Berkeley Cotter, Ricardo José Machado Pereira da Silva Araújo, Vânia Carvalho Dias da Silva de Antas de Barros, Hugo Miguel Alves Ribeiro e Eduardo Miguel Teixeira Fernandes. \nO Vereador Bruno Alberto Vieira Fernandes solicitou a sua substituição na presente reunião, nos termos do art.º 78.º da Lei nº 169/99, de 18 de setembro, na sua redação atual. Nesta sequência, a cidadã imediatamente a seguir na ordem da lista da Coligação Juntos por Guimarães pelo PPD/PSD, Emília Rosa Leite Pereira Lemos, manifestou impossibilidade em estar presente na reunião, pelo que foi substituída pelo cidadão imediatamente a seguir na ordem da referida lista, Eduardo Miguel Teixeira Fernandes, nos termos do nº 7, do art.º 77º, do mesmo diploma legal, tendo o Presidente da Câmara verificado a conformidade formal do processo eleitoral com a identidade do eleito. \nNão compareceu a Vereadora Alice Sofia de Freitas Soares Ferreira Fernandes, cuja falta foi considerada justificada. \nSecretariou a Diretora Municipal ***********************, em regime de substituição, **************************************. \nPelas 10.00 horas foi declarada aberta a reunião. \n \nORDEM DO DIA \nINFORMAÇÕES\n"
|
| 400 |
+
EXAMPLE_TEXT_PT_CLOSING = "PELAS ONZE HORAS E CINQUENTA MINUTOS O PRESIDENTE DA CÂMARA DEU POR ENCERRADA A REUNIÃO, DE QUE, PARA CONSTAR, SE LAVROU A PRESENTE ATA. \n"
|
| 401 |
+
EXAMPLE_TEXT_EN_INTRO = "Guimarães City Council.\n MINUTES NO. 1 FLS. __10__ ORDINARY MEETING OF JANUARY 13, 2022 \nMINUTES\nOn the thirteenth day of January in the year two thousand and twenty-two, in the Town Hall Building, in the Meeting Room, the following Gentlemen attended: Mayor – Domingos Bragança Salgado and Councilors – Adelina Paula Mendes Pinto, Paulo Rui Lopes Pereira da Silva, Paula Cristina dos Santos Oliveira, Nelson José Guimarães Felgueiras, Alice Sofia de Freitas Soares Ferreira Fernandes, Ana Maria Prego de Faria Berkeley Cotter, Bruno Alberto Vieira Fernandes, Ricardo José Machado Pereira da Silva Araújo, Vânia Carvalho Dias da Silva de Antas de Barros and Hugo Miguel Alves Ribeiro. \nSecretary the Director ***************, **************************************. \nAt 10.10 am the meeting was declared open. \n \nAGENDA \nINFORMATION\n"
|
| 402 |
+
EXAMPLE_TEXT_EN_CLOSING = "AT ELEVEN O'CLOCK AND FIFTY MINUTES, THE PRESIDENT OF THE CHAMBER CLOSED THE MEETING, OF WHICH, FOR THE RECORD, THESE MINUTES WERE DRAWN UP. \n"
|
| 403 |
+
|
| 404 |
+
# Pre-load models
|
| 405 |
+
load_ner_model("anonymous13542/BERTimbau-large-metadata-council-pt")
|
| 406 |
+
load_ner_model("anonymous13542/XLMR-large-metadata-council-en")
|
| 407 |
+
|
| 408 |
+
|
| 409 |
+
# ============================================================
|
| 410 |
+
# MAIN APP
|
| 411 |
+
# ============================================================
|
| 412 |
+
|
| 413 |
+
def main():
|
| 414 |
+
# Header
|
| 415 |
+
st.markdown('<h1 class="main-header">🏷️ MiNER — Stage 2: Metadata Extraction Demo</h1>', unsafe_allow_html=True)
|
| 416 |
+
st.markdown("""
|
| 417 |
+
<p class="subtitle">
|
| 418 |
+
Automatic extraction of structured metadata from municipal meeting minutes
|
| 419 |
+
</p>
|
| 420 |
+
""", unsafe_allow_html=True)
|
| 421 |
+
|
| 422 |
+
# Sidebar
|
| 423 |
+
st.sidebar.header("⚙️ Configuration")
|
| 424 |
+
|
| 425 |
+
example_choice = st.sidebar.selectbox(
|
| 426 |
+
"Choose an example or enter your own text:",
|
| 427 |
+
["Custom Text", "Portuguese Example - Intro", "Portuguese Example - Closing",
|
| 428 |
+
"English Example - Intro", "English Example - Closing"],
|
| 429 |
+
index=0
|
| 430 |
+
)
|
| 431 |
+
|
| 432 |
+
# Auto-select model based on example (English/multilingual as default)
|
| 433 |
+
if "Portuguese" in example_choice:
|
| 434 |
+
model_name = "anonymous13542/BERTimbau-large-metadata-council-pt"
|
| 435 |
+
else:
|
| 436 |
+
model_name = "anonymous13542/XLMR-large-metadata-council-en"
|
| 437 |
+
|
| 438 |
+
tokenizer, model, device, id2label, error = load_ner_model(model_name)
|
| 439 |
+
|
| 440 |
+
if error or model is None:
|
| 441 |
+
st.sidebar.markdown(f"<div class='status-box error'>❌ Error loading model:<br>{error}</div>",
|
| 442 |
+
unsafe_allow_html=True)
|
| 443 |
+
else:
|
| 444 |
+
st.sidebar.markdown(
|
| 445 |
+
f"<div class='status-box success'>✅ Loaded automatically: <strong>{model_name.split('/')[-1]}</strong></div>",
|
| 446 |
+
unsafe_allow_html=True)
|
| 447 |
+
|
| 448 |
+
st.sidebar.markdown("---")
|
| 449 |
+
st.sidebar.markdown("### 📊 About")
|
| 450 |
+
st.sidebar.info("""
|
| 451 |
+
**MiNER Stage 2** uses Named Entity Recognition models to automatically extract metadata from meeting minutes.
|
| 452 |
+
|
| 453 |
+
- **Model**: BERTimbau / XLM-RoBERTa fine-tuned
|
| 454 |
+
- **Languages**: Portuguese and English
|
| 455 |
+
- **Method**: Token Classification (NER) with BIO tagging
|
| 456 |
+
""")
|
| 457 |
+
|
| 458 |
+
st.sidebar.markdown("---")
|
| 459 |
+
st.sidebar.markdown(render_legend(), unsafe_allow_html=True)
|
| 460 |
+
|
| 461 |
+
st.sidebar.markdown("---")
|
| 462 |
+
st.sidebar.markdown("### 🔗 Resources")
|
| 463 |
+
st.sidebar.markdown("""
|
| 464 |
+
- [📖 Model Card (PT)](https://huggingface.co/anonymous13542/BERTimbau-large-metadata-council-pt)
|
| 465 |
+
- [📖 Model Card (EN)](https://huggingface.co/anonymous13542/XLMR-large-metadata-council-en)
|
| 466 |
+
""")
|
| 467 |
+
|
| 468 |
+
# Main layout
|
| 469 |
+
col1, col2 = st.columns([1, 1])
|
| 470 |
+
|
| 471 |
+
with col1:
|
| 472 |
+
st.subheader("📄 Input Document")
|
| 473 |
+
|
| 474 |
+
if example_choice == "Portuguese Example - Intro":
|
| 475 |
+
input_text = st.text_area("Input", value=EXAMPLE_TEXT_PT_INTRO, height=400, label_visibility="collapsed")
|
| 476 |
+
elif example_choice == "Portuguese Example - Closing":
|
| 477 |
+
input_text = st.text_area("Input", value=EXAMPLE_TEXT_PT_CLOSING, height=400, label_visibility="collapsed")
|
| 478 |
+
elif example_choice == "English Example - Intro":
|
| 479 |
+
input_text = st.text_area("Input", value=EXAMPLE_TEXT_EN_INTRO, height=400, label_visibility="collapsed")
|
| 480 |
+
elif example_choice == "English Example - Closing":
|
| 481 |
+
input_text = st.text_area("Input", value=EXAMPLE_TEXT_EN_CLOSING, height=400, label_visibility="collapsed")
|
| 482 |
+
else:
|
| 483 |
+
input_text = st.text_area("Input", placeholder="Paste your document text here…", height=400,
|
| 484 |
+
label_visibility="collapsed")
|
| 485 |
+
|
| 486 |
+
extract_button = st.button("🏷️ Extract Metadata", type="primary", use_container_width=True)
|
| 487 |
+
|
| 488 |
+
with col2:
|
| 489 |
+
st.subheader("📊 Extraction Results")
|
| 490 |
+
|
| 491 |
+
results_placeholder = st.empty()
|
| 492 |
+
|
| 493 |
+
if extract_button:
|
| 494 |
+
with st.spinner("🔄 Analyzing document..."):
|
| 495 |
+
if model is None:
|
| 496 |
+
results_placeholder.warning("⚠️ Model could not be loaded. Please refresh and try again.")
|
| 497 |
+
elif not input_text or len(input_text.strip()) < 10:
|
| 498 |
+
results_placeholder.warning("⚠️ Please enter a longer text (minimum 10 characters).")
|
| 499 |
+
else:
|
| 500 |
+
spans, scores, err = extract_metadata(input_text, tokenizer, model, device, id2label)
|
| 501 |
+
if err:
|
| 502 |
+
results_placeholder.error(f"❌ {err}")
|
| 503 |
+
elif not spans:
|
| 504 |
+
results_placeholder.info("ℹ️ No entities found in the text.")
|
| 505 |
+
else:
|
| 506 |
+
with results_placeholder.container():
|
| 507 |
+
st.markdown(
|
| 508 |
+
f"**Found {len(spans)} entities** with average confidence: **{sum(scores) / len(scores):.2%}**")
|
| 509 |
+
st.markdown("---")
|
| 510 |
+
st.markdown("**📝 Annotated Text**")
|
| 511 |
+
annotated_html = render_html(input_text, spans, scores)
|
| 512 |
+
#st.write(annotated_html)
|
| 513 |
+
st.markdown(annotated_html, unsafe_allow_html=True)
|
| 514 |
+
else:
|
| 515 |
+
results_placeholder.info("👈 Enter text in the input box and click 'Extract Metadata' to begin.")
|
| 516 |
+
|
| 517 |
+
# How it works
|
| 518 |
+
st.markdown("---")
|
| 519 |
+
st.subheader("🎯 How It Works")
|
| 520 |
+
st.markdown("""
|
| 521 |
+
The model analyzes the **meeting minutes** to automatically extract **structured metadata** using a *Named Entity Recognition (NER)* approach.
|
| 522 |
+
|
| 523 |
+
Each token in the document is classified, identifying information such as:
|
| 524 |
+
- 📅 **Date**
|
| 525 |
+
- 🕐 **Start / End time**
|
| 526 |
+
- 📍 **Location**
|
| 527 |
+
- 📋 **Minute ID**
|
| 528 |
+
- 📌 **Meeting type**
|
| 529 |
+
- 👔 **President** (present / absent / substituted)
|
| 530 |
+
- 👥 **Councilors** (present / absent / substituted)
|
| 531 |
+
|
| 532 |
+
The model uses the **BIO tagging scheme** (*Begin, Inside, Outside*) to mark entity boundaries, and the final spans are reconstructed from token-level predictions.
|
| 533 |
+
""")
|
| 534 |
+
|
| 535 |
+
st.markdown("**Example:**")
|
| 536 |
+
st.code("""
|
| 537 |
+
Introduction text: "Câmara Municipal de Guimarães. ATA Nº 10 Fls. __14__ REUNIÃO ORDINÁRIA DE 25 DE MAIO DE 2023 ATA Aos vinte e cinco dias do mês de maio do ano de dois mil e vinte e três, no Edifício dos Paços do Concelho, na Sala de Reuniões, compareceram os Excelentíssimos Senhores – Presidente da Câmara – Domingos Bragança Salgado – e os Vereadores Adelina Paula Mendes Pinto, Paulo Rui Lopes Pereira da Silva, Paula Cristina dos Santos Oliveira, Nelson José Guimarães Felgueiras, Ana Maria Prego de Faria Berkeley Cotter, Ricardo José Machado Pereira da Silva Araújo, Vânia Carvalho Dias da Silva de Antas de Barros, Hugo Miguel Alves Ribeiro e Eduardo Miguel Teixeira Fernandes. O Vereador Bruno Alberto Vieira Fernandes solicitou a sua substituição na presente reunião, nos termos do art.º 78.º da Lei nº 169/99, de 18 de setembro, na sua redação atual. Nesta sequência, a cidadã imediatamente a seguir na ordem da lista da Coligação Juntos por Guimarães pelo PPD/PSD, Emília Rosa Leite Pereira Lemos, manifestou impossibilidade em estar presente na reunião, pelo que foi substituída pelo cidadão imediatamente a seguir na ordem da referida lista, Eduardo Miguel Teixeira Fernandes, nos termos do nº 7, do art.º 77º, do mesmo diploma legal, tendo o Presidente da Câmara verificado a conformidade formal do processo eleitoral com a identidade do eleito. Não compareceu a Vereadora Alice Sofia de Freitas Soares Ferreira Fernandes, cuja falta foi considerada justificada. Secretariou a Diretora Municipal ***********************, em regime de substituição, **************************************. Pelas 10.00 horas foi declarada aberta a reunião. ORDEM DO DIA INFORMAÇÕES "
|
| 538 |
+
|
| 539 |
+
Predicted entities:
|
| 540 |
+
- MINUTE-ID → "10"
|
| 541 |
+
- MEETING-TYPE → "ORDINÁRIA"
|
| 542 |
+
- DATE → "11/09/2024"
|
| 543 |
+
- PRESIDENT-PRESENT → "João Maria Aranha Grilo"
|
| 544 |
+
- COUNCILOR-PRESENT → "Paulo Jorge da Silva Gonçalves"
|
| 545 |
+
- LOCATION → "Edifício Sede do Município de Alandroal"
|
| 546 |
+
- START-TIME → "15 horas e 30 minutos"
|
| 547 |
+
""")
|
| 548 |
+
|
| 549 |
+
|
| 550 |
+
# Footer
|
| 551 |
+
st.markdown("---")
|
| 552 |
+
st.markdown("""
|
| 553 |
+
<div style='text-align:center; color:#64748b; font-size:0.875rem; padding:2rem 0;'>
|
| 554 |
+
<p style='margin:0 0 8px 0;'><strong style='color:#667eea;'>MiNER</strong> — Municipal Information Extraction & Recognition</p>
|
| 555 |
+
<p style='margin:0;'>Anonymous research demo for Stage 2 (Metadata Extraction via NER)</p>
|
| 556 |
+
<p style='margin:12px 0 0 0; opacity:0.7;'>Built with ❤️ using Streamlit & Transformers</p>
|
| 557 |
+
</div>
|
| 558 |
+
""", unsafe_allow_html=True)
|
| 559 |
+
|
| 560 |
+
|
| 561 |
+
if __name__ == "__main__":
|
| 562 |
+
main()
|