import streamlit as st import json from transformers import AutoTokenizer, AutoModelForTokenClassification import torch # Page configuration st.set_page_config( page_title="MiNER - Stage 2: Metadata Extraction", page_icon="🏷️", layout="wide" ) # Custom CSS for improved appearance st.markdown(""" """, unsafe_allow_html=True) # ==================== MODEL LOADING ==================== @st.cache_resource def load_model(): """Loads the model and tokenizer (cached to avoid reloading)""" MODEL_NAME = "liaad/Citilink-BERTimbau-large-metadata-pt-baseline" try: tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME) model = AutoModelForTokenClassification.from_pretrained(MODEL_NAME) model.eval() return tokenizer, model except Exception as e: st.error(f"Error loading model: {e}") return None, None # ==================== ENTITY EXTRACTION ==================== def extract_entities(text, tokenizer, model): """ Extracts entities from text using BERT model. Based on the original extract_entities function. """ if not text or text.strip() == "": return {} # Tokenization + offsets encoding = tokenizer( text, return_tensors="pt", return_offsets_mapping=True, truncation=True, max_length=512 ) offsets = encoding["offset_mapping"][0].tolist() word_ids = encoding.word_ids(batch_index=0) inputs = { "input_ids": encoding["input_ids"], "attention_mask": encoding["attention_mask"], } # Prediction with torch.no_grad(): outputs = model(**inputs) pred_ids = torch.argmax(outputs.logits, dim=2)[0].tolist() pred_labels = [model.config.id2label[i] for i in pred_ids] # Entity reconstruction entities = [] current = None prev_word_idx = None for i, label in enumerate(pred_labels): word_idx = word_ids[i] start, end = offsets[i] # Ignore special tokens (CLS, SEP, PAD) if word_idx is None: continue # ONLY process the FIRST subtoken of each word if word_idx == prev_word_idx: # This is a subsequent subtoken, ignore prediction # But extend the offset of current entity if it exists if current: current["end"] = end continue # Update prev_word_idx prev_word_idx = word_idx # Process the label of the first subtoken if label.startswith("B-"): # Close previous entity if it exists if current: entities.append(current) # Start new entity current = {"label": label[2:], "start": start, "end": end} elif label.startswith("I-"): if current and current["label"] == label[2:]: # Continue entity correctly current["end"] = end else: # Ignore loose I- (don't create entity) continue else: # label == "O" # Close current entity if current: entities.append(current) current = None # Close last entity if it exists if current: entities.append(current) # Add extracted text to each entity for ent in entities: ent["text"] = text[ent["start"]:ent["end"]] # Group entities by label grouped_entities = {} for ent in entities: label = ent["label"] if label not in grouped_entities: grouped_entities[label] = [] grouped_entities[label].append({ "text": ent["text"], "start": ent["start"], "end": ent["end"] }) # ==================== POST-PROCESSING ==================== # For certain metadata, keep only the first occurrence unique_entities = ["HORARIO", "DATA", "NUMERO-ATA", "LOCAL", "TIPO-REUNIAO"] for label in list(grouped_entities.keys()): # Check if the label starts with any of the unique entities for unique_entity in unique_entities: if label.startswith(unique_entity): # Keep only the first occurrence (smallest start offset) grouped_entities[label] = sorted( grouped_entities[label], key=lambda x: x["start"] )[:1] break return grouped_entities # ==================== UTILITY FUNCTIONS ==================== def translate_entity_label(label): """Translates entity labels from Portuguese to English and adds emojis""" translations = { "NUMERO-ATA": ("📋", "MINUTES NUMBER"), "DATA": ("📅", "DATE"), "LOCAL": ("📍", "LOCATION"), "TIPO-REUNIAO": ("📌", "MEETING TYPE"), "HORARIO-INICIO": ("🕐", "BEGIN TIME"), "HORARIO-FIM": ("🕐", "END TIME"), "PARTICIPANTE-PRESIDENTE-PRESENTE": ("👔", "PRESIDENT - PRESENT"), "PARTICIPANTE-PRESIDENTE-AUSENTE": ("👔", "PRESIDENT - ABSENT"), "PARTICIPANTE-PRESIDENTE-SUBSTITUIDO": ("👔", "PRESIDENT - SUBSTITUTED"), "PARTICIPANTE-VEREADOR-PRESENTE": ("👥", "COUNCILOR - PRESENT"), "PARTICIPANTE-VEREADOR-AUSENTE": ("👥", "COUNCILOR - ABSENT"), "PARTICIPANTE-VEREADOR-SUBSTITUIDO": ("👥", "COUNCILOR - SUBSTITUTED"), } # Check if exact translation exists if label in translations: return translations[label] # Fallback: try to extract base category for key, value in translations.items(): if label.startswith(key.split("-")[0]): return value return ("", label) def get_entity_style(label): """Returns color and border style based on entity type""" # Base colors colors = { "NUMERO-ATA": "#E74C3C", # Red "DATA": "#16A085", # Teal "LOCAL": "#2980B9", # Blue "TIPO-REUNIAO": "#E67E22", # Orange "HORARIO": "#F39C12", # Yellow-orange "PARTICIPANTE-PRESIDENTE": "#8E44AD", # Purple for President "PARTICIPANTE-VEREADOR": "#27AE60" # Green for Councilors } # Border styles for different attendance states border_styles = { "PRESENTE": "solid", # Solid border "AUSENTE": "dashed", # Dashed border "SUBSTITUIDO": "dotted" # Dotted border } # Determine base color color = "#7F8C8D" # Default color border_style = "solid" border_width = "2px" # Check if it's President or Councilor if "PARTICIPANTE-PRESIDENTE" in label: color = colors["PARTICIPANTE-PRESIDENTE"] # Determine border style if "PRESENTE" in label: border_style = border_styles["PRESENTE"] elif "AUSENTE" in label: border_style = border_styles["AUSENTE"] elif "SUBSTITUIDO" in label: border_style = border_styles["SUBSTITUIDO"] elif "PARTICIPANTE-VEREADOR" in label: color = colors["PARTICIPANTE-VEREADOR"] # Determine border style if "PRESENTE" in label: border_style = border_styles["PRESENTE"] elif "AUSENTE" in label: border_style = border_styles["AUSENTE"] elif "SUBSTITUIDO" in label: border_style = border_styles["SUBSTITUIDO"] else: # For other entities, use specific colors base_category = label.split("-")[0] color = colors.get(base_category, color) return color, border_style, border_width def display_entities_compact(entities_dict): """Displays entities in a compact format with emojis and translation""" if not entities_dict: st.info("No entities detected.") return for label, entities in sorted(entities_dict.items()): emoji, translated_label = translate_entity_label(label) color, border_style, border_width = get_entity_style(label) # Group title with emoji title_html = f'

{emoji} {translated_label}

' st.markdown(title_html, unsafe_allow_html=True) # Show all entities of this type in compact badges badges_html = '

' for ent in entities: badge_style = f"background-color: {color}; color: white; border: {border_width} {border_style} rgba(0,0,0,0.2);" badges_html += f'{ent["text"]}' badges_html += '

' st.markdown(badges_html, unsafe_allow_html=True) # ==================== SIDEBAR ==================== with st.sidebar: st.markdown("### ⚙️ Configuration") st.markdown("Choose an example or enter your own text:") selected_example = st.selectbox( "Select an example", ["Custom Text", "Example 1 - Alandroal", "Example 2 - Campo Maior", "Example 3 - Covilhã", "Example 4 - Fundão", "Example 5 - Guimarães", "Example 6 - Porto"] ) st.markdown("---") # About Section st.markdown("### 📋 About") st.markdown(""" **MiNER Stage 2** uses Named Entity Recognition models to automatically extract metadata from meeting minutes. - **Model**: BERTimbau fine-tuned - **Languages**: Portuguese - **Method**: Token Classification (NER) with BIO tagging """) st.markdown("---") # Resources Section st.markdown("### 🔗 Resources") st.markdown(""" - [GitHub Repository](https://github.com/LIAAD/MiNER) - [Model](https://huggingface.co/liaad/Citilink-BERTimbau-large-metadata-pt-baseline) """) # ==================== MAIN CONTENT ==================== # Header st.markdown('

🏷️ MiNER — Stage 2: Metadata Extraction Demo

', unsafe_allow_html=True) st.markdown('

Automatic extraction of structured metadata from municipal meeting minutes

', unsafe_allow_html=True) # ==================== HOW IT WORKS (MOVED TO TOP) ==================== with st.expander("🎯 How It Works", expanded=False): st.markdown(""" The model analyzes the **meeting minutes** to automatically extract **structured metadata** using a *Named Entity Recognition (NER)* approach. **What information is extracted:** Each token in the document is classified, identifying information such as: - 📅 **Date** - 🕐 **Start / End time** - 📍 **Location** - 📋 **Minute ID** - 📌 **Meeting type** - 👔 **President** (present / absent / substituted) - 👥 **Councilors** (present / absent / substituted) **Technical approach:** The model uses the **BIO tagging scheme** (*Begin, Inside, Outside*) to mark entity boundaries, and the final spans are reconstructed from token-level predictions. --- ### 📖 Complete Example **Input Document:** """) # Input example in code block st.code("""CÂMARA MUNICIPAL DE ALANDROAL ATA N.º 21 REUNIÃO ORDINÁRIA 11/09/2024 Presidiu o Senhor João Maria Aranha Grilo, Presidente da Câmara Municipal de Alandroal Vereadores Paulo Jorge da Silva Gonçalves Fernanda Manuela Brites Romão Elisabete de Jesus dos Passos Galhardas Faltou João Carlos Camões Roma Balsante Secretariou a Reunião **************************************** No Edifício Sede do Município de Alandroal, o Senhor Presidente da Câmara Municipal, João Maria Aranha Grilo, declarou aberta a reunião, eram 15 horas e 30 minutos.""", language=None) st.markdown(""" **Expected Output (Extracted Entities):** ``` 📅 DATE • 11/09/2024 🕐 TIME • 15 horas e 30 minutos 📍 LOCATION • No Edifício Sede do Município de Alandroal 📋 MINUTES NUMBER • 21 📌 MEETING TYPE • ORDINÁRIA 👔 PRESIDENT - PRESENT • João Maria Aranha Grilo 👥 COUNCILOR - PRESENT • Paulo Jorge da Silva Gonçalves • Fernanda Manuela Brites Romão • Elisabete de Jesus dos Passos Galhardas 👥 COUNCILOR - ABSENT • João Carlos Camões Roma Balsante ``` """) # Load model with st.spinner("Loading model..."): tokenizer, model = load_model() if tokenizer is None or model is None: st.error("❌ Failed to load model. Please check if the model path is correct.") st.stop() # Main layout with two columns col1, col2 = st.columns([1, 1]) # ==================== LEFT COLUMN - INPUT ==================== with col1: st.markdown("### 📝 Input Document") if selected_example == "Custom Text": example_text = "" elif selected_example == "Example 1 - Alandroal": example_text = """CÂMARA MUNICIPAL DE ALANDROAL ATA N.º 21 REUNIÃO ORDINÁRIA 11/09/2024 Presidiu o Senhor João Maria Aranha Grilo, Presidente da Câmara Municipal de Alandroal Vereadores Paulo Jorge da Silva Gonçalves Fernanda Manuela Brites Romão Elisabete de Jesus dos Passos Galhardas Faltou João Carlos Camões Roma Balsante Secretariou a Reunião **************************************** No Edifício Sede do Município de Alandroal, o Senhor Presidente da Câmara Municipal, João Maria Aranha Grilo, declarou aberta a reunião, eram 15 horas e 30 minutos.""" elif selected_example == "Example 2 - Campo Maior": example_text = """ATA Nº 1 REUNIÃO ORDINÁRIA DA CÂMARA MUNICIPAL DE CAMPO MAIOR, REALIZADA EM 5 DE JANEIRO DE 2022. Aos cinco dias do mês de janeiro do ano de dois mil e vinte e dois, no Edifício dos Paços do Concelho, nesta Vila, realizou-se, pelas nove horas e trinta minutos, a reunião Ordinária da Câmara Municipal, comparecendo os Excelentíssimos Senhores Luís Fernando Martins Rosinha, Paulo Ivo Sabino Martins de Almeida, Paulo Jorge Furtado Pinheiro, Maria da Encarnação Grifo Silveirinha (videoconferência) e Fátima do Rosário Pingo Vitorino Pereira, respetivamente, Presidente e Vereadores efetivos deste Órgão Autárquico. -Verificada a presença dos respectivos membros, o Senhor Presidente declarou aberta a reunião: -Estava presente o Chefe **************************************, Dr. *********************************** e a Assistente Técnico **************************************. -Depois de todos terem ocupado os seus lugares, o Senhor Presidente declarou aberta a reunião eram nove horas e trinta minutos.""" elif selected_example == "Example 3 - Covilhã": example_text = """-- -- -- -- CÂMARA MUNICIPAL DA COVILHÃ TEXTO DEFINITIVO DA ATA Nº 02/2023 Da reunião ordinária privada realizada no dia 03 de fevereiro de 2023, iniciada às 09:05 horas e concluída às 10:15 horas. ------------------------------------ Sumário: 01 ------------------------------- ---- Abertura 02 Período Antes da Ordem do Dia 05 Período da Ordem do Dia 06 Agenda 06 Aprovação de Atas 06 Balancete 07 Despachos 07 DAGCJ 10 DFMA 17 DOP 19 DECAD 29 DU 38 Aprovação em minuta 42 Votação das deliberações 42 Montante Global de Encargos 42 Encerramento 42 ------------------------------------ ABERTURA ATA Nº 02/2023 Aos três dias do mês de fevereiro do ano de dois mil e vinte e três, na Sala de Reuniões dos Paços do Concelho, na Covilhã, realizou-se a reunião ordinária privada da Câmara Municipal da Covilhã sob a presidência do Senhor Presidente da Câmara, Vítor Manuel Pinheiro Pereira, estando presentes o Senhor Vice-Presidente José Armando Serra dos Reis e os Senhores Vereadores Pedro Miguel Santos Farromba, Maria Regina Gomes Gouveia, Jorge Humberto Martins Simões (em substituição de Ricardo Miguel Correia Leitão Ferreira da Silva), José Miguel Ribeiro Oliveira e Marta Maria Tomaz Gomes Morais Alçada Bom Jesus. A reunião foi secretariada pela Senhora Dr.ª ********************************, Diretora *************************************************************. E, pelas 09:05 horas, o Senhor Presidente da Câmara deu início aos trabalhos da presente reunião com a seguinte Ordem de Trabalhos:""" elif selected_example == "Example 4 - Fundão": example_text = """ 14/02/2022 ATA DA REUNIÃO DE 14/02/2022 CÂMARA MUNICIPAL DO FUNDÃO Texto definitivo da ata n.º 2/2022 da reunião ordinária realizada no dia 14 de fevereiro de 2022, iniciada às 17:00 horas e concluída às 19:00. ATA N.º 2/2022 Aos catorze dias do mês fevereiro do ano dois mil e vinte e dois, realizou-se por videoconferência, a reunião ordinária privada da Câmara Municipal do Fundão, sob a presidência do Senhor Presidente da Câmara, Dr. Paulo Alexandre Bernardo Fernandes, com a participação do Senhor Vice-presidente, Dr. Luís Miguel Roque Tarouca Duarte Gavinhos e dos Senhores Vereadores, Dra. Joana Morgadinho Bento, Dra. Maria Alcina Domingues Cerdeira, Dr. Pedro Manuel Figueiredo Neto, Prof. Sérgio Miguel Cardoso Mendes e Dra. Ana Paula Coelho Duarte. A reunião foi secretariada pela Dra. ****************************, Diretora *******************************************.""" elif selected_example == "Example 5 - Guimarães": example_text = """Câmara Municipal de Guimarães. ATA Nº 1 Fls. __10__ REUNIÃO ORDINÁRIA DE 13 DE JANEIRO DE 2022 ATA Aos treze dias do mês de janeiro do ano de dois mil e vinte e dois, no Edifício dos Paços do Concelho, na Sala de Reuniões, compareceram os Excelentíssimos Senhores: Presidente da Câmara – Domingos Bragança Salgado e Vereadores – Adelina Paula Mendes Pinto, Paulo Rui Lopes Pereira da Silva, Paula Cristina dos Santos Oliveira, Nelson José Guimarães Felgueiras, Alice Sofia de Freitas Soares Ferreira Fernandes, Ana Maria Prego de Faria Berkeley Cotter, Bruno Alberto Vieira Fernandes, Ricardo José Machado Pereira da Silva Araújo, Vânia Carvalho Dias da Silva de Antas de Barros e Hugo Miguel Alves Ribeiro. Secretariou a Diretora ***************, **************************************. Pelas 10.10 horas foi declarada aberta a reunião.""" elif selected_example == "Example 6 - Porto": example_text = """2.ª REUNIÃO PÚBLICA, DA CÂMARA MUNICIPAL DO PORTO REALIZADA EM 8 DE NOVEMBRO DE 2021 ÀS 10 HORAS PRESENTES: - Rui de Carvalho de Araújo Moreira - Filipe Manuel Ventura Camões de Almeida Araújo - Ana Catarina da Rocha Araújo - Ricardo Miguel Araújo Cardoso Valente - Albino Pedro Pereira Baganha - Cristina Mafalda Nieto Guimarães Pimentel - Tiago Barbosa Ribeiro - Maria do Rosário Gambôa Lopes de Carvalho - Catarina Maria da Costa Santos Cunha Pereira de Abreu - Vladimiro Mota Cardoso Feliz - Alberto Amaro Guedes Machado - Maria Ilda da Costa Figueiredo - Sérgio Augusto Leite Aires Secretariou a reunião a Técnica ********, ***************.""" else: example_text = "Add your text here" st.markdown(f"**Example:** {selected_example}") text_input = st.text_area( "Type or paste the text here:", value=example_text, height=400, placeholder="Enter the meeting minutes or administrative document text..." ) # Segmentation button process = st.button("🔍 Segment Document") # ==================== RIGHT COLUMN - RESULTS ==================== with col2: st.markdown("### 📊 Segmentation Results") if process: if text_input.strip(): with st.spinner("Processing text..."): # Extract entities using the model entities = extract_entities(text_input, tokenizer, model) if entities: st.markdown("#### Detected Entities:") display_entities_compact(entities) # ==================== JSON EXPORT ==================== st.markdown("---") # Expander with JSON visualization with st.expander("📄 View complete JSON"): st.json(entities) # JSON download button json_str = json.dumps(entities, ensure_ascii=False, indent=2) st.download_button( label="⬇️ Download JSON", data=json_str, file_name="extracted_entities.json", mime="application/json" ) else: st.warning("⚠️ No entities were detected in the text.") else: st.warning("⚠️ Please enter some text to process.") else: st.info("👈 Enter text in the input box and click 'Segment Document' to begin.")