Spaces:

liaad
/

Metadata-Identification-Demo

Sleeping

File size: 22,537 Bytes

c951ae3
a955397
 
 
c951ae3
a955397

import streamlit as st
import json
from transformers import AutoTokenizer, AutoModelForTokenClassification
import torch

# Page configuration
st.set_page_config(
    page_title="MiNER - Stage 2: Metadata Extraction",
    page_icon="🏷️",
    layout="wide"
)

# Custom CSS for improved appearance
st.markdown("""
    <style>
    .main-header {
        font-size: 2.5rem;
        color: #4A90E2;
        text-align: center;
        margin-bottom: 0.5rem;
    }
    .sub-header {
        text-align: center;
        color: #666;
        margin-bottom: 2rem;
    }
    .stButton>button {
        width: 100%;
        background-color: #FF6B6B;
        color: white;
        font-size: 1.1rem;
        padding: 0.75rem;
        border-radius: 8px;
        border: none;
    }
    .stButton>button:hover {
        background-color: #FF5252;
    }
    .entity-group {
        padding: 0.5rem 0;
        margin-bottom: 0.5rem;
    }
    .entity-group-title {
        font-weight: 600;
        color: #555;
        margin-bottom: 0.4rem;
        font-size: 0.8rem;
        text-transform: uppercase;
        letter-spacing: 0.5px;
    }
    .entity-badge {
        display: inline-block;
        padding: 0.3rem 0.7rem;
        border-radius: 5px;
        margin: 0.2rem 0.3rem 0.2rem 0;
        font-size: 0.9rem;
        font-weight: 500;
        box-shadow: 0 1px 3px rgba(0,0,0,0.12);
    }
    .example-section {
        background-color: #2d3436;
        color: #dfe6e9;
        padding: 1rem;
        border-radius: 8px;
        margin: 1rem 0;
    }
    </style>
""", unsafe_allow_html=True)

# ==================== MODEL LOADING ====================
@st.cache_resource
def load_model():
    """Loads the model and tokenizer (cached to avoid reloading)"""
    MODEL_NAME = "liaad/Citilink-BERTimbau-large-metadata-pt-baseline"
    
    try:
        tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
        model = AutoModelForTokenClassification.from_pretrained(MODEL_NAME)
        model.eval()
        return tokenizer, model
    except Exception as e:
        st.error(f"Error loading model: {e}")
        return None, None

# ==================== ENTITY EXTRACTION ====================
def extract_entities(text, tokenizer, model):
    """
    Extracts entities from text using BERT model.
    Based on the original extract_entities function.
    """
    if not text or text.strip() == "":
        return {}
    
    # Tokenization + offsets
    encoding = tokenizer(
        text,
        return_tensors="pt",
        return_offsets_mapping=True,
        truncation=True,
        max_length=512
    )
    
    offsets = encoding["offset_mapping"][0].tolist()
    word_ids = encoding.word_ids(batch_index=0)
    
    inputs = {
        "input_ids": encoding["input_ids"],
        "attention_mask": encoding["attention_mask"],
    }
    
    # Prediction
    with torch.no_grad():
        outputs = model(**inputs)
    
    pred_ids = torch.argmax(outputs.logits, dim=2)[0].tolist()
    pred_labels = [model.config.id2label[i] for i in pred_ids]
    
    # Entity reconstruction
    entities = []
    current = None
    prev_word_idx = None
    
    for i, label in enumerate(pred_labels):
        word_idx = word_ids[i]
        start, end = offsets[i]
        
        # Ignore special tokens (CLS, SEP, PAD)
        if word_idx is None:
            continue
        
        # ONLY process the FIRST subtoken of each word
        if word_idx == prev_word_idx:
            # This is a subsequent subtoken, ignore prediction
            # But extend the offset of current entity if it exists
            if current:
                current["end"] = end
            continue
        
        # Update prev_word_idx
        prev_word_idx = word_idx
        
        # Process the label of the first subtoken
        if label.startswith("B-"):
            # Close previous entity if it exists
            if current:
                entities.append(current)
            
            # Start new entity
            current = {"label": label[2:], "start": start, "end": end}
        
        elif label.startswith("I-"):
            if current and current["label"] == label[2:]:
                # Continue entity correctly
                current["end"] = end
            else:
                # Ignore loose I- (don't create entity)
                continue
        
        else:  # label == "O"
            # Close current entity
            if current:
                entities.append(current)
                current = None
    
    # Close last entity if it exists
    if current:
        entities.append(current)
    
    # Add extracted text to each entity
    for ent in entities:
        ent["text"] = text[ent["start"]:ent["end"]]
    
    # Group entities by label
    grouped_entities = {}
    for ent in entities:
        label = ent["label"]
        if label not in grouped_entities:
            grouped_entities[label] = []
        
        grouped_entities[label].append({
            "text": ent["text"],
            "start": ent["start"],
            "end": ent["end"]
        })
    
    # ==================== POST-PROCESSING ====================
    # For certain metadata, keep only the first occurrence
    unique_entities = ["HORARIO", "DATA", "NUMERO-ATA", "LOCAL", "TIPO-REUNIAO"]
    
    for label in list(grouped_entities.keys()):
        # Check if the label starts with any of the unique entities
        for unique_entity in unique_entities:
            if label.startswith(unique_entity):
                # Keep only the first occurrence (smallest start offset)
                grouped_entities[label] = sorted(
                    grouped_entities[label], 
                    key=lambda x: x["start"]
                )[:1]
                break
    
    return grouped_entities

# ==================== UTILITY FUNCTIONS ====================
def translate_entity_label(label):
    """Translates entity labels from Portuguese to English and adds emojis"""
    translations = {
        "NUMERO-ATA": ("📋", "MINUTES NUMBER"),
        "DATA": ("📅", "DATE"),
        "LOCAL": ("📍", "LOCATION"),
        "TIPO-REUNIAO": ("📌", "MEETING TYPE"),
        "HORARIO-INICIO": ("🕐", "BEGIN TIME"),
        "HORARIO-FIM": ("🕐", "END TIME"),
        "PARTICIPANTE-PRESIDENTE-PRESENTE": ("👔", "PRESIDENT - PRESENT"),
        "PARTICIPANTE-PRESIDENTE-AUSENTE": ("👔", "PRESIDENT - ABSENT"),
        "PARTICIPANTE-PRESIDENTE-SUBSTITUIDO": ("👔", "PRESIDENT - SUBSTITUTED"),
        "PARTICIPANTE-VEREADOR-PRESENTE": ("👥", "COUNCILOR - PRESENT"),
        "PARTICIPANTE-VEREADOR-AUSENTE": ("👥", "COUNCILOR - ABSENT"),
        "PARTICIPANTE-VEREADOR-SUBSTITUIDO": ("👥", "COUNCILOR - SUBSTITUTED"),
    }
    
    # Check if exact translation exists
    if label in translations:
        return translations[label]
    
    # Fallback: try to extract base category
    for key, value in translations.items():
        if label.startswith(key.split("-")[0]):
            return value
    
    return ("", label)

def get_entity_style(label):
    """Returns color and border style based on entity type"""
    # Base colors
    colors = {
        "NUMERO-ATA": "#E74C3C",           # Red
        "DATA": "#16A085",                 # Teal
        "LOCAL": "#2980B9",                # Blue
        "TIPO-REUNIAO": "#E67E22",         # Orange
        "HORARIO": "#F39C12",              # Yellow-orange
        "PARTICIPANTE-PRESIDENTE": "#8E44AD",  # Purple for President
        "PARTICIPANTE-VEREADOR": "#27AE60"     # Green for Councilors
    }
    
    # Border styles for different attendance states
    border_styles = {
        "PRESENTE": "solid",      # Solid border
        "AUSENTE": "dashed",      # Dashed border
        "SUBSTITUIDO": "dotted"   # Dotted border
    }
    
    # Determine base color
    color = "#7F8C8D"  # Default color
    border_style = "solid"
    border_width = "2px"
    
    # Check if it's President or Councilor
    if "PARTICIPANTE-PRESIDENTE" in label:
        color = colors["PARTICIPANTE-PRESIDENTE"]
        # Determine border style
        if "PRESENTE" in label:
            border_style = border_styles["PRESENTE"]
        elif "AUSENTE" in label:
            border_style = border_styles["AUSENTE"]
        elif "SUBSTITUIDO" in label:
            border_style = border_styles["SUBSTITUIDO"]
    elif "PARTICIPANTE-VEREADOR" in label:
        color = colors["PARTICIPANTE-VEREADOR"]
        # Determine border style
        if "PRESENTE" in label:
            border_style = border_styles["PRESENTE"]
        elif "AUSENTE" in label:
            border_style = border_styles["AUSENTE"]
        elif "SUBSTITUIDO" in label:
            border_style = border_styles["SUBSTITUIDO"]
    else:
        # For other entities, use specific colors
        base_category = label.split("-")[0]
        color = colors.get(base_category, color)
    
    return color, border_style, border_width

def display_entities_compact(entities_dict):
    """Displays entities in a compact format with emojis and translation"""
    if not entities_dict:
        st.info("No entities detected.")
        return
    
    for label, entities in sorted(entities_dict.items()):
        emoji, translated_label = translate_entity_label(label)
        color, border_style, border_width = get_entity_style(label)
        
        # Group title with emoji
        title_html = f'<div class="entity-group-title">{emoji} {translated_label}</div>'
        st.markdown(title_html, unsafe_allow_html=True)
        
        # Show all entities of this type in compact badges
        badges_html = '<div class="entity-group">'
        for ent in entities:
            badge_style = f"background-color: {color}; color: white; border: {border_width} {border_style} rgba(0,0,0,0.2);"
            badges_html += f'<span class="entity-badge" style="{badge_style}">{ent["text"]}</span>'
        badges_html += '</div>'
        
        st.markdown(badges_html, unsafe_allow_html=True)

# ==================== SIDEBAR ====================
with st.sidebar:
    st.markdown("### ⚙️ Configuration")
    st.markdown("Choose an example or enter your own text:")
    
    selected_example = st.selectbox(
        "Select an example",
        ["Custom Text", "Example 1 - Alandroal", "Example 2 - Campo Maior", "Example 3 - Covilhã", "Example 4 - Fundão", "Example 5 - Guimarães", "Example 6 - Porto"]
    )
    
    st.markdown("---")
    
    # About Section
    st.markdown("### 📋 About")
    st.markdown("""
    **MiNER Stage 2** uses Named Entity Recognition models to automatically extract metadata from meeting minutes.

    - **Model**: BERTimbau fine-tuned
    - **Languages**: Portuguese
    - **Method**: Token Classification (NER) with BIO tagging
    """)
    
    st.markdown("---")
    
    # Resources Section
    st.markdown("### 🔗 Resources")
    st.markdown("""
    - [GitHub Repository](https://github.com/LIAAD/MiNER)
    - [Model](https://huggingface.co/liaad/Citilink-BERTimbau-large-metadata-pt-baseline)
    """)

# ==================== MAIN CONTENT ====================

# Header
st.markdown('<div class="main-header">🏷️ MiNER — Stage 2: Metadata Extraction Demo</div>', unsafe_allow_html=True)
st.markdown('<div class="sub-header">Automatic extraction of structured metadata from municipal meeting minutes</div>', unsafe_allow_html=True)

# ==================== HOW IT WORKS (MOVED TO TOP) ====================
with st.expander("🎯 How It Works", expanded=False):
    st.markdown("""
    The model analyzes the **meeting minutes** to automatically extract **structured metadata** using a *Named Entity Recognition (NER)* approach.
    
    **What information is extracted:**
    
    Each token in the document is classified, identifying information such as:
    - 📅 **Date**
    - 🕐 **Start / End time**
    - 📍 **Location**
    - 📋 **Minute ID**
    - 📌 **Meeting type**
    - 👔 **President** (present / absent / substituted)
    - 👥 **Councilors** (present / absent / substituted)
    
    **Technical approach:**
    
    The model uses the **BIO tagging scheme** (*Begin, Inside, Outside*) to mark entity boundaries, and the final spans are reconstructed from token-level predictions.
    
    ---
    
    ### 📖 Complete Example
    
    **Input Document:**
    """)
    
    # Input example in code block
    st.code("""CÂMARA MUNICIPAL DE ALANDROAL
ATA N.º 21
REUNIÃO ORDINÁRIA 11/09/2024
Presidiu o Senhor João Maria Aranha Grilo, Presidente da Câmara Municipal de Alandroal
Vereadores Paulo Jorge da Silva Gonçalves
Fernanda Manuela Brites Romão
Elisabete de Jesus dos Passos Galhardas
Faltou João Carlos Camões Roma Balsante
Secretariou a Reunião ****************************************
No Edifício Sede do Município de Alandroal, o Senhor Presidente da Câmara Municipal, João Maria Aranha Grilo, declarou aberta a reunião, eram 15 horas e 30 minutos.""", language=None)
    
    st.markdown("""
    **Expected Output (Extracted Entities):**
    
    ```
    📅 DATE
       • 11/09/2024
    
    🕐 TIME
       • 15 horas e 30 minutos
    
    📍 LOCATION
       • No Edifício Sede do Município de Alandroal
    
    📋 MINUTES NUMBER
       • 21
    
    📌 MEETING TYPE
       • ORDINÁRIA
    
    👔 PRESIDENT - PRESENT
       • João Maria Aranha Grilo
    
    👥 COUNCILOR - PRESENT
       • Paulo Jorge da Silva Gonçalves
       • Fernanda Manuela Brites Romão
       • Elisabete de Jesus dos Passos Galhardas
    
    👥 COUNCILOR - ABSENT
       • João Carlos Camões Roma Balsante
    ```
    """)

# Load model
with st.spinner("Loading model..."):
    tokenizer, model = load_model()

if tokenizer is None or model is None:
    st.error("❌ Failed to load model. Please check if the model path is correct.")
    st.stop()

# Main layout with two columns
col1, col2 = st.columns([1, 1])

# ==================== LEFT COLUMN - INPUT ====================
with col1:
    st.markdown("### 📝 Input Document")
    
    if selected_example == "Custom Text":
        example_text = ""
    elif selected_example == "Example 1 - Alandroal":
        example_text = """CÂMARA MUNICIPAL DE ALANDROAL
ATA N.º 21
REUNIÃO ORDINÁRIA 11/09/2024
Presidiu o Senhor João Maria Aranha Grilo, Presidente da Câmara Municipal de Alandroal
Vereadores Paulo Jorge da Silva Gonçalves
Fernanda Manuela Brites Romão
Elisabete de Jesus dos Passos Galhardas
Faltou João Carlos Camões Roma Balsante
Secretariou a Reunião ****************************************
No Edifício Sede do Município de Alandroal, o Senhor Presidente da Câmara Municipal, João Maria Aranha Grilo, declarou aberta a reunião, eram 15 horas e 30 minutos."""    
    elif selected_example == "Example 2 - Campo Maior":
        example_text = """ATA Nº 1 REUNIÃO ORDINÁRIA DA CÂMARA MUNICIPAL DE CAMPO MAIOR, REALIZADA EM 5 DE JANEIRO DE 2022.
Aos cinco dias do mês de janeiro do ano de dois mil e vinte e dois, no Edifício dos Paços do Concelho, nesta Vila, realizou-se, pelas nove horas e trinta minutos, a reunião Ordinária da Câmara Municipal, comparecendo os Excelentíssimos Senhores Luís Fernando Martins Rosinha, Paulo Ivo Sabino Martins de Almeida, Paulo Jorge Furtado Pinheiro, Maria da Encarnação Grifo Silveirinha (videoconferência) e Fátima do Rosário Pingo Vitorino Pereira, respetivamente, Presidente e Vereadores efetivos deste Órgão Autárquico.
-Verificada a presença dos respectivos membros, o Senhor Presidente declarou aberta a reunião:
-Estava presente o Chefe **************************************, Dr. *********************************** e a Assistente Técnico **************************************.
-Depois de todos terem ocupado os seus lugares, o Senhor Presidente declarou aberta a reunião eram nove horas e trinta minutos."""
    elif selected_example == "Example 3 - Covilhã":
        example_text = """-- --

-- --

CÂMARA MUNICIPAL

DA

COVILHÃ

TEXTO DEFINITIVO DA ATA Nº 02/2023

Da reunião ordinária privada realizada no dia 03 de fevereiro de 2023, iniciada às 09:05 horas e concluída às 10:15 horas.

------------------------------------ Sumário:                        01 ------------------------------- ---- Abertura                        02

Período Antes da Ordem do Dia   05

Período da Ordem do Dia         06

Agenda                          06

Aprovação de Atas               06

Balancete                       07

Despachos                       07

DAGCJ                           10

DFMA                            17

DOP                             19

DECAD                           29

DU                              38

Aprovação em minuta             42 Votação das deliberações        42 Montante Global de Encargos     42 Encerramento                    42

------------------------------------

ABERTURA

ATA Nº 02/2023

Aos três dias do mês de fevereiro do ano de dois mil e vinte e três, na Sala de Reuniões dos Paços do Concelho, na Covilhã, realizou-se a reunião ordinária privada da Câmara Municipal da Covilhã sob a presidência do Senhor Presidente da Câmara, Vítor Manuel Pinheiro Pereira, estando presentes o Senhor Vice-Presidente José Armando Serra dos Reis e os Senhores Vereadores Pedro Miguel Santos Farromba, Maria Regina Gomes Gouveia, Jorge Humberto Martins Simões (em substituição de Ricardo Miguel Correia Leitão Ferreira da Silva), José Miguel Ribeiro Oliveira e Marta Maria Tomaz Gomes Morais Alçada Bom Jesus.

A reunião foi secretariada pela Senhora Dr.ª ********************************, Diretora *************************************************************.

E, pelas 09:05 horas, o Senhor Presidente da Câmara deu início aos trabalhos da presente reunião com a seguinte Ordem de Trabalhos:"""
    elif selected_example == "Example 4 - Fundão":
        example_text = """ 14/02/2022 ATA DA REUNIÃO DE 14/02/2022 CÂMARA MUNICIPAL
DO
FUNDÃO
Texto definitivo da ata n.º 2/2022 da reunião ordinária realizada no dia 14 de fevereiro de 2022, iniciada às 17:00 horas e concluída às 19:00.
ATA N.º 2/2022 Aos catorze dias do mês fevereiro do ano dois mil e vinte e dois, realizou-se por videoconferência, a reunião ordinária privada da Câmara Municipal do Fundão, sob a presidência do Senhor Presidente da Câmara, Dr. Paulo Alexandre Bernardo Fernandes, com a participação do Senhor Vice-presidente, Dr. Luís Miguel Roque Tarouca Duarte Gavinhos e dos Senhores Vereadores, Dra. Joana Morgadinho Bento, Dra. Maria Alcina Domingues Cerdeira, Dr. Pedro Manuel Figueiredo Neto, Prof. Sérgio Miguel Cardoso Mendes e Dra. Ana Paula Coelho Duarte.
A reunião foi secretariada pela Dra. ****************************, Diretora *******************************************."""
    elif selected_example == "Example 5 - Guimarães":
        example_text = """Câmara Municipal de Guimarães.
 ATA Nº 1 Fls. __10__ REUNIÃO ORDINÁRIA DE 13 DE JANEIRO DE 2022 
ATA
Aos treze dias do mês de janeiro do ano de dois mil e vinte e dois, no Edifício dos Paços do Concelho, na Sala de Reuniões, compareceram os Excelentíssimos Senhores: Presidente da Câmara – Domingos Bragança Salgado e Vereadores – Adelina Paula Mendes Pinto, Paulo Rui Lopes Pereira da Silva, Paula Cristina dos Santos Oliveira, Nelson José Guimarães Felgueiras, Alice Sofia de Freitas Soares Ferreira Fernandes, Ana Maria Prego de Faria Berkeley Cotter, Bruno Alberto Vieira Fernandes, Ricardo José Machado Pereira da Silva Araújo, Vânia Carvalho Dias da Silva de Antas de Barros e Hugo Miguel Alves Ribeiro. 
Secretariou a Diretora ***************, **************************************. 
Pelas 10.10 horas foi declarada aberta a reunião."""
    elif selected_example == "Example 6 - Porto":
        example_text = """2.ª REUNIÃO PÚBLICA,
DA CÂMARA MUNICIPAL DO PORTO
REALIZADA EM 8 DE NOVEMBRO DE 2021
ÀS 10 HORAS
PRESENTES:
-   Rui de Carvalho de Araújo Moreira
-   Filipe Manuel Ventura Camões de Almeida Araújo
-   Ana Catarina da Rocha Araújo
-   Ricardo Miguel Araújo Cardoso Valente
-   Albino Pedro Pereira Baganha
-   Cristina Mafalda Nieto Guimarães Pimentel
-   Tiago Barbosa Ribeiro
-   Maria do Rosário Gambôa Lopes de Carvalho
-   Catarina Maria da Costa Santos Cunha Pereira de Abreu
-   Vladimiro Mota Cardoso Feliz
-   Alberto Amaro Guedes Machado
-   Maria Ilda da Costa Figueiredo
-   Sérgio Augusto Leite Aires
Secretariou a reunião a Técnica ********, ***************."""
    else:
        example_text = "Add your text here"
    
    st.markdown(f"**Example:** {selected_example}")
    
    text_input = st.text_area(
        "Type or paste the text here:",
        value=example_text,
        height=400,
        placeholder="Enter the meeting minutes or administrative document text..."
    )
    
    # Segmentation button
    process = st.button("🔍 Segment Document")

# ==================== RIGHT COLUMN - RESULTS ====================
with col2:
    st.markdown("### 📊 Segmentation Results")
    
    if process:
        if text_input.strip():
            with st.spinner("Processing text..."):
                # Extract entities using the model
                entities = extract_entities(text_input, tokenizer, model)
                
                if entities:
                    st.markdown("#### Detected Entities:")
                    display_entities_compact(entities)
                    
                    # ==================== JSON EXPORT ====================
                    st.markdown("---")
                    
                    # Expander with JSON visualization
                    with st.expander("📄 View complete JSON"):
                        st.json(entities)
                    
                    # JSON download button
                    json_str = json.dumps(entities, ensure_ascii=False, indent=2)
                    st.download_button(
                        label="⬇️ Download JSON",
                        data=json_str,
                        file_name="extracted_entities.json",
                        mime="application/json"
                    )
                else:
                    st.warning("⚠️ No entities were detected in the text.")
        else:
            st.warning("⚠️ Please enter some text to process.")
    else:
        st.info("👈 Enter text in the input box and click 'Segment Document' to begin.")