Spaces:

Quivara
/

alisto-project

Running

File size: 4,206 Bytes

bdb271a

from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline
import re

# Load NER model once
model_name = "Davlan/xlm-roberta-base-ner-hrl"
# loads the tokenizer for the multilingual NER model
tokenizer = AutoTokenizer.from_pretrained(model_name, force_download=True)
# loads the pre-trained token classification model (the NER engine)
model = AutoModelForTokenClassification.from_pretrained(model_name, force_download=True)
# creates a Hugging Face pipeline for easy entity recognition
nlp = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple")

# function to extract the contact person's name using Regex
def extract_contact_person_name(text):
    # regex pattern to find name following contact/name keywords
    name_pattern = re.compile(
        r'(?:contact\s*person|contact\s*name|contact|name):\s*(.*?)(?=\s*(?:Mobile|Number|Needs|09\d{2}|[A-Z]{3,}:|\n|$))', 
        re.IGNORECASE | re.DOTALL
    )
    
    match = name_pattern.search(text)
    if match:
        # returns the captured name after removing parenthetical nicknames and periods
        name = match.group(1).strip()
        return re.sub(r'\s*\(.*\)\s*$', '', name).strip().rstrip('.')
    return None

# function to extract mobile or landline phone numbers using Regex
def extract_contact_number(text):

    # regex for common Philippine mobile and landline patterns
    phone_pattern = re.compile(r'(09\d{2}\s?-?\s?\d{3}\s?-?\s?\d{4}|(?:\+63|63)?\d{10})')
    
    match = phone_pattern.search(text)
    if match:
        # returns the captured phone number
        return match.group(0).strip()
    return None

# list of keywords often incorrectly classified as locations by the NER model
NON_LOC_KEYWORDS = ["S.O.S", "PLS", "HELP", "URGENT", "ALERT", "LOCATION", "ADDRESS", "ASAP"]

# function to extract the detailed address line by prioritizing explicit labels
def extract_address_line(text):
    # regex pattern to capture address text following keywords, stopping at noise or new lines
    address_pattern = re.compile(
        r'(?:location|address|loc|near):\s*(.*?)(?=\s*\n|\s*\(|\s*Contact|\s*Mobile|\s*Number|\s*kami|\s*yung|\s*bahay|\s*kmi|\s*near\s|$)', 
        re.IGNORECASE
    )
    
    match = address_pattern.search(text)
    if match:
        # returns the cleaned address line
        return match.group(1).strip().rstrip('.') 
    return None

# main function that orchestrates all entity extraction and filtering
def extract_entities(text):

    # 1. Attempt to extract high-confidence address line (Regex)
    explicit_address = extract_address_line(text)
    
    # 2. Run general NER extraction
    # runs the text through the Hugging Face NER pipeline
    results = nlp(text)
    locations = []
    
    # processes NER results and filters out generic noise keywords
    for entity in results:
        # 'LOC' is the label for Locations in XLM-R NER
        if entity['entity_group'] == 'LOC':
            clean_loc = entity['word'].replace("##", "").strip()
            
            # --- FILTERING LOGIC (Skip noise) ---
            # skips locations that are too short
            if len(clean_loc) <= 2:
                continue
            
            # skips locations matching known non-location keywords (e.g., S.O.S.)
            clean_loc_upper = clean_loc.upper().replace('.', '')
            if clean_loc_upper in NON_LOC_KEYWORDS:
                continue
            # --- END FILTERING LOGIC ---
                
            if clean_loc not in locations:
                locations.append(clean_loc)
            
    # 3. Prioritize the explicit address if found (inserts at index 0)
    # ensures the regex-found address is always the primary location
    if explicit_address:
        locations.insert(0, explicit_address)

    # 4. Extract Contact Number
    contact_number = extract_contact_number(text)
    
    # 5. Extract Contact Person Name (NEW)
    contact_name = extract_contact_person_name(text) # <--- NEW CALL

    # returns all extracted data in a dictionary format
    return {
        "locations": locations,
        "contact": contact_number, 
        "contact_person_name": contact_name, 
        "raw": results
    }