from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline import re # Load NER model once model_name = "Davlan/xlm-roberta-base-ner-hrl" # loads the tokenizer for the multilingual NER model tokenizer = AutoTokenizer.from_pretrained(model_name, force_download=True) # loads the pre-trained token classification model (the NER engine) model = AutoModelForTokenClassification.from_pretrained(model_name, force_download=True) # creates a Hugging Face pipeline for easy entity recognition nlp = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple") # function to extract the contact person's name using Regex def extract_contact_person_name(text): # regex pattern to find name following contact/name keywords name_pattern = re.compile( r'(?:contact\s*person|contact\s*name|contact|name):\s*(.*?)(?=\s*(?:Mobile|Number|Needs|09\d{2}|[A-Z]{3,}:|\n|$))', re.IGNORECASE | re.DOTALL ) match = name_pattern.search(text) if match: # returns the captured name after removing parenthetical nicknames and periods name = match.group(1).strip() return re.sub(r'\s*\(.*\)\s*$', '', name).strip().rstrip('.') return None # function to extract mobile or landline phone numbers using Regex def extract_contact_number(text): # regex for common Philippine mobile and landline patterns phone_pattern = re.compile(r'(09\d{2}\s?-?\s?\d{3}\s?-?\s?\d{4}|(?:\+63|63)?\d{10})') match = phone_pattern.search(text) if match: # returns the captured phone number return match.group(0).strip() return None # list of keywords often incorrectly classified as locations by the NER model NON_LOC_KEYWORDS = ["S.O.S", "PLS", "HELP", "URGENT", "ALERT", "LOCATION", "ADDRESS", "ASAP"] # function to extract the detailed address line by prioritizing explicit labels def extract_address_line(text): # regex pattern to capture address text following keywords, stopping at noise or new lines address_pattern = re.compile( r'(?:location|address|loc|near):\s*(.*?)(?=\s*\n|\s*\(|\s*Contact|\s*Mobile|\s*Number|\s*kami|\s*yung|\s*bahay|\s*kmi|\s*near\s|$)', re.IGNORECASE ) match = address_pattern.search(text) if match: # returns the cleaned address line return match.group(1).strip().rstrip('.') return None # main function that orchestrates all entity extraction and filtering def extract_entities(text): # 1. Attempt to extract high-confidence address line (Regex) explicit_address = extract_address_line(text) # 2. Run general NER extraction # runs the text through the Hugging Face NER pipeline results = nlp(text) locations = [] # processes NER results and filters out generic noise keywords for entity in results: # 'LOC' is the label for Locations in XLM-R NER if entity['entity_group'] == 'LOC': clean_loc = entity['word'].replace("##", "").strip() # --- FILTERING LOGIC (Skip noise) --- # skips locations that are too short if len(clean_loc) <= 2: continue # skips locations matching known non-location keywords (e.g., S.O.S.) clean_loc_upper = clean_loc.upper().replace('.', '') if clean_loc_upper in NON_LOC_KEYWORDS: continue # --- END FILTERING LOGIC --- if clean_loc not in locations: locations.append(clean_loc) # 3. Prioritize the explicit address if found (inserts at index 0) # ensures the regex-found address is always the primary location if explicit_address: locations.insert(0, explicit_address) # 4. Extract Contact Number contact_number = extract_contact_number(text) # 5. Extract Contact Person Name (NEW) contact_name = extract_contact_person_name(text) # <--- NEW CALL # returns all extracted data in a dictionary format return { "locations": locations, "contact": contact_number, "contact_person_name": contact_name, "raw": results }