Spaces:
Running
Running
File size: 4,206 Bytes
bdb271a |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 |
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline
import re
# Load NER model once
model_name = "Davlan/xlm-roberta-base-ner-hrl"
# loads the tokenizer for the multilingual NER model
tokenizer = AutoTokenizer.from_pretrained(model_name, force_download=True)
# loads the pre-trained token classification model (the NER engine)
model = AutoModelForTokenClassification.from_pretrained(model_name, force_download=True)
# creates a Hugging Face pipeline for easy entity recognition
nlp = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple")
# function to extract the contact person's name using Regex
def extract_contact_person_name(text):
# regex pattern to find name following contact/name keywords
name_pattern = re.compile(
r'(?:contact\s*person|contact\s*name|contact|name):\s*(.*?)(?=\s*(?:Mobile|Number|Needs|09\d{2}|[A-Z]{3,}:|\n|$))',
re.IGNORECASE | re.DOTALL
)
match = name_pattern.search(text)
if match:
# returns the captured name after removing parenthetical nicknames and periods
name = match.group(1).strip()
return re.sub(r'\s*\(.*\)\s*$', '', name).strip().rstrip('.')
return None
# function to extract mobile or landline phone numbers using Regex
def extract_contact_number(text):
# regex for common Philippine mobile and landline patterns
phone_pattern = re.compile(r'(09\d{2}\s?-?\s?\d{3}\s?-?\s?\d{4}|(?:\+63|63)?\d{10})')
match = phone_pattern.search(text)
if match:
# returns the captured phone number
return match.group(0).strip()
return None
# list of keywords often incorrectly classified as locations by the NER model
NON_LOC_KEYWORDS = ["S.O.S", "PLS", "HELP", "URGENT", "ALERT", "LOCATION", "ADDRESS", "ASAP"]
# function to extract the detailed address line by prioritizing explicit labels
def extract_address_line(text):
# regex pattern to capture address text following keywords, stopping at noise or new lines
address_pattern = re.compile(
r'(?:location|address|loc|near):\s*(.*?)(?=\s*\n|\s*\(|\s*Contact|\s*Mobile|\s*Number|\s*kami|\s*yung|\s*bahay|\s*kmi|\s*near\s|$)',
re.IGNORECASE
)
match = address_pattern.search(text)
if match:
# returns the cleaned address line
return match.group(1).strip().rstrip('.')
return None
# main function that orchestrates all entity extraction and filtering
def extract_entities(text):
# 1. Attempt to extract high-confidence address line (Regex)
explicit_address = extract_address_line(text)
# 2. Run general NER extraction
# runs the text through the Hugging Face NER pipeline
results = nlp(text)
locations = []
# processes NER results and filters out generic noise keywords
for entity in results:
# 'LOC' is the label for Locations in XLM-R NER
if entity['entity_group'] == 'LOC':
clean_loc = entity['word'].replace("##", "").strip()
# --- FILTERING LOGIC (Skip noise) ---
# skips locations that are too short
if len(clean_loc) <= 2:
continue
# skips locations matching known non-location keywords (e.g., S.O.S.)
clean_loc_upper = clean_loc.upper().replace('.', '')
if clean_loc_upper in NON_LOC_KEYWORDS:
continue
# --- END FILTERING LOGIC ---
if clean_loc not in locations:
locations.append(clean_loc)
# 3. Prioritize the explicit address if found (inserts at index 0)
# ensures the regex-found address is always the primary location
if explicit_address:
locations.insert(0, explicit_address)
# 4. Extract Contact Number
contact_number = extract_contact_number(text)
# 5. Extract Contact Person Name (NEW)
contact_name = extract_contact_person_name(text) # <--- NEW CALL
# returns all extracted data in a dictionary format
return {
"locations": locations,
"contact": contact_number,
"contact_person_name": contact_name,
"raw": results
} |