Spaces:
Running
Running
| from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline | |
| import re | |
| # Load NER model once | |
| model_name = "Davlan/xlm-roberta-base-ner-hrl" | |
| # loads the tokenizer for the multilingual NER model | |
| tokenizer = AutoTokenizer.from_pretrained(model_name, force_download=True) | |
| # loads the pre-trained token classification model (the NER engine) | |
| model = AutoModelForTokenClassification.from_pretrained(model_name, force_download=True) | |
| # creates a Hugging Face pipeline for easy entity recognition | |
| nlp = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple") | |
| # function to extract the contact person's name using Regex | |
| def extract_contact_person_name(text): | |
| # regex pattern to find name following contact/name keywords | |
| name_pattern = re.compile( | |
| r'(?:contact\s*person|contact\s*name|contact|name):\s*(.*?)(?=\s*(?:Mobile|Number|Needs|09\d{2}|[A-Z]{3,}:|\n|$))', | |
| re.IGNORECASE | re.DOTALL | |
| ) | |
| match = name_pattern.search(text) | |
| if match: | |
| # returns the captured name after removing parenthetical nicknames and periods | |
| name = match.group(1).strip() | |
| return re.sub(r'\s*\(.*\)\s*$', '', name).strip().rstrip('.') | |
| return None | |
| # function to extract mobile or landline phone numbers using Regex | |
| def extract_contact_number(text): | |
| # regex for common Philippine mobile and landline patterns | |
| phone_pattern = re.compile(r'(09\d{2}\s?-?\s?\d{3}\s?-?\s?\d{4}|(?:\+63|63)?\d{10})') | |
| match = phone_pattern.search(text) | |
| if match: | |
| # returns the captured phone number | |
| return match.group(0).strip() | |
| return None | |
| # list of keywords often incorrectly classified as locations by the NER model | |
| NON_LOC_KEYWORDS = ["S.O.S", "PLS", "HELP", "URGENT", "ALERT", "LOCATION", "ADDRESS", "ASAP"] | |
| # function to extract the detailed address line by prioritizing explicit labels | |
| def extract_address_line(text): | |
| # regex pattern to capture address text following keywords, stopping at noise or new lines | |
| address_pattern = re.compile( | |
| r'(?:location|address|loc|near):\s*(.*?)(?=\s*\n|\s*\(|\s*Contact|\s*Mobile|\s*Number|\s*kami|\s*yung|\s*bahay|\s*kmi|\s*near\s|$)', | |
| re.IGNORECASE | |
| ) | |
| match = address_pattern.search(text) | |
| if match: | |
| # returns the cleaned address line | |
| return match.group(1).strip().rstrip('.') | |
| return None | |
| # main function that orchestrates all entity extraction and filtering | |
| def extract_entities(text): | |
| # 1. Attempt to extract high-confidence address line (Regex) | |
| explicit_address = extract_address_line(text) | |
| # 2. Run general NER extraction | |
| # runs the text through the Hugging Face NER pipeline | |
| results = nlp(text) | |
| locations = [] | |
| # processes NER results and filters out generic noise keywords | |
| for entity in results: | |
| # 'LOC' is the label for Locations in XLM-R NER | |
| if entity['entity_group'] == 'LOC': | |
| clean_loc = entity['word'].replace("##", "").strip() | |
| # --- FILTERING LOGIC (Skip noise) --- | |
| # skips locations that are too short | |
| if len(clean_loc) <= 2: | |
| continue | |
| # skips locations matching known non-location keywords (e.g., S.O.S.) | |
| clean_loc_upper = clean_loc.upper().replace('.', '') | |
| if clean_loc_upper in NON_LOC_KEYWORDS: | |
| continue | |
| # --- END FILTERING LOGIC --- | |
| if clean_loc not in locations: | |
| locations.append(clean_loc) | |
| # 3. Prioritize the explicit address if found (inserts at index 0) | |
| # ensures the regex-found address is always the primary location | |
| if explicit_address: | |
| locations.insert(0, explicit_address) | |
| # 4. Extract Contact Number | |
| contact_number = extract_contact_number(text) | |
| # 5. Extract Contact Person Name (NEW) | |
| contact_name = extract_contact_person_name(text) # <--- NEW CALL | |
| # returns all extracted data in a dictionary format | |
| return { | |
| "locations": locations, | |
| "contact": contact_number, | |
| "contact_person_name": contact_name, | |
| "raw": results | |
| } |