Spaces:

Quivara
/

alisto-project

Running

App Files Files Community

alisto-project / alisto_project /backend /ner_extractor.py

Quivara

Fresh upload with LFS

bdb271a 2 days ago

raw

history blame contribute delete

4.21 kB

	from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline
	import re

	# Load NER model once
	model_name = "Davlan/xlm-roberta-base-ner-hrl"
	# loads the tokenizer for the multilingual NER model
	tokenizer = AutoTokenizer.from_pretrained(model_name, force_download=True)
	# loads the pre-trained token classification model (the NER engine)
	model = AutoModelForTokenClassification.from_pretrained(model_name, force_download=True)
	# creates a Hugging Face pipeline for easy entity recognition
	nlp = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple")

	# function to extract the contact person's name using Regex
	def extract_contact_person_name(text):
	# regex pattern to find name following contact/name keywords
	name_pattern = re.compile(
	r'(?:contact\sperson\|contact\sname\|contact\|name):\s(.?)(?=\s*(?:Mobile\|Number\|Needs\|09\d{2}\|[A-Z]{3,}:\|\n\|$))',
	re.IGNORECASE \| re.DOTALL
	)

	match = name_pattern.search(text)
	if match:
	# returns the captured name after removing parenthetical nicknames and periods
	name = match.group(1).strip()
	return re.sub(r'\s$.$\s*$', '', name).strip().rstrip('.')
	return None

	# function to extract mobile or landline phone numbers using Regex
	def extract_contact_number(text):

	# regex for common Philippine mobile and landline patterns
	phone_pattern = re.compile(r'(09\d{2}\s?-?\s?\d{3}\s?-?\s?\d{4}\|(?:\+63\|63)?\d{10})')

	match = phone_pattern.search(text)
	if match:
	# returns the captured phone number
	return match.group(0).strip()
	return None

	# list of keywords often incorrectly classified as locations by the NER model
	NON_LOC_KEYWORDS = ["S.O.S", "PLS", "HELP", "URGENT", "ALERT", "LOCATION", "ADDRESS", "ASAP"]

	# function to extract the detailed address line by prioritizing explicit labels
	def extract_address_line(text):
	# regex pattern to capture address text following keywords, stopping at noise or new lines
	address_pattern = re.compile(
	r'(?:location\|address\|loc\|near):\s(.?)(?=\s\n\|\s\(\|\sContact\|\sMobile\|\sNumber\|\skami\|\syung\|\sbahay\|\skmi\|\snear\s\|$)',
	re.IGNORECASE
	)

	match = address_pattern.search(text)
	if match:
	# returns the cleaned address line
	return match.group(1).strip().rstrip('.')
	return None

	# main function that orchestrates all entity extraction and filtering
	def extract_entities(text):

	# 1. Attempt to extract high-confidence address line (Regex)
	explicit_address = extract_address_line(text)

	# 2. Run general NER extraction
	# runs the text through the Hugging Face NER pipeline
	results = nlp(text)
	locations = []

	# processes NER results and filters out generic noise keywords
	for entity in results:
	# 'LOC' is the label for Locations in XLM-R NER
	if entity['entity_group'] == 'LOC':
	clean_loc = entity['word'].replace("##", "").strip()

	# --- FILTERING LOGIC (Skip noise) ---
	# skips locations that are too short
	if len(clean_loc) <= 2:
	continue

	# skips locations matching known non-location keywords (e.g., S.O.S.)
	clean_loc_upper = clean_loc.upper().replace('.', '')
	if clean_loc_upper in NON_LOC_KEYWORDS:
	continue
	# --- END FILTERING LOGIC ---

	if clean_loc not in locations:
	locations.append(clean_loc)

	# 3. Prioritize the explicit address if found (inserts at index 0)
	# ensures the regex-found address is always the primary location
	if explicit_address:
	locations.insert(0, explicit_address)

	# 4. Extract Contact Number
	contact_number = extract_contact_number(text)

	# 5. Extract Contact Person Name (NEW)
	contact_name = extract_contact_person_name(text) # <--- NEW CALL

	# returns all extracted data in a dictionary format
	return {
	"locations": locations,
	"contact": contact_number,
	"contact_person_name": contact_name,
	"raw": results
	}