Spaces:

howdoiuse-keyboard
/

indian-address-parser

Sleeping

indian-address-parser / src /address_parser /postprocessing /rules.py

Upload folder using huggingface_hub

5363153 verified about 2 months ago

20.5 kB

	"""Rule-based post-processing for entity refinement."""

	import re

	from address_parser.postprocessing.gazetteer import DelhiGazetteer
	from address_parser.schemas import AddressEntity


	class RuleBasedRefiner:
	"""
	Post-processing rules for refining NER predictions.

	Handles:
	- Pattern-based entity detection (pincodes, khasra numbers)
	- Entity boundary correction using gazetteer
	- Entity merging for fragmented predictions
	- Confidence adjustment
	- Validation and filtering
	"""

	# Regex patterns for deterministic entities
	PATTERNS = {
	"PINCODE": re.compile(r'\b[1-9]\d{5}\b'),
	"KHASRA": re.compile(
	r'\b(?:KH\.?\s(?:NO\.?)?\s\|KHASRA\s(?:NO\.?)?\s)[\d/]+(?:[/-]\d+)*\b',
	re.IGNORECASE
	),
	"HOUSE_NUMBER": re.compile(
	r'\b(?:H\.?\s(?:NO\.?)?\s\|HOUSE\s(?:NO\.?)?\s\|PLOT\s(?:NO\.?)?\s)?[A-Z]?\d+[A-Z]?(?:[-/]\d+)*\b',
	re.IGNORECASE
	),
	"FLOOR": re.compile(
	r'\b(?:GROUND\|FIRST\|SECOND\|THIRD\|FOURTH\|FIFTH\|1ST\|2ND\|3RD\|4TH\|5TH\|GF\|FF\|SF\|TF)?\s*(?:FLOOR\|FLR)\b',
	re.IGNORECASE
	),
	"BLOCK": re.compile(
	r'\b(?:BLOCK\|BLK\|BL)\s*[A-Z]?[-]?[A-Z0-9]+\b',
	re.IGNORECASE
	),
	"SECTOR": re.compile(
	r'\b(?:SECTOR\|SEC)\s*\d+[A-Z]?\b',
	re.IGNORECASE
	),
	"GALI": re.compile(
	r'\b(?:GALI\|GALLI\|LANE)\s(?:NO\.?)?\s\d+[A-Z]?\b',
	re.IGNORECASE
	),
	}

	# Area patterns - directional areas
	AREA_PATTERNS = [
	(re.compile(r'\bSOUTH\s+DELHI\b', re.IGNORECASE), "SOUTH DELHI"),
	(re.compile(r'\bNORTH\s+DELHI\b', re.IGNORECASE), "NORTH DELHI"),
	(re.compile(r'\bEAST\s+DELHI\b', re.IGNORECASE), "EAST DELHI"),
	(re.compile(r'\bWEST\s+DELHI\b', re.IGNORECASE), "WEST DELHI"),
	(re.compile(r'\bCENTRAL\s+DELHI\b', re.IGNORECASE), "CENTRAL DELHI"),
	(re.compile(r'\bSOUTH\s+WEST\s+DELHI\b', re.IGNORECASE), "SOUTH WEST DELHI"),
	(re.compile(r'\bNORTH\s+WEST\s+DELHI\b', re.IGNORECASE), "NORTH WEST DELHI"),
	(re.compile(r'\bNORTH\s+EAST\s+DELHI\b', re.IGNORECASE), "NORTH EAST DELHI"),
	(re.compile(r'\bSOUTH\s+EAST\s+DELHI\b', re.IGNORECASE), "SOUTH EAST DELHI"),
	(re.compile(r'\bOUTER\s+DELHI\b', re.IGNORECASE), "OUTER DELHI"),
	]

	# City patterns
	CITY_PATTERNS = [
	(re.compile(r'\bNEW\s+DELHI\b', re.IGNORECASE), "NEW DELHI"),
	(re.compile(r'\bDELHI\b', re.IGNORECASE), "DELHI"),
	(re.compile(r'\bNOIDA\b', re.IGNORECASE), "NOIDA"),
	(re.compile(r'\bGURUGRAM\b', re.IGNORECASE), "GURUGRAM"),
	(re.compile(r'\bGURGAON\b', re.IGNORECASE), "GURGAON"),
	(re.compile(r'\bFARIDABAD\b', re.IGNORECASE), "FARIDABAD"),
	(re.compile(r'\bGHAZIABAD\b', re.IGNORECASE), "GHAZIABAD"),
	]

	# State patterns
	STATE_PATTERNS = [
	(re.compile(r'\bDELHI\b', re.IGNORECASE), "DELHI"),
	(re.compile(r'\bHARYANA\b', re.IGNORECASE), "HARYANA"),
	(re.compile(r'\bUTTAR\s+PRADESH\b', re.IGNORECASE), "UTTAR PRADESH"),
	(re.compile(r'\bU\.?\s*P\.?\b'), "UTTAR PRADESH"),
	]

	# Colony/Nagar indicators
	COLONY_SUFFIXES = [
	"NAGAR", "VIHAR", "COLONY", "ENCLAVE", "PARK", "GARDEN",
	"PURI", "BAGH", "KUNJ", "EXTENSION", "EXTN", "PHASE",
	]

	# Known multi-word localities that get fragmented
	KNOWN_LOCALITIES = [
	"LAJPAT NAGAR", "MALVIYA NAGAR", "KAROL BAGH", "HAUZ KHAS",
	"GREEN PARK", "GREATER KAILASH", "DEFENCE COLONY", "SOUTH EXTENSION",
	"CHITTARANJAN PARK", "NEHRU PLACE", "SARITA VIHAR", "VASANT KUNJ",
	"CIVIL LINES", "MODEL TOWN", "MUKHERJEE NAGAR", "KAMLA NAGAR",
	"ASHOK VIHAR", "SHALIMAR BAGH", "PREET VIHAR", "MAYUR VIHAR",
	"LAKSHMI NAGAR", "GANDHI NAGAR", "DILSHAD GARDEN", "ANAND VIHAR",
	"UTTAM NAGAR", "TILAK NAGAR", "RAJOURI GARDEN", "PUNJABI BAGH",
	"PASCHIM VIHAR", "CONNAUGHT PLACE", "RAJENDER NAGAR", "PATEL NAGAR",
	"KIRTI NAGAR", "LODHI ROAD", "GOLF LINKS", "SANGAM VIHAR",
	"GOVINDPURI", "AMBEDKAR NAGAR", "LADO SARAI", "KAUNWAR SINGH NAGAR",
	"BABA HARI DAS COLONY", "SWARN PARK", "CHANCHAL PARK", "DURGA PARK",
	"RAJ NAGAR", "SADH NAGAR", "VIJAY ENCLAVE", "PALAM COLONY",
	]

	def __init__(self, use_gazetteer: bool = True):
	"""
	Initialize refiner.

	Args:
	use_gazetteer: Use gazetteer for validation/correction
	"""
	self.gazetteer = DelhiGazetteer() if use_gazetteer else None

	def refine(
	self,
	text: str,
	entities: list[AddressEntity]
	) -> list[AddressEntity]:
	"""
	Refine entity predictions.

	Args:
	text: Original address text
	entities: Predicted entities from NER model

	Returns:
	Refined list of entities
	"""
	refined = list(entities)

	# First: detect and fix known localities from gazetteer
	refined = self._fix_known_localities(text, refined)

	# Add rule-based entities that may have been missed
	refined = self._add_pattern_entities(text, refined)

	# Detect area patterns (SOUTH DELHI, etc.)
	refined = self._add_area_patterns(text, refined)

	# Correct entity boundaries
	refined = self._correct_boundaries(text, refined)

	# Merge fragmented entities
	refined = self._merge_fragmented_entities(text, refined)

	# Adjust confidence scores
	refined = self._adjust_confidence(text, refined)

	# Remove duplicates and overlapping entities
	refined = self._remove_overlaps(refined)

	# Validate entities
	refined = self._validate_entities(refined)

	return refined

	def _fix_known_localities(
	self,
	text: str,
	entities: list[AddressEntity]
	) -> list[AddressEntity]:
	"""Fix fragmented known localities using gazetteer lookup."""
	text_upper = text.upper()
	result = []
	used_ranges: list[tuple[int, int]] = []

	# First pass: find all known localities in text
	locality_entities = []
	for locality in self.KNOWN_LOCALITIES:
	idx = 0
	while True:
	pos = text_upper.find(locality, idx)
	if pos == -1:
	break
	end = pos + len(locality)
	locality_entities.append(AddressEntity(
	label="SUBAREA",
	value=text[pos:end],
	start=pos,
	end=end,
	confidence=0.95
	))
	used_ranges.append((pos, end))
	idx = end

	# Also check area patterns
	for pattern, area_name in self.AREA_PATTERNS:
	match = pattern.search(text)
	if match:
	start, end = match.start(), match.end()
	# Check for overlap with existing ranges
	overlaps = any(
	not (end <= s or start >= e)
	for s, e in used_ranges
	)
	if not overlaps:
	locality_entities.append(AddressEntity(
	label="AREA",
	value=area_name,
	start=start,
	end=end,
	confidence=0.95
	))
	used_ranges.append((start, end))

	# Filter out original entities that overlap with found localities
	for entity in entities:
	# Check if entity overlaps with any locality range
	overlaps_locality = any(
	not (entity.end <= start or entity.start >= end)
	for start, end in used_ranges
	)

	if overlaps_locality and entity.label in ("AREA", "SUBAREA", "COLONY", "CITY"):
	# Skip this fragmented entity
	continue

	result.append(entity)

	# Add the locality entities
	result.extend(locality_entities)

	return result

	def _add_area_patterns(
	self,
	text: str,
	entities: list[AddressEntity]
	) -> list[AddressEntity]:
	"""Add area patterns like SOUTH DELHI, NORTH DELHI (already handled in _fix_known_localities)."""
	# This is now handled in _fix_known_localities to avoid duplicates
	return entities

	def _merge_fragmented_entities(
	self,
	text: str,
	entities: list[AddressEntity]
	) -> list[AddressEntity]:
	"""Merge adjacent entities of same type that should be together."""
	if len(entities) < 2:
	return entities

	# Sort by position
	sorted_entities = sorted(entities, key=lambda e: e.start)
	result = []
	i = 0

	while i < len(sorted_entities):
	current = sorted_entities[i]

	# Look for adjacent entities to merge
	if current.label in ("AREA", "SUBAREA", "COLONY", "CITY"):
	merged_end = current.end
	merged_confidence = current.confidence
	j = i + 1

	# Check subsequent entities
	while j < len(sorted_entities):
	next_ent = sorted_entities[j]

	# Check if adjacent (within 2 chars - allows for space)
	gap = next_ent.start - merged_end
	if gap <= 2 and next_ent.label in ("AREA", "SUBAREA", "COLONY", "CITY"):
	# Check if the merged text forms a known locality
	merged_text = text[current.start:next_ent.end].strip()
	if self._is_valid_merge(merged_text):
	merged_end = next_ent.end
	merged_confidence = max(merged_confidence, next_ent.confidence)
	j += 1
	else:
	break
	else:
	break

	# Create merged entity if we merged anything
	if j > i + 1:
	merged_value = text[current.start:merged_end].strip()
	result.append(AddressEntity(
	label=current.label,
	value=merged_value,
	start=current.start,
	end=merged_end,
	confidence=merged_confidence
	))
	i = j
	continue

	result.append(current)
	i += 1

	return result

	def _is_valid_merge(self, text: str) -> bool:
	"""Check if merged text forms a valid locality name."""
	text_upper = text.upper().strip()

	# Check against known localities
	if text_upper in self.KNOWN_LOCALITIES:
	return True

	# Check gazetteer
	if self.gazetteer and self.gazetteer.is_known_locality(text_upper, threshold=80):
	return True

	# Check if ends with common suffix
	for suffix in self.COLONY_SUFFIXES:
	if text_upper.endswith(suffix):
	return True

	return False

	def _add_pattern_entities(
	self,
	text: str,
	entities: list[AddressEntity]
	) -> list[AddressEntity]:
	"""Add entities detected by regex patterns."""
	result = list(entities)
	existing_spans = {(e.start, e.end) for e in entities}

	# Check for pincode
	if not any(e.label == "PINCODE" for e in entities):
	match = self.PATTERNS["PINCODE"].search(text)
	if match and (match.start(), match.end()) not in existing_spans:
	result.append(AddressEntity(
	label="PINCODE",
	value=match.group(0),
	start=match.start(),
	end=match.end(),
	confidence=1.0 # Rule-based, high confidence
	))

	# Check for city - DELHI addresses always have DELHI as city
	has_city = any(e.label == "CITY" for e in result)
	if not has_city:
	# If text contains DELHI anywhere, set city to DELHI
	if "DELHI" in text.upper():
	# Find the last occurrence of DELHI (usually the city mention)
	delhi_positions = [m.start() for m in re.finditer(r'\bDELHI\b', text.upper())]
	if delhi_positions:
	pos = delhi_positions[-1] # Use last occurrence
	result.append(AddressEntity(
	label="CITY",
	value="DELHI",
	start=pos,
	end=pos + 5,
	confidence=0.90
	))
	else:
	# Check other city patterns
	for pattern, city_name in self.CITY_PATTERNS:
	if city_name == "DELHI":
	continue # Already handled above
	match = pattern.search(text)
	if match and (match.start(), match.end()) not in existing_spans:
	result.append(AddressEntity(
	label="CITY",
	value=city_name,
	start=match.start(),
	end=match.end(),
	confidence=0.95
	))
	break

	# Check for state
	if not any(e.label == "STATE" for e in entities):
	for pattern, state_name in self.STATE_PATTERNS:
	match = pattern.search(text)
	if match and (match.start(), match.end()) not in existing_spans:
	# Avoid tagging "DELHI" as state if it's already a city
	if state_name == "DELHI" and any(e.label == "CITY" and "DELHI" in e.value.upper() for e in result):
	continue
	result.append(AddressEntity(
	label="STATE",
	value=state_name,
	start=match.start(),
	end=match.end(),
	confidence=0.90
	))
	break

	return result

	def _correct_boundaries(
	self,
	text: str,
	entities: list[AddressEntity]
	) -> list[AddressEntity]:
	"""Correct entity boundaries based on patterns."""
	result = []

	for entity in entities:
	updates: dict[str, object] = {}

	# Expand KHASRA to include full pattern
	if entity.label == "KHASRA":
	match = self.PATTERNS["KHASRA"].search(text)
	if match:
	updates = {"value": match.group(0), "start": match.start(), "end": match.end()}

	# Expand BLOCK to include identifier
	elif entity.label == "BLOCK":
	match = self.PATTERNS["BLOCK"].search(text)
	if match:
	updates = {"value": match.group(0), "start": match.start(), "end": match.end()}

	# Expand FLOOR to include floor number
	elif entity.label == "FLOOR":
	match = self.PATTERNS["FLOOR"].search(text)
	if match:
	updates = {"value": match.group(0), "start": match.start(), "end": match.end()}

	# Clean up leading/trailing whitespace from value
	final_value = (updates.get("value") or entity.value).strip()
	if final_value != entity.value or updates:
	updates["value"] = final_value

	result.append(entity.model_copy(update=updates) if updates else entity)

	return result

	def _adjust_confidence(
	self,
	text: str,
	entities: list[AddressEntity]
	) -> list[AddressEntity]:
	"""Adjust confidence scores based on patterns and gazetteer."""
	result = []

	for entity in entities:
	new_confidence = entity.confidence

	# Boost confidence for pattern matches
	if entity.label in self.PATTERNS:
	pattern = self.PATTERNS[entity.label]
	if pattern.fullmatch(entity.value):
	new_confidence = min(1.0, new_confidence + 0.1)

	# Boost confidence for gazetteer matches
	if self.gazetteer and entity.label in ("AREA", "SUBAREA", "COLONY"):
	if self.gazetteer.is_known_locality(entity.value):
	new_confidence = min(1.0, new_confidence + 0.15)

	# Reduce confidence for very short entities
	if len(entity.value) < 3:
	new_confidence = max(0.0, new_confidence - 0.2)

	if new_confidence != entity.confidence:
	result.append(entity.model_copy(update={"confidence": new_confidence}))
	else:
	result.append(entity)

	return result

	def _remove_overlaps(
	self,
	entities: list[AddressEntity]
	) -> list[AddressEntity]:
	"""Remove overlapping entities, keeping higher confidence ones."""
	if not entities:
	return entities

	# Separate CITY and PINCODE entities - these should always be kept
	# as they represent different semantic levels than AREA/SUBAREA
	preserved_labels = {"CITY", "PINCODE", "STATE"}
	preserved_entities = [e for e in entities if e.label in preserved_labels]
	other_entities = [e for e in entities if e.label not in preserved_labels]

	# Sort non-preserved by confidence (descending) then by start position
	sorted_entities = sorted(other_entities, key=lambda e: (-e.confidence, e.start))

	result: list[AddressEntity] = []
	used_ranges: list[tuple[int, int]] = []

	for entity in sorted_entities:
	# Check for overlap with existing entities
	overlaps = False
	for start, end in used_ranges:
	if not (entity.end <= start or entity.start >= end):
	overlaps = True
	break

	if not overlaps:
	result.append(entity)
	used_ranges.append((entity.start, entity.end))

	# Add back preserved entities (CITY, PINCODE, STATE)
	result.extend(preserved_entities)

	# Sort by position for output
	return sorted(result, key=lambda e: e.start)

	def _validate_entities(
	self,
	entities: list[AddressEntity]
	) -> list[AddressEntity]:
	"""Validate and filter entities."""
	result = []

	for entity in entities:
	# Skip empty values
	if not entity.value.strip():
	continue

	# Skip very low confidence
	if entity.confidence < 0.3:
	continue

	# Validate pincode format
	if entity.label == "PINCODE":
	if not re.fullmatch(r'[1-9]\d{5}', entity.value):
	continue
	if self.gazetteer and not self.gazetteer.validate_pincode(entity.value):
	# Pincode outside Delhi range - reduce confidence but keep
	entity = entity.model_copy(update={"confidence": entity.confidence * 0.7})

	result.append(entity)

	return result

	def extract_all_patterns(self, text: str) -> dict[str, list[str]]:
	"""
	Extract all pattern-based entities from text.

	Returns dict of label -> list of matched values.
	"""
	results = {}

	for label, pattern in self.PATTERNS.items():
	matches = pattern.findall(text)
	if matches:
	results[label] = matches

	return results