Spaces:

Raj718
/

Voucher-Bot

Sleeping

App Files Files Community

Voucher-Bot / fixed_address_extraction.py

Raj718

Initial commit: NYC Voucher Housing Navigator

dbaeeae 7 months ago

raw

history blame contribute delete

14.1 kB

	#!/usr/bin/env python3
	"""
	Fixed Address Extraction - Prioritizes Real Address Sources
	Based on debug findings: .mapaddress and JSON structured data contain the real addresses
	"""

	def fixed_address_extraction():
	"""
	Fixed extraction that finds real addresses from proper sources.
	Avoids title contamination by prioritizing mapaddress and structured data.
	"""
	return """
	function extractRealAddress() {
	let candidates = [];
	let debug = { sources: {}, title_avoided: false };

	// Function to score address quality
	function scoreAddress(addr, source) {
	if (!addr \|\| addr.length < 3) return 0;

	let score = 0;
	let text = addr.toLowerCase().trim();

	// Boost score based on reliable source
	let sourceBonus = 0;
	if (source === 'structured_data') sourceBonus = 5;
	else if (source === 'mapaddress') sourceBonus = 4;
	else if (source === 'body_text') sourceBonus = 2;
	else if (source === 'title') sourceBonus = -10; // AVOID TITLES

	// Score the content quality
	if (/\d+\s+[a-z\s]+(?:street\|st\|avenue\|ave\|road\|rd\|boulevard\|blvd\|drive\|dr\|place\|pl\|lane\|ln)\s,?\s(?:bronx\|brooklyn\|manhattan\|queens\|staten island)\s,?\sny\s+\d{5}/.test(text)) {
	score = 10 + sourceBonus;
	}
	else if (/\d+\s+[a-z\s]+(?:street\|st\|avenue\|ave\|road\|rd\|boulevard\|blvd\|drive\|dr\|place\|pl\|lane\|ln)\s,?\s(?:bronx\|brooklyn\|manhattan\|queens\|staten island)/.test(text)) {
	score = 9 + sourceBonus;
	}
	else if (/\d+\s+[a-z\s]+(?:street\|st\|avenue\|ave\|road\|rd\|boulevard\|blvd\|drive\|dr\|place\|pl\|lane\|ln)/.test(text)) {
	score = 8 + sourceBonus;
	}
	else if (/[a-z\s]+(?:street\|st\|avenue\|ave\|road\|rd\|boulevard\|blvd\|drive\|dr\|place\|pl\|lane\|ln)\s,?\s(?:bronx\|brooklyn\|manhattan\|queens\|staten island)/.test(text)) {
	score = 6 + sourceBonus;
	}
	else if (text.includes('near') && /(?:street\|st\|avenue\|ave\|road\|rd\|boulevard\|blvd\|drive\|dr\|place\|pl\|lane\|ln)/.test(text)) {
	score = 5 + sourceBonus;
	}
	else if (/(?:bronx\|brooklyn\|manhattan\|queens\|staten island)/.test(text) &&
	!text.includes('all ') && !text.includes('newly renovated') &&
	!text.includes('bedroom') && text.length > 8 && text.length < 60) {
	score = 4 + sourceBonus;
	}

	// Penalty for title-like content
	if (text.includes('br apt') \|\| text.includes('bedroom') \|\| text.includes('renovated') \|\|
	text.includes('$') \|\| text.includes('/') \|\| text.includes('newly')) {
	score -= 15;
	}

	return Math.max(0, score);
	}

	// Strategy 1: Extract from JSON-LD structured data (highest priority)
	function extractFromStructuredData() {
	let found = [];
	let scripts = document.querySelectorAll('script[type*="json"]');

	for (let script of scripts) {
	try {
	let data = JSON.parse(script.textContent);

	// Look for address objects
	function findAddresses(obj) {
	if (typeof obj !== 'object' \|\| obj === null) return;

	if (obj.streetAddress) {
	let addr = obj.streetAddress;
	if (obj.addressLocality) addr += ', ' + obj.addressLocality;
	if (obj.addressRegion) addr += ', ' + obj.addressRegion;
	if (obj.postalCode) addr += ' ' + obj.postalCode;

	found.push({
	address: addr.trim(),
	source: 'structured_data',
	quality: scoreAddress(addr, 'structured_data')
	});
	}

	// Recursively search nested objects
	for (let key in obj) {
	if (typeof obj[key] === 'object') {
	findAddresses(obj[key]);
	}
	}
	}

	findAddresses(data);
	} catch (e) {
	// Invalid JSON, skip
	}
	}

	return found;
	}

	// Strategy 2: Extract from mapaddress element (second highest priority)
	function extractFromMapAddress() {
	let found = [];
	let mapSelectors = [
	'.mapaddress',
	'[class*="mapaddress"]',
	'.postingtitle .mapaddress'
	];

	for (let selector of mapSelectors) {
	let elements = document.querySelectorAll(selector);
	for (let el of elements) {
	if (el.textContent && el.textContent.trim()) {
	let addr = el.textContent.trim();
	found.push({
	address: addr,
	source: 'mapaddress',
	quality: scoreAddress(addr, 'mapaddress')
	});
	}
	}
	}

	return found;
	}

	// Strategy 3: Extract from body text (careful to avoid title contamination)
	function extractFromBodyText() {
	let found = [];
	let bodySelectors = ['#postingbody', '.postingbody', '.section-content'];

	for (let selector of bodySelectors) {
	let elements = document.querySelectorAll(selector);
	for (let el of elements) {
	if (el.textContent && el.textContent.trim()) {
	let text = el.textContent;

	// Look for address patterns
	let patterns = [
	/\d+\s+[A-Za-z\s]+(?:Street\|St\|Avenue\|Ave\|Road\|Rd\|Boulevard\|Blvd\|Drive\|Dr\|Place\|Pl\|Lane\|Ln)\s,?\s(?:Bronx\|Brooklyn\|Manhattan\|Queens\|Staten Island)\s,?\sNY\s*\d{0,5}/gi,
	/\d+\s+[A-Za-z\s]+(?:Street\|St\|Avenue\|Ave\|Road\|Rd\|Boulevard\|Blvd\|Drive\|Dr\|Place\|Pl\|Lane\|Ln)\s,?\s(?:Bronx\|Brooklyn\|Manhattan\|Queens\|Staten Island)/gi,
	/(?:Near\|At\|On)\s+[A-Za-z\s]+(?:Street\|St\|Avenue\|Ave\|Road\|Rd)\s(?:and\|&\|near)\s[A-Za-z\s]+(?:Street\|St\|Avenue\|Ave\|Road\|Rd)/gi
	];

	for (let pattern of patterns) {
	let matches = text.match(pattern);
	if (matches) {
	matches.forEach(addr => {
	found.push({
	address: addr.trim(),
	source: 'body_text',
	quality: scoreAddress(addr, 'body_text')
	});
	});
	}
	}
	}
	}
	}

	return found;
	}

	// Strategy 4: Extract from title ONLY as last resort (with penalties)
	function extractFromTitle() {
	let found = [];
	let titleEl = document.querySelector('.postingtitle') \|\|
	document.querySelector('#titletextonly');

	if (titleEl && titleEl.textContent) {
	let titleText = titleEl.textContent;

	// Look for parenthetical location info like "(Fordham Vicinity)"
	let locMatch = titleText.match(/$([^)]+(?:Bronx\|Brooklyn\|Manhattan\|Queens\|Staten Island)[^)]*)$/i);
	if (locMatch) {
	let location = locMatch[1].trim();
	if (!location.includes('bedroom') && !location.includes('br ') &&
	!location.includes('renovated') && location.length > 5) {
	found.push({
	address: location,
	source: 'title_location',
	quality: scoreAddress(location, 'title')
	});
	}
	}

	// Avoid extracting the main title as address
	debug.title_avoided = true;
	}

	return found;
	}

	// Execute strategies in priority order
	candidates = candidates.concat(extractFromStructuredData());
	candidates = candidates.concat(extractFromMapAddress());
	candidates = candidates.concat(extractFromBodyText());
	candidates = candidates.concat(extractFromTitle());

	// Remove duplicates and filter out poor quality
	let uniqueCandidates = [];
	let seen = new Set();

	for (let candidate of candidates) {
	let normalized = candidate.address.toLowerCase().replace(/[^\w\s]/g, '');
	if (!seen.has(normalized) && candidate.quality > 0) {
	seen.add(normalized);
	uniqueCandidates.push(candidate);
	}
	}

	// Sort by quality (highest first)
	uniqueCandidates.sort((a, b) => b.quality - a.quality);

	debug.total_candidates = uniqueCandidates.length;
	debug.candidates = uniqueCandidates;
	debug.best_quality = uniqueCandidates.length > 0 ? uniqueCandidates[0].quality : 0;

	// Select best address
	let bestAddress = null;
	if (uniqueCandidates.length > 0 && uniqueCandidates[0].quality > 3) {
	bestAddress = uniqueCandidates[0].address;

	// Clean up the address
	bestAddress = bestAddress.replace(/^(Near\|At\|On)\s+/i, '');
	bestAddress = bestAddress.trim();
	}

	return {
	address: bestAddress,
	debug: debug,
	all_candidates: uniqueCandidates
	};
	}

	return extractRealAddress();
	"""

	def apply_fixed_extraction():
	"""Apply the fixed address extraction to browser agent."""
	import browser_agent

	original_function = browser_agent._get_detailed_data_with_enhanced_address

	def fixed_extraction(url):
	"""Fixed version that finds real addresses and avoids title contamination."""
	try:
	import helium

	print(f"🔧 Fixed address extraction for {url}")
	helium.go_to(url)
	browser_agent._smart_delay(2, 3)

	# Use fixed extraction script
	extraction_script = fixed_address_extraction()
	result = helium.get_driver().execute_script(extraction_script)

	# Get additional data
	additional_script = """
	return {
	price: (document.querySelector('.price') \|\|
	document.querySelector('[class*="price"]') \|\|
	{textContent: 'N/A'}).textContent.trim(),
	description: (document.querySelector('#postingbody') \|\|
	document.querySelector('.postingbody') \|\|
	{textContent: 'N/A'}).textContent.trim(),
	title: (document.querySelector('.postingtitle') \|\|
	{textContent: 'N/A'}).textContent.trim()
	};
	"""
	additional_data = helium.get_driver().execute_script(additional_script)

	# Process results
	address = result.get('address')
	if address:
	# Light normalization
	address = browser_agent._normalize_address(address)
	print(f"📍 Found address: {address}")
	else:
	address = 'N/A'
	print(f"❌ No address found")

	final_result = {
	'address': address,
	'price': additional_data.get('price', 'N/A'),
	'description': additional_data.get('description', 'N/A'),
	'title': additional_data.get('title', 'N/A'),
	'debug': result.get('debug', {}),
	'all_candidates': result.get('all_candidates', [])
	}

	# Enhanced logging
	if final_result.get('debug'):
	debug = final_result['debug']
	print(f"📊 Found {debug.get('total_candidates', 0)} address candidates")
	print(f"🏆 Best quality: {debug.get('best_quality', 0)}/10")
	print(f"🚫 Title avoided: {debug.get('title_avoided', False)}")

	if debug.get('candidates'):
	print(f"🎯 Top candidates:")
	for i, candidate in enumerate(debug['candidates'][:3], 1):
	print(f" {i}. {candidate['address']} (Q:{candidate['quality']}, {candidate['source']})")

	return final_result

	except Exception as e:
	print(f"Fixed extraction failed for {url}: {e}")
	return original_function(url)

	browser_agent._get_detailed_data_with_enhanced_address = fixed_extraction
	print("✅ Applied fixed address extraction to browser agent")

	if __name__ == "__main__":
	print("🔧 Fixed Address Extraction")
	print("Prioritizes mapaddress and structured data, avoids title contamination")