Spaces:

Raj718
/

Voucher-Bot

Sleeping

App Files Files Community

Voucher-Bot / balanced_address_fix.py

Raj718

Initial commit: NYC Voucher Housing Navigator

dbaeeae 8 months ago

raw

history blame contribute delete

13.8 kB

	#!/usr/bin/env python3
	"""
	Balanced Address Extraction Fix
	Shows the best available location information to users
	Prioritizes complete addresses but falls back to useful approximations
	"""

	def balanced_address_extraction():
	"""
	Balanced extraction that shows users the best available location info.
	Never returns N/A if there's any useful location information.
	"""
	return """
	function extractBestLocationInfo() {
	let allLocations = [];
	let debug = { strategies: [], fallbacks: [] };

	// Function to score location usefulness (more permissive than before)
	function scoreLocation(location) {
	if (!location \|\| location.length < 3) return 0;

	let score = 0;
	let addr = location.toLowerCase();

	// Perfect: Full address with house number + street + borough + zip
	if (/\d+\s+[a-z\s]+(?:street\|st\|avenue\|ave\|road\|rd\|boulevard\|blvd\|drive\|dr\|place\|pl\|lane\|ln)\s,?\s(?:bronx\|brooklyn\|manhattan\|queens\|staten island)\s,?\sny\s+\d{5}/.test(addr)) {
	score = 10;
	}
	// Excellent: Partial address with house number + street + borough
	else if (/\d+\s+[a-z\s]+(?:street\|st\|avenue\|ave\|road\|rd\|boulevard\|blvd\|drive\|dr\|place\|pl\|lane\|ln)\s,?\s(?:bronx\|brooklyn\|manhattan\|queens\|staten island)/.test(addr)) {
	score = 9;
	}
	// Very Good: Street with house number (missing borough)
	else if (/\d+\s+[a-z\s]+(?:street\|st\|avenue\|ave\|road\|rd\|boulevard\|blvd\|drive\|dr\|place\|pl\|lane\|ln)/.test(addr)) {
	score = 8;
	}
	// Good: Intersection with specific streets
	else if ((addr.includes('near') \|\| addr.includes('&') \|\| addr.includes(' and ')) &&
	/(?:street\|st\|avenue\|ave\|road\|rd\|boulevard\|blvd\|drive\|dr\|place\|pl\|lane\|ln)/.test(addr)) {
	score = 7;
	}
	// Fair: Street name + borough (no house number)
	else if (/[a-z\s]+(?:street\|st\|avenue\|ave\|road\|rd\|boulevard\|blvd\|drive\|dr\|place\|pl\|lane\|ln)\s,?\s(?:bronx\|brooklyn\|manhattan\|queens\|staten island)/.test(addr)) {
	score = 6;
	}
	// Useful: Neighborhood/area + borough
	else if (/(?:bronx\|brooklyn\|manhattan\|queens\|staten island)/.test(addr) &&
	!/all (bronx\|brooklyn\|manhattan\|queens\|staten island) areas/.test(addr) &&
	addr.length > 10 && addr.length < 100) {
	score = 5;
	}
	// Basic: Just intersection description
	else if (addr.includes('near') && addr.length > 8) {
	score = 4;
	}
	// Minimal: Borough-specific area (better than nothing)
	else if (/(?:bronx\|brooklyn\|manhattan\|queens\|staten island)/.test(addr) && addr.length > 5) {
	score = 3;
	}

	return score;
	}

	// Strategy 1: Look for ALL text that might contain location info
	function findAllLocationMentions() {
	let found = [];
	let searchTexts = [];

	// Get main content areas
	let contentAreas = [
	document.querySelector('#postingbody'),
	document.querySelector('.postingbody'),
	document.querySelector('.section-content'),
	document.querySelector('.postingtitle'),
	document.querySelector('#titletextonly')
	];

	// Get map address (often most reliable)
	let mapEl = document.querySelector('.mapaddress') \|\|
	document.querySelector('[class*="map-address"]');
	if (mapEl) {
	searchTexts.push(mapEl.textContent);
	}

	// Get all text content
	for (let area of contentAreas) {
	if (area && area.textContent) {
	searchTexts.push(area.textContent);
	}
	}

	// Get attribute groups
	let attrGroups = document.querySelectorAll('.attrgroup');
	for (let group of attrGroups) {
	if (group.textContent) {
	searchTexts.push(group.textContent);
	}
	}

	// Extract location info from all text
	for (let text of searchTexts) {
	if (!text) continue;

	// Pattern 1: Complete addresses
	let completeMatches = text.match(/\d+\s+[A-Za-z\s]+(?:Street\|St\|Avenue\|Ave\|Road\|Rd\|Boulevard\|Blvd\|Drive\|Dr\|Place\|Pl\|Lane\|Ln)[^,],?\s(?:Bronx\|Brooklyn\|Manhattan\|Queens\|Staten Island)[^,],?\sNY\s*\d{0,5}/gi);
	if (completeMatches) {
	completeMatches.forEach(addr => {
	found.push({
	location: addr.trim(),
	source: 'complete_address',
	quality: scoreLocation(addr)
	});
	});
	}

	// Pattern 2: Partial addresses
	let partialMatches = text.match(/\d+\s+[A-Za-z\s]+(?:Street\|St\|Avenue\|Ave\|Road\|Rd\|Boulevard\|Blvd\|Drive\|Dr\|Place\|Pl\|Lane\|Ln)[^,],?\s(?:Bronx\|Brooklyn\|Manhattan\|Queens\|Staten Island)/gi);
	if (partialMatches) {
	partialMatches.forEach(addr => {
	found.push({
	location: addr.trim(),
	source: 'partial_address',
	quality: scoreLocation(addr)
	});
	});
	}

	// Pattern 3: Street intersections
	let intersectionMatches = text.match(/[A-Za-z\s]+(?:Street\|St\|Avenue\|Ave\|Road\|Rd)\s+(?:near\|and\|&)\s+[A-Za-z\s]+(?:Street\|St\|Avenue\|Ave\|Road\|Rd)/gi);
	if (intersectionMatches) {
	intersectionMatches.forEach(addr => {
	found.push({
	location: addr.trim(),
	source: 'intersection',
	quality: scoreLocation(addr)
	});
	});
	}

	// Pattern 4: Neighborhood mentions
	let neighborhoodMatches = text.match(/(?:near\|in\|around\|at)\s+[A-Za-z\s]{3,30}(?:Bronx\|Brooklyn\|Manhattan\|Queens\|Staten Island)/gi);
	if (neighborhoodMatches) {
	neighborhoodMatches.forEach(addr => {
	let cleaned = addr.replace(/^(?:near\|in\|around\|at)\s+/i, '').trim();
	if (cleaned.length > 8) {
	found.push({
	location: cleaned,
	source: 'neighborhood',
	quality: scoreLocation(cleaned)
	});
	}
	});
	}
	}

	return found;
	}

	// Strategy 2: Check for Google Maps or other external location sources
	function findExternalLocationSources() {
	let found = [];

	// Check iframes for maps
	let iframes = document.querySelectorAll('iframe');
	for (let iframe of iframes) {
	if (iframe.src && iframe.src.includes('maps')) {
	let urlMatch = iframe.src.match(/q=([^&]+)/);
	if (urlMatch) {
	let addr = decodeURIComponent(urlMatch[1]);
	found.push({
	location: addr,
	source: 'google_maps',
	quality: scoreLocation(addr)
	});
	}
	}
	}

	return found;
	}

	// Execute all strategies
	allLocations = allLocations.concat(findAllLocationMentions());
	allLocations = allLocations.concat(findExternalLocationSources());

	// Remove duplicates and very poor quality locations
	let uniqueLocations = [];
	let seen = new Set();

	for (let loc of allLocations) {
	let normalized = loc.location.toLowerCase().replace(/[^\w\s]/g, '').trim();
	if (!seen.has(normalized) && loc.quality > 0 && loc.location.length > 3) {
	// Skip overly generic entries
	if (!loc.location.toLowerCase().includes('all bronx areas') &&
	!loc.location.toLowerCase().includes('all brooklyn areas') &&
	!loc.location.toLowerCase().includes('all manhattan areas') &&
	!loc.location.toLowerCase().includes('all queens areas')) {
	seen.add(normalized);
	uniqueLocations.push(loc);
	}
	}
	}

	// Sort by quality (best first)
	uniqueLocations.sort((a, b) => b.quality - a.quality);

	debug.strategies = uniqueLocations;
	debug.total_found = uniqueLocations.length;
	debug.best_quality = uniqueLocations.length > 0 ? uniqueLocations[0].quality : 0;

	// Select best location
	let bestLocation = null;
	if (uniqueLocations.length > 0) {
	bestLocation = uniqueLocations[0].location;

	// Add quality indicator for user
	let quality = uniqueLocations[0].quality;
	if (quality >= 8) {
	// Complete address - no indicator needed
	bestLocation = bestLocation;
	} else if (quality >= 6) {
	// Good partial address
	bestLocation = bestLocation;
	} else if (quality >= 4) {
	// Approximate location
	bestLocation = `~${bestLocation}`;
	}
	}

	return {
	location: bestLocation,
	debug: debug,
	all_candidates: uniqueLocations
	};
	}

	return extractBestLocationInfo();
	"""

	def apply_balanced_extraction():
	"""Apply balanced address extraction to browser agent."""
	import browser_agent

	original_function = browser_agent._get_detailed_data_with_enhanced_address

	def balanced_extraction(url):
	"""Balanced version that shows best available location info."""
	try:
	import helium

	print(f"🎯 Balanced location extraction for {url}")
	helium.go_to(url)
	browser_agent._smart_delay(2, 3)

	# Use balanced extraction
	extraction_script = balanced_address_extraction()
	result = helium.get_driver().execute_script(extraction_script)

	# Get additional data
	additional_script = """
	return {
	price: (document.querySelector('.price') \|\|
	document.querySelector('[class*="price"]') \|\|
	{textContent: 'N/A'}).textContent.trim(),
	description: (document.querySelector('#postingbody') \|\|
	document.querySelector('.postingbody') \|\|
	{textContent: 'N/A'}).textContent.trim(),
	title: (document.querySelector('.postingtitle') \|\|
	{textContent: 'N/A'}).textContent.trim()
	};
	"""
	additional_data = helium.get_driver().execute_script(additional_script)

	# Process results
	location = result.get('location')
	if location:
	# Apply light normalization (don't be too aggressive)
	location = browser_agent._normalize_address(location)
	print(f"📍 Found location: {location}")
	else:
	location = 'N/A'
	print(f"❌ No location information found")

	final_result = {
	'address': location,
	'price': additional_data.get('price', 'N/A'),
	'description': additional_data.get('description', 'N/A'),
	'title': additional_data.get('title', 'N/A'),
	'debug': result.get('debug', {}),
	'all_candidates': result.get('all_candidates', [])
	}

	# Enhanced logging
	if final_result.get('debug'):
	debug = final_result['debug']
	print(f"📊 Found {debug.get('total_found', 0)} location candidates")
	print(f"🏆 Best quality: {debug.get('best_quality', 0)}/10")

	if debug.get('strategies'):
	print(f"🎯 Top candidates:")
	for i, candidate in enumerate(debug['strategies'][:3], 1):
	print(f" {i}. {candidate['location']} (Q:{candidate['quality']}, {candidate['source']})")

	return final_result

	except Exception as e:
	print(f"Balanced extraction failed for {url}: {e}")
	return original_function(url)

	browser_agent._get_detailed_data_with_enhanced_address = balanced_extraction
	print("✅ Applied balanced address extraction to browser agent")

	if __name__ == "__main__":
	print("🎯 Balanced Address Extraction Fix")
	print("Shows users the best available location information, even if approximate")