Voucher-Bot / balanced_address_fix.py
Raj718's picture
Initial commit: NYC Voucher Housing Navigator
dbaeeae
#!/usr/bin/env python3
"""
Balanced Address Extraction Fix
Shows the best available location information to users
Prioritizes complete addresses but falls back to useful approximations
"""
def balanced_address_extraction():
"""
Balanced extraction that shows users the best available location info.
Never returns N/A if there's any useful location information.
"""
return """
function extractBestLocationInfo() {
let allLocations = [];
let debug = { strategies: [], fallbacks: [] };
// Function to score location usefulness (more permissive than before)
function scoreLocation(location) {
if (!location || location.length < 3) return 0;
let score = 0;
let addr = location.toLowerCase();
// Perfect: Full address with house number + street + borough + zip
if (/\d+\s+[a-z\s]+(?:street|st|avenue|ave|road|rd|boulevard|blvd|drive|dr|place|pl|lane|ln)\s*,?\s*(?:bronx|brooklyn|manhattan|queens|staten island)\s*,?\s*ny\s+\d{5}/.test(addr)) {
score = 10;
}
// Excellent: Partial address with house number + street + borough
else if (/\d+\s+[a-z\s]+(?:street|st|avenue|ave|road|rd|boulevard|blvd|drive|dr|place|pl|lane|ln)\s*,?\s*(?:bronx|brooklyn|manhattan|queens|staten island)/.test(addr)) {
score = 9;
}
// Very Good: Street with house number (missing borough)
else if (/\d+\s+[a-z\s]+(?:street|st|avenue|ave|road|rd|boulevard|blvd|drive|dr|place|pl|lane|ln)/.test(addr)) {
score = 8;
}
// Good: Intersection with specific streets
else if ((addr.includes('near') || addr.includes('&') || addr.includes(' and ')) &&
/(?:street|st|avenue|ave|road|rd|boulevard|blvd|drive|dr|place|pl|lane|ln)/.test(addr)) {
score = 7;
}
// Fair: Street name + borough (no house number)
else if (/[a-z\s]+(?:street|st|avenue|ave|road|rd|boulevard|blvd|drive|dr|place|pl|lane|ln)\s*,?\s*(?:bronx|brooklyn|manhattan|queens|staten island)/.test(addr)) {
score = 6;
}
// Useful: Neighborhood/area + borough
else if (/(?:bronx|brooklyn|manhattan|queens|staten island)/.test(addr) &&
!/all (bronx|brooklyn|manhattan|queens|staten island) areas/.test(addr) &&
addr.length > 10 && addr.length < 100) {
score = 5;
}
// Basic: Just intersection description
else if (addr.includes('near') && addr.length > 8) {
score = 4;
}
// Minimal: Borough-specific area (better than nothing)
else if (/(?:bronx|brooklyn|manhattan|queens|staten island)/.test(addr) && addr.length > 5) {
score = 3;
}
return score;
}
// Strategy 1: Look for ALL text that might contain location info
function findAllLocationMentions() {
let found = [];
let searchTexts = [];
// Get main content areas
let contentAreas = [
document.querySelector('#postingbody'),
document.querySelector('.postingbody'),
document.querySelector('.section-content'),
document.querySelector('.postingtitle'),
document.querySelector('#titletextonly')
];
// Get map address (often most reliable)
let mapEl = document.querySelector('.mapaddress') ||
document.querySelector('[class*="map-address"]');
if (mapEl) {
searchTexts.push(mapEl.textContent);
}
// Get all text content
for (let area of contentAreas) {
if (area && area.textContent) {
searchTexts.push(area.textContent);
}
}
// Get attribute groups
let attrGroups = document.querySelectorAll('.attrgroup');
for (let group of attrGroups) {
if (group.textContent) {
searchTexts.push(group.textContent);
}
}
// Extract location info from all text
for (let text of searchTexts) {
if (!text) continue;
// Pattern 1: Complete addresses
let completeMatches = text.match(/\d+\s+[A-Za-z\s]+(?:Street|St|Avenue|Ave|Road|Rd|Boulevard|Blvd|Drive|Dr|Place|Pl|Lane|Ln)[^,]*,?\s*(?:Bronx|Brooklyn|Manhattan|Queens|Staten Island)[^,]*,?\s*NY\s*\d{0,5}/gi);
if (completeMatches) {
completeMatches.forEach(addr => {
found.push({
location: addr.trim(),
source: 'complete_address',
quality: scoreLocation(addr)
});
});
}
// Pattern 2: Partial addresses
let partialMatches = text.match(/\d+\s+[A-Za-z\s]+(?:Street|St|Avenue|Ave|Road|Rd|Boulevard|Blvd|Drive|Dr|Place|Pl|Lane|Ln)[^,]*,?\s*(?:Bronx|Brooklyn|Manhattan|Queens|Staten Island)/gi);
if (partialMatches) {
partialMatches.forEach(addr => {
found.push({
location: addr.trim(),
source: 'partial_address',
quality: scoreLocation(addr)
});
});
}
// Pattern 3: Street intersections
let intersectionMatches = text.match(/[A-Za-z\s]+(?:Street|St|Avenue|Ave|Road|Rd)\s+(?:near|and|&)\s+[A-Za-z\s]+(?:Street|St|Avenue|Ave|Road|Rd)/gi);
if (intersectionMatches) {
intersectionMatches.forEach(addr => {
found.push({
location: addr.trim(),
source: 'intersection',
quality: scoreLocation(addr)
});
});
}
// Pattern 4: Neighborhood mentions
let neighborhoodMatches = text.match(/(?:near|in|around|at)\s+[A-Za-z\s]{3,30}(?:Bronx|Brooklyn|Manhattan|Queens|Staten Island)/gi);
if (neighborhoodMatches) {
neighborhoodMatches.forEach(addr => {
let cleaned = addr.replace(/^(?:near|in|around|at)\s+/i, '').trim();
if (cleaned.length > 8) {
found.push({
location: cleaned,
source: 'neighborhood',
quality: scoreLocation(cleaned)
});
}
});
}
}
return found;
}
// Strategy 2: Check for Google Maps or other external location sources
function findExternalLocationSources() {
let found = [];
// Check iframes for maps
let iframes = document.querySelectorAll('iframe');
for (let iframe of iframes) {
if (iframe.src && iframe.src.includes('maps')) {
let urlMatch = iframe.src.match(/q=([^&]+)/);
if (urlMatch) {
let addr = decodeURIComponent(urlMatch[1]);
found.push({
location: addr,
source: 'google_maps',
quality: scoreLocation(addr)
});
}
}
}
return found;
}
// Execute all strategies
allLocations = allLocations.concat(findAllLocationMentions());
allLocations = allLocations.concat(findExternalLocationSources());
// Remove duplicates and very poor quality locations
let uniqueLocations = [];
let seen = new Set();
for (let loc of allLocations) {
let normalized = loc.location.toLowerCase().replace(/[^\w\s]/g, '').trim();
if (!seen.has(normalized) && loc.quality > 0 && loc.location.length > 3) {
// Skip overly generic entries
if (!loc.location.toLowerCase().includes('all bronx areas') &&
!loc.location.toLowerCase().includes('all brooklyn areas') &&
!loc.location.toLowerCase().includes('all manhattan areas') &&
!loc.location.toLowerCase().includes('all queens areas')) {
seen.add(normalized);
uniqueLocations.push(loc);
}
}
}
// Sort by quality (best first)
uniqueLocations.sort((a, b) => b.quality - a.quality);
debug.strategies = uniqueLocations;
debug.total_found = uniqueLocations.length;
debug.best_quality = uniqueLocations.length > 0 ? uniqueLocations[0].quality : 0;
// Select best location
let bestLocation = null;
if (uniqueLocations.length > 0) {
bestLocation = uniqueLocations[0].location;
// Add quality indicator for user
let quality = uniqueLocations[0].quality;
if (quality >= 8) {
// Complete address - no indicator needed
bestLocation = bestLocation;
} else if (quality >= 6) {
// Good partial address
bestLocation = bestLocation;
} else if (quality >= 4) {
// Approximate location
bestLocation = `~${bestLocation}`;
}
}
return {
location: bestLocation,
debug: debug,
all_candidates: uniqueLocations
};
}
return extractBestLocationInfo();
"""
def apply_balanced_extraction():
"""Apply balanced address extraction to browser agent."""
import browser_agent
original_function = browser_agent._get_detailed_data_with_enhanced_address
def balanced_extraction(url):
"""Balanced version that shows best available location info."""
try:
import helium
print(f"🎯 Balanced location extraction for {url}")
helium.go_to(url)
browser_agent._smart_delay(2, 3)
# Use balanced extraction
extraction_script = balanced_address_extraction()
result = helium.get_driver().execute_script(extraction_script)
# Get additional data
additional_script = """
return {
price: (document.querySelector('.price') ||
document.querySelector('[class*="price"]') ||
{textContent: 'N/A'}).textContent.trim(),
description: (document.querySelector('#postingbody') ||
document.querySelector('.postingbody') ||
{textContent: 'N/A'}).textContent.trim(),
title: (document.querySelector('.postingtitle') ||
{textContent: 'N/A'}).textContent.trim()
};
"""
additional_data = helium.get_driver().execute_script(additional_script)
# Process results
location = result.get('location')
if location:
# Apply light normalization (don't be too aggressive)
location = browser_agent._normalize_address(location)
print(f"πŸ“ Found location: {location}")
else:
location = 'N/A'
print(f"❌ No location information found")
final_result = {
'address': location,
'price': additional_data.get('price', 'N/A'),
'description': additional_data.get('description', 'N/A'),
'title': additional_data.get('title', 'N/A'),
'debug': result.get('debug', {}),
'all_candidates': result.get('all_candidates', [])
}
# Enhanced logging
if final_result.get('debug'):
debug = final_result['debug']
print(f"πŸ“Š Found {debug.get('total_found', 0)} location candidates")
print(f"πŸ† Best quality: {debug.get('best_quality', 0)}/10")
if debug.get('strategies'):
print(f"🎯 Top candidates:")
for i, candidate in enumerate(debug['strategies'][:3], 1):
print(f" {i}. {candidate['location']} (Q:{candidate['quality']}, {candidate['source']})")
return final_result
except Exception as e:
print(f"Balanced extraction failed for {url}: {e}")
return original_function(url)
browser_agent._get_detailed_data_with_enhanced_address = balanced_extraction
print("βœ… Applied balanced address extraction to browser agent")
if __name__ == "__main__":
print("🎯 Balanced Address Extraction Fix")
print("Shows users the best available location information, even if approximate")