Voucher-Bot / fixed_address_extraction.py
Raj718's picture
Initial commit: NYC Voucher Housing Navigator
dbaeeae
#!/usr/bin/env python3
"""
Fixed Address Extraction - Prioritizes Real Address Sources
Based on debug findings: .mapaddress and JSON structured data contain the real addresses
"""
def fixed_address_extraction():
"""
Fixed extraction that finds real addresses from proper sources.
Avoids title contamination by prioritizing mapaddress and structured data.
"""
return """
function extractRealAddress() {
let candidates = [];
let debug = { sources: {}, title_avoided: false };
// Function to score address quality
function scoreAddress(addr, source) {
if (!addr || addr.length < 3) return 0;
let score = 0;
let text = addr.toLowerCase().trim();
// Boost score based on reliable source
let sourceBonus = 0;
if (source === 'structured_data') sourceBonus = 5;
else if (source === 'mapaddress') sourceBonus = 4;
else if (source === 'body_text') sourceBonus = 2;
else if (source === 'title') sourceBonus = -10; // AVOID TITLES
// Score the content quality
if (/\d+\s+[a-z\s]+(?:street|st|avenue|ave|road|rd|boulevard|blvd|drive|dr|place|pl|lane|ln)\s*,?\s*(?:bronx|brooklyn|manhattan|queens|staten island)\s*,?\s*ny\s+\d{5}/.test(text)) {
score = 10 + sourceBonus;
}
else if (/\d+\s+[a-z\s]+(?:street|st|avenue|ave|road|rd|boulevard|blvd|drive|dr|place|pl|lane|ln)\s*,?\s*(?:bronx|brooklyn|manhattan|queens|staten island)/.test(text)) {
score = 9 + sourceBonus;
}
else if (/\d+\s+[a-z\s]+(?:street|st|avenue|ave|road|rd|boulevard|blvd|drive|dr|place|pl|lane|ln)/.test(text)) {
score = 8 + sourceBonus;
}
else if (/[a-z\s]+(?:street|st|avenue|ave|road|rd|boulevard|blvd|drive|dr|place|pl|lane|ln)\s*,?\s*(?:bronx|brooklyn|manhattan|queens|staten island)/.test(text)) {
score = 6 + sourceBonus;
}
else if (text.includes('near') && /(?:street|st|avenue|ave|road|rd|boulevard|blvd|drive|dr|place|pl|lane|ln)/.test(text)) {
score = 5 + sourceBonus;
}
else if (/(?:bronx|brooklyn|manhattan|queens|staten island)/.test(text) &&
!text.includes('all ') && !text.includes('newly renovated') &&
!text.includes('bedroom') && text.length > 8 && text.length < 60) {
score = 4 + sourceBonus;
}
// Penalty for title-like content
if (text.includes('br apt') || text.includes('bedroom') || text.includes('renovated') ||
text.includes('$') || text.includes('/') || text.includes('newly')) {
score -= 15;
}
return Math.max(0, score);
}
// Strategy 1: Extract from JSON-LD structured data (highest priority)
function extractFromStructuredData() {
let found = [];
let scripts = document.querySelectorAll('script[type*="json"]');
for (let script of scripts) {
try {
let data = JSON.parse(script.textContent);
// Look for address objects
function findAddresses(obj) {
if (typeof obj !== 'object' || obj === null) return;
if (obj.streetAddress) {
let addr = obj.streetAddress;
if (obj.addressLocality) addr += ', ' + obj.addressLocality;
if (obj.addressRegion) addr += ', ' + obj.addressRegion;
if (obj.postalCode) addr += ' ' + obj.postalCode;
found.push({
address: addr.trim(),
source: 'structured_data',
quality: scoreAddress(addr, 'structured_data')
});
}
// Recursively search nested objects
for (let key in obj) {
if (typeof obj[key] === 'object') {
findAddresses(obj[key]);
}
}
}
findAddresses(data);
} catch (e) {
// Invalid JSON, skip
}
}
return found;
}
// Strategy 2: Extract from mapaddress element (second highest priority)
function extractFromMapAddress() {
let found = [];
let mapSelectors = [
'.mapaddress',
'[class*="mapaddress"]',
'.postingtitle .mapaddress'
];
for (let selector of mapSelectors) {
let elements = document.querySelectorAll(selector);
for (let el of elements) {
if (el.textContent && el.textContent.trim()) {
let addr = el.textContent.trim();
found.push({
address: addr,
source: 'mapaddress',
quality: scoreAddress(addr, 'mapaddress')
});
}
}
}
return found;
}
// Strategy 3: Extract from body text (careful to avoid title contamination)
function extractFromBodyText() {
let found = [];
let bodySelectors = ['#postingbody', '.postingbody', '.section-content'];
for (let selector of bodySelectors) {
let elements = document.querySelectorAll(selector);
for (let el of elements) {
if (el.textContent && el.textContent.trim()) {
let text = el.textContent;
// Look for address patterns
let patterns = [
/\d+\s+[A-Za-z\s]+(?:Street|St|Avenue|Ave|Road|Rd|Boulevard|Blvd|Drive|Dr|Place|Pl|Lane|Ln)\s*,?\s*(?:Bronx|Brooklyn|Manhattan|Queens|Staten Island)\s*,?\s*NY\s*\d{0,5}/gi,
/\d+\s+[A-Za-z\s]+(?:Street|St|Avenue|Ave|Road|Rd|Boulevard|Blvd|Drive|Dr|Place|Pl|Lane|Ln)\s*,?\s*(?:Bronx|Brooklyn|Manhattan|Queens|Staten Island)/gi,
/(?:Near|At|On)\s+[A-Za-z\s]+(?:Street|St|Avenue|Ave|Road|Rd)\s*(?:and|&|near)\s*[A-Za-z\s]+(?:Street|St|Avenue|Ave|Road|Rd)/gi
];
for (let pattern of patterns) {
let matches = text.match(pattern);
if (matches) {
matches.forEach(addr => {
found.push({
address: addr.trim(),
source: 'body_text',
quality: scoreAddress(addr, 'body_text')
});
});
}
}
}
}
}
return found;
}
// Strategy 4: Extract from title ONLY as last resort (with penalties)
function extractFromTitle() {
let found = [];
let titleEl = document.querySelector('.postingtitle') ||
document.querySelector('#titletextonly');
if (titleEl && titleEl.textContent) {
let titleText = titleEl.textContent;
// Look for parenthetical location info like "(Fordham Vicinity)"
let locMatch = titleText.match(/\(([^)]+(?:Bronx|Brooklyn|Manhattan|Queens|Staten Island)[^)]*)\)/i);
if (locMatch) {
let location = locMatch[1].trim();
if (!location.includes('bedroom') && !location.includes('br ') &&
!location.includes('renovated') && location.length > 5) {
found.push({
address: location,
source: 'title_location',
quality: scoreAddress(location, 'title')
});
}
}
// Avoid extracting the main title as address
debug.title_avoided = true;
}
return found;
}
// Execute strategies in priority order
candidates = candidates.concat(extractFromStructuredData());
candidates = candidates.concat(extractFromMapAddress());
candidates = candidates.concat(extractFromBodyText());
candidates = candidates.concat(extractFromTitle());
// Remove duplicates and filter out poor quality
let uniqueCandidates = [];
let seen = new Set();
for (let candidate of candidates) {
let normalized = candidate.address.toLowerCase().replace(/[^\w\s]/g, '');
if (!seen.has(normalized) && candidate.quality > 0) {
seen.add(normalized);
uniqueCandidates.push(candidate);
}
}
// Sort by quality (highest first)
uniqueCandidates.sort((a, b) => b.quality - a.quality);
debug.total_candidates = uniqueCandidates.length;
debug.candidates = uniqueCandidates;
debug.best_quality = uniqueCandidates.length > 0 ? uniqueCandidates[0].quality : 0;
// Select best address
let bestAddress = null;
if (uniqueCandidates.length > 0 && uniqueCandidates[0].quality > 3) {
bestAddress = uniqueCandidates[0].address;
// Clean up the address
bestAddress = bestAddress.replace(/^(Near|At|On)\s+/i, '');
bestAddress = bestAddress.trim();
}
return {
address: bestAddress,
debug: debug,
all_candidates: uniqueCandidates
};
}
return extractRealAddress();
"""
def apply_fixed_extraction():
"""Apply the fixed address extraction to browser agent."""
import browser_agent
original_function = browser_agent._get_detailed_data_with_enhanced_address
def fixed_extraction(url):
"""Fixed version that finds real addresses and avoids title contamination."""
try:
import helium
print(f"πŸ”§ Fixed address extraction for {url}")
helium.go_to(url)
browser_agent._smart_delay(2, 3)
# Use fixed extraction script
extraction_script = fixed_address_extraction()
result = helium.get_driver().execute_script(extraction_script)
# Get additional data
additional_script = """
return {
price: (document.querySelector('.price') ||
document.querySelector('[class*="price"]') ||
{textContent: 'N/A'}).textContent.trim(),
description: (document.querySelector('#postingbody') ||
document.querySelector('.postingbody') ||
{textContent: 'N/A'}).textContent.trim(),
title: (document.querySelector('.postingtitle') ||
{textContent: 'N/A'}).textContent.trim()
};
"""
additional_data = helium.get_driver().execute_script(additional_script)
# Process results
address = result.get('address')
if address:
# Light normalization
address = browser_agent._normalize_address(address)
print(f"πŸ“ Found address: {address}")
else:
address = 'N/A'
print(f"❌ No address found")
final_result = {
'address': address,
'price': additional_data.get('price', 'N/A'),
'description': additional_data.get('description', 'N/A'),
'title': additional_data.get('title', 'N/A'),
'debug': result.get('debug', {}),
'all_candidates': result.get('all_candidates', [])
}
# Enhanced logging
if final_result.get('debug'):
debug = final_result['debug']
print(f"πŸ“Š Found {debug.get('total_candidates', 0)} address candidates")
print(f"πŸ† Best quality: {debug.get('best_quality', 0)}/10")
print(f"🚫 Title avoided: {debug.get('title_avoided', False)}")
if debug.get('candidates'):
print(f"🎯 Top candidates:")
for i, candidate in enumerate(debug['candidates'][:3], 1):
print(f" {i}. {candidate['address']} (Q:{candidate['quality']}, {candidate['source']})")
return final_result
except Exception as e:
print(f"Fixed extraction failed for {url}: {e}")
return original_function(url)
browser_agent._get_detailed_data_with_enhanced_address = fixed_extraction
print("βœ… Applied fixed address extraction to browser agent")
if __name__ == "__main__":
print("πŸ”§ Fixed Address Extraction")
print("Prioritizes mapaddress and structured data, avoids title contamination")