#!/usr/bin/env python3 """ Fixed Address Extraction - Prioritizes Real Address Sources Based on debug findings: .mapaddress and JSON structured data contain the real addresses """ def fixed_address_extraction(): """ Fixed extraction that finds real addresses from proper sources. Avoids title contamination by prioritizing mapaddress and structured data. """ return """ function extractRealAddress() { let candidates = []; let debug = { sources: {}, title_avoided: false }; // Function to score address quality function scoreAddress(addr, source) { if (!addr || addr.length < 3) return 0; let score = 0; let text = addr.toLowerCase().trim(); // Boost score based on reliable source let sourceBonus = 0; if (source === 'structured_data') sourceBonus = 5; else if (source === 'mapaddress') sourceBonus = 4; else if (source === 'body_text') sourceBonus = 2; else if (source === 'title') sourceBonus = -10; // AVOID TITLES // Score the content quality if (/\d+\s+[a-z\s]+(?:street|st|avenue|ave|road|rd|boulevard|blvd|drive|dr|place|pl|lane|ln)\s*,?\s*(?:bronx|brooklyn|manhattan|queens|staten island)\s*,?\s*ny\s+\d{5}/.test(text)) { score = 10 + sourceBonus; } else if (/\d+\s+[a-z\s]+(?:street|st|avenue|ave|road|rd|boulevard|blvd|drive|dr|place|pl|lane|ln)\s*,?\s*(?:bronx|brooklyn|manhattan|queens|staten island)/.test(text)) { score = 9 + sourceBonus; } else if (/\d+\s+[a-z\s]+(?:street|st|avenue|ave|road|rd|boulevard|blvd|drive|dr|place|pl|lane|ln)/.test(text)) { score = 8 + sourceBonus; } else if (/[a-z\s]+(?:street|st|avenue|ave|road|rd|boulevard|blvd|drive|dr|place|pl|lane|ln)\s*,?\s*(?:bronx|brooklyn|manhattan|queens|staten island)/.test(text)) { score = 6 + sourceBonus; } else if (text.includes('near') && /(?:street|st|avenue|ave|road|rd|boulevard|blvd|drive|dr|place|pl|lane|ln)/.test(text)) { score = 5 + sourceBonus; } else if (/(?:bronx|brooklyn|manhattan|queens|staten island)/.test(text) && !text.includes('all ') && !text.includes('newly renovated') && !text.includes('bedroom') && text.length > 8 && text.length < 60) { score = 4 + sourceBonus; } // Penalty for title-like content if (text.includes('br apt') || text.includes('bedroom') || text.includes('renovated') || text.includes('$') || text.includes('/') || text.includes('newly')) { score -= 15; } return Math.max(0, score); } // Strategy 1: Extract from JSON-LD structured data (highest priority) function extractFromStructuredData() { let found = []; let scripts = document.querySelectorAll('script[type*="json"]'); for (let script of scripts) { try { let data = JSON.parse(script.textContent); // Look for address objects function findAddresses(obj) { if (typeof obj !== 'object' || obj === null) return; if (obj.streetAddress) { let addr = obj.streetAddress; if (obj.addressLocality) addr += ', ' + obj.addressLocality; if (obj.addressRegion) addr += ', ' + obj.addressRegion; if (obj.postalCode) addr += ' ' + obj.postalCode; found.push({ address: addr.trim(), source: 'structured_data', quality: scoreAddress(addr, 'structured_data') }); } // Recursively search nested objects for (let key in obj) { if (typeof obj[key] === 'object') { findAddresses(obj[key]); } } } findAddresses(data); } catch (e) { // Invalid JSON, skip } } return found; } // Strategy 2: Extract from mapaddress element (second highest priority) function extractFromMapAddress() { let found = []; let mapSelectors = [ '.mapaddress', '[class*="mapaddress"]', '.postingtitle .mapaddress' ]; for (let selector of mapSelectors) { let elements = document.querySelectorAll(selector); for (let el of elements) { if (el.textContent && el.textContent.trim()) { let addr = el.textContent.trim(); found.push({ address: addr, source: 'mapaddress', quality: scoreAddress(addr, 'mapaddress') }); } } } return found; } // Strategy 3: Extract from body text (careful to avoid title contamination) function extractFromBodyText() { let found = []; let bodySelectors = ['#postingbody', '.postingbody', '.section-content']; for (let selector of bodySelectors) { let elements = document.querySelectorAll(selector); for (let el of elements) { if (el.textContent && el.textContent.trim()) { let text = el.textContent; // Look for address patterns let patterns = [ /\d+\s+[A-Za-z\s]+(?:Street|St|Avenue|Ave|Road|Rd|Boulevard|Blvd|Drive|Dr|Place|Pl|Lane|Ln)\s*,?\s*(?:Bronx|Brooklyn|Manhattan|Queens|Staten Island)\s*,?\s*NY\s*\d{0,5}/gi, /\d+\s+[A-Za-z\s]+(?:Street|St|Avenue|Ave|Road|Rd|Boulevard|Blvd|Drive|Dr|Place|Pl|Lane|Ln)\s*,?\s*(?:Bronx|Brooklyn|Manhattan|Queens|Staten Island)/gi, /(?:Near|At|On)\s+[A-Za-z\s]+(?:Street|St|Avenue|Ave|Road|Rd)\s*(?:and|&|near)\s*[A-Za-z\s]+(?:Street|St|Avenue|Ave|Road|Rd)/gi ]; for (let pattern of patterns) { let matches = text.match(pattern); if (matches) { matches.forEach(addr => { found.push({ address: addr.trim(), source: 'body_text', quality: scoreAddress(addr, 'body_text') }); }); } } } } } return found; } // Strategy 4: Extract from title ONLY as last resort (with penalties) function extractFromTitle() { let found = []; let titleEl = document.querySelector('.postingtitle') || document.querySelector('#titletextonly'); if (titleEl && titleEl.textContent) { let titleText = titleEl.textContent; // Look for parenthetical location info like "(Fordham Vicinity)" let locMatch = titleText.match(/\(([^)]+(?:Bronx|Brooklyn|Manhattan|Queens|Staten Island)[^)]*)\)/i); if (locMatch) { let location = locMatch[1].trim(); if (!location.includes('bedroom') && !location.includes('br ') && !location.includes('renovated') && location.length > 5) { found.push({ address: location, source: 'title_location', quality: scoreAddress(location, 'title') }); } } // Avoid extracting the main title as address debug.title_avoided = true; } return found; } // Execute strategies in priority order candidates = candidates.concat(extractFromStructuredData()); candidates = candidates.concat(extractFromMapAddress()); candidates = candidates.concat(extractFromBodyText()); candidates = candidates.concat(extractFromTitle()); // Remove duplicates and filter out poor quality let uniqueCandidates = []; let seen = new Set(); for (let candidate of candidates) { let normalized = candidate.address.toLowerCase().replace(/[^\w\s]/g, ''); if (!seen.has(normalized) && candidate.quality > 0) { seen.add(normalized); uniqueCandidates.push(candidate); } } // Sort by quality (highest first) uniqueCandidates.sort((a, b) => b.quality - a.quality); debug.total_candidates = uniqueCandidates.length; debug.candidates = uniqueCandidates; debug.best_quality = uniqueCandidates.length > 0 ? uniqueCandidates[0].quality : 0; // Select best address let bestAddress = null; if (uniqueCandidates.length > 0 && uniqueCandidates[0].quality > 3) { bestAddress = uniqueCandidates[0].address; // Clean up the address bestAddress = bestAddress.replace(/^(Near|At|On)\s+/i, ''); bestAddress = bestAddress.trim(); } return { address: bestAddress, debug: debug, all_candidates: uniqueCandidates }; } return extractRealAddress(); """ def apply_fixed_extraction(): """Apply the fixed address extraction to browser agent.""" import browser_agent original_function = browser_agent._get_detailed_data_with_enhanced_address def fixed_extraction(url): """Fixed version that finds real addresses and avoids title contamination.""" try: import helium print(f"🔧 Fixed address extraction for {url}") helium.go_to(url) browser_agent._smart_delay(2, 3) # Use fixed extraction script extraction_script = fixed_address_extraction() result = helium.get_driver().execute_script(extraction_script) # Get additional data additional_script = """ return { price: (document.querySelector('.price') || document.querySelector('[class*="price"]') || {textContent: 'N/A'}).textContent.trim(), description: (document.querySelector('#postingbody') || document.querySelector('.postingbody') || {textContent: 'N/A'}).textContent.trim(), title: (document.querySelector('.postingtitle') || {textContent: 'N/A'}).textContent.trim() }; """ additional_data = helium.get_driver().execute_script(additional_script) # Process results address = result.get('address') if address: # Light normalization address = browser_agent._normalize_address(address) print(f"📍 Found address: {address}") else: address = 'N/A' print(f"❌ No address found") final_result = { 'address': address, 'price': additional_data.get('price', 'N/A'), 'description': additional_data.get('description', 'N/A'), 'title': additional_data.get('title', 'N/A'), 'debug': result.get('debug', {}), 'all_candidates': result.get('all_candidates', []) } # Enhanced logging if final_result.get('debug'): debug = final_result['debug'] print(f"📊 Found {debug.get('total_candidates', 0)} address candidates") print(f"🏆 Best quality: {debug.get('best_quality', 0)}/10") print(f"🚫 Title avoided: {debug.get('title_avoided', False)}") if debug.get('candidates'): print(f"🎯 Top candidates:") for i, candidate in enumerate(debug['candidates'][:3], 1): print(f" {i}. {candidate['address']} (Q:{candidate['quality']}, {candidate['source']})") return final_result except Exception as e: print(f"Fixed extraction failed for {url}: {e}") return original_function(url) browser_agent._get_detailed_data_with_enhanced_address = fixed_extraction print("✅ Applied fixed address extraction to browser agent") if __name__ == "__main__": print("🔧 Fixed Address Extraction") print("Prioritizes mapaddress and structured data, avoids title contamination")