Spaces:
Sleeping
Sleeping
| #!/usr/bin/env python3 | |
| """ | |
| Comprehensive Address Extraction Fix | |
| Handles Google Maps, JavaScript content, and all address sources | |
| """ | |
| def comprehensive_address_extraction(): | |
| """ | |
| Most comprehensive address extraction script that checks ALL possible sources. | |
| """ | |
| return """ | |
| function extractAllAddresses() { | |
| let allAddresses = []; | |
| let debug = { sources: {}, raw_content: {} }; | |
| // Function to score address quality | |
| function scoreAddress(addr) { | |
| if (!addr || addr.length < 5) return 0; | |
| let score = 0; | |
| // Full address with house number + street + borough + state + zip | |
| if (/\d+\s+[A-Za-z\s]+(?:Street|St|Avenue|Ave|Road|Rd|Boulevard|Blvd|Drive|Dr|Place|Pl|Lane|Ln)\s*,?\s*(?:Bronx|Brooklyn|Manhattan|Queens|Staten Island)\s*,?\s*NY\s+\d{5}/.test(addr)) { | |
| score = 10; | |
| } | |
| // Partial address with house number + street + borough | |
| else if (/\d+\s+[A-Za-z\s]+(?:Street|St|Avenue|Ave|Road|Rd|Boulevard|Blvd|Drive|Dr|Place|Pl|Lane|Ln)\s*,?\s*(?:Bronx|Brooklyn|Manhattan|Queens|Staten Island)/.test(addr)) { | |
| score = 8; | |
| } | |
| // Street with house number | |
| else if (/\d+\s+[A-Za-z\s]+(?:Street|St|Avenue|Ave|Road|Rd|Boulevard|Blvd|Drive|Dr|Place|Pl|Lane|Ln)/.test(addr)) { | |
| score = 6; | |
| } | |
| // Intersection | |
| else if (addr.includes('near') || addr.includes('&') || addr.includes(' and ')) { | |
| score = 4; | |
| } | |
| // Generic area | |
| else if (/bronx|brooklyn|manhattan|queens|staten/i.test(addr)) { | |
| score = 2; | |
| } | |
| return score; | |
| } | |
| // 1. Check all text elements for addresses | |
| function scanAllTextElements() { | |
| let found = []; | |
| let allElements = document.querySelectorAll('*'); | |
| for (let el of allElements) { | |
| if (el.children.length === 0 && el.textContent.trim()) { | |
| let text = el.textContent.trim(); | |
| // Full address patterns | |
| let fullMatches = text.match(/\d+\s+[A-Za-z\s]+(?:Street|St|Avenue|Ave|Road|Rd|Boulevard|Blvd|Drive|Dr|Place|Pl|Lane|Ln)\s*,?\s*(?:Bronx|Brooklyn|Manhattan|Queens|Staten Island)\s*,?\s*NY\s*\d{5}?/gi); | |
| if (fullMatches) { | |
| fullMatches.forEach(addr => { | |
| found.push({ | |
| address: addr.trim(), | |
| source: 'text_scan_full', | |
| element: el.tagName.toLowerCase(), | |
| quality: scoreAddress(addr) | |
| }); | |
| }); | |
| } | |
| // Partial address patterns | |
| let partialMatches = text.match(/\d+\s+[A-Za-z\s]+(?:Street|St|Avenue|Ave|Road|Rd|Boulevard|Blvd|Drive|Dr|Place|Pl|Lane|Ln)\s*,?\s*(?:Bronx|Brooklyn|Manhattan|Queens|Staten Island)/gi); | |
| if (partialMatches) { | |
| partialMatches.forEach(addr => { | |
| found.push({ | |
| address: addr.trim(), | |
| source: 'text_scan_partial', | |
| element: el.tagName.toLowerCase(), | |
| quality: scoreAddress(addr) | |
| }); | |
| }); | |
| } | |
| } | |
| } | |
| return found; | |
| } | |
| // 2. Check all data attributes and hidden content | |
| function scanDataAttributes() { | |
| let found = []; | |
| let allElements = document.querySelectorAll('*'); | |
| for (let el of allElements) { | |
| // Check all attributes | |
| for (let attr of el.attributes || []) { | |
| if (attr.value && attr.value.length > 10) { | |
| let matches = attr.value.match(/\d+\s+[A-Za-z\s]+(?:Street|St|Avenue|Ave|Road|Rd|Boulevard|Blvd|Drive|Dr|Place|Pl|Lane|Ln)\s*,?\s*(?:Bronx|Brooklyn|Manhattan|Queens|Staten Island)/gi); | |
| if (matches) { | |
| matches.forEach(addr => { | |
| found.push({ | |
| address: addr.trim(), | |
| source: 'data_attribute', | |
| attribute: attr.name, | |
| quality: scoreAddress(addr) | |
| }); | |
| }); | |
| } | |
| } | |
| } | |
| } | |
| return found; | |
| } | |
| // 3. Check iframe content (Google Maps) | |
| function scanIframes() { | |
| let found = []; | |
| let iframes = document.querySelectorAll('iframe'); | |
| for (let iframe of iframes) { | |
| if (iframe.src && (iframe.src.includes('maps') || iframe.src.includes('google'))) { | |
| // Extract from Google Maps URL parameters | |
| let url = iframe.src; | |
| // Look for address in URL parameters | |
| let addressMatch = url.match(/q=([^&]+)/); | |
| if (addressMatch) { | |
| let addr = decodeURIComponent(addressMatch[1]); | |
| if (scoreAddress(addr) > 0) { | |
| found.push({ | |
| address: addr, | |
| source: 'google_maps_url', | |
| quality: scoreAddress(addr) | |
| }); | |
| } | |
| } | |
| // Look for coordinates that might be converted | |
| let coordMatch = url.match(/[@!](-?\d+\.\d+),(-?\d+\.\d+)/); | |
| if (coordMatch) { | |
| found.push({ | |
| address: `Coordinates: ${coordMatch[1]}, ${coordMatch[2]}`, | |
| source: 'google_maps_coords', | |
| quality: 3 | |
| }); | |
| } | |
| } | |
| } | |
| return found; | |
| } | |
| // 4. Check meta tags and structured data | |
| function scanMetaData() { | |
| let found = []; | |
| // Check meta tags | |
| let metaTags = document.querySelectorAll('meta[property], meta[name]'); | |
| for (let meta of metaTags) { | |
| if (meta.content && meta.content.length > 10) { | |
| let matches = meta.content.match(/\d+\s+[A-Za-z\s]+(?:Street|St|Avenue|Ave|Road|Rd|Boulevard|Blvd|Drive|Dr|Place|Pl|Lane|Ln)\s*,?\s*(?:Bronx|Brooklyn|Manhattan|Queens|Staten Island)/gi); | |
| if (matches) { | |
| matches.forEach(addr => { | |
| found.push({ | |
| address: addr.trim(), | |
| source: 'meta_tag', | |
| property: meta.getAttribute('property') || meta.getAttribute('name'), | |
| quality: scoreAddress(addr) | |
| }); | |
| }); | |
| } | |
| } | |
| } | |
| // Check JSON-LD structured data | |
| let scripts = document.querySelectorAll('script[type="application/ld+json"]'); | |
| for (let script of scripts) { | |
| try { | |
| let data = JSON.parse(script.textContent); | |
| let dataStr = JSON.stringify(data); | |
| let matches = dataStr.match(/\d+\s+[A-Za-z\s]+(?:Street|St|Avenue|Ave|Road|Rd|Boulevard|Blvd|Drive|Dr|Place|Pl|Lane|Ln)\s*,?\s*(?:Bronx|Brooklyn|Manhattan|Queens|Staten Island)/gi); | |
| if (matches) { | |
| matches.forEach(addr => { | |
| found.push({ | |
| address: addr.trim(), | |
| source: 'structured_data', | |
| quality: scoreAddress(addr) | |
| }); | |
| }); | |
| } | |
| } catch (e) { | |
| // Invalid JSON, skip | |
| } | |
| } | |
| return found; | |
| } | |
| // 5. Wait for and check dynamic content | |
| function scanDynamicContent() { | |
| return new Promise((resolve) => { | |
| let found = []; | |
| let checkCount = 0; | |
| let maxChecks = 10; | |
| function checkForNewAddresses() { | |
| checkCount++; | |
| // Look for any new address-containing elements | |
| let newElements = document.querySelectorAll('[data-address], .address, .location, .geo'); | |
| for (let el of newElements) { | |
| if (el.textContent && el.textContent.trim()) { | |
| let addr = el.textContent.trim(); | |
| if (scoreAddress(addr) > 0) { | |
| found.push({ | |
| address: addr, | |
| source: 'dynamic_content', | |
| quality: scoreAddress(addr) | |
| }); | |
| } | |
| } | |
| } | |
| if (checkCount < maxChecks) { | |
| setTimeout(checkForNewAddresses, 200); | |
| } else { | |
| resolve(found); | |
| } | |
| } | |
| checkForNewAddresses(); | |
| }); | |
| } | |
| // Execute all scanning methods | |
| try { | |
| // Immediate scans | |
| allAddresses = allAddresses.concat(scanAllTextElements()); | |
| allAddresses = allAddresses.concat(scanDataAttributes()); | |
| allAddresses = allAddresses.concat(scanIframes()); | |
| allAddresses = allAddresses.concat(scanMetaData()); | |
| // Store debug info | |
| debug.sources = { | |
| text_scan: allAddresses.filter(a => a.source.includes('text_scan')).length, | |
| data_attributes: allAddresses.filter(a => a.source === 'data_attribute').length, | |
| google_maps: allAddresses.filter(a => a.source.includes('google_maps')).length, | |
| meta_data: allAddresses.filter(a => a.source.includes('meta')).length | |
| }; | |
| // Remove duplicates and sort by quality | |
| let uniqueAddresses = []; | |
| let seen = new Set(); | |
| for (let addr of allAddresses) { | |
| let normalized = addr.address.toLowerCase().replace(/[^\w\s]/g, ''); | |
| if (!seen.has(normalized) && addr.address.length > 5) { | |
| seen.add(normalized); | |
| uniqueAddresses.push(addr); | |
| } | |
| } | |
| uniqueAddresses.sort((a, b) => b.quality - a.quality); | |
| debug.total_candidates = uniqueAddresses.length; | |
| debug.best_quality = uniqueAddresses.length > 0 ? uniqueAddresses[0].quality : 0; | |
| debug.all_candidates = uniqueAddresses; | |
| let bestAddress = uniqueAddresses.length > 0 ? uniqueAddresses[0].address : null; | |
| return { | |
| address: bestAddress, | |
| debug: debug, | |
| all_candidates: uniqueAddresses | |
| }; | |
| } catch (error) { | |
| debug.error = error.toString(); | |
| return { | |
| address: null, | |
| debug: debug, | |
| all_candidates: [] | |
| }; | |
| } | |
| } | |
| return extractAllAddresses(); | |
| """ | |
| def apply_comprehensive_extraction(): | |
| """Apply comprehensive address extraction to browser agent.""" | |
| import browser_agent | |
| original_function = browser_agent._get_detailed_data_with_enhanced_address | |
| def comprehensive_extraction(url): | |
| """Enhanced version with comprehensive address extraction.""" | |
| try: | |
| import helium | |
| print(f"π Comprehensive address extraction for {url}") | |
| helium.go_to(url) | |
| browser_agent._smart_delay(3, 4) # Wait longer for dynamic content | |
| # Use comprehensive extraction | |
| extraction_script = comprehensive_address_extraction() | |
| result = helium.get_driver().execute_script(extraction_script) | |
| # Get additional data | |
| additional_script = """ | |
| return { | |
| price: (document.querySelector('.price') || | |
| document.querySelector('[class*="price"]') || | |
| {textContent: 'N/A'}).textContent.trim(), | |
| description: (document.querySelector('#postingbody') || | |
| document.querySelector('.postingbody') || | |
| {textContent: 'N/A'}).textContent.trim(), | |
| title: (document.querySelector('.postingtitle') || | |
| {textContent: 'N/A'}).textContent.trim() | |
| }; | |
| """ | |
| additional_data = helium.get_driver().execute_script(additional_script) | |
| # Combine results | |
| final_result = { | |
| 'address': result.get('address') or 'N/A', | |
| 'price': additional_data.get('price', 'N/A'), | |
| 'description': additional_data.get('description', 'N/A'), | |
| 'title': additional_data.get('title', 'N/A'), | |
| 'debug': result.get('debug', {}), | |
| 'all_candidates': result.get('all_candidates', []) | |
| } | |
| # Enhanced logging | |
| if final_result.get('debug'): | |
| debug = final_result['debug'] | |
| print(f"π Comprehensive scan found {debug.get('total_candidates', 0)} total candidates") | |
| print(f"π Sources: {debug.get('sources', {})}") | |
| print(f"π Best quality: {debug.get('best_quality', 0)}") | |
| if debug.get('all_candidates'): | |
| print(f"π― Top 5 candidates:") | |
| for i, candidate in enumerate(debug['all_candidates'][:5], 1): | |
| print(f" {i}. {candidate['address']} (Q:{candidate['quality']}, {candidate['source']})") | |
| # Validate best address | |
| if final_result.get('address') and final_result['address'] != 'N/A': | |
| final_result['address'] = browser_agent._normalize_address(final_result['address']) | |
| if browser_agent._validate_address(final_result['address']): | |
| print(f"β Best address: {final_result['address']}") | |
| else: | |
| print(f"β Address validation failed: {final_result['address']}") | |
| final_result['address'] = 'N/A' | |
| return final_result | |
| except Exception as e: | |
| print(f"Comprehensive extraction failed for {url}: {e}") | |
| return original_function(url) | |
| browser_agent._get_detailed_data_with_enhanced_address = comprehensive_extraction | |
| print("β Applied comprehensive address extraction to browser agent") | |
| if __name__ == "__main__": | |
| print("π§ Comprehensive Address Extraction Fix") | |
| print("Scans ALL possible address sources including Google Maps and dynamic content") |