Spaces:
Sleeping
Sleeping
| #!/usr/bin/env python3 | |
| """ | |
| Balanced Address Extraction Fix | |
| Shows the best available location information to users | |
| Prioritizes complete addresses but falls back to useful approximations | |
| """ | |
| def balanced_address_extraction(): | |
| """ | |
| Balanced extraction that shows users the best available location info. | |
| Never returns N/A if there's any useful location information. | |
| """ | |
| return """ | |
| function extractBestLocationInfo() { | |
| let allLocations = []; | |
| let debug = { strategies: [], fallbacks: [] }; | |
| // Function to score location usefulness (more permissive than before) | |
| function scoreLocation(location) { | |
| if (!location || location.length < 3) return 0; | |
| let score = 0; | |
| let addr = location.toLowerCase(); | |
| // Perfect: Full address with house number + street + borough + zip | |
| if (/\d+\s+[a-z\s]+(?:street|st|avenue|ave|road|rd|boulevard|blvd|drive|dr|place|pl|lane|ln)\s*,?\s*(?:bronx|brooklyn|manhattan|queens|staten island)\s*,?\s*ny\s+\d{5}/.test(addr)) { | |
| score = 10; | |
| } | |
| // Excellent: Partial address with house number + street + borough | |
| else if (/\d+\s+[a-z\s]+(?:street|st|avenue|ave|road|rd|boulevard|blvd|drive|dr|place|pl|lane|ln)\s*,?\s*(?:bronx|brooklyn|manhattan|queens|staten island)/.test(addr)) { | |
| score = 9; | |
| } | |
| // Very Good: Street with house number (missing borough) | |
| else if (/\d+\s+[a-z\s]+(?:street|st|avenue|ave|road|rd|boulevard|blvd|drive|dr|place|pl|lane|ln)/.test(addr)) { | |
| score = 8; | |
| } | |
| // Good: Intersection with specific streets | |
| else if ((addr.includes('near') || addr.includes('&') || addr.includes(' and ')) && | |
| /(?:street|st|avenue|ave|road|rd|boulevard|blvd|drive|dr|place|pl|lane|ln)/.test(addr)) { | |
| score = 7; | |
| } | |
| // Fair: Street name + borough (no house number) | |
| else if (/[a-z\s]+(?:street|st|avenue|ave|road|rd|boulevard|blvd|drive|dr|place|pl|lane|ln)\s*,?\s*(?:bronx|brooklyn|manhattan|queens|staten island)/.test(addr)) { | |
| score = 6; | |
| } | |
| // Useful: Neighborhood/area + borough | |
| else if (/(?:bronx|brooklyn|manhattan|queens|staten island)/.test(addr) && | |
| !/all (bronx|brooklyn|manhattan|queens|staten island) areas/.test(addr) && | |
| addr.length > 10 && addr.length < 100) { | |
| score = 5; | |
| } | |
| // Basic: Just intersection description | |
| else if (addr.includes('near') && addr.length > 8) { | |
| score = 4; | |
| } | |
| // Minimal: Borough-specific area (better than nothing) | |
| else if (/(?:bronx|brooklyn|manhattan|queens|staten island)/.test(addr) && addr.length > 5) { | |
| score = 3; | |
| } | |
| return score; | |
| } | |
| // Strategy 1: Look for ALL text that might contain location info | |
| function findAllLocationMentions() { | |
| let found = []; | |
| let searchTexts = []; | |
| // Get main content areas | |
| let contentAreas = [ | |
| document.querySelector('#postingbody'), | |
| document.querySelector('.postingbody'), | |
| document.querySelector('.section-content'), | |
| document.querySelector('.postingtitle'), | |
| document.querySelector('#titletextonly') | |
| ]; | |
| // Get map address (often most reliable) | |
| let mapEl = document.querySelector('.mapaddress') || | |
| document.querySelector('[class*="map-address"]'); | |
| if (mapEl) { | |
| searchTexts.push(mapEl.textContent); | |
| } | |
| // Get all text content | |
| for (let area of contentAreas) { | |
| if (area && area.textContent) { | |
| searchTexts.push(area.textContent); | |
| } | |
| } | |
| // Get attribute groups | |
| let attrGroups = document.querySelectorAll('.attrgroup'); | |
| for (let group of attrGroups) { | |
| if (group.textContent) { | |
| searchTexts.push(group.textContent); | |
| } | |
| } | |
| // Extract location info from all text | |
| for (let text of searchTexts) { | |
| if (!text) continue; | |
| // Pattern 1: Complete addresses | |
| let completeMatches = text.match(/\d+\s+[A-Za-z\s]+(?:Street|St|Avenue|Ave|Road|Rd|Boulevard|Blvd|Drive|Dr|Place|Pl|Lane|Ln)[^,]*,?\s*(?:Bronx|Brooklyn|Manhattan|Queens|Staten Island)[^,]*,?\s*NY\s*\d{0,5}/gi); | |
| if (completeMatches) { | |
| completeMatches.forEach(addr => { | |
| found.push({ | |
| location: addr.trim(), | |
| source: 'complete_address', | |
| quality: scoreLocation(addr) | |
| }); | |
| }); | |
| } | |
| // Pattern 2: Partial addresses | |
| let partialMatches = text.match(/\d+\s+[A-Za-z\s]+(?:Street|St|Avenue|Ave|Road|Rd|Boulevard|Blvd|Drive|Dr|Place|Pl|Lane|Ln)[^,]*,?\s*(?:Bronx|Brooklyn|Manhattan|Queens|Staten Island)/gi); | |
| if (partialMatches) { | |
| partialMatches.forEach(addr => { | |
| found.push({ | |
| location: addr.trim(), | |
| source: 'partial_address', | |
| quality: scoreLocation(addr) | |
| }); | |
| }); | |
| } | |
| // Pattern 3: Street intersections | |
| let intersectionMatches = text.match(/[A-Za-z\s]+(?:Street|St|Avenue|Ave|Road|Rd)\s+(?:near|and|&)\s+[A-Za-z\s]+(?:Street|St|Avenue|Ave|Road|Rd)/gi); | |
| if (intersectionMatches) { | |
| intersectionMatches.forEach(addr => { | |
| found.push({ | |
| location: addr.trim(), | |
| source: 'intersection', | |
| quality: scoreLocation(addr) | |
| }); | |
| }); | |
| } | |
| // Pattern 4: Neighborhood mentions | |
| let neighborhoodMatches = text.match(/(?:near|in|around|at)\s+[A-Za-z\s]{3,30}(?:Bronx|Brooklyn|Manhattan|Queens|Staten Island)/gi); | |
| if (neighborhoodMatches) { | |
| neighborhoodMatches.forEach(addr => { | |
| let cleaned = addr.replace(/^(?:near|in|around|at)\s+/i, '').trim(); | |
| if (cleaned.length > 8) { | |
| found.push({ | |
| location: cleaned, | |
| source: 'neighborhood', | |
| quality: scoreLocation(cleaned) | |
| }); | |
| } | |
| }); | |
| } | |
| } | |
| return found; | |
| } | |
| // Strategy 2: Check for Google Maps or other external location sources | |
| function findExternalLocationSources() { | |
| let found = []; | |
| // Check iframes for maps | |
| let iframes = document.querySelectorAll('iframe'); | |
| for (let iframe of iframes) { | |
| if (iframe.src && iframe.src.includes('maps')) { | |
| let urlMatch = iframe.src.match(/q=([^&]+)/); | |
| if (urlMatch) { | |
| let addr = decodeURIComponent(urlMatch[1]); | |
| found.push({ | |
| location: addr, | |
| source: 'google_maps', | |
| quality: scoreLocation(addr) | |
| }); | |
| } | |
| } | |
| } | |
| return found; | |
| } | |
| // Execute all strategies | |
| allLocations = allLocations.concat(findAllLocationMentions()); | |
| allLocations = allLocations.concat(findExternalLocationSources()); | |
| // Remove duplicates and very poor quality locations | |
| let uniqueLocations = []; | |
| let seen = new Set(); | |
| for (let loc of allLocations) { | |
| let normalized = loc.location.toLowerCase().replace(/[^\w\s]/g, '').trim(); | |
| if (!seen.has(normalized) && loc.quality > 0 && loc.location.length > 3) { | |
| // Skip overly generic entries | |
| if (!loc.location.toLowerCase().includes('all bronx areas') && | |
| !loc.location.toLowerCase().includes('all brooklyn areas') && | |
| !loc.location.toLowerCase().includes('all manhattan areas') && | |
| !loc.location.toLowerCase().includes('all queens areas')) { | |
| seen.add(normalized); | |
| uniqueLocations.push(loc); | |
| } | |
| } | |
| } | |
| // Sort by quality (best first) | |
| uniqueLocations.sort((a, b) => b.quality - a.quality); | |
| debug.strategies = uniqueLocations; | |
| debug.total_found = uniqueLocations.length; | |
| debug.best_quality = uniqueLocations.length > 0 ? uniqueLocations[0].quality : 0; | |
| // Select best location | |
| let bestLocation = null; | |
| if (uniqueLocations.length > 0) { | |
| bestLocation = uniqueLocations[0].location; | |
| // Add quality indicator for user | |
| let quality = uniqueLocations[0].quality; | |
| if (quality >= 8) { | |
| // Complete address - no indicator needed | |
| bestLocation = bestLocation; | |
| } else if (quality >= 6) { | |
| // Good partial address | |
| bestLocation = bestLocation; | |
| } else if (quality >= 4) { | |
| // Approximate location | |
| bestLocation = `~${bestLocation}`; | |
| } | |
| } | |
| return { | |
| location: bestLocation, | |
| debug: debug, | |
| all_candidates: uniqueLocations | |
| }; | |
| } | |
| return extractBestLocationInfo(); | |
| """ | |
| def apply_balanced_extraction(): | |
| """Apply balanced address extraction to browser agent.""" | |
| import browser_agent | |
| original_function = browser_agent._get_detailed_data_with_enhanced_address | |
| def balanced_extraction(url): | |
| """Balanced version that shows best available location info.""" | |
| try: | |
| import helium | |
| print(f"π― Balanced location extraction for {url}") | |
| helium.go_to(url) | |
| browser_agent._smart_delay(2, 3) | |
| # Use balanced extraction | |
| extraction_script = balanced_address_extraction() | |
| result = helium.get_driver().execute_script(extraction_script) | |
| # Get additional data | |
| additional_script = """ | |
| return { | |
| price: (document.querySelector('.price') || | |
| document.querySelector('[class*="price"]') || | |
| {textContent: 'N/A'}).textContent.trim(), | |
| description: (document.querySelector('#postingbody') || | |
| document.querySelector('.postingbody') || | |
| {textContent: 'N/A'}).textContent.trim(), | |
| title: (document.querySelector('.postingtitle') || | |
| {textContent: 'N/A'}).textContent.trim() | |
| }; | |
| """ | |
| additional_data = helium.get_driver().execute_script(additional_script) | |
| # Process results | |
| location = result.get('location') | |
| if location: | |
| # Apply light normalization (don't be too aggressive) | |
| location = browser_agent._normalize_address(location) | |
| print(f"π Found location: {location}") | |
| else: | |
| location = 'N/A' | |
| print(f"β No location information found") | |
| final_result = { | |
| 'address': location, | |
| 'price': additional_data.get('price', 'N/A'), | |
| 'description': additional_data.get('description', 'N/A'), | |
| 'title': additional_data.get('title', 'N/A'), | |
| 'debug': result.get('debug', {}), | |
| 'all_candidates': result.get('all_candidates', []) | |
| } | |
| # Enhanced logging | |
| if final_result.get('debug'): | |
| debug = final_result['debug'] | |
| print(f"π Found {debug.get('total_found', 0)} location candidates") | |
| print(f"π Best quality: {debug.get('best_quality', 0)}/10") | |
| if debug.get('strategies'): | |
| print(f"π― Top candidates:") | |
| for i, candidate in enumerate(debug['strategies'][:3], 1): | |
| print(f" {i}. {candidate['location']} (Q:{candidate['quality']}, {candidate['source']})") | |
| return final_result | |
| except Exception as e: | |
| print(f"Balanced extraction failed for {url}: {e}") | |
| return original_function(url) | |
| browser_agent._get_detailed_data_with_enhanced_address = balanced_extraction | |
| print("β Applied balanced address extraction to browser agent") | |
| if __name__ == "__main__": | |
| print("π― Balanced Address Extraction Fix") | |
| print("Shows users the best available location information, even if approximate") |