#!/usr/bin/env python3
"""
Balanced Address Extraction Fix
Shows the best available location information to users
Prioritizes complete addresses but falls back to useful approximations
"""

def balanced_address_extraction():
    """
    Balanced extraction that shows users the best available location info.
    Never returns N/A if there's any useful location information.
    """
    return """
    function extractBestLocationInfo() {
        let allLocations = [];
        let debug = { strategies: [], fallbacks: [] };
        
        // Function to score location usefulness (more permissive than before)
        function scoreLocation(location) {
            if (!location || location.length < 3) return 0;
            
            let score = 0;
            let addr = location.toLowerCase();
            
            // Perfect: Full address with house number + street + borough + zip
            if (/\d+\s+[a-z\s]+(?:street|st|avenue|ave|road|rd|boulevard|blvd|drive|dr|place|pl|lane|ln)\s*,?\s*(?:bronx|brooklyn|manhattan|queens|staten island)\s*,?\s*ny\s+\d{5}/.test(addr)) {
                score = 10;
            }
            // Excellent: Partial address with house number + street + borough
            else if (/\d+\s+[a-z\s]+(?:street|st|avenue|ave|road|rd|boulevard|blvd|drive|dr|place|pl|lane|ln)\s*,?\s*(?:bronx|brooklyn|manhattan|queens|staten island)/.test(addr)) {
                score = 9;
            }
            // Very Good: Street with house number (missing borough)
            else if (/\d+\s+[a-z\s]+(?:street|st|avenue|ave|road|rd|boulevard|blvd|drive|dr|place|pl|lane|ln)/.test(addr)) {
                score = 8;
            }
            // Good: Intersection with specific streets
            else if ((addr.includes('near') || addr.includes('&') || addr.includes(' and ')) && 
                     /(?:street|st|avenue|ave|road|rd|boulevard|blvd|drive|dr|place|pl|lane|ln)/.test(addr)) {
                score = 7;
            }
            // Fair: Street name + borough (no house number)
            else if (/[a-z\s]+(?:street|st|avenue|ave|road|rd|boulevard|blvd|drive|dr|place|pl|lane|ln)\s*,?\s*(?:bronx|brooklyn|manhattan|queens|staten island)/.test(addr)) {
                score = 6;
            }
            // Useful: Neighborhood/area + borough
            else if (/(?:bronx|brooklyn|manhattan|queens|staten island)/.test(addr) && 
                     !/all (bronx|brooklyn|manhattan|queens|staten island) areas/.test(addr) &&
                     addr.length > 10 && addr.length < 100) {
                score = 5;
            }
            // Basic: Just intersection description
            else if (addr.includes('near') && addr.length > 8) {
                score = 4;
            }
            // Minimal: Borough-specific area (better than nothing)
            else if (/(?:bronx|brooklyn|manhattan|queens|staten island)/.test(addr) && addr.length > 5) {
                score = 3;
            }
            
            return score;
        }
        
        // Strategy 1: Look for ALL text that might contain location info
        function findAllLocationMentions() {
            let found = [];
            let searchTexts = [];
            
            // Get main content areas
            let contentAreas = [
                document.querySelector('#postingbody'),
                document.querySelector('.postingbody'),
                document.querySelector('.section-content'),
                document.querySelector('.postingtitle'),
                document.querySelector('#titletextonly')
            ];
            
            // Get map address (often most reliable)
            let mapEl = document.querySelector('.mapaddress') || 
                       document.querySelector('[class*="map-address"]');
            if (mapEl) {
                searchTexts.push(mapEl.textContent);
            }
            
            // Get all text content
            for (let area of contentAreas) {
                if (area && area.textContent) {
                    searchTexts.push(area.textContent);
                }
            }
            
            // Get attribute groups
            let attrGroups = document.querySelectorAll('.attrgroup');
            for (let group of attrGroups) {
                if (group.textContent) {
                    searchTexts.push(group.textContent);
                }
            }
            
            // Extract location info from all text
            for (let text of searchTexts) {
                if (!text) continue;
                
                // Pattern 1: Complete addresses
                let completeMatches = text.match(/\d+\s+[A-Za-z\s]+(?:Street|St|Avenue|Ave|Road|Rd|Boulevard|Blvd|Drive|Dr|Place|Pl|Lane|Ln)[^,]*,?\s*(?:Bronx|Brooklyn|Manhattan|Queens|Staten Island)[^,]*,?\s*NY\s*\d{0,5}/gi);
                if (completeMatches) {
                    completeMatches.forEach(addr => {
                        found.push({
                            location: addr.trim(),
                            source: 'complete_address',
                            quality: scoreLocation(addr)
                        });
                    });
                }
                
                // Pattern 2: Partial addresses
                let partialMatches = text.match(/\d+\s+[A-Za-z\s]+(?:Street|St|Avenue|Ave|Road|Rd|Boulevard|Blvd|Drive|Dr|Place|Pl|Lane|Ln)[^,]*,?\s*(?:Bronx|Brooklyn|Manhattan|Queens|Staten Island)/gi);
                if (partialMatches) {
                    partialMatches.forEach(addr => {
                        found.push({
                            location: addr.trim(),
                            source: 'partial_address',
                            quality: scoreLocation(addr)
                        });
                    });
                }
                
                // Pattern 3: Street intersections
                let intersectionMatches = text.match(/[A-Za-z\s]+(?:Street|St|Avenue|Ave|Road|Rd)\s+(?:near|and|&)\s+[A-Za-z\s]+(?:Street|St|Avenue|Ave|Road|Rd)/gi);
                if (intersectionMatches) {
                    intersectionMatches.forEach(addr => {
                        found.push({
                            location: addr.trim(),
                            source: 'intersection',
                            quality: scoreLocation(addr)
                        });
                    });
                }
                
                // Pattern 4: Neighborhood mentions
                let neighborhoodMatches = text.match(/(?:near|in|around|at)\s+[A-Za-z\s]{3,30}(?:Bronx|Brooklyn|Manhattan|Queens|Staten Island)/gi);
                if (neighborhoodMatches) {
                    neighborhoodMatches.forEach(addr => {
                        let cleaned = addr.replace(/^(?:near|in|around|at)\s+/i, '').trim();
                        if (cleaned.length > 8) {
                            found.push({
                                location: cleaned,
                                source: 'neighborhood',
                                quality: scoreLocation(cleaned)
                            });
                        }
                    });
                }
            }
            
            return found;
        }
        
        // Strategy 2: Check for Google Maps or other external location sources
        function findExternalLocationSources() {
            let found = [];
            
            // Check iframes for maps
            let iframes = document.querySelectorAll('iframe');
            for (let iframe of iframes) {
                if (iframe.src && iframe.src.includes('maps')) {
                    let urlMatch = iframe.src.match(/q=([^&]+)/);
                    if (urlMatch) {
                        let addr = decodeURIComponent(urlMatch[1]);
                        found.push({
                            location: addr,
                            source: 'google_maps',
                            quality: scoreLocation(addr)
                        });
                    }
                }
            }
            
            return found;
        }
        
        // Execute all strategies
        allLocations = allLocations.concat(findAllLocationMentions());
        allLocations = allLocations.concat(findExternalLocationSources());
        
        // Remove duplicates and very poor quality locations
        let uniqueLocations = [];
        let seen = new Set();
        
        for (let loc of allLocations) {
            let normalized = loc.location.toLowerCase().replace(/[^\w\s]/g, '').trim();
            if (!seen.has(normalized) && loc.quality > 0 && loc.location.length > 3) {
                // Skip overly generic entries
                if (!loc.location.toLowerCase().includes('all bronx areas') && 
                    !loc.location.toLowerCase().includes('all brooklyn areas') &&
                    !loc.location.toLowerCase().includes('all manhattan areas') &&
                    !loc.location.toLowerCase().includes('all queens areas')) {
                    seen.add(normalized);
                    uniqueLocations.push(loc);
                }
            }
        }
        
        // Sort by quality (best first)
        uniqueLocations.sort((a, b) => b.quality - a.quality);
        
        debug.strategies = uniqueLocations;
        debug.total_found = uniqueLocations.length;
        debug.best_quality = uniqueLocations.length > 0 ? uniqueLocations[0].quality : 0;
        
        // Select best location
        let bestLocation = null;
        if (uniqueLocations.length > 0) {
            bestLocation = uniqueLocations[0].location;
            
            // Add quality indicator for user
            let quality = uniqueLocations[0].quality;
            if (quality >= 8) {
                // Complete address - no indicator needed
                bestLocation = bestLocation;
            } else if (quality >= 6) {
                // Good partial address
                bestLocation = bestLocation;
            } else if (quality >= 4) {
                // Approximate location
                bestLocation = `~${bestLocation}`;
            }
        }
        
        return {
            location: bestLocation,
            debug: debug,
            all_candidates: uniqueLocations
        };
    }
    
    return extractBestLocationInfo();
    """

def apply_balanced_extraction():
    """Apply balanced address extraction to browser agent."""
    import browser_agent
    
    original_function = browser_agent._get_detailed_data_with_enhanced_address
    
    def balanced_extraction(url):
        """Balanced version that shows best available location info."""
        try:
            import helium
            
            print(f"🎯 Balanced location extraction for {url}")
            helium.go_to(url)
            browser_agent._smart_delay(2, 3)
            
            # Use balanced extraction
            extraction_script = balanced_address_extraction()
            result = helium.get_driver().execute_script(extraction_script)
            
            # Get additional data
            additional_script = """
            return {
                price: (document.querySelector('.price') || 
                       document.querySelector('[class*="price"]') || 
                       {textContent: 'N/A'}).textContent.trim(),
                description: (document.querySelector('#postingbody') || 
                             document.querySelector('.postingbody') ||
                             {textContent: 'N/A'}).textContent.trim(),
                title: (document.querySelector('.postingtitle') ||
                       {textContent: 'N/A'}).textContent.trim()
            };
            """
            additional_data = helium.get_driver().execute_script(additional_script)
            
            # Process results
            location = result.get('location')
            if location:
                # Apply light normalization (don't be too aggressive)
                location = browser_agent._normalize_address(location)
                print(f"📍 Found location: {location}")
            else:
                location = 'N/A'
                print(f"❌ No location information found")
            
            final_result = {
                'address': location,
                'price': additional_data.get('price', 'N/A'),
                'description': additional_data.get('description', 'N/A'),
                'title': additional_data.get('title', 'N/A'),
                'debug': result.get('debug', {}),
                'all_candidates': result.get('all_candidates', [])
            }
            
            # Enhanced logging
            if final_result.get('debug'):
                debug = final_result['debug']
                print(f"📊 Found {debug.get('total_found', 0)} location candidates")
                print(f"🏆 Best quality: {debug.get('best_quality', 0)}/10")
                
                if debug.get('strategies'):
                    print(f"🎯 Top candidates:")
                    for i, candidate in enumerate(debug['strategies'][:3], 1):
                        print(f"   {i}. {candidate['location']} (Q:{candidate['quality']}, {candidate['source']})")
            
            return final_result
            
        except Exception as e:
            print(f"Balanced extraction failed for {url}: {e}")
            return original_function(url)
    
    browser_agent._get_detailed_data_with_enhanced_address = balanced_extraction
    print("✅ Applied balanced address extraction to browser agent")

if __name__ == "__main__":
    print("🎯 Balanced Address Extraction Fix")
    print("Shows users the best available location information, even if approximate")