| |
|
|
| from .model_loader import load_model |
| from geopy.geocoders import Nominatim |
| from .logging_config import logger |
| import re |
| import time |
| from typing import Dict, Any |
| from geopy.distance import geodesic |
|
|
| geocoder = Nominatim(user_agent="indian_property_verifier", timeout=10) |
|
|
| def validate_address_format(address: str) -> bool: |
| """Validate the format of the address.""" |
| if not address: |
| return False |
| |
| |
| if len(address.strip()) < 5: |
| return False |
| |
| |
| components = [comp.strip() for comp in address.split(',')] |
| if len(components) < 1: |
| return False |
| |
| |
| patterns = [ |
| r'[A-Za-z\s]+', |
| ] |
| |
| |
| pattern_matches = sum(1 for pattern in patterns if re.search(pattern, address.lower())) |
| if pattern_matches < 1: |
| return False |
| |
| |
| address_lower = address.lower() |
| has_location = any(term in address_lower for term in [ |
| 'ward', 'zone', 'mandal', 'municipal', 'corporation', 'greater', |
| 'street', 'road', 'avenue', 'lane', 'colony', 'society', 'area', 'near' |
| ]) |
| has_area = any(term in address_lower for term in [ |
| 'colony', 'society', 'apartment', 'flat', 'house', 'plot', 'block', 'sector', |
| 'area', 'locality', 'main', 'cross', 'circle', 'square', 'market', 'near' |
| ]) |
| |
| |
| return has_location or has_area or len(address.strip()) >= 8 |
|
|
| def validate_postal_code(postal_code: str) -> bool: |
| """Validate Indian postal code format.""" |
| if not postal_code: |
| return False |
| |
| |
| postal_code = str(postal_code).strip().replace(' ', '') |
| |
| |
| if not re.match(r'^\d{5,6}$', postal_code): |
| return False |
| |
| |
| first_digit = int(postal_code[0]) |
| if first_digit not in range(0, 10): |
| return False |
| |
| return True |
|
|
| def validate_coordinates(latitude: str, longitude: str) -> bool: |
| """Validate coordinate format and range for India.""" |
| try: |
| |
| lat = float(str(latitude).strip()) |
| lng = float(str(longitude).strip()) |
| |
| |
| india_bounds = { |
| 'lat_min': 5.0, |
| 'lat_max': 40.0, |
| 'lng_min': 65.0, |
| 'lng_max': 100.0 |
| } |
| |
| |
| if not (india_bounds['lat_min'] <= lat <= india_bounds['lat_max'] and |
| india_bounds['lng_min'] <= lng <= india_bounds['lng_max']): |
| return False |
| |
| |
| lat_str = f"{lat:.4f}" |
| lng_str = f"{lng:.4f}" |
| |
| |
| if abs(float(lat_str) - lat) > 0.0001 or abs(float(lng_str) - lng) > 0.0001: |
| return False |
| |
| return True |
| except (ValueError, TypeError): |
| return False |
|
|
| def verify_location_in_city(address: str, city: str) -> bool: |
| """Verify if the address exists in the given city.""" |
| if not address or not city: |
| return False |
| |
| try: |
| |
| address = address.strip() |
| city = city.strip() |
| |
| |
| address_components = [comp.strip() for comp in address.split(',')] |
| |
| |
| address_formats = [ |
| |
| f"{address}, India", |
| |
| f"{city}, {address_components[0]}, India", |
| f"{city}, {address_components[1]}, India", |
| |
| f"{city}, {next((comp for comp in address_components if 'municipal corporation' in comp.lower()), '')}, India", |
| |
| f"{city}, {next((comp for comp in address_components if 'mandal' in comp.lower()), '')}, India", |
| |
| f"{address_components[0]}, {city}, India", |
| |
| f"{next((comp for comp in address_components if 'zone' in comp.lower()), '')}, {city}, India" |
| ] |
| |
| |
| for addr_format in address_formats: |
| try: |
| location = geocoder.geocode(addr_format, timeout=10) |
| if location: |
| |
| location_address = location.address.lower() |
| city_lower = city.lower() |
| |
| |
| city_variations = [ |
| city_lower, |
| city_lower.replace(' ', ''), |
| city_lower.replace(' ', '-'), |
| f"{city_lower} city", |
| f"{city_lower} district", |
| f"{city_lower} municipal corporation", |
| f"greater {city_lower}", |
| f"greater {city_lower} municipal corporation" |
| ] |
| |
| |
| if any(var in location_address for var in city_variations): |
| |
| location_components = [comp.strip().lower() for comp in location_address.split(',')] |
| |
| |
| key_components = [ |
| comp.lower() for comp in address_components |
| if any(keyword in comp.lower() for keyword in [ |
| 'ward', 'zone', 'mandal', 'municipal', 'corporation', 'greater' |
| ]) |
| ] |
| |
| |
| matching_components = sum(1 for comp in key_components if any(comp in loc_comp for loc_comp in location_components)) |
| if matching_components >= 2: |
| return True |
| except Exception as e: |
| logger.debug(f"Error in address verification: {str(e)}") |
| continue |
| time.sleep(1) |
| |
| |
| try: |
| |
| city_location = geocoder.geocode(f"{city}, India", timeout=10) |
| if city_location: |
| |
| address_location = geocoder.geocode(f"{address}, {city}, India", timeout=10) |
| if address_location: |
| |
| city_coords = (city_location.latitude, city_location.longitude) |
| address_coords = (address_location.latitude, address_location.longitude) |
| distance = geodesic(city_coords, address_coords).kilometers |
| |
| |
| city_lower = city.lower() |
| metro_cities = ["mumbai", "delhi", "bangalore", "hyderabad", "chennai", "kolkata", "pune"] |
| tier2_cities = ["ahmedabad", "jaipur", "surat", "lucknow", "kanpur", "nagpur", "indore", |
| "thane", "bhopal", "visakhapatnam", "patna", "vadodara", "ghaziabad", |
| "ludhiana", "agra", "nashik", "faridabad", "meerut", "rajkot", "varanasi"] |
| |
| if any(city in city_lower for city in metro_cities): |
| max_distance = 50 |
| elif any(city in city_lower for city in tier2_cities): |
| max_distance = 30 |
| else: |
| max_distance = 20 |
| |
| return distance <= max_distance |
| except Exception as e: |
| logger.debug(f"Error in reverse geocoding: {str(e)}") |
| |
| return False |
| except Exception as e: |
| logger.error(f"Error in location verification: {str(e)}") |
| return False |
|
|
| def verify_city_in_state(city: str, state: str) -> bool: |
| """Verify if the city exists in the given state.""" |
| if not city or not state: |
| return False |
| |
| try: |
| |
| formats = [ |
| f"{city}, {state}, India", |
| f"{state}, {city}, India", |
| f"{city}, {state}" |
| ] |
| |
| for fmt in formats: |
| try: |
| location = geocoder.geocode(fmt, timeout=10) |
| if location: |
| location_address = location.address.lower() |
| city_lower = city.lower() |
| state_lower = state.lower() |
| |
| |
| city_variations = [ |
| city_lower, |
| city_lower.replace(' ', ''), |
| city_lower.replace(' ', '-') |
| ] |
| |
| state_variations = [ |
| state_lower, |
| state_lower.replace(' ', ''), |
| state_lower.replace(' ', '-') |
| ] |
| |
| if any(city_var in location_address for city_var in city_variations) and \ |
| any(state_var in location_address for state_var in state_variations): |
| return True |
| except: |
| continue |
| time.sleep(1) |
| |
| return False |
| except: |
| return False |
|
|
| def verify_state_in_country(state: str, country: str = "India") -> bool: |
| """Verify if the state exists in the given country.""" |
| if not state: |
| return False |
| |
| |
| valid_states = [ |
| 'andhra pradesh', 'arunachal pradesh', 'assam', 'bihar', 'chhattisgarh', |
| 'goa', 'gujarat', 'haryana', 'himachal pradesh', 'jharkhand', 'karnataka', |
| 'kerala', 'madhya pradesh', 'maharashtra', 'manipur', 'meghalaya', 'mizoram', |
| 'nagaland', 'odisha', 'punjab', 'rajasthan', 'sikkim', 'tamil nadu', |
| 'telangana', 'tripura', 'uttar pradesh', 'uttarakhand', 'west bengal', |
| 'andaman and nicobar islands', 'chandigarh', 'dadra and nagar haveli and daman and diu', |
| 'delhi', 'jammu and kashmir', 'ladakh', 'lakshadweep', 'puducherry' |
| ] |
| |
| state_lower = state.lower() |
| return state_lower in valid_states |
|
|
| def verify_postal_code_in_city(postal_code: str, city: str) -> bool: |
| """Verify if the postal code belongs to the given city.""" |
| if not postal_code or not city: |
| return False |
| |
| try: |
| |
| formats = [ |
| f"{postal_code}, {city}, India", |
| f"{city}, {postal_code}, India", |
| f"{postal_code}, {city}" |
| ] |
| |
| for fmt in formats: |
| try: |
| location = geocoder.geocode(fmt, timeout=10) |
| if location: |
| location_address = location.address.lower() |
| city_lower = city.lower() |
| |
| |
| city_variations = [ |
| city_lower, |
| city_lower.replace(' ', ''), |
| city_lower.replace(' ', '-') |
| ] |
| |
| if any(var in location_address for var in city_variations): |
| return True |
| except: |
| continue |
| time.sleep(1) |
| |
| return False |
| except: |
| return False |
|
|
| def verify_coordinates_in_city(latitude: str, longitude: str, city: str) -> bool: |
| """Verify if the coordinates are within the given city.""" |
| if not all([latitude, longitude, city]): |
| return False |
| |
| try: |
| |
| lat = float(str(latitude).strip()) |
| lng = float(str(longitude).strip()) |
| |
| |
| city_location = geocoder.geocode(f"{city}, India", timeout=10) |
| if not city_location: |
| return False |
| |
| city_coords = (city_location.latitude, city_location.longitude) |
| property_coords = (lat, lng) |
| |
| |
| distance = geodesic(city_coords, property_coords).kilometers |
| |
| |
| city_lower = city.lower() |
| metro_cities = ["mumbai", "delhi", "bangalore", "hyderabad", "chennai", "kolkata", "pune"] |
| tier2_cities = ["ahmedabad", "jaipur", "surat", "lucknow", "kanpur", "nagpur", "indore", |
| "thane", "bhopal", "visakhapatnam", "patna", "vadodara", "ghaziabad", |
| "ludhiana", "agra", "nashik", "faridabad", "meerut", "rajkot", "varanasi"] |
| |
| |
| if any(city in city_lower for city in metro_cities): |
| max_distance = 50 |
| elif any(city in city_lower for city in tier2_cities): |
| max_distance = 30 |
| else: |
| max_distance = 20 |
| |
| return distance <= max_distance |
| except: |
| return False |
|
|
| def analyze_location(data: Dict[str, Any]) -> Dict[str, Any]: |
| """Analyze location data with detailed verification.""" |
| try: |
| |
| if not isinstance(data, dict): |
| logger.warning(f"Input to analyze_location is not a dict: {type(data)}") |
| data = {} |
| |
| for key in ['address', 'city', 'state', 'zip', 'latitude', 'longitude', 'nearby_landmarks']: |
| if key not in data: |
| data[key] = '' |
| |
| verification_results = { |
| 'address_format_valid': validate_address_format(data.get('address', '')), |
| 'address_in_city': verify_location_in_city(data.get('address', ''), data.get('city', '')), |
| 'city_in_state': verify_city_in_state(data.get('city', ''), data.get('state', '')), |
| 'state_in_country': verify_state_in_country(data.get('state', '')), |
| 'postal_code_valid': validate_postal_code(data.get('zip', '')), |
| 'postal_code_in_city': verify_postal_code_in_city(data.get('zip', ''), data.get('city', '')), |
| 'coordinates_valid': validate_coordinates(data.get('latitude', ''), data.get('longitude', '')), |
| 'coordinates_in_city': verify_coordinates_in_city( |
| data.get('latitude', ''), |
| data.get('longitude', ''), |
| data.get('city', '') |
| ) |
| } |
| |
| weights = { |
| 'address_format_valid': 0.10, |
| 'address_in_city': 0.15, |
| 'city_in_state': 0.15, |
| 'state_in_country': 0.15, |
| 'postal_code_valid': 0.15, |
| 'postal_code_in_city': 0.10, |
| 'coordinates_valid': 0.10, |
| 'coordinates_in_city': 0.10 |
| } |
| completeness_score = sum( |
| weights[key] * 100 if result else 0 |
| for key, result in verification_results.items() |
| ) |
| |
| |
| critical_checks = ['city_in_state', 'state_in_country'] |
| secondary_checks = ['address_format_valid', 'address_in_city', 'postal_code_valid', 'postal_code_in_city', 'coordinates_valid', 'coordinates_in_city'] |
| |
| |
| critical_passed = all(verification_results[check] for check in critical_checks) |
| secondary_passed = sum(1 for check in secondary_checks if verification_results[check]) |
| location_quality = "verified" if critical_passed and secondary_passed >= 1 else "unverified" |
| |
| |
| landmarks_analysis = { |
| 'provided': bool(data.get('nearby_landmarks')), |
| 'count': len(data.get('nearby_landmarks', '').split(',')) if data.get('nearby_landmarks') else 0, |
| 'types': [] |
| } |
| if data.get('nearby_landmarks'): |
| landmark_types = { |
| 'transport': ['station', 'metro', 'bus', 'railway', 'airport', 'terminal', 'depot', 'stand', 'stop'], |
| 'education': ['school', 'college', 'university', 'institute', 'academy', 'campus', 'library'], |
| 'healthcare': ['hospital', 'clinic', 'medical', 'health', 'diagnostic', 'pharmacy', 'dispensary'], |
| 'shopping': ['mall', 'market', 'shop', 'store', 'bazaar', 'complex', 'plaza', 'retail', 'outlet'], |
| 'entertainment': ['park', 'garden', 'theater', 'cinema', 'stadium', 'auditorium', 'playground'], |
| 'business': ['office', 'business', 'corporate', 'commercial', 'industrial', 'tech park', 'hub'] |
| } |
| landmarks = [landmark.strip() for landmark in data['nearby_landmarks'].lower().split(',')] |
| for landmark in landmarks: |
| for type_name, keywords in landmark_types.items(): |
| if any(keyword in landmark for keyword in keywords): |
| if type_name not in landmarks_analysis['types']: |
| landmarks_analysis['types'].append(type_name) |
| |
| |
| city_tier = "unknown" |
| if data.get('city'): |
| city_lower = data['city'].lower() |
| metro_cities = ["mumbai", "delhi", "bangalore", "hyderabad", "chennai", "kolkata", "pune"] |
| tier2_cities = ["ahmedabad", "jaipur", "surat", "lucknow", "kanpur", "nagpur", "indore", |
| "thane", "bhopal", "visakhapatnam", "patna", "vadodara", "ghaziabad", |
| "ludhiana", "agra", "nashik", "faridabad", "meerut", "rajkot", "varanasi"] |
| if any(city in city_lower for city in metro_cities): |
| city_tier = "metro" |
| elif any(city in city_lower for city in tier2_cities): |
| city_tier = "tier2" |
| else: |
| city_tier = "tier3" |
| |
| |
| if completeness_score >= 60: |
| assessment = "complete" |
| elif completeness_score >= 30: |
| assessment = "partial" |
| else: |
| assessment = "minimal" |
| |
| |
| if completeness_score == 0 and (data.get('city') or data.get('state')): |
| completeness_score = 40 |
| |
| return { |
| **verification_results, |
| 'assessment': assessment, |
| 'completeness_score': completeness_score, |
| 'location_quality': location_quality, |
| 'city_tier': city_tier, |
| 'landmarks_analysis': landmarks_analysis, |
| 'verification_status': "verified" if location_quality == "verified" else "unverified", |
| 'formatted_address': f"{data.get('address', '')}, {data.get('city', '')}, {data.get('state', '')}, India - {data.get('zip', '')}" |
| } |
| except Exception as e: |
| logger.error(f"Error analyzing location: {str(e)}") |
| return { |
| 'assessment': 'error', |
| 'completeness_score': 30, |
| 'location_quality': 'error', |
| 'city_tier': 'unknown', |
| 'landmarks_analysis': {'provided': False, 'count': 0, 'types': []}, |
| 'verification_status': 'error', |
| 'formatted_address': '', |
| 'address_format_valid': False, |
| 'address_in_city': False, |
| 'city_in_state': False, |
| 'state_in_country': False, |
| 'postal_code_valid': False, |
| 'postal_code_in_city': False, |
| 'coordinates_valid': False, |
| 'coordinates_in_city': False |
| } |
|
|
| def calculate_location_completeness(data): |
| |
| weights = { |
| 'address': 0.25, |
| 'city': 0.20, |
| 'state': 0.15, |
| 'country': 0.05, |
| 'zip': 0.10, |
| 'latitude': 0.10, |
| 'longitude': 0.10, |
| 'nearby_landmarks': 0.05 |
| } |
|
|
| |
| score = 0 |
| for field, weight in weights.items(): |
| if data[field]: |
| score += weight |
|
|
| return int(score * 100) |
|
|