pythonprincess commited on
Commit
45d5985
·
verified ·
1 Parent(s): 50ba4f8

Delete location_utils.py

Browse files
Files changed (1) hide show
  1. location_utils.py +0 -717
location_utils.py DELETED
@@ -1,717 +0,0 @@
1
- # app/location_utils.py
2
- """
3
- 🗺️ Penny's Location Intelligence System
4
- Handles city detection, tenant routing, and geographic data loading.
5
-
6
- MISSION: Connect residents to the right local resources, regardless of how
7
- they describe their location — whether it's "Atlanta", "ATL", "30303", or "near me".
8
-
9
- CURRENT: Rule-based city matching with 6 supported cities
10
- FUTURE: Will add ZIP→city mapping, geocoding API, and user location preferences
11
- """
12
-
13
- import re
14
- import json
15
- import os
16
- import logging
17
- from typing import Dict, Any, Optional, List, Tuple
18
- from pathlib import Path
19
- from dataclasses import dataclass
20
- from enum import Enum
21
-
22
- # --- LOGGING SETUP (Azure-friendly) ---
23
- logger = logging.getLogger(__name__)
24
-
25
- # --- BASE PATHS (OS-agnostic for Azure/Windows/Linux) ---
26
- BASE_DIR = Path(__file__).parent.parent.resolve()
27
- DATA_PATH = BASE_DIR / "data"
28
- EVENTS_PATH = DATA_PATH / "events"
29
- RESOURCES_PATH = DATA_PATH / "resources"
30
-
31
- # Ensure critical directories exist (Azure deployment safety)
32
- for path in [DATA_PATH, EVENTS_PATH, RESOURCES_PATH]:
33
- path.mkdir(parents=True, exist_ok=True)
34
-
35
-
36
- # ============================================================
37
- # CITY REGISTRY (Penny's Supported Cities)
38
- # ============================================================
39
-
40
- @dataclass
41
- class CityInfo:
42
- """
43
- Structured information about a city Penny supports.
44
- Makes it easy to add new cities with metadata.
45
- """
46
- tenant_id: str # Standard format: cityname_state (e.g., "atlanta_ga")
47
- full_name: str # Display name: "Atlanta, GA"
48
- state: str # Two-letter state code
49
- aliases: List[str] # Common variations users might say
50
- timezone: str # IANA timezone (e.g., "America/New_York")
51
- lat: Optional[float] = None # For weather API fallback
52
- lon: Optional[float] = None
53
-
54
- def __post_init__(self):
55
- # Normalize all aliases to lowercase for matching
56
- self.aliases = [alias.lower().strip() for alias in self.aliases]
57
-
58
-
59
- class SupportedCities:
60
- """
61
- 🏙️ Penny's city registry.
62
- Each city gets standardized metadata for consistent routing.
63
- """
64
-
65
- ATLANTA = CityInfo(
66
- tenant_id="atlanta_ga",
67
- full_name="Atlanta, GA",
68
- state="GA",
69
- timezone="America/New_York",
70
- lat=33.7490,
71
- lon=-84.3880,
72
- aliases=[
73
- "atlanta", "atl", "atlanta ga", "atlanta, ga",
74
- "city of atlanta", "hotlanta", "the atl"
75
- ]
76
- )
77
-
78
- BIRMINGHAM = CityInfo(
79
- tenant_id="birmingham_al",
80
- full_name="Birmingham, AL",
81
- state="AL",
82
- timezone="America/Chicago",
83
- lat=33.5207,
84
- lon=-86.8025,
85
- aliases=[
86
- "birmingham", "birmingham al", "birmingham, al",
87
- "city of birmingham", "bham"
88
- ]
89
- )
90
-
91
- CHESTERFIELD = CityInfo(
92
- tenant_id="chesterfield_va",
93
- full_name="Chesterfield, VA",
94
- state="VA",
95
- timezone="America/New_York",
96
- lat=37.3771,
97
- lon=-77.5047,
98
- aliases=[
99
- "chesterfield", "chesterfield va", "chesterfield, va",
100
- "chesterfield county"
101
- ]
102
- )
103
-
104
- EL_PASO = CityInfo(
105
- tenant_id="el_paso_tx",
106
- full_name="El Paso, TX",
107
- state="TX",
108
- timezone="America/Denver",
109
- lat=31.7619,
110
- lon=-106.4850,
111
- aliases=[
112
- "el paso", "el paso tx", "el paso, tx",
113
- "city of el paso", "elpaso"
114
- ]
115
- )
116
-
117
- PROVIDENCE = CityInfo(
118
- tenant_id="providence_ri",
119
- full_name="Providence, RI",
120
- state="RI",
121
- timezone="America/New_York",
122
- lat=41.8240,
123
- lon=-71.4128,
124
- aliases=[
125
- "providence", "providence ri", "providence, ri",
126
- "city of providence", "pvd"
127
- ]
128
- )
129
-
130
- SEATTLE = CityInfo(
131
- tenant_id="seattle_wa",
132
- full_name="Seattle, WA",
133
- state="WA",
134
- timezone="America/Los_Angeles",
135
- lat=47.6062,
136
- lon=-122.3321,
137
- aliases=[
138
- "seattle", "seattle wa", "seattle, wa",
139
- "city of seattle", "emerald city", "sea"
140
- ]
141
- )
142
-
143
- @classmethod
144
- def get_all_cities(cls) -> List[CityInfo]:
145
- """Returns list of all supported cities."""
146
- return [
147
- cls.ATLANTA,
148
- cls.BIRMINGHAM,
149
- cls.CHESTERFIELD,
150
- cls.EL_PASO,
151
- cls.PROVIDENCE,
152
- cls.SEATTLE
153
- ]
154
-
155
- @classmethod
156
- def get_city_by_tenant_id(cls, tenant_id: str) -> Optional[CityInfo]:
157
- """Lookup city info by tenant ID."""
158
- for city in cls.get_all_cities():
159
- if city.tenant_id == tenant_id:
160
- return city
161
- return None
162
-
163
-
164
- # ============================================================
165
- # BUILD DYNAMIC CITY PATTERNS (from CityInfo registry)
166
- # ============================================================
167
-
168
- def _build_city_patterns() -> Dict[str, str]:
169
- """
170
- Generates city matching dictionary from the CityInfo registry.
171
- This keeps the pattern matching backward-compatible with existing code.
172
- """
173
- patterns = {}
174
- for city in SupportedCities.get_all_cities():
175
- for alias in city.aliases:
176
- patterns[alias] = city.tenant_id
177
- return patterns
178
-
179
-
180
- # Dynamic pattern dictionary (auto-generated from city registry)
181
- REAL_CITY_PATTERNS = _build_city_patterns()
182
-
183
-
184
- # ============================================================
185
- # LOCATION DETECTION ENUMS
186
- # ============================================================
187
-
188
- class LocationStatus(str, Enum):
189
- """
190
- Status codes for location detection results.
191
- """
192
- FOUND = "found" # Valid city matched
193
- ZIP_DETECTED = "zip_detected" # ZIP code found (needs mapping)
194
- USER_LOCATION_NEEDED = "user_location_needed" # "near me" detected
195
- UNKNOWN = "unknown" # No match found
196
- AMBIGUOUS = "ambiguous" # Multiple possible matches
197
-
198
-
199
- @dataclass
200
- class LocationMatch:
201
- """
202
- Structured result from location detection.
203
- Includes confidence and matched patterns for debugging.
204
- """
205
- status: LocationStatus
206
- tenant_id: Optional[str] = None
207
- city_info: Optional[CityInfo] = None
208
- confidence: float = 0.0 # 0.0 - 1.0
209
- matched_pattern: Optional[str] = None
210
- alternatives: List[str] = None
211
-
212
- def __post_init__(self):
213
- if self.alternatives is None:
214
- self.alternatives = []
215
-
216
-
217
- # ============================================================
218
- # ZIP CODE PATTERNS (for future expansion)
219
- # ============================================================
220
-
221
- ZIP_PATTERN = re.compile(r"\b\d{5}(?:-\d{4})?\b") # Matches 12345 or 12345-6789
222
-
223
- # Future ZIP → City mapping (placeholder)
224
- ZIP_TO_CITY_MAP: Dict[str, str] = {
225
- # Atlanta metro
226
- "30303": "atlanta_ga",
227
- "30318": "atlanta_ga",
228
- "30309": "atlanta_ga",
229
-
230
- # Birmingham metro
231
- "35203": "birmingham_al",
232
- "35233": "birmingham_al",
233
-
234
- # Chesterfield County
235
- "23832": "chesterfield_va",
236
- "23838": "chesterfield_va",
237
-
238
- # El Paso
239
- "79901": "el_paso_tx",
240
- "79936": "el_paso_tx",
241
-
242
- # Providence
243
- "02903": "providence_ri",
244
- "02904": "providence_ri",
245
-
246
- # Seattle metro
247
- "98101": "seattle_wa",
248
- "98104": "seattle_wa",
249
- "98122": "seattle_wa",
250
- }
251
-
252
-
253
- # ============================================================
254
- # MAIN CITY EXTRACTION LOGIC (Enhanced)
255
- # ============================================================
256
-
257
- def extract_city_name(text: str) -> str:
258
- """
259
- 🎯 BACKWARD-COMPATIBLE location extraction (returns tenant_id string).
260
-
261
- Extracts tenant ID (e.g., 'atlanta_ga') from user input.
262
-
263
- Args:
264
- text: User's location input (e.g., "Atlanta", "30303", "near me")
265
-
266
- Returns:
267
- Tenant ID string or status code:
268
- - Valid tenant_id (e.g., "atlanta_ga")
269
- - "zip_detected" (ZIP code found, needs mapping)
270
- - "user_location_needed" ("near me" detected)
271
- - "unknown" (no match)
272
- """
273
- result = extract_location_detailed(text)
274
- return result.tenant_id or result.status.value
275
-
276
-
277
- def extract_location_detailed(text: str) -> LocationMatch:
278
- """
279
- 🧠 ENHANCED location extraction with confidence scoring.
280
-
281
- This function intelligently parses location references and returns
282
- structured results with metadata for better error handling.
283
-
284
- Args:
285
- text: User's location input
286
-
287
- Returns:
288
- LocationMatch object with full detection details
289
- """
290
-
291
- if not text or not text.strip():
292
- logger.warning("Empty text provided to location extraction")
293
- return LocationMatch(
294
- status=LocationStatus.UNKNOWN,
295
- confidence=0.0
296
- )
297
-
298
- lowered = text.lower().strip()
299
- logger.debug(f"Extracting location from: '{lowered}'")
300
-
301
- # --- STEP 1: Check for "near me" / location services needed ---
302
- near_me_phrases = [
303
- "near me", "my area", "my city", "my neighborhood",
304
- "where i am", "current location", "my location",
305
- "around here", "locally", "in my town"
306
- ]
307
-
308
- if any(phrase in lowered for phrase in near_me_phrases):
309
- logger.info("User location services required")
310
- return LocationMatch(
311
- status=LocationStatus.USER_LOCATION_NEEDED,
312
- confidence=1.0,
313
- matched_pattern="near_me_detected"
314
- )
315
-
316
- # --- STEP 2: Check for ZIP codes ---
317
- zip_matches = ZIP_PATTERN.findall(text)
318
- if zip_matches:
319
- zip_code = zip_matches[0] # Take first ZIP if multiple
320
-
321
- # Try to map ZIP to known city
322
- if zip_code in ZIP_TO_CITY_MAP:
323
- tenant_id = ZIP_TO_CITY_MAP[zip_code]
324
- city_info = SupportedCities.get_city_by_tenant_id(tenant_id)
325
- logger.info(f"ZIP {zip_code} mapped to {tenant_id}")
326
- return LocationMatch(
327
- status=LocationStatus.FOUND,
328
- tenant_id=tenant_id,
329
- city_info=city_info,
330
- confidence=0.95,
331
- matched_pattern=f"zip:{zip_code}"
332
- )
333
- else:
334
- logger.info(f"ZIP code detected but not mapped: {zip_code}")
335
- return LocationMatch(
336
- status=LocationStatus.ZIP_DETECTED,
337
- confidence=0.5,
338
- matched_pattern=f"zip:{zip_code}"
339
- )
340
-
341
- # --- STEP 3: Match against city patterns ---
342
- matches = []
343
- for pattern, tenant_id in REAL_CITY_PATTERNS.items():
344
- if pattern in lowered:
345
- matches.append((pattern, tenant_id))
346
-
347
- if not matches:
348
- logger.info(f"No city match found for: '{lowered}'")
349
- return LocationMatch(
350
- status=LocationStatus.UNKNOWN,
351
- confidence=0.0
352
- )
353
-
354
- # If multiple matches, pick the longest pattern (most specific)
355
- # Example: "atlanta" vs "city of atlanta" — pick the longer one
356
- matches.sort(key=lambda x: len(x[0]), reverse=True)
357
- best_pattern, best_tenant_id = matches[0]
358
-
359
- city_info = SupportedCities.get_city_by_tenant_id(best_tenant_id)
360
-
361
- # Calculate confidence based on match specificity
362
- confidence = min(len(best_pattern) / len(lowered), 1.0)
363
-
364
- result = LocationMatch(
365
- status=LocationStatus.FOUND,
366
- tenant_id=best_tenant_id,
367
- city_info=city_info,
368
- confidence=confidence,
369
- matched_pattern=best_pattern
370
- )
371
-
372
- # Check for ambiguity (multiple different cities matched)
373
- unique_tenant_ids = set(tid for _, tid in matches)
374
- if len(unique_tenant_ids) > 1:
375
- result.status = LocationStatus.AMBIGUOUS
376
- result.alternatives = [tid for _, tid in matches if tid != best_tenant_id]
377
- logger.warning(f"Ambiguous location match: {unique_tenant_ids}")
378
-
379
- logger.info(f"Location matched: {best_tenant_id} (confidence: {confidence:.2f})")
380
- return result
381
-
382
-
383
- # ============================================================
384
- # DATA LOADING UTILITIES (Enhanced with error handling)
385
- # ============================================================
386
-
387
- def load_city_data(directory: Path, tenant_id: str) -> Dict[str, Any]:
388
- """
389
- 🗄️ Generic utility to load JSON data for a given tenant ID.
390
-
391
- Args:
392
- directory: Base path (EVENTS_PATH or RESOURCES_PATH)
393
- tenant_id: City identifier (e.g., 'atlanta_ga')
394
-
395
- Returns:
396
- Parsed JSON content as dictionary
397
-
398
- Raises:
399
- FileNotFoundError: If the JSON file doesn't exist
400
- json.JSONDecodeError: If the file is malformed
401
- """
402
-
403
- file_path = directory / f"{tenant_id}.json"
404
-
405
- if not file_path.exists():
406
- logger.error(f"Data file not found: {file_path}")
407
- raise FileNotFoundError(f"Data file not found: {file_path}")
408
-
409
- try:
410
- with open(file_path, 'r', encoding='utf-8') as f:
411
- data = json.load(f)
412
- logger.debug(f"Loaded data from {file_path}")
413
- return data
414
- except json.JSONDecodeError as e:
415
- logger.error(f"Invalid JSON in {file_path}: {e}")
416
- raise
417
- except Exception as e:
418
- logger.error(f"Error reading {file_path}: {e}", exc_info=True)
419
- raise
420
-
421
-
422
- def load_city_events(tenant_id: str) -> Dict[str, Any]:
423
- """
424
- 📅 Loads structured event data for a given city.
425
-
426
- Args:
427
- tenant_id: City identifier (e.g., 'atlanta_ga')
428
-
429
- Returns:
430
- Event data structure with 'events' key containing list of events
431
-
432
- Example:
433
- {
434
- "city": "Atlanta, GA",
435
- "events": [
436
- {"name": "Jazz Festival", "category": "outdoor", ...},
437
- ...
438
- ]
439
- }
440
- """
441
- logger.info(f"Loading events for {tenant_id}")
442
- return load_city_data(EVENTS_PATH, tenant_id)
443
-
444
-
445
- def load_city_resources(tenant_id: str) -> Dict[str, Any]:
446
- """
447
- 🏛️ Loads civic resource data for a given city.
448
-
449
- Args:
450
- tenant_id: City identifier (e.g., 'atlanta_ga')
451
-
452
- Returns:
453
- Resource data structure with categorized resources
454
-
455
- Example:
456
- {
457
- "city": "Atlanta, GA",
458
- "resources": {
459
- "shelters": [...],
460
- "food_banks": [...],
461
- "libraries": [...]
462
- }
463
- }
464
- """
465
- logger.info(f"Loading resources for {tenant_id}")
466
- return load_city_data(RESOURCES_PATH, tenant_id)
467
-
468
-
469
- # ============================================================
470
- # UTILITY FUNCTIONS
471
- # ============================================================
472
-
473
- def normalize_location_name(text: str) -> str:
474
- """
475
- 🧹 Normalize location names into consistent format.
476
- Removes spaces, hyphens, and special characters.
477
-
478
- Example:
479
- "El Paso, TX" → "elpasotx"
480
- "Chesterfield County" → "chesterfieldcounty"
481
- """
482
- if not text:
483
- return ""
484
-
485
- # Remove punctuation and spaces
486
- normalized = re.sub(r"[\s\-,\.]+", "", text.lower().strip())
487
- return normalized
488
-
489
-
490
- def get_city_coordinates(tenant_id: str) -> Optional[Dict[str, float]]:
491
- """
492
- 🗺️ Returns coordinates for a city as a dictionary.
493
- Useful for weather API calls.
494
-
495
- Args:
496
- tenant_id: City identifier
497
-
498
- Returns:
499
- Dictionary with "lat" and "lon" keys, or None if not found
500
-
501
- Note: This function returns a dict for consistency with orchestrator usage.
502
- Use tuple unpacking: coords = get_city_coordinates(tenant_id); lat, lon = coords["lat"], coords["lon"]
503
- """
504
- city_info = SupportedCities.get_city_by_tenant_id(tenant_id)
505
- if city_info and city_info.lat is not None and city_info.lon is not None:
506
- return {"lat": city_info.lat, "lon": city_info.lon}
507
- return None
508
-
509
-
510
- def get_city_info(tenant_id: str) -> Optional[Dict[str, Any]]:
511
- """
512
- 🏙️ Returns city information dictionary.
513
-
514
- Args:
515
- tenant_id: City identifier
516
-
517
- Returns:
518
- Dictionary with city information (name, state, coordinates, etc.) or None
519
- """
520
- city_info = SupportedCities.get_city_by_tenant_id(tenant_id)
521
- if city_info:
522
- return {
523
- "tenant_id": city_info.tenant_id,
524
- "full_name": city_info.full_name,
525
- "state": city_info.state,
526
- "timezone": city_info.timezone,
527
- "lat": city_info.lat,
528
- "lon": city_info.lon,
529
- "aliases": city_info.aliases
530
- }
531
- return None
532
-
533
-
534
- def detect_location_from_text(text: str) -> Dict[str, Any]:
535
- """
536
- 🔍 Detects location from text input.
537
-
538
- Args:
539
- text: User input text
540
-
541
- Returns:
542
- Dictionary with keys:
543
- - found: bool (whether location was detected)
544
- - tenant_id: str (if found)
545
- - city_info: dict (if found)
546
- - confidence: float (0.0-1.0)
547
- """
548
- result = extract_location_detailed(text)
549
-
550
- return {
551
- "found": result.status == LocationStatus.FOUND,
552
- "tenant_id": result.tenant_id,
553
- "city_info": {
554
- "tenant_id": result.city_info.tenant_id,
555
- "full_name": result.city_info.full_name,
556
- "state": result.city_info.state
557
- } if result.city_info else None,
558
- "confidence": result.confidence,
559
- "status": result.status.value
560
- }
561
-
562
-
563
- def validate_coordinates(lat: float, lon: float) -> Tuple[bool, Optional[str]]:
564
- """
565
- ✅ Validates latitude and longitude coordinates.
566
-
567
- Args:
568
- lat: Latitude (-90 to 90)
569
- lon: Longitude (-180 to 180)
570
-
571
- Returns:
572
- Tuple of (is_valid, error_message)
573
- - is_valid: True if coordinates are valid
574
- - error_message: None if valid, error description if invalid
575
- """
576
- if not isinstance(lat, (int, float)) or not isinstance(lon, (int, float)):
577
- return False, "Coordinates must be numeric values"
578
-
579
- if not (-90 <= lat <= 90):
580
- return False, f"Latitude must be between -90 and 90, got {lat}"
581
-
582
- if not (-180 <= lon <= 180):
583
- return False, f"Longitude must be between -180 and 180, got {lon}"
584
-
585
- return True, None
586
-
587
-
588
- def get_city_timezone(tenant_id: str) -> Optional[str]:
589
- """
590
- 🕐 Returns IANA timezone string for a city.
591
- Useful for time-sensitive features (events, business hours).
592
-
593
- Args:
594
- tenant_id: City identifier
595
-
596
- Returns:
597
- IANA timezone string (e.g., "America/New_York") or None
598
- """
599
- city_info = SupportedCities.get_city_by_tenant_id(tenant_id)
600
- return city_info.timezone if city_info else None
601
-
602
-
603
- def validate_tenant_id(tenant_id: str) -> bool:
604
- """
605
- ✅ Checks if a tenant_id is valid and supported.
606
-
607
- Args:
608
- tenant_id: City identifier to validate
609
-
610
- Returns:
611
- True if valid and supported, False otherwise
612
- """
613
- city_info = SupportedCities.get_city_by_tenant_id(tenant_id)
614
- return city_info is not None
615
-
616
-
617
- def get_all_supported_cities() -> List[Dict[str, str]]:
618
- """
619
- 📋 Returns list of all supported cities for API responses.
620
-
621
- Returns:
622
- List of city info dictionaries with tenant_id and display name
623
-
624
- Example:
625
- [
626
- {"tenant_id": "atlanta_ga", "name": "Atlanta, GA"},
627
- {"tenant_id": "seattle_wa", "name": "Seattle, WA"},
628
- ...
629
- ]
630
- """
631
- return [
632
- {
633
- "tenant_id": city.tenant_id,
634
- "name": city.full_name,
635
- "state": city.state
636
- }
637
- for city in SupportedCities.get_all_cities()
638
- ]
639
-
640
-
641
- # ============================================================
642
- # DATA VALIDATION (For startup checks)
643
- # ============================================================
644
-
645
- def validate_city_data_files() -> Dict[str, Dict[str, bool]]:
646
- """
647
- 🧪 Validates that all expected data files exist.
648
- Useful for startup checks and deployment verification.
649
-
650
- Returns:
651
- Dictionary mapping tenant_id to file existence status
652
-
653
- Example:
654
- {
655
- "atlanta_ga": {"events": True, "resources": True},
656
- "seattle_wa": {"events": False, "resources": True}
657
- }
658
- """
659
- validation_results = {}
660
-
661
- for city in SupportedCities.get_all_cities():
662
- tenant_id = city.tenant_id
663
- events_file = EVENTS_PATH / f"{tenant_id}.json"
664
- resources_file = RESOURCES_PATH / f"{tenant_id}.json"
665
-
666
- validation_results[tenant_id] = {
667
- "events": events_file.exists(),
668
- "resources": resources_file.exists()
669
- }
670
-
671
- if not events_file.exists():
672
- logger.warning(f"Missing events file for {tenant_id}")
673
- if not resources_file.exists():
674
- logger.warning(f"Missing resources file for {tenant_id}")
675
-
676
- return validation_results
677
-
678
-
679
- # ============================================================
680
- # INITIALIZATION CHECK (Call on app startup)
681
- # ============================================================
682
-
683
- def initialize_location_system() -> bool:
684
- """
685
- 🚀 Validates location system is ready.
686
- Should be called during app startup.
687
-
688
- Returns:
689
- True if system is ready, False if critical files missing
690
- """
691
- logger.info("🗺️ Initializing Penny's location system...")
692
-
693
- # Check directories exist
694
- if not DATA_PATH.exists():
695
- logger.error(f"Data directory not found: {DATA_PATH}")
696
- return False
697
-
698
- # Validate city data files
699
- validation = validate_city_data_files()
700
-
701
- total_cities = len(SupportedCities.get_all_cities())
702
- cities_with_events = sum(1 for v in validation.values() if v["events"])
703
- cities_with_resources = sum(1 for v in validation.values() if v["resources"])
704
-
705
- logger.info(f"✅ {total_cities} cities registered")
706
- logger.info(f"✅ {cities_with_events}/{total_cities} cities have event data")
707
- logger.info(f"✅ {cities_with_resources}/{total_cities} cities have resource data")
708
-
709
- # Warn about missing data but don't fail
710
- missing_data = [tid for tid, status in validation.items()
711
- if not status["events"] or not status["resources"]]
712
-
713
- if missing_data:
714
- logger.warning(f"⚠️ Incomplete data for cities: {missing_data}")
715
-
716
- logger.info("🗺️ Location system initialized successfully")
717
- return True