Spaces:
Sleeping
Sleeping
| """ | |
| PlaceNameExtractor โ ๋ธ๋ก๊ทธ/์ ํ๋ธ ํ ์คํธ์์ ์ฅ์๋ช ์ ์ถ์ถํ๋ ๊ณตํต ๋ชจ๋ | |
| ์ถ์ถ ์ฐ์ ์์: | |
| 1. ์ฅ์๋ช ์ฌ์ ๋งค์นญ (trend_spots + story_spots ์ด๋ฆ) | |
| 2. ์ ํ๋ธ ์ ์ฉ ํจํด (๐, @, ๋ฒํธ ๋ฆฌ์คํธ) | |
| 3. ์ ๋ฏธ์ฌ ํจํด ๋งค์นญ (์นดํ, ์๋น, ํด๋ณ ๋ฑ) | |
| """ | |
| import re | |
| import logging | |
| from supabase import Client | |
| from db import get_supabase | |
| from trend_engine.spot_matcher import is_system_name | |
| logger = logging.getLogger(__name__) | |
| # ์ฅ์๋ช ์ ๋ฏธ์ฌ ํจํด (์ฌ์ ์ ์๋ ์ ์ฅ์ ๋ฐ๊ฒฌ์ฉ) | |
| SUFFIX_PATTERNS = [ | |
| re.compile(r"[\w๊ฐ-ํฃ]{2,}(?:์นดํ|์ปคํผ)"), | |
| re.compile(r"[\w๊ฐ-ํฃ]{2,}(?:์๋น|๋ฐฅ์ง|๊ตญ์)"), | |
| re.compile(r"[\w๊ฐ-ํฃ]{2,}(?:ํด๋ณ|ํด์|ํฌ๊ตฌ)"), | |
| re.compile(r"[\w๊ฐ-ํฃ]{2,}(?:์ค๋ฆ|๊ณต์|์ฒ)"), | |
| re.compile(r"[\w๊ฐ-ํฃ]{2,}(?:์ ๋ง๋|๋ทฐ)"), | |
| re.compile(r"[\w๊ฐ-ํฃ]{2,}(?:๋ฒ ์ด์ปค๋ฆฌ|๋ธ๋ฐ์น|๋์ ํธ)"), | |
| ] | |
| # ์ ํ๋ธ ์์ ์ค๋ช ์์ ์์ฃผ ์ฌ์ฉ๋๋ ์ฅ์ ์๊ฐ ํจํด | |
| YOUTUBE_PLACE_PATTERNS = [ | |
| re.compile(r"๐\s*([\w๊ฐ-ํฃ\s]{2,20})"), | |
| re.compile(r"(?:๋ฐฉ๋ฌธ|๋ค๋ ์จ|๋ค๋ฅธ)\s*(?:๊ณณ|์ฅ์)[:\s]*([\w๊ฐ-ํฃ\s]{2,20})"), | |
| re.compile(r"(?:\d+\.\s*)([\w๊ฐ-ํฃ]{2,15})"), | |
| re.compile(r"@\s*([\w๊ฐ-ํฃ]{2,15})"), | |
| ] | |
| # ๋๋ฌด ์ผ๋ฐ์ ์ธ ๋จ์ด โ ์ฅ์๋ช ์ด ์๋ ๊ฒฝ์ฐ ํํฐ๋ง | |
| STOPWORDS = frozenset({ | |
| # ์ง์๋๋ช ์ฌ + ์นดํ ๊ณ ๋ฆฌ | |
| "์ด์นดํ", "๊ทธ์นดํ", "์ ์นดํ", "์ด์๋น", "๊ทธ์๋น", | |
| "์ดํด๋ณ", "๊ทธํด๋ณ", "์ด์ค๋ฆ", "๊ทธ์ค๋ฆ", | |
| # ํ์ฉ์ฌ + ์นดํ ๊ณ ๋ฆฌ | |
| "์ข์์นดํ", "์์์นดํ", "๋ง์๋์๋น", "์ข์์๋น", | |
| # ์์น/์์์ด + ์นดํ ๊ณ ๋ฆฌ | |
| "๊ทผ์ฒ์นดํ", "์ฃผ๋ณ์นดํ", "๋๋ค์นดํ", "์ ๋ช ์นดํ", | |
| "๊ฐ์ฑ์นดํ", "๋ทฐ์นดํ", "์ค์ ๋ทฐ์นดํ", | |
| "์ ์์นดํ", "ํ๋ฆผ์นดํ", "ํ์ฌ์นดํ", | |
| "์ ์ฃผ์นดํ", "์ ์ฃผ์๋น", "์ ์ฃผํด๋ณ", "์ ์ฃผ์ค๋ฆ", | |
| "์์์๋น", "์ ์ฃผ๋ง์ง", "์ ์๋ง์ง", | |
| # ์นดํ ๊ณ ๋ฆฌ ์ผ๋ฐ๋ช ์ฌ (๋จ๋ ์ฌ์ฉ ์ ์ฅ์๋ช ์๋) | |
| "์นดํ", "์ปคํผ", "์๋น", "๋ง์ง", "๋ฐฅ์ง", "๊ตญ์", | |
| "ํด๋ณ", "ํด์", "ํฌ๊ตฌ", "์ค๋ฆ", "๊ณต์", "์ฒ", | |
| "์ ๋ง๋", "๋ทฐ", "๋ฒ ์ด์ปค๋ฆฌ", "๋ธ๋ฐ์น", "๋์ ํธ", | |
| "์ฐ์ฑ ", "์ฐ์ฑ ๋ก", "๋๋ผ์ด๋ธ", | |
| # ์ง์ญ + ์ผ๋ฐ ํํ | |
| "์ ์ฃผ์ฌํ", "์ ์์ฌํ", "์ ์ฃผ๋", "์ ์", "์ ์๋ฆฌ", | |
| }) | |
| # ์ฃผ์ ํจํด โ ์ฅ์๋ช ์ถ์ถ ์ ์ ๊ฑฐ | |
| ADDRESS_PATTERNS = [ | |
| re.compile(r"์ ์ฃผ\s*(?:ํน๋ณ์์น)?๋?\s*์ ์ฃผ์\s*์ ์์\s*[\w๊ฐ-ํฃ\d\-]+"), | |
| re.compile(r"์ ์์\s*[\w๊ฐ-ํฃ]+๋ฆฌ\s*[\d\-]+"), | |
| re.compile(r"์ ์ฃผ์\s*[\w๊ฐ-ํฃ]+(?:์|๋ฉด)\s*[\w๊ฐ-ํฃ]+(?:๋ฆฌ|๋ก|๊ธธ)\s*[\d\-]*"), | |
| ] | |
| def preprocess_text(text: str) -> str: | |
| """์ฅ์๋ช ์ถ์ถ ์ ์ฃผ์ ํจํด ๋ฐ HTML ํ๊ทธ ์ ๊ฑฐ.""" | |
| text = re.sub(r"<[^>]+>", "", text) | |
| for pattern in ADDRESS_PATTERNS: | |
| text = pattern.sub("", text) | |
| return text | |
| class PlaceNameExtractor: | |
| """ํ ์คํธ์์ ์ฅ์๋ช ์ ์ถ์ถํ๋ ํ์ดํ๋ผ์ธ.""" | |
| def __init__(self, supabase: Client | None = None): | |
| self.supabase = supabase or get_supabase() | |
| # {์ฅ์๋ช : spot_id} โ ์ ํ ๋งค์นญ์ฉ | |
| self.known_places: dict[str, str] = {} | |
| self._load_place_dictionary() | |
| # ------------------------------------------------------------------ | |
| # ์ฌ์ ๊ตฌ์ถ | |
| # ------------------------------------------------------------------ | |
| def _load_place_dictionary(self) -> None: | |
| """trend_spots + story_spots์์ ์ฅ์๋ช ์ฌ์ ์ ๊ตฌ์ถํ๋ค.""" | |
| # 1) trend_spots | |
| try: | |
| resp = self.supabase.table("trend_spots").select("id, name").execute() | |
| for row in resp.data or []: | |
| name = row["name"] | |
| if not is_system_name(name): | |
| self._register_name(name, row["id"]) | |
| except Exception as e: | |
| logger.warning("trend_spots ๋ก๋ ์คํจ (ํ ์ด๋ธ ๋ฏธ์กด์ฌ ๊ฐ๋ฅ): %s", e) | |
| # 2) story_spots | |
| try: | |
| resp = ( | |
| self.supabase.table("story_spots") | |
| .select("id, name, aliases") | |
| .execute() | |
| ) | |
| for row in resp.data or []: | |
| name = row["name"] | |
| if name and not is_system_name(name): | |
| self._register_name(name, row["id"]) | |
| for alias in row.get("aliases") or []: | |
| if alias and not is_system_name(alias): | |
| self._register_name(alias, row["id"]) | |
| except Exception as e: | |
| logger.warning("story_spots ๋ก๋ ์คํจ: %s", e) | |
| logger.info("์ฅ์๋ช ์ฌ์ ๊ตฌ์ถ ์๋ฃ: %d๊ฑด", len(self.known_places)) | |
| def _register_name(self, name: str, spot_id: str) -> None: | |
| """์ด๋ฆ๊ณผ ๊ณต๋ฐฑ ์ ๊ฑฐ ๋ณํ์ ์ฌ์ ์ ๋ฑ๋กํ๋ค.""" | |
| name = name.strip() | |
| if not name or len(name) < 2: | |
| return | |
| # STOPWORDS ์ฒดํฌ | |
| if name in STOPWORDS: | |
| return | |
| self.known_places[name] = spot_id | |
| no_space = name.replace(" ", "") | |
| if no_space != name and no_space not in STOPWORDS: | |
| self.known_places[no_space] = spot_id | |
| # ------------------------------------------------------------------ | |
| # ์ถ์ถ | |
| # ------------------------------------------------------------------ | |
| def extract(self, text: str) -> list[dict]: | |
| """ | |
| ํ ์คํธ์์ ์ฅ์๋ช ์ ์ถ์ถํ๋ค. | |
| Returns: | |
| [{"name": str, "spot_id": str|None, "method": "dictionary"|"pattern"|"youtube_pattern"}, ...] | |
| """ | |
| if not text: | |
| return [] | |
| # ์ ์ฒ๋ฆฌ: ์ฃผ์ ํจํด + HTML ํ๊ทธ ์ ๊ฑฐ | |
| text = preprocess_text(text) | |
| found: list[dict] = [] | |
| found_names: set[str] = set() | |
| # 1์์: ์ฌ์ ๋งค์นญ โ ๊ธด ์ด๋ฆ๋ถํฐ ๋งค์นญ (๋ถ๋ถ ๋งค์นญ ๋ฐฉ์ง) | |
| for name in sorted(self.known_places, key=len, reverse=True): | |
| if name in STOPWORDS: | |
| continue | |
| if name in text and name not in found_names: | |
| found.append({ | |
| "name": name, | |
| "spot_id": self.known_places[name], | |
| "method": "dictionary", | |
| }) | |
| found_names.add(name) | |
| # 2์์: ์ ํ๋ธ ์ ์ฉ ํจํด | |
| for pattern in YOUTUBE_PLACE_PATTERNS: | |
| for match in pattern.findall(text): | |
| match = match.strip() | |
| if match and match not in found_names and match not in STOPWORDS: | |
| found.append({ | |
| "name": match, | |
| "spot_id": None, | |
| "method": "youtube_pattern", | |
| }) | |
| found_names.add(match) | |
| # 3์์: ์ ๋ฏธ์ฌ ํจํด | |
| for pattern in SUFFIX_PATTERNS: | |
| for match in pattern.findall(text): | |
| if match not in found_names and match not in STOPWORDS: | |
| found.append({ | |
| "name": match, | |
| "spot_id": None, | |
| "method": "pattern", | |
| }) | |
| found_names.add(match) | |
| return found | |