samchun-gemini / trend_engine /place_extractor.py
JHyeok5's picture
Upload folder using huggingface_hub
69458c9 verified
"""
PlaceNameExtractor โ€” ๋ธ”๋กœ๊ทธ/์œ ํŠœ๋ธŒ ํ…์ŠคํŠธ์—์„œ ์žฅ์†Œ๋ช…์„ ์ถ”์ถœํ•˜๋Š” ๊ณตํ†ต ๋ชจ๋“ˆ
์ถ”์ถœ ์šฐ์„ ์ˆœ์œ„:
1. ์žฅ์†Œ๋ช… ์‚ฌ์ „ ๋งค์นญ (trend_spots + story_spots ์ด๋ฆ„)
2. ์œ ํŠœ๋ธŒ ์ „์šฉ ํŒจํ„ด (๐Ÿ“, @, ๋ฒˆํ˜ธ ๋ฆฌ์ŠคํŠธ)
3. ์ ‘๋ฏธ์‚ฌ ํŒจํ„ด ๋งค์นญ (์นดํŽ˜, ์‹๋‹น, ํ•ด๋ณ€ ๋“ฑ)
"""
import re
import logging
from supabase import Client
from db import get_supabase
from trend_engine.spot_matcher import is_system_name
logger = logging.getLogger(__name__)
# ์žฅ์†Œ๋ช… ์ ‘๋ฏธ์‚ฌ ํŒจํ„ด (์‚ฌ์ „์— ์—†๋Š” ์ƒˆ ์žฅ์†Œ ๋ฐœ๊ฒฌ์šฉ)
SUFFIX_PATTERNS = [
re.compile(r"[\w๊ฐ€-ํžฃ]{2,}(?:์นดํŽ˜|์ปคํ”ผ)"),
re.compile(r"[\w๊ฐ€-ํžฃ]{2,}(?:์‹๋‹น|๋ฐฅ์ง‘|๊ตญ์ˆ˜)"),
re.compile(r"[\w๊ฐ€-ํžฃ]{2,}(?:ํ•ด๋ณ€|ํ•ด์•ˆ|ํฌ๊ตฌ)"),
re.compile(r"[\w๊ฐ€-ํžฃ]{2,}(?:์˜ค๋ฆ„|๊ณต์›|์ˆฒ)"),
re.compile(r"[\w๊ฐ€-ํžฃ]{2,}(?:์ „๋ง๋Œ€|๋ทฐ)"),
re.compile(r"[\w๊ฐ€-ํžฃ]{2,}(?:๋ฒ ์ด์ปค๋ฆฌ|๋ธŒ๋Ÿฐ์น˜|๋””์ €ํŠธ)"),
]
# ์œ ํŠœ๋ธŒ ์˜์ƒ ์„ค๋ช…์—์„œ ์ž์ฃผ ์‚ฌ์šฉ๋˜๋Š” ์žฅ์†Œ ์†Œ๊ฐœ ํŒจํ„ด
YOUTUBE_PLACE_PATTERNS = [
re.compile(r"๐Ÿ“\s*([\w๊ฐ€-ํžฃ\s]{2,20})"),
re.compile(r"(?:๋ฐฉ๋ฌธ|๋‹ค๋…€์˜จ|๋“ค๋ฅธ)\s*(?:๊ณณ|์žฅ์†Œ)[:\s]*([\w๊ฐ€-ํžฃ\s]{2,20})"),
re.compile(r"(?:\d+\.\s*)([\w๊ฐ€-ํžฃ]{2,15})"),
re.compile(r"@\s*([\w๊ฐ€-ํžฃ]{2,15})"),
]
# ๋„ˆ๋ฌด ์ผ๋ฐ˜์ ์ธ ๋‹จ์–ด โ€” ์žฅ์†Œ๋ช…์ด ์•„๋‹Œ ๊ฒฝ์šฐ ํ•„ํ„ฐ๋ง
STOPWORDS = frozenset({
# ์ง€์‹œ๋Œ€๋ช…์‚ฌ + ์นดํ…Œ๊ณ ๋ฆฌ
"์ด์นดํŽ˜", "๊ทธ์นดํŽ˜", "์ €์นดํŽ˜", "์ด์‹๋‹น", "๊ทธ์‹๋‹น",
"์ดํ•ด๋ณ€", "๊ทธํ•ด๋ณ€", "์ด์˜ค๋ฆ„", "๊ทธ์˜ค๋ฆ„",
# ํ˜•์šฉ์‚ฌ + ์นดํ…Œ๊ณ ๋ฆฌ
"์ข‹์€์นดํŽ˜", "์˜ˆ์œ์นดํŽ˜", "๋ง›์žˆ๋Š”์‹๋‹น", "์ข‹์€์‹๋‹น",
# ์œ„์น˜/์ˆ˜์‹์–ด + ์นดํ…Œ๊ณ ๋ฆฌ
"๊ทผ์ฒ˜์นดํŽ˜", "์ฃผ๋ณ€์นดํŽ˜", "๋™๋„ค์นดํŽ˜", "์œ ๋ช…์นดํŽ˜",
"๊ฐ์„ฑ์นดํŽ˜", "๋ทฐ์นดํŽ˜", "์˜ค์…˜๋ทฐ์นดํŽ˜",
"์• ์›”์นดํŽ˜", "ํ•œ๋ฆผ์นดํŽ˜", "ํ˜‘์žฌ์นดํŽ˜",
"์ œ์ฃผ์นดํŽ˜", "์ œ์ฃผ์‹๋‹น", "์ œ์ฃผํ•ด๋ณ€", "์ œ์ฃผ์˜ค๋ฆ„",
"์˜ˆ์œ์‹๋‹น", "์ œ์ฃผ๋ง›์ง‘", "์• ์›”๋ง›์ง‘",
# ์นดํ…Œ๊ณ ๋ฆฌ ์ผ๋ฐ˜๋ช…์‚ฌ (๋‹จ๋… ์‚ฌ์šฉ ์‹œ ์žฅ์†Œ๋ช… ์•„๋‹˜)
"์นดํŽ˜", "์ปคํ”ผ", "์‹๋‹น", "๋ง›์ง‘", "๋ฐฅ์ง‘", "๊ตญ์ˆ˜",
"ํ•ด๋ณ€", "ํ•ด์•ˆ", "ํฌ๊ตฌ", "์˜ค๋ฆ„", "๊ณต์›", "์ˆฒ",
"์ „๋ง๋Œ€", "๋ทฐ", "๋ฒ ์ด์ปค๋ฆฌ", "๋ธŒ๋Ÿฐ์น˜", "๋””์ €ํŠธ",
"์‚ฐ์ฑ…", "์‚ฐ์ฑ…๋กœ", "๋“œ๋ผ์ด๋ธŒ",
# ์ง€์—ญ + ์ผ๋ฐ˜ ํ‘œํ˜„
"์ œ์ฃผ์—ฌํ–‰", "์• ์›”์—ฌํ–‰", "์ œ์ฃผ๋„", "์• ์›”", "์• ์›”๋ฆฌ",
})
# ์ฃผ์†Œ ํŒจํ„ด โ€” ์žฅ์†Œ๋ช… ์ถ”์ถœ ์ „ ์ œ๊ฑฐ
ADDRESS_PATTERNS = [
re.compile(r"์ œ์ฃผ\s*(?:ํŠน๋ณ„์ž์น˜)?๋„?\s*์ œ์ฃผ์‹œ\s*์• ์›”์\s*[\w๊ฐ€-ํžฃ\d\-]+"),
re.compile(r"์• ์›”์\s*[\w๊ฐ€-ํžฃ]+๋ฆฌ\s*[\d\-]+"),
re.compile(r"์ œ์ฃผ์‹œ\s*[\w๊ฐ€-ํžฃ]+(?:์|๋ฉด)\s*[\w๊ฐ€-ํžฃ]+(?:๋ฆฌ|๋กœ|๊ธธ)\s*[\d\-]*"),
]
def preprocess_text(text: str) -> str:
"""์žฅ์†Œ๋ช… ์ถ”์ถœ ์ „ ์ฃผ์†Œ ํŒจํ„ด ๋ฐ HTML ํƒœ๊ทธ ์ œ๊ฑฐ."""
text = re.sub(r"<[^>]+>", "", text)
for pattern in ADDRESS_PATTERNS:
text = pattern.sub("", text)
return text
class PlaceNameExtractor:
"""ํ…์ŠคํŠธ์—์„œ ์žฅ์†Œ๋ช…์„ ์ถ”์ถœํ•˜๋Š” ํŒŒ์ดํ”„๋ผ์ธ."""
def __init__(self, supabase: Client | None = None):
self.supabase = supabase or get_supabase()
# {์žฅ์†Œ๋ช…: spot_id} โ€” ์ •ํ™• ๋งค์นญ์šฉ
self.known_places: dict[str, str] = {}
self._load_place_dictionary()
# ------------------------------------------------------------------
# ์‚ฌ์ „ ๊ตฌ์ถ•
# ------------------------------------------------------------------
def _load_place_dictionary(self) -> None:
"""trend_spots + story_spots์—์„œ ์žฅ์†Œ๋ช… ์‚ฌ์ „์„ ๊ตฌ์ถ•ํ•œ๋‹ค."""
# 1) trend_spots
try:
resp = self.supabase.table("trend_spots").select("id, name").execute()
for row in resp.data or []:
name = row["name"]
if not is_system_name(name):
self._register_name(name, row["id"])
except Exception as e:
logger.warning("trend_spots ๋กœ๋“œ ์‹คํŒจ (ํ…Œ์ด๋ธ” ๋ฏธ์กด์žฌ ๊ฐ€๋Šฅ): %s", e)
# 2) story_spots
try:
resp = (
self.supabase.table("story_spots")
.select("id, name, aliases")
.execute()
)
for row in resp.data or []:
name = row["name"]
if name and not is_system_name(name):
self._register_name(name, row["id"])
for alias in row.get("aliases") or []:
if alias and not is_system_name(alias):
self._register_name(alias, row["id"])
except Exception as e:
logger.warning("story_spots ๋กœ๋“œ ์‹คํŒจ: %s", e)
logger.info("์žฅ์†Œ๋ช… ์‚ฌ์ „ ๊ตฌ์ถ• ์™„๋ฃŒ: %d๊ฑด", len(self.known_places))
def _register_name(self, name: str, spot_id: str) -> None:
"""์ด๋ฆ„๊ณผ ๊ณต๋ฐฑ ์ œ๊ฑฐ ๋ณ€ํ˜•์„ ์‚ฌ์ „์— ๋“ฑ๋กํ•œ๋‹ค."""
name = name.strip()
if not name or len(name) < 2:
return
# STOPWORDS ์ฒดํฌ
if name in STOPWORDS:
return
self.known_places[name] = spot_id
no_space = name.replace(" ", "")
if no_space != name and no_space not in STOPWORDS:
self.known_places[no_space] = spot_id
# ------------------------------------------------------------------
# ์ถ”์ถœ
# ------------------------------------------------------------------
def extract(self, text: str) -> list[dict]:
"""
ํ…์ŠคํŠธ์—์„œ ์žฅ์†Œ๋ช…์„ ์ถ”์ถœํ•œ๋‹ค.
Returns:
[{"name": str, "spot_id": str|None, "method": "dictionary"|"pattern"|"youtube_pattern"}, ...]
"""
if not text:
return []
# ์ „์ฒ˜๋ฆฌ: ์ฃผ์†Œ ํŒจํ„ด + HTML ํƒœ๊ทธ ์ œ๊ฑฐ
text = preprocess_text(text)
found: list[dict] = []
found_names: set[str] = set()
# 1์ˆœ์œ„: ์‚ฌ์ „ ๋งค์นญ โ€” ๊ธด ์ด๋ฆ„๋ถ€ํ„ฐ ๋งค์นญ (๋ถ€๋ถ„ ๋งค์นญ ๋ฐฉ์ง€)
for name in sorted(self.known_places, key=len, reverse=True):
if name in STOPWORDS:
continue
if name in text and name not in found_names:
found.append({
"name": name,
"spot_id": self.known_places[name],
"method": "dictionary",
})
found_names.add(name)
# 2์ˆœ์œ„: ์œ ํŠœ๋ธŒ ์ „์šฉ ํŒจํ„ด
for pattern in YOUTUBE_PLACE_PATTERNS:
for match in pattern.findall(text):
match = match.strip()
if match and match not in found_names and match not in STOPWORDS:
found.append({
"name": match,
"spot_id": None,
"method": "youtube_pattern",
})
found_names.add(match)
# 3์ˆœ์œ„: ์ ‘๋ฏธ์‚ฌ ํŒจํ„ด
for pattern in SUFFIX_PATTERNS:
for match in pattern.findall(text):
if match not in found_names and match not in STOPWORDS:
found.append({
"name": match,
"spot_id": None,
"method": "pattern",
})
found_names.add(match)
return found