Spaces:
Sleeping
Sleeping
Upload folder using huggingface_hub
Browse files- requirements-trend.txt +8 -0
- scripts/run_trend_engine.py +248 -0
- trend_engine/__init__.py +1 -0
- trend_engine/collectors/__init__.py +15 -0
- trend_engine/collectors/instagram.py +277 -0
- trend_engine/collectors/kakaomap.py +309 -0
- trend_engine/collectors/naver_blog.py +330 -0
- trend_engine/collectors/naver_place.py +297 -0
- trend_engine/collectors/youtube.py +317 -0
- trend_engine/place_extractor.py +152 -0
- trend_engine/spot_matcher.py +111 -0
- trend_engine/trend_scorer.py +344 -0
requirements-trend.txt
ADDED
|
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
httpx>=0.27.0
|
| 2 |
+
beautifulsoup4>=4.12.0
|
| 3 |
+
lxml>=5.0.0
|
| 4 |
+
requests>=2.31.0
|
| 5 |
+
google-api-python-client>=2.100.0
|
| 6 |
+
apify-client>=1.6.0
|
| 7 |
+
supabase>=2.0.0
|
| 8 |
+
python-dotenv>=1.0.0
|
scripts/run_trend_engine.py
ADDED
|
@@ -0,0 +1,248 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
RE:Play Trend Engine v3 โ ์ฃผ๊ฐ ๋ฐฐ์น ์ค์ผ์คํธ๋ ์ดํฐ
|
| 3 |
+
|
| 4 |
+
์์ฐจ ์คํ ํ์ดํ๋ผ์ธ:
|
| 5 |
+
1. ์นด์นด์ค๋งต ๊ทธ๋ฆฌ๋ ์ค์บ + ๋ฆฌ๋ทฐ ํ์ฑ (trend_spots ๋ง์คํฐ ์์ฑ)
|
| 6 |
+
2. SpotMatcher ์ด๊ธฐํ (trend_spots + story_spots ์ฌ์ ๋ก๋)
|
| 7 |
+
3. ๋ค์ด๋ฒ ๋ธ๋ก๊ทธ ์์ง (URL ํ๋ณด + ํฌ๋กค๋ง + DB ์ ์ฅ)
|
| 8 |
+
4. ๋ธ๋ก๊ทธ ๋ณธ๋ฌธ โ ์ฅ์๋ช
์ถ์ถ + mention_count ์ง๊ณ
|
| 9 |
+
5. ์ ํ๋ธ API (SpotMatcher ์ฐ๋)
|
| 10 |
+
6. ์ธ์คํ๊ทธ๋จ Apify (SpotMatcher ์ฐ๋)
|
| 11 |
+
7. ์ข
ํฉ ์ค์ฝ์ด ๊ณ์ฐ + ๋ญํน ์์ฑ
|
| 12 |
+
|
| 13 |
+
Usage:
|
| 14 |
+
python backend/scripts/run_trend_engine.py
|
| 15 |
+
"""
|
| 16 |
+
|
| 17 |
+
import asyncio
|
| 18 |
+
import json
|
| 19 |
+
import logging
|
| 20 |
+
import os
|
| 21 |
+
import re
|
| 22 |
+
import sys
|
| 23 |
+
import time
|
| 24 |
+
from datetime import date, timedelta
|
| 25 |
+
|
| 26 |
+
# backend/ ๋๋ ํ ๋ฆฌ๋ฅผ import path์ ์ถ๊ฐ
|
| 27 |
+
sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
|
| 28 |
+
|
| 29 |
+
# ๋ก์ปฌ ์คํ ์ .env ํ์ผ ๋ก๋
|
| 30 |
+
try:
|
| 31 |
+
from dotenv import load_dotenv
|
| 32 |
+
# ํ๋ก์ ํธ ๋ฃจํธ์ .env ํ์ผ ๋ก๋
|
| 33 |
+
env_path = os.path.join(os.path.dirname(__file__), "..", "..", ".env")
|
| 34 |
+
load_dotenv(env_path)
|
| 35 |
+
except ImportError:
|
| 36 |
+
pass # GitHub Actions ๋ฑ dotenv ์๋ ํ๊ฒฝ
|
| 37 |
+
|
| 38 |
+
from supabase import create_client
|
| 39 |
+
|
| 40 |
+
from trend_engine.collectors.naver_blog import NaverBlogCollector
|
| 41 |
+
from trend_engine.collectors.kakaomap import KakaoMapCollector
|
| 42 |
+
from trend_engine.collectors.youtube import YouTubeCollector
|
| 43 |
+
from trend_engine.collectors.instagram import InstagramCollector
|
| 44 |
+
from trend_engine.spot_matcher import SpotMatcher
|
| 45 |
+
from trend_engine.trend_scorer import generate_weekly_ranking
|
| 46 |
+
from trend_engine.place_extractor import PlaceNameExtractor
|
| 47 |
+
|
| 48 |
+
logging.basicConfig(
|
| 49 |
+
level=logging.INFO,
|
| 50 |
+
format="%(asctime)s [%(levelname)s] %(name)s: %(message)s",
|
| 51 |
+
datefmt="%Y-%m-%d %H:%M:%S",
|
| 52 |
+
)
|
| 53 |
+
logger = logging.getLogger("trend_engine.orchestrator")
|
| 54 |
+
|
| 55 |
+
|
| 56 |
+
def _get_supabase_client():
|
| 57 |
+
url = os.environ.get("SUPABASE_URL") or os.environ.get("VITE_SUPABASE_URL")
|
| 58 |
+
key = os.environ.get("SUPABASE_SERVICE_ROLE_KEY") or os.environ.get("SUPABASE_SERVICE_KEY")
|
| 59 |
+
if not url or not key:
|
| 60 |
+
raise ValueError("SUPABASE_URL and SUPABASE_SERVICE_ROLE_KEY must be set")
|
| 61 |
+
return create_client(url, key)
|
| 62 |
+
|
| 63 |
+
|
| 64 |
+
def run_step(name: str, func, results: dict):
|
| 65 |
+
"""๋จ์ผ ํ์ดํ๋ผ์ธ ๋จ๊ณ๋ฅผ ์คํํ๊ณ ๊ฒฐ๊ณผ๋ฅผ ๊ธฐ๋กํ๋ค. ๋ฐํ๊ฐ์ ๋๋ ค์ค๋ค."""
|
| 66 |
+
logger.info("โโโ [START] %s โโโ", name)
|
| 67 |
+
start = time.time()
|
| 68 |
+
try:
|
| 69 |
+
result = func()
|
| 70 |
+
elapsed = time.time() - start
|
| 71 |
+
results[name] = {"status": "ok", "result": _summarize(result), "elapsed_sec": round(elapsed, 1)}
|
| 72 |
+
logger.info("โ [DONE] %s โ %.1f์ด", name, elapsed)
|
| 73 |
+
return result
|
| 74 |
+
except Exception as e:
|
| 75 |
+
elapsed = time.time() - start
|
| 76 |
+
results[name] = {"status": "error", "error": str(e), "elapsed_sec": round(elapsed, 1)}
|
| 77 |
+
logger.error("โ [FAIL] %s โ %s (%.1f์ด)", name, e, elapsed)
|
| 78 |
+
return None
|
| 79 |
+
|
| 80 |
+
|
| 81 |
+
def run_async_step(name: str, coro, results: dict):
|
| 82 |
+
"""asyncio ์ฝ๋ฃจํด์ ์คํํ๋ run_step ๋ณํ."""
|
| 83 |
+
def wrapper():
|
| 84 |
+
return asyncio.run(coro)
|
| 85 |
+
return run_step(name, wrapper, results)
|
| 86 |
+
|
| 87 |
+
|
| 88 |
+
def _summarize(result) -> str:
|
| 89 |
+
"""๊ฒฐ๊ณผ๋ฅผ ๋ก๊ทธ์ฉ ์์ฝ ๋ฌธ์์ด๋ก ๋ณํ."""
|
| 90 |
+
if isinstance(result, dict):
|
| 91 |
+
return json.dumps(result, ensure_ascii=False, default=str)[:200]
|
| 92 |
+
return str(result)[:200]
|
| 93 |
+
|
| 94 |
+
|
| 95 |
+
def main() -> None:
|
| 96 |
+
total_start = time.time()
|
| 97 |
+
results: dict = {}
|
| 98 |
+
|
| 99 |
+
sb = _get_supabase_client()
|
| 100 |
+
|
| 101 |
+
# โโ 1. ์นด์นด์ค๋งต ๊ทธ๋ฆฌ๋ ์ค์บ (๋จผ์ ์คํ โ trend_spots ๋ง์คํฐ ์์ฑ) โโ
|
| 102 |
+
kakao = KakaoMapCollector(sb)
|
| 103 |
+
run_async_step("1_kakaomap", kakao.run(), results)
|
| 104 |
+
|
| 105 |
+
# โโ 2. SpotMatcher ์ด๊ธฐํ (trend_spots + story_spots ์ฌ์ ๋ก๋) โโ
|
| 106 |
+
matcher = SpotMatcher(sb)
|
| 107 |
+
extractor = PlaceNameExtractor(sb)
|
| 108 |
+
logger.info(
|
| 109 |
+
"SpotMatcher ์ค๋น ์๋ฃ โ trend_spots %d๊ฑด, story_spots %d๊ฑด",
|
| 110 |
+
len(matcher.trend_spots),
|
| 111 |
+
len(matcher.story_spots),
|
| 112 |
+
)
|
| 113 |
+
|
| 114 |
+
# โโ 3. ์ ํ๋ธ API (SpotMatcher ์ฐ๋) โโ
|
| 115 |
+
youtube = YouTubeCollector(sb, spot_matcher=matcher)
|
| 116 |
+
run_step("3_youtube", youtube.run, results)
|
| 117 |
+
|
| 118 |
+
# โโ 4. ์ธ์คํ๊ทธ๋จ Apify (SpotMatcher ์ฐ๋) โโ
|
| 119 |
+
instagram = InstagramCollector(sb, spot_matcher=matcher)
|
| 120 |
+
run_step("4_instagram", instagram.run, results)
|
| 121 |
+
|
| 122 |
+
# โโ 5. ๋ค์ด๋ฒ ํ๋ ์ด์ค โ ๋นํ์ฑ (Place ID ๋งค์นญ ๋ถ๊ฐ) โโ
|
| 123 |
+
logger.info("๋ค์ด๋ฒ ํ๋ ์ด์ค: ๋นํ์ฑ (Place ID ๋งค์นญ ๋ถ๊ฐ, 2026-02)")
|
| 124 |
+
results["5_naver_place"] = {
|
| 125 |
+
"status": "skipped",
|
| 126 |
+
"reason": "Place ID matching unavailable",
|
| 127 |
+
"elapsed_sec": 0,
|
| 128 |
+
}
|
| 129 |
+
|
| 130 |
+
# โโ 6. ๋ค์ด๋ฒ ๋ธ๋ก๊ทธ ์์ง (URL + ๋ณ๋ ฌ ํฌ๋กค๋ง + ์ ์ฅ) โโ
|
| 131 |
+
blog = NaverBlogCollector(sb)
|
| 132 |
+
run_step("6_naver_blog", blog.run, results)
|
| 133 |
+
|
| 134 |
+
# โโ 7. ๋ธ๋ก๊ทธ ๋ณธ๋ฌธ โ ์ฅ์๋ช
์ถ์ถ + mention_count ์ง๊ณ โโ
|
| 135 |
+
def extract_blog_places():
|
| 136 |
+
"""๋ธ๋ก๊ทธ ํฌ์คํธ์์ ์ฅ์๋ช
์ถ์ถ โ mention_count ์ง๊ณ โ spot_trends ์ ์ฅ."""
|
| 137 |
+
today = date.today()
|
| 138 |
+
period_start = today - timedelta(days=7)
|
| 139 |
+
|
| 140 |
+
# spot_trends์์ naver_blog + __pending__ ๋ ์ฝ๋ ์กฐํ (ํ์ด์ง๋ค์ด์
)
|
| 141 |
+
records = []
|
| 142 |
+
page_size = 1000
|
| 143 |
+
offset = 0
|
| 144 |
+
try:
|
| 145 |
+
while True:
|
| 146 |
+
batch = (
|
| 147 |
+
sb.table("spot_trends")
|
| 148 |
+
.select("id, raw_data")
|
| 149 |
+
.eq("source", "naver_blog")
|
| 150 |
+
.eq("spot_id", "__pending__")
|
| 151 |
+
.range(offset, offset + page_size - 1)
|
| 152 |
+
.execute()
|
| 153 |
+
)
|
| 154 |
+
rows = batch.data or []
|
| 155 |
+
records.extend(rows)
|
| 156 |
+
if len(rows) < page_size:
|
| 157 |
+
break
|
| 158 |
+
offset += page_size
|
| 159 |
+
except Exception as e:
|
| 160 |
+
logger.warning("๋ธ๋ก๊ทธ pending ๋ ์ฝ๋ ์กฐํ ์คํจ: %s", e)
|
| 161 |
+
return {"error": str(e)}
|
| 162 |
+
|
| 163 |
+
logger.info("๋ธ๋ก๊ทธ pending ๋ ์ฝ๋: %d๊ฑด ์กฐํ", len(records))
|
| 164 |
+
if not records:
|
| 165 |
+
return {"pending_records": 0, "places_found": 0}
|
| 166 |
+
|
| 167 |
+
# ์ฅ์๋ณ ์ธ๊ธ ํ์ ์ง๊ณ
|
| 168 |
+
place_mentions: dict[str, int] = {}
|
| 169 |
+
|
| 170 |
+
for record in records:
|
| 171 |
+
raw = record.get("raw_data", {})
|
| 172 |
+
content = raw.get("content_preview", "")
|
| 173 |
+
title = raw.get("title", "")
|
| 174 |
+
text = f"{title} {content}"
|
| 175 |
+
text = re.sub(r"<[^>]+>", "", text) # HTML ํ๊ทธ ์ ๊ฑฐ
|
| 176 |
+
|
| 177 |
+
places = extractor.extract(text)
|
| 178 |
+
for place in places:
|
| 179 |
+
matched_id = matcher.match(place["name"])
|
| 180 |
+
if matched_id:
|
| 181 |
+
place_mentions[matched_id] = place_mentions.get(matched_id, 0) + 1
|
| 182 |
+
|
| 183 |
+
# ์ง๊ณ ๊ฒฐ๊ณผ๋ฅผ spot_trends์ ์ ์ฅ (์ฅ์๋ณ mention_count)
|
| 184 |
+
saved = 0
|
| 185 |
+
for spot_id, count in place_mentions.items():
|
| 186 |
+
try:
|
| 187 |
+
sb.table("spot_trends").insert({
|
| 188 |
+
"spot_id": spot_id,
|
| 189 |
+
"source": "naver_blog",
|
| 190 |
+
"metric_type": "mention_count",
|
| 191 |
+
"metric_value": count,
|
| 192 |
+
"period_start": period_start.isoformat(),
|
| 193 |
+
"period_end": today.isoformat(),
|
| 194 |
+
"raw_data": {"aggregated_from": "blog_post_extraction"},
|
| 195 |
+
}).execute()
|
| 196 |
+
saved += 1
|
| 197 |
+
except Exception as e:
|
| 198 |
+
logger.warning("mention_count ์ ์ฅ ์คํจ (%s): %s", spot_id, e)
|
| 199 |
+
|
| 200 |
+
return {
|
| 201 |
+
"pending_records": len(records),
|
| 202 |
+
"places_found": len(place_mentions),
|
| 203 |
+
"mention_records_saved": saved,
|
| 204 |
+
}
|
| 205 |
+
|
| 206 |
+
run_step("7_blog_place_extraction", extract_blog_places, results)
|
| 207 |
+
|
| 208 |
+
# โโ 8. ์ข
ํฉ ์ค์ฝ์ด ๊ณ์ฐ + ๋ญํน ์์ฑ โโ
|
| 209 |
+
def calc_scores():
|
| 210 |
+
return generate_weekly_ranking(sb)
|
| 211 |
+
|
| 212 |
+
run_step("8_score_calculation", calc_scores, results)
|
| 213 |
+
|
| 214 |
+
# โโ ๊ฒฐ๊ณผ ์์ฝ โโ
|
| 215 |
+
total_elapsed = time.time() - total_start
|
| 216 |
+
ok_count = sum(1 for r in results.values() if r.get("status") == "ok")
|
| 217 |
+
err_count = sum(1 for r in results.values() if r.get("status") == "error")
|
| 218 |
+
skip_count = sum(1 for r in results.values() if r.get("status") == "skipped")
|
| 219 |
+
|
| 220 |
+
summary = {
|
| 221 |
+
"total_steps": len(results),
|
| 222 |
+
"succeeded": ok_count,
|
| 223 |
+
"failed": err_count,
|
| 224 |
+
"skipped": skip_count,
|
| 225 |
+
"total_elapsed_sec": round(total_elapsed, 1),
|
| 226 |
+
"steps": {
|
| 227 |
+
k: {"status": v.get("status"), "elapsed_sec": v.get("elapsed_sec", 0)}
|
| 228 |
+
for k, v in results.items()
|
| 229 |
+
},
|
| 230 |
+
}
|
| 231 |
+
|
| 232 |
+
logger.info("โโโ TREND ENGINE COMPLETE โโโ")
|
| 233 |
+
logger.info(
|
| 234 |
+
"์ฑ๊ณต: %d / ์คํจ: %d / ์คํต: %d / ์ด ์์: %.1f์ด",
|
| 235 |
+
ok_count, err_count, skip_count, total_elapsed,
|
| 236 |
+
)
|
| 237 |
+
|
| 238 |
+
# JSON ์์ฝ ์ถ๋ ฅ (GitHub Actions ๋ก๊ทธ์ฉ)
|
| 239 |
+
print(json.dumps(summary, ensure_ascii=False, indent=2))
|
| 240 |
+
|
| 241 |
+
# ์ ์ฒด ์คํจ ์์๋ง ๋น์ ์ ์ข
๋ฃ
|
| 242 |
+
if ok_count == 0:
|
| 243 |
+
logger.error("๋ชจ๋ ๋จ๊ณ๊ฐ ์คํจํ์ต๋๋ค.")
|
| 244 |
+
sys.exit(1)
|
| 245 |
+
|
| 246 |
+
|
| 247 |
+
if __name__ == "__main__":
|
| 248 |
+
main()
|
trend_engine/__init__.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
"""RE:Play Trend Engine v3 โ ํธ๋ ๋ ๋ฐ์ดํฐ ์์ง ํ์ดํ๋ผ์ธ"""
|
trend_engine/collectors/__init__.py
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Trend Engine data collectors โ ์ฑ๋๋ณ ์์ง๊ธฐ"""
|
| 2 |
+
|
| 3 |
+
from .naver_blog import NaverBlogCollector
|
| 4 |
+
from .naver_place import NaverPlaceCollector
|
| 5 |
+
from .kakaomap import KakaoMapCollector
|
| 6 |
+
from .youtube import YouTubeCollector
|
| 7 |
+
from .instagram import InstagramCollector
|
| 8 |
+
|
| 9 |
+
__all__ = [
|
| 10 |
+
"NaverBlogCollector",
|
| 11 |
+
"NaverPlaceCollector",
|
| 12 |
+
"KakaoMapCollector",
|
| 13 |
+
"YouTubeCollector",
|
| 14 |
+
"InstagramCollector",
|
| 15 |
+
]
|
trend_engine/collectors/instagram.py
ADDED
|
@@ -0,0 +1,277 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Instagram Collector โ Apify SaaS (Instagram Hashtag Scraper)
|
| 3 |
+
|
| 4 |
+
์์ง ๋ฐฉ์:
|
| 5 |
+
1. Apify์ instagram-hashtag-scraper Actor๋ก ํด์ํ๊ทธ๋ณ ๊ฒ์๋ฌผ ์์ง
|
| 6 |
+
2. ์์น๋ณ ํ๊ท ์ฐธ์ฌ๋(์ข์์+๋๊ธ) ์ง๊ณ
|
| 7 |
+
3. spot_trends ํ
์ด๋ธ์ ์ ์ฅ
|
| 8 |
+
"""
|
| 9 |
+
|
| 10 |
+
import os
|
| 11 |
+
import logging
|
| 12 |
+
from datetime import date
|
| 13 |
+
|
| 14 |
+
from apify_client import ApifyClient
|
| 15 |
+
|
| 16 |
+
from trend_engine.place_extractor import PlaceNameExtractor
|
| 17 |
+
|
| 18 |
+
logger = logging.getLogger(__name__)
|
| 19 |
+
|
| 20 |
+
# โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 21 |
+
# ์ค์
|
| 22 |
+
# โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 23 |
+
|
| 24 |
+
INSTAGRAM_HASHTAGS = [
|
| 25 |
+
"์ ์์นดํ",
|
| 26 |
+
"์ ์๋ง์ง",
|
| 27 |
+
"์ ์๊ฐ๋ณผ๋งํ๊ณณ",
|
| 28 |
+
"์ ์ํด์",
|
| 29 |
+
"์ ์์ฌํ",
|
| 30 |
+
"์ ์ฃผ์ ์",
|
| 31 |
+
"์ ์๊ฐ์ฑ",
|
| 32 |
+
"ํ๋ดํด์",
|
| 33 |
+
"๊ณฝ์งํด๋ณ",
|
| 34 |
+
"์ ์ํซํ",
|
| 35 |
+
"์ ์๋์ ํธ",
|
| 36 |
+
]
|
| 37 |
+
|
| 38 |
+
RESULTS_LIMIT_PER_HASHTAG = 50
|
| 39 |
+
|
| 40 |
+
ACTOR_ID = "apify/instagram-hashtag-scraper"
|
| 41 |
+
|
| 42 |
+
|
| 43 |
+
class InstagramCollector:
|
| 44 |
+
"""Apify Instagram Hashtag Scraper ๊ธฐ๋ฐ ์์ง๊ธฐ."""
|
| 45 |
+
|
| 46 |
+
def __init__(self, supabase_client, spot_matcher=None):
|
| 47 |
+
self.supabase = supabase_client
|
| 48 |
+
self.apify = ApifyClient(os.environ["APIFY_API_TOKEN"])
|
| 49 |
+
self.spot_matcher = spot_matcher
|
| 50 |
+
self.extractor = PlaceNameExtractor(supabase_client)
|
| 51 |
+
|
| 52 |
+
# ------------------------------------------------------------------
|
| 53 |
+
# ํด์ํ๊ทธ ๊ฒ์๋ฌผ ์์ง
|
| 54 |
+
# ------------------------------------------------------------------
|
| 55 |
+
|
| 56 |
+
def collect_hashtag_posts(self) -> list[dict]:
|
| 57 |
+
"""
|
| 58 |
+
Apify Actor๋ฅผ ์คํํ์ฌ ํด์ํ๊ทธ๋ณ ๊ฒ์๋ฌผ์ ์์งํ๋ค.
|
| 59 |
+
|
| 60 |
+
Returns:
|
| 61 |
+
[{hashtag, location_name, likes_count, comments_count, caption, timestamp}, ...]
|
| 62 |
+
"""
|
| 63 |
+
run_input = {
|
| 64 |
+
"hashtags": INSTAGRAM_HASHTAGS,
|
| 65 |
+
"resultsLimit": RESULTS_LIMIT_PER_HASHTAG,
|
| 66 |
+
}
|
| 67 |
+
|
| 68 |
+
logger.info(
|
| 69 |
+
"Apify Actor ์คํ ์์: %d๊ฐ ํด์ํ๊ทธ, ํด์ํ๊ทธ๋น %d๊ฑด",
|
| 70 |
+
len(INSTAGRAM_HASHTAGS), RESULTS_LIMIT_PER_HASHTAG,
|
| 71 |
+
)
|
| 72 |
+
|
| 73 |
+
try:
|
| 74 |
+
run = self.apify.actor(ACTOR_ID).call(run_input=run_input)
|
| 75 |
+
except Exception as e:
|
| 76 |
+
logger.error("Apify Actor ์คํ ์คํจ: %s", e)
|
| 77 |
+
return []
|
| 78 |
+
|
| 79 |
+
all_posts: list[dict] = []
|
| 80 |
+
dataset_id = run["defaultDatasetId"]
|
| 81 |
+
|
| 82 |
+
for item in self.apify.dataset(dataset_id).iterate_items():
|
| 83 |
+
all_posts.append({
|
| 84 |
+
"hashtag": item.get("hashtag", ""),
|
| 85 |
+
"location_name": item.get("locationName", ""),
|
| 86 |
+
"likes_count": item.get("likesCount", 0),
|
| 87 |
+
"comments_count": item.get("commentsCount", 0),
|
| 88 |
+
"caption": item.get("caption", ""),
|
| 89 |
+
"timestamp": item.get("timestamp", ""),
|
| 90 |
+
"url": item.get("url", ""),
|
| 91 |
+
})
|
| 92 |
+
|
| 93 |
+
logger.info("๊ฒ์๋ฌผ ์์ง ์๋ฃ: %d๊ฑด", len(all_posts))
|
| 94 |
+
return all_posts
|
| 95 |
+
|
| 96 |
+
# ------------------------------------------------------------------
|
| 97 |
+
# ์์น๋ณ ๋ฉํธ๋ฆญ ์ง๊ณ
|
| 98 |
+
# ------------------------------------------------------------------
|
| 99 |
+
|
| 100 |
+
def aggregate_location_metrics(self, posts: list[dict]) -> dict[str, dict]:
|
| 101 |
+
"""
|
| 102 |
+
์์ง๋ ๊ฒ์๋ฌผ์์ ์์น๋ณ ์ธ๊ธฐ๋ ๋ฉํธ๋ฆญ์ ์ง๊ณํ๋ค.
|
| 103 |
+
|
| 104 |
+
1์ฐจ: ์์น ํ๊ทธ ๊ธฐ๋ฐ (location_name)
|
| 105 |
+
2์ฐจ: ์บก์
๊ธฐ๋ฐ ์ฅ์๋ช
์ถ์ถ (์์น ํ๊ทธ ์๋ ๊ฒ์๋ฌผ)
|
| 106 |
+
|
| 107 |
+
Returns:
|
| 108 |
+
{location_name: {post_count, total_likes, total_comments, avg_engagement}}
|
| 109 |
+
"""
|
| 110 |
+
location_metrics: dict[str, dict] = {}
|
| 111 |
+
no_location_posts: list[dict] = []
|
| 112 |
+
|
| 113 |
+
for post in posts:
|
| 114 |
+
loc = post["location_name"]
|
| 115 |
+
if not loc:
|
| 116 |
+
no_location_posts.append(post)
|
| 117 |
+
continue
|
| 118 |
+
|
| 119 |
+
if loc not in location_metrics:
|
| 120 |
+
location_metrics[loc] = {
|
| 121 |
+
"post_count": 0,
|
| 122 |
+
"total_likes": 0,
|
| 123 |
+
"total_comments": 0,
|
| 124 |
+
"hashtags": set(),
|
| 125 |
+
}
|
| 126 |
+
|
| 127 |
+
location_metrics[loc]["post_count"] += 1
|
| 128 |
+
location_metrics[loc]["total_likes"] += post["likes_count"]
|
| 129 |
+
location_metrics[loc]["total_comments"] += post["comments_count"]
|
| 130 |
+
if post["hashtag"]:
|
| 131 |
+
location_metrics[loc]["hashtags"].add(post["hashtag"])
|
| 132 |
+
|
| 133 |
+
# 2์ฐจ: ์บก์
๊ธฐ๋ฐ ์ฅ์๋ช
์ถ์ถ (์์น ํ๊ทธ ์๋ ๊ฒ์๋ฌผ)
|
| 134 |
+
caption_extracted = 0
|
| 135 |
+
for post in no_location_posts:
|
| 136 |
+
caption = post.get("caption", "")
|
| 137 |
+
if not caption or len(caption) < 5:
|
| 138 |
+
continue
|
| 139 |
+
|
| 140 |
+
places = self.extractor.extract(caption)
|
| 141 |
+
for place in places:
|
| 142 |
+
loc = place["name"]
|
| 143 |
+
if loc not in location_metrics:
|
| 144 |
+
location_metrics[loc] = {
|
| 145 |
+
"post_count": 0,
|
| 146 |
+
"total_likes": 0,
|
| 147 |
+
"total_comments": 0,
|
| 148 |
+
"hashtags": set(),
|
| 149 |
+
}
|
| 150 |
+
location_metrics[loc]["post_count"] += 1
|
| 151 |
+
location_metrics[loc]["total_likes"] += post["likes_count"]
|
| 152 |
+
location_metrics[loc]["total_comments"] += post["comments_count"]
|
| 153 |
+
if post["hashtag"]:
|
| 154 |
+
location_metrics[loc]["hashtags"].add(post["hashtag"])
|
| 155 |
+
caption_extracted += 1
|
| 156 |
+
break # ๊ฒ์๋ฌผ๋น 1๊ฐ ์ฅ์๋ง ์นด์ดํธ
|
| 157 |
+
|
| 158 |
+
# ํ๊ท ์ฐธ์ฌ๋ ๊ณ์ฐ + set โ list ๋ณํ
|
| 159 |
+
for loc, metrics in location_metrics.items():
|
| 160 |
+
count = max(metrics["post_count"], 1)
|
| 161 |
+
metrics["avg_engagement"] = (
|
| 162 |
+
metrics["total_likes"] + metrics["total_comments"]
|
| 163 |
+
) / count
|
| 164 |
+
metrics["hashtags"] = sorted(metrics["hashtags"])
|
| 165 |
+
|
| 166 |
+
logger.info(
|
| 167 |
+
"์์น๋ณ ์ง๊ณ ์๋ฃ: %d๊ฐ ์์น (์์นํ๊ทธ %d๊ฑด, ์บก์
์ถ์ถ %d๊ฑด, ๋ฏธ์๋ณ %d๊ฑด)",
|
| 168 |
+
len(location_metrics),
|
| 169 |
+
sum(1 for p in posts if p["location_name"]),
|
| 170 |
+
caption_extracted,
|
| 171 |
+
len(no_location_posts) - caption_extracted,
|
| 172 |
+
)
|
| 173 |
+
return location_metrics
|
| 174 |
+
|
| 175 |
+
# ------------------------------------------------------------------
|
| 176 |
+
# DB ์ ์ฅ
|
| 177 |
+
# ------------------------------------------------------------------
|
| 178 |
+
|
| 179 |
+
def _save_to_db(
|
| 180 |
+
self,
|
| 181 |
+
location_metrics: dict[str, dict],
|
| 182 |
+
total_posts: int,
|
| 183 |
+
) -> int:
|
| 184 |
+
"""์ง๊ณ๋ ๋ฉํธ๋ฆญ์ spot_trends ํ
์ด๋ธ์ ์ ์ฅํ๋ค.
|
| 185 |
+
|
| 186 |
+
SpotMatcher๊ฐ ์ค์ ๋ ๊ฒฝ์ฐ trend_spots.id๋ก ์ ๊ทํ.
|
| 187 |
+
๋งค์นญ ์คํจํ ์์น๋ ์คํต.
|
| 188 |
+
"""
|
| 189 |
+
today = date.today()
|
| 190 |
+
saved = 0
|
| 191 |
+
skipped = 0
|
| 192 |
+
|
| 193 |
+
for loc_name, metrics in location_metrics.items():
|
| 194 |
+
# SpotMatcher๋ก spot_id ์ ๊ทํ
|
| 195 |
+
if self.spot_matcher:
|
| 196 |
+
spot_id = self.spot_matcher.match(loc_name)
|
| 197 |
+
if not spot_id:
|
| 198 |
+
skipped += 1
|
| 199 |
+
continue
|
| 200 |
+
else:
|
| 201 |
+
spot_id = f"ig_{loc_name}"
|
| 202 |
+
|
| 203 |
+
# post_count ๋ฉํธ๋ฆญ
|
| 204 |
+
try:
|
| 205 |
+
self.supabase.table("spot_trends").insert({
|
| 206 |
+
"spot_id": spot_id,
|
| 207 |
+
"source": "instagram",
|
| 208 |
+
"metric_type": "post_count",
|
| 209 |
+
"metric_value": metrics["post_count"],
|
| 210 |
+
"period_start": today.isoformat(),
|
| 211 |
+
"period_end": today.isoformat(),
|
| 212 |
+
"raw_data": {
|
| 213 |
+
"location_name": loc_name,
|
| 214 |
+
"total_likes": metrics["total_likes"],
|
| 215 |
+
"total_comments": metrics["total_comments"],
|
| 216 |
+
"avg_engagement": metrics["avg_engagement"],
|
| 217 |
+
"hashtags": metrics["hashtags"],
|
| 218 |
+
},
|
| 219 |
+
}).execute()
|
| 220 |
+
saved += 1
|
| 221 |
+
except Exception as e:
|
| 222 |
+
logger.warning("spot_trends insert ์คํจ (instagram post_count, %s): %s", spot_id, e)
|
| 223 |
+
|
| 224 |
+
# avg_engagement ๋ฉํธ๋ฆญ (์ฑ๋ ์ค์ฝ์ด๋ง์ ํ์)
|
| 225 |
+
avg_eng = metrics["avg_engagement"]
|
| 226 |
+
if avg_eng > 0: # -1 ๊ฐ ํํฐ๋ง (Apify ๋ฏธ์์ง ์ผ์ด์ค)
|
| 227 |
+
try:
|
| 228 |
+
self.supabase.table("spot_trends").insert({
|
| 229 |
+
"spot_id": spot_id,
|
| 230 |
+
"source": "instagram",
|
| 231 |
+
"metric_type": "avg_engagement",
|
| 232 |
+
"metric_value": int(round(avg_eng)),
|
| 233 |
+
"period_start": today.isoformat(),
|
| 234 |
+
"period_end": today.isoformat(),
|
| 235 |
+
"raw_data": {"location_name": loc_name},
|
| 236 |
+
}).execute()
|
| 237 |
+
except Exception as e:
|
| 238 |
+
logger.warning("spot_trends insert ์คํจ (instagram avg_engagement, %s): %s", spot_id, e)
|
| 239 |
+
|
| 240 |
+
if skipped:
|
| 241 |
+
logger.info("Instagram ์ฅ์ ๋งค์นญ ์คํจ๋ก %d๊ฑด ์คํต", skipped)
|
| 242 |
+
logger.info("Instagram DB ์ ์ฅ ์๋ฃ: %d๊ฑด", saved)
|
| 243 |
+
return saved
|
| 244 |
+
|
| 245 |
+
# ------------------------------------------------------------------
|
| 246 |
+
# ๋ฉ์ธ ์คํ
|
| 247 |
+
# ------------------------------------------------------------------
|
| 248 |
+
|
| 249 |
+
def run(self) -> dict:
|
| 250 |
+
"""
|
| 251 |
+
Instagram ์์ง ํ์ดํ๋ผ์ธ ์ ์ฒด ์คํ.
|
| 252 |
+
|
| 253 |
+
Returns:
|
| 254 |
+
{"total_posts": int, "locations_found": int, "saved": int}
|
| 255 |
+
"""
|
| 256 |
+
logger.info("=== Instagram ์์ง ์์ ===")
|
| 257 |
+
|
| 258 |
+
# 1๋จ๊ณ: ํด์ํ๊ทธ ๏ฟฝ๏ฟฝ๏ฟฝ์๋ฌผ ์์ง
|
| 259 |
+
posts = self.collect_hashtag_posts()
|
| 260 |
+
|
| 261 |
+
if not posts:
|
| 262 |
+
logger.warning("์์ง๋ ๊ฒ์๋ฌผ ์์ โ ์ข
๋ฃ")
|
| 263 |
+
return {"total_posts": 0, "locations_found": 0, "saved": 0}
|
| 264 |
+
|
| 265 |
+
# 2๋จ๊ณ: ์์น๋ณ ๋ฉํธ๋ฆญ ์ง๊ณ
|
| 266 |
+
location_metrics = self.aggregate_location_metrics(posts)
|
| 267 |
+
|
| 268 |
+
# 3๋จ๊ณ: DB ์ ์ฅ
|
| 269 |
+
saved = self._save_to_db(location_metrics, len(posts))
|
| 270 |
+
|
| 271 |
+
result = {
|
| 272 |
+
"total_posts": len(posts),
|
| 273 |
+
"locations_found": len(location_metrics),
|
| 274 |
+
"saved": saved,
|
| 275 |
+
}
|
| 276 |
+
logger.info("=== Instagram ์์ง ์๋ฃ: %s ===", result)
|
| 277 |
+
return result
|
trend_engine/collectors/kakaomap.py
ADDED
|
@@ -0,0 +1,309 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
KakaoMap Collector โ ๊ทธ๋ฆฌ๋ ๋ถํ ์นดํ
๊ณ ๋ฆฌ ์ค์บ + ๋ฆฌ๋ทฐ ์ ํ์ฑ
|
| 3 |
+
|
| 4 |
+
์์ง ๋ฐฉ์:
|
| 5 |
+
1. ์ ์ ์ง์ญ์ 2kmร2km ๊ทธ๋ฆฌ๋๋ก ๋ถํ
|
| 6 |
+
2. ๊ฐ ์
ร 4๊ฐ ์นดํ
๊ณ ๋ฆฌ(CE7/FD6/AT4/CT1) ร 3ํ์ด์ง ์กฐํ
|
| 7 |
+
3. ์์ 100๊ฐ ์ฅ์์ ๋ฆฌ๋ทฐ ์๋ฅผ ๋น๊ณต์ API๋ก ํ์ฑ (graceful degradation)
|
| 8 |
+
"""
|
| 9 |
+
|
| 10 |
+
import os
|
| 11 |
+
import logging
|
| 12 |
+
from datetime import date, datetime
|
| 13 |
+
|
| 14 |
+
import httpx
|
| 15 |
+
|
| 16 |
+
logger = logging.getLogger(__name__)
|
| 17 |
+
|
| 18 |
+
# โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 19 |
+
# ์ ์ ๊ทธ๋ฆฌ๋ ์ค์
|
| 20 |
+
# โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 21 |
+
|
| 22 |
+
# ์ ์๋ฆฌ ์ค์ฌ ๊ทธ๋ฆฌ๋ (์ ์ํญ~ํ๋ดํด์~๊ณฝ์งํด๋ณ ์ปค๋ฒ)
|
| 23 |
+
AEWOL_GRID = {
|
| 24 |
+
"west": 126.30,
|
| 25 |
+
"east": 126.36,
|
| 26 |
+
"south": 33.44,
|
| 27 |
+
"north": 33.47,
|
| 28 |
+
"cell_lng": 0.023, # ~2km ๊ฒฝ๋
|
| 29 |
+
"cell_lat": 0.018, # ~2km ์๋
|
| 30 |
+
}
|
| 31 |
+
|
| 32 |
+
CATEGORY_CODES = {
|
| 33 |
+
"CE7": "์นดํ",
|
| 34 |
+
"FD6": "์์์ ",
|
| 35 |
+
"AT4": "๊ด๊ด๋ช
์",
|
| 36 |
+
"CT1": "๋ฌธํ์์ค",
|
| 37 |
+
}
|
| 38 |
+
|
| 39 |
+
KAKAO_CATEGORY_URL = "https://dapi.kakao.com/v2/local/search/category"
|
| 40 |
+
KAKAO_PLACE_DETAIL_URL = "https://place.map.kakao.com/main/v/{place_id}"
|
| 41 |
+
|
| 42 |
+
REVIEW_PARSE_LIMIT = 100 # ๋ฆฌ๋ทฐ ์ ํ์ฑ ๋์ ์์ N๊ฐ
|
| 43 |
+
|
| 44 |
+
|
| 45 |
+
class KakaoMapCollector:
|
| 46 |
+
"""์นด์นด์ค๋งต ๊ทธ๋ฆฌ๋ ๋ถํ ์ค์บ + ๋ฆฌ๋ทฐ ์ ํ์ฑ ์์ง๊ธฐ."""
|
| 47 |
+
|
| 48 |
+
def __init__(self, supabase_client):
|
| 49 |
+
self.supabase = supabase_client
|
| 50 |
+
self.api_key = os.environ["KAKAO_REST_API_KEY"]
|
| 51 |
+
|
| 52 |
+
# ------------------------------------------------------------------
|
| 53 |
+
# ๊ทธ๋ฆฌ๋ ์์ฑ
|
| 54 |
+
# ------------------------------------------------------------------
|
| 55 |
+
|
| 56 |
+
def generate_grid_cells(self) -> list[dict]:
|
| 57 |
+
"""์ ์ ์ง์ญ์ 2kmร2km ์
๋ก ๋ถํ ํ์ฌ rect ํ๋ผ๋ฏธํฐ ๋ชฉ๋ก์ ์์ฑํ๋ค."""
|
| 58 |
+
cells = []
|
| 59 |
+
lng = AEWOL_GRID["west"]
|
| 60 |
+
while lng < AEWOL_GRID["east"]:
|
| 61 |
+
lat = AEWOL_GRID["south"]
|
| 62 |
+
while lat < AEWOL_GRID["north"]:
|
| 63 |
+
cells.append({
|
| 64 |
+
"x1": lng,
|
| 65 |
+
"y1": lat,
|
| 66 |
+
"x2": min(lng + AEWOL_GRID["cell_lng"], AEWOL_GRID["east"]),
|
| 67 |
+
"y2": min(lat + AEWOL_GRID["cell_lat"], AEWOL_GRID["north"]),
|
| 68 |
+
})
|
| 69 |
+
lat += AEWOL_GRID["cell_lat"]
|
| 70 |
+
lng += AEWOL_GRID["cell_lng"]
|
| 71 |
+
|
| 72 |
+
logger.info("๊ทธ๋ฆฌ๋ ์
%d๊ฐ ์์ฑ ์๋ฃ", len(cells))
|
| 73 |
+
return cells
|
| 74 |
+
|
| 75 |
+
# ------------------------------------------------------------------
|
| 76 |
+
# ์นดํ
๊ณ ๋ฆฌ ์ค์บ
|
| 77 |
+
# ------------------------------------------------------------------
|
| 78 |
+
|
| 79 |
+
def collect_spots(self) -> dict[str, dict]:
|
| 80 |
+
"""
|
| 81 |
+
์นด์นด์ค๋งต ์นดํ
๊ณ ๋ฆฌ ๊ฒ์ API๋ก ์ ์ ์ ์ญ ์ฅ์๋ฅผ ์์งํ๋ค.
|
| 82 |
+
|
| 83 |
+
Returns:
|
| 84 |
+
{place_id: {kakao_id, name, category, lat, lng, address, place_url, phone, search_rank}}
|
| 85 |
+
"""
|
| 86 |
+
all_spots: dict[str, dict] = {}
|
| 87 |
+
grid_cells = self.generate_grid_cells()
|
| 88 |
+
headers = {"Authorization": f"KakaoAK {self.api_key}"}
|
| 89 |
+
api_calls = 0
|
| 90 |
+
|
| 91 |
+
for cell in grid_cells:
|
| 92 |
+
rect_str = f"{cell['x1']},{cell['y1']},{cell['x2']},{cell['y2']}"
|
| 93 |
+
|
| 94 |
+
for code, category_name in CATEGORY_CODES.items():
|
| 95 |
+
for page in range(1, 4): # ์ต๋ 3ํ์ด์ง
|
| 96 |
+
params = {
|
| 97 |
+
"category_group_code": code,
|
| 98 |
+
"rect": rect_str,
|
| 99 |
+
"page": page,
|
| 100 |
+
"size": 15,
|
| 101 |
+
"sort": "accuracy",
|
| 102 |
+
}
|
| 103 |
+
|
| 104 |
+
try:
|
| 105 |
+
resp = httpx.get(
|
| 106 |
+
KAKAO_CATEGORY_URL,
|
| 107 |
+
headers=headers,
|
| 108 |
+
params=params,
|
| 109 |
+
timeout=10,
|
| 110 |
+
)
|
| 111 |
+
resp.raise_for_status()
|
| 112 |
+
data = resp.json()
|
| 113 |
+
api_calls += 1
|
| 114 |
+
except Exception as e:
|
| 115 |
+
logger.warning(
|
| 116 |
+
"์นด์นด์ค๋งต API ํธ์ถ ์คํจ (cell=%s, code=%s, page=%d): %s",
|
| 117 |
+
rect_str, code, page, e,
|
| 118 |
+
)
|
| 119 |
+
break
|
| 120 |
+
|
| 121 |
+
for place in data.get("documents", []):
|
| 122 |
+
place_id = place["id"]
|
| 123 |
+
if place_id not in all_spots:
|
| 124 |
+
all_spots[place_id] = {
|
| 125 |
+
"kakao_id": place_id,
|
| 126 |
+
"name": place["place_name"],
|
| 127 |
+
"category": category_name,
|
| 128 |
+
"lat": float(place["y"]),
|
| 129 |
+
"lng": float(place["x"]),
|
| 130 |
+
"address": place["address_name"],
|
| 131 |
+
"place_url": place.get("place_url", ""),
|
| 132 |
+
"phone": place.get("phone", ""),
|
| 133 |
+
"search_rank": len(all_spots) + 1,
|
| 134 |
+
}
|
| 135 |
+
|
| 136 |
+
# ๋ง์ง๋ง ํ์ด์ง๋ฉด ๋ค์ ์นดํ
๊ณ ๋ฆฌ๋ก
|
| 137 |
+
if data.get("meta", {}).get("is_end", True):
|
| 138 |
+
break
|
| 139 |
+
|
| 140 |
+
logger.info(
|
| 141 |
+
"์นด์นด์ค๋งต ์ค์บ ์๋ฃ: %d๊ฐ ์ฅ์ ์์ง (API ํธ์ถ %dํ)",
|
| 142 |
+
len(all_spots), api_calls,
|
| 143 |
+
)
|
| 144 |
+
return all_spots
|
| 145 |
+
|
| 146 |
+
# ------------------------------------------------------------------
|
| 147 |
+
# ๋ฆฌ๋ทฐ ์ ํ์ฑ (๋น๊ณต์ API)
|
| 148 |
+
# ------------------------------------------------------------------
|
| 149 |
+
|
| 150 |
+
async def fetch_review_counts(
|
| 151 |
+
self, spots: dict[str, dict], limit: int = REVIEW_PARSE_LIMIT
|
| 152 |
+
) -> dict[str, dict]:
|
| 153 |
+
"""
|
| 154 |
+
์์ limit๊ฐ ์ฅ์์ ๋ฆฌ๋ทฐ ์๋ฅผ ์นด์นด์ค๋งต ๋น๊ณต์ API๋ก ํ์ฑํ๋ค.
|
| 155 |
+
|
| 156 |
+
์คํจ ์ review_count=0 ์ฒ๋ฆฌ (graceful degradation).
|
| 157 |
+
"""
|
| 158 |
+
# search_rank ๊ธฐ์ค ์์ N๊ฐ๋ง ํ์ฑ
|
| 159 |
+
sorted_spots = sorted(spots.values(), key=lambda s: s["search_rank"])[:limit]
|
| 160 |
+
results: dict[str, dict] = {}
|
| 161 |
+
success_count = 0
|
| 162 |
+
fail_count = 0
|
| 163 |
+
|
| 164 |
+
headers = {
|
| 165 |
+
"User-Agent": (
|
| 166 |
+
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
|
| 167 |
+
"AppleWebKit/537.36 (KHTML, like Gecko) "
|
| 168 |
+
"Chrome/120.0.0.0 Safari/537.36"
|
| 169 |
+
),
|
| 170 |
+
"Referer": "https://map.kakao.com/",
|
| 171 |
+
}
|
| 172 |
+
|
| 173 |
+
async with httpx.AsyncClient(headers=headers, timeout=10) as client:
|
| 174 |
+
for spot in sorted_spots:
|
| 175 |
+
place_id = spot["kakao_id"]
|
| 176 |
+
url = KAKAO_PLACE_DETAIL_URL.format(place_id=place_id)
|
| 177 |
+
|
| 178 |
+
try:
|
| 179 |
+
resp = await client.get(url)
|
| 180 |
+
data = resp.json()
|
| 181 |
+
|
| 182 |
+
results[place_id] = {
|
| 183 |
+
"review_count": data.get("comment", {}).get("scorecnt", 0),
|
| 184 |
+
"avg_rating": data.get("comment", {}).get("scoretotalavgstar", 0),
|
| 185 |
+
"blog_review_count": data.get("blogReview", {}).get("blogrvwcnt", 0),
|
| 186 |
+
"success": True,
|
| 187 |
+
}
|
| 188 |
+
success_count += 1
|
| 189 |
+
except Exception as e:
|
| 190 |
+
results[place_id] = {
|
| 191 |
+
"review_count": 0,
|
| 192 |
+
"avg_rating": 0,
|
| 193 |
+
"blog_review_count": 0,
|
| 194 |
+
"success": False,
|
| 195 |
+
"error": str(e),
|
| 196 |
+
}
|
| 197 |
+
fail_count += 1
|
| 198 |
+
|
| 199 |
+
total = success_count + fail_count
|
| 200 |
+
fail_rate = fail_count / max(total, 1)
|
| 201 |
+
logger.info(
|
| 202 |
+
"๋ฆฌ๋ทฐ ํ์ฑ ์๋ฃ: ์ฑ๊ณต %d, ์คํจ %d (์คํจ์จ %.1f%%)",
|
| 203 |
+
success_count, fail_count, fail_rate * 100,
|
| 204 |
+
)
|
| 205 |
+
|
| 206 |
+
if fail_rate > 0.5:
|
| 207 |
+
logger.warning(
|
| 208 |
+
"๋ฆฌ๋ทฐ ํ์ฑ ์คํจ์จ 50%% ์ด๊ณผ โ ์ด์ ๋ฐ์ดํฐ ์ ์ง ๊ถ์ฅ"
|
| 209 |
+
)
|
| 210 |
+
|
| 211 |
+
return results
|
| 212 |
+
|
| 213 |
+
# ------------------------------------------------------------------
|
| 214 |
+
# DB ์ ์ฅ
|
| 215 |
+
# ------------------------------------------------------------------
|
| 216 |
+
|
| 217 |
+
def _save_to_db(self, spots: dict[str, dict], reviews: dict[str, dict]) -> int:
|
| 218 |
+
"""์์ง๋ ์ฅ์+๋ฆฌ๋ทฐ๋ฅผ trend_spots / spot_trends ํ
์ด๋ธ์ ์ ์ฅํ๋ค."""
|
| 219 |
+
today = date.today()
|
| 220 |
+
saved = 0
|
| 221 |
+
|
| 222 |
+
for place_id, spot in spots.items():
|
| 223 |
+
spot_id = f"trend_{spot['name']}_{place_id[:8]}"
|
| 224 |
+
|
| 225 |
+
# trend_spots upsert
|
| 226 |
+
try:
|
| 227 |
+
self.supabase.table("trend_spots").upsert(
|
| 228 |
+
{
|
| 229 |
+
"id": spot_id,
|
| 230 |
+
"name": spot["name"],
|
| 231 |
+
"category": spot["category"],
|
| 232 |
+
"lat": spot["lat"],
|
| 233 |
+
"lng": spot["lng"],
|
| 234 |
+
"address": spot["address"],
|
| 235 |
+
"source_ids": {"kakaomap": place_id},
|
| 236 |
+
},
|
| 237 |
+
on_conflict="id",
|
| 238 |
+
).execute()
|
| 239 |
+
except Exception as e:
|
| 240 |
+
logger.warning("trend_spots upsert ์คํจ (%s): %s", spot_id, e)
|
| 241 |
+
continue
|
| 242 |
+
|
| 243 |
+
# spot_trends โ search_rank ๋ฉํธ๋ฆญ
|
| 244 |
+
try:
|
| 245 |
+
self.supabase.table("spot_trends").insert({
|
| 246 |
+
"spot_id": spot_id,
|
| 247 |
+
"source": "kakaomap",
|
| 248 |
+
"metric_type": "search_rank",
|
| 249 |
+
"metric_value": spot["search_rank"],
|
| 250 |
+
"period_start": today.isoformat(),
|
| 251 |
+
"period_end": today.isoformat(),
|
| 252 |
+
"raw_data": {"kakao_id": place_id, "address": spot["address"]},
|
| 253 |
+
}).execute()
|
| 254 |
+
except Exception as e:
|
| 255 |
+
logger.warning("spot_trends insert ์คํจ (search_rank, %s): %s", spot_id, e)
|
| 256 |
+
|
| 257 |
+
# spot_trends โ review_count ๋ฉํธ๋ฆญ (ํ์ฑ ์ฑ๊ณต ์)
|
| 258 |
+
review = reviews.get(place_id, {})
|
| 259 |
+
if review.get("success"):
|
| 260 |
+
try:
|
| 261 |
+
self.supabase.table("spot_trends").insert({
|
| 262 |
+
"spot_id": spot_id,
|
| 263 |
+
"source": "kakaomap",
|
| 264 |
+
"metric_type": "review_count",
|
| 265 |
+
"metric_value": review["review_count"],
|
| 266 |
+
"period_start": today.isoformat(),
|
| 267 |
+
"period_end": today.isoformat(),
|
| 268 |
+
"raw_data": {
|
| 269 |
+
"avg_rating": review["avg_rating"],
|
| 270 |
+
"blog_review_count": review["blog_review_count"],
|
| 271 |
+
},
|
| 272 |
+
}).execute()
|
| 273 |
+
except Exception as e:
|
| 274 |
+
logger.warning("spot_trends insert ์คํจ (review_count, %s): %s", spot_id, e)
|
| 275 |
+
|
| 276 |
+
saved += 1
|
| 277 |
+
|
| 278 |
+
logger.info("DB ์ ์ฅ ์๋ฃ: %d๊ฑด", saved)
|
| 279 |
+
return saved
|
| 280 |
+
|
| 281 |
+
# ------------------------------------------------------------------
|
| 282 |
+
# ๋ฉ์ธ ์คํ
|
| 283 |
+
# ------------------------------------------------------------------
|
| 284 |
+
|
| 285 |
+
async def run(self) -> dict:
|
| 286 |
+
"""
|
| 287 |
+
์นด์นด์ค๋งต ์์ง ํ์ดํ๋ผ์ธ ์ ์ฒด ์คํ.
|
| 288 |
+
|
| 289 |
+
Returns:
|
| 290 |
+
{"spots_count": int, "reviews_parsed": int, "saved": int}
|
| 291 |
+
"""
|
| 292 |
+
logger.info("=== ์นด์นด์ค๋งต ์์ง ์์ ===")
|
| 293 |
+
|
| 294 |
+
# 1๋จ๊ณ: ๊ทธ๋ฆฌ๋ ์ค์บ์ผ๋ก ์ฅ์ ์์ง
|
| 295 |
+
spots = self.collect_spots()
|
| 296 |
+
|
| 297 |
+
# 2๋จ๊ณ: ์์ ์ฅ์ ๋ฆฌ๋ทฐ ์ ํ์ฑ
|
| 298 |
+
reviews = await self.fetch_review_counts(spots)
|
| 299 |
+
|
| 300 |
+
# 3๋จ๊ณ: DB ์ ์ฅ
|
| 301 |
+
saved = self._save_to_db(spots, reviews)
|
| 302 |
+
|
| 303 |
+
result = {
|
| 304 |
+
"spots_count": len(spots),
|
| 305 |
+
"reviews_parsed": len(reviews),
|
| 306 |
+
"saved": saved,
|
| 307 |
+
}
|
| 308 |
+
logger.info("=== ์นด์นด์ค๋งต ์์ง ์๋ฃ: %s ===", result)
|
| 309 |
+
return result
|
trend_engine/collectors/naver_blog.py
ADDED
|
@@ -0,0 +1,330 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
๋ค์ด๋ฒ ๋ธ๋ก๊ทธ 2๋จ๊ณ ์์ง๊ธฐ (Naver Blog 2-Stage Collector)
|
| 3 |
+
|
| 4 |
+
1๋จ๊ณ: ๋ค์ด๋ฒ ๊ฒ์ API๋ก ๋ธ๋ก๊ทธ URL ๋ชฉ๋ก ํ๋ณด
|
| 5 |
+
2๋จ๊ณ: ๋ชจ๋ฐ์ผ URL ํฌ๋กค๋ง์ผ๋ก ๋ณธ๋ฌธ ์ ์ฒด ์์ง
|
| 6 |
+
|
| 7 |
+
์์ง๋ ๋ณธ๋ฌธ์ spot_trends ํ
์ด๋ธ์ ์ ์ฅ๋๋ฉฐ,
|
| 8 |
+
์ฅ์๋ช
์ถ์ถ์ ๋ณ๋ PlaceNameExtractor๊ฐ ๋ด๋นํ๋ค.
|
| 9 |
+
"""
|
| 10 |
+
|
| 11 |
+
import logging
|
| 12 |
+
import os
|
| 13 |
+
import time
|
| 14 |
+
import threading
|
| 15 |
+
from concurrent.futures import ThreadPoolExecutor, as_completed
|
| 16 |
+
from datetime import date, datetime, timedelta
|
| 17 |
+
|
| 18 |
+
import requests
|
| 19 |
+
from bs4 import BeautifulSoup
|
| 20 |
+
|
| 21 |
+
logger = logging.getLogger(__name__)
|
| 22 |
+
|
| 23 |
+
# ๊ฒ์ ํค์๋ ๋ชฉ๋ก (์ ์๋ฆฌ ์ง์ค)
|
| 24 |
+
NAVER_BLOG_KEYWORDS = [
|
| 25 |
+
"์ ์ ์นดํ", "์ ์ ๋ง์ง", "์ ์ ๊ฐ๋ณผ๋งํ๊ณณ", "์ ์ ์ฐ์ฑ
",
|
| 26 |
+
"์ ์ ํด์", "์ ์ ๋ทฐ", "์ ์ ๋์ ํธ", "์ ์ ๋ธ๋ฐ์น",
|
| 27 |
+
"์ ์ ๊ฐ์ฑ", "์ ์ ์จ์ ๋ช
์", "์ ์ ์ค์
๋ทฐ",
|
| 28 |
+
"์ ์๋ฆฌ ์นดํ", "์ ์๋ฆฌ ๋ง์ง",
|
| 29 |
+
"ํ๋ด ํด์ ์ฐ์ฑ
๋ก", "๊ณฝ์ง ํด๋ณ",
|
| 30 |
+
]
|
| 31 |
+
|
| 32 |
+
# ๋ชจ๋ฐ์ผ User-Agent
|
| 33 |
+
MOBILE_USER_AGENT = (
|
| 34 |
+
"Mozilla/5.0 (iPhone; CPU iPhone OS 16_0 like Mac OS X) "
|
| 35 |
+
"AppleWebKit/605.1.15 (KHTML, like Gecko) "
|
| 36 |
+
"Version/16.0 Mobile/15E148 Safari/604.1"
|
| 37 |
+
)
|
| 38 |
+
|
| 39 |
+
|
| 40 |
+
class NaverBlogCollector:
|
| 41 |
+
"""๋ค์ด๋ฒ ๋ธ๋ก๊ทธ 2๋จ๊ณ ์์ง๊ธฐ."""
|
| 42 |
+
|
| 43 |
+
def __init__(self, supabase_client):
|
| 44 |
+
self.supabase = supabase_client
|
| 45 |
+
self.client_id = os.environ["NAVER_CLIENT_ID"]
|
| 46 |
+
self.client_secret = os.environ["NAVER_CLIENT_SECRET"]
|
| 47 |
+
self.keywords = NAVER_BLOG_KEYWORDS
|
| 48 |
+
self._stats = {
|
| 49 |
+
"api_calls": 0,
|
| 50 |
+
"urls_found": 0,
|
| 51 |
+
"urls_unique": 0,
|
| 52 |
+
"crawl_success": 0,
|
| 53 |
+
"crawl_fail": 0,
|
| 54 |
+
"saved": 0,
|
| 55 |
+
}
|
| 56 |
+
|
| 57 |
+
# โโ 1๋จ๊ณ: ๋ค์ด๋ฒ ๊ฒ์ API๋ก URL ๋ชฉ๋ก ํ๋ณด โโโโโโโโโโโโโโโโโโโโ
|
| 58 |
+
|
| 59 |
+
def collect_blog_urls(self) -> list[dict]:
|
| 60 |
+
"""๋ค์ด๋ฒ ๊ฒ์ API๋ก ๋ธ๋ก๊ทธ ํฌ์คํธ URL ๋ชฉ๋ก ํ๋ณด.
|
| 61 |
+
|
| 62 |
+
ํค์๋๋น ์ต๋ 1,100๊ฑด(start 1~1000, display 100) ์กฐํ ๊ฐ๋ฅํ๋,
|
| 63 |
+
์ต๊ทผ 1์ฃผ ํฌ์คํ
๊ธฐ์ค์ผ๋ก ์ค์ 100๊ฑด ๋ฏธ๋ง์ด ๋๋ถ๋ถ์ด๋ค.
|
| 64 |
+
|
| 65 |
+
Returns:
|
| 66 |
+
URL ๊ธฐ์ค ์ค๋ณต ์ ๊ฑฐ๋ ํฌ์คํธ ๋ชฉ๋ก
|
| 67 |
+
"""
|
| 68 |
+
all_posts: list[dict] = []
|
| 69 |
+
headers = {
|
| 70 |
+
"X-Naver-Client-Id": self.client_id,
|
| 71 |
+
"X-Naver-Client-Secret": self.client_secret,
|
| 72 |
+
}
|
| 73 |
+
|
| 74 |
+
for keyword in self.keywords:
|
| 75 |
+
keyword_count = 0
|
| 76 |
+
for start in range(1, 1001, 100):
|
| 77 |
+
params = {
|
| 78 |
+
"query": keyword,
|
| 79 |
+
"display": 100,
|
| 80 |
+
"start": start,
|
| 81 |
+
"sort": "date",
|
| 82 |
+
}
|
| 83 |
+
try:
|
| 84 |
+
resp = requests.get(
|
| 85 |
+
"https://openapi.naver.com/v1/search/blog",
|
| 86 |
+
headers=headers,
|
| 87 |
+
params=params,
|
| 88 |
+
timeout=10,
|
| 89 |
+
)
|
| 90 |
+
resp.raise_for_status()
|
| 91 |
+
data = resp.json()
|
| 92 |
+
self._stats["api_calls"] += 1
|
| 93 |
+
except requests.RequestException as e:
|
| 94 |
+
logger.warning("๊ฒ์ API ์คํจ [%s start=%d]: %s", keyword, start, e)
|
| 95 |
+
break
|
| 96 |
+
|
| 97 |
+
items = data.get("items", [])
|
| 98 |
+
for item in items:
|
| 99 |
+
all_posts.append({
|
| 100 |
+
"title": item["title"],
|
| 101 |
+
"link": item["link"],
|
| 102 |
+
"bloggername": item["bloggername"],
|
| 103 |
+
"postdate": item["postdate"], # YYYYMMDD
|
| 104 |
+
"keyword": keyword,
|
| 105 |
+
})
|
| 106 |
+
keyword_count += 1
|
| 107 |
+
|
| 108 |
+
# ๊ฒฐ๊ณผ๊ฐ 100๊ฑด ๋ฏธ๋ง์ด๋ฉด ๋ ์ด์ ๋ฐ์ดํฐ ์์
|
| 109 |
+
if len(items) < 100:
|
| 110 |
+
break
|
| 111 |
+
|
| 112 |
+
time.sleep(0.1) # API ๋ถํ ๋ฐฉ์ง
|
| 113 |
+
|
| 114 |
+
logger.info("ํค์๋ [%s]: %d๊ฑด ์์ง", keyword, keyword_count)
|
| 115 |
+
|
| 116 |
+
self._stats["urls_found"] = len(all_posts)
|
| 117 |
+
|
| 118 |
+
# URL ๊ธฐ์ค ์ค๋ณต ์ ๊ฑฐ
|
| 119 |
+
seen_urls: set[str] = set()
|
| 120 |
+
unique_posts: list[dict] = []
|
| 121 |
+
for post in all_posts:
|
| 122 |
+
if post["link"] not in seen_urls:
|
| 123 |
+
seen_urls.add(post["link"])
|
| 124 |
+
unique_posts.append(post)
|
| 125 |
+
|
| 126 |
+
self._stats["urls_unique"] = len(unique_posts)
|
| 127 |
+
logger.info(
|
| 128 |
+
"1๋จ๊ณ ์๋ฃ: ์ด %d๊ฑด โ ์ค๋ณต ์ ๊ฑฐ ํ %d๊ฑด (API ํธ์ถ %dํ)",
|
| 129 |
+
self._stats["urls_found"],
|
| 130 |
+
self._stats["urls_unique"],
|
| 131 |
+
self._stats["api_calls"],
|
| 132 |
+
)
|
| 133 |
+
return unique_posts
|
| 134 |
+
|
| 135 |
+
# โโ 2๋จ๊ณ: ๋ชจ๋ฐ์ผ ํฌ๋กค๋ง์ผ๋ก ๋ณธ๋ฌธ ์ ์ฒด ์์ง โโโโโโโโโโโโโโโโโโ
|
| 136 |
+
|
| 137 |
+
def _convert_to_mobile_url(self, url: str) -> str:
|
| 138 |
+
"""PC ๋ธ๋ก๊ทธ URL์ ๏ฟฝ๏ฟฝ๋ฐ์ผ URL๋ก ๋ณํ."""
|
| 139 |
+
url = url.replace("https://blog.naver.com", "https://m.blog.naver.com")
|
| 140 |
+
url = url.replace("http://blog.naver.com", "https://m.blog.naver.com")
|
| 141 |
+
return url
|
| 142 |
+
|
| 143 |
+
def _crawl_single_blog(self, url: str) -> str | None:
|
| 144 |
+
"""๋จ์ผ ๋ธ๋ก๊ทธ ๋ชจ๋ฐ์ผ ํ์ด์ง์์ ๋ณธ๋ฌธ ํ
์คํธ ์ถ์ถ."""
|
| 145 |
+
m_url = self._convert_to_mobile_url(url)
|
| 146 |
+
headers = {"User-Agent": MOBILE_USER_AGENT}
|
| 147 |
+
|
| 148 |
+
try:
|
| 149 |
+
resp = requests.get(m_url, headers=headers, timeout=10)
|
| 150 |
+
resp.raise_for_status()
|
| 151 |
+
except requests.RequestException as e:
|
| 152 |
+
logger.debug("ํฌ๋กค๋ง HTTP ์คํจ: %s โ %s", m_url, e)
|
| 153 |
+
return None
|
| 154 |
+
|
| 155 |
+
soup = BeautifulSoup(resp.text, "lxml")
|
| 156 |
+
|
| 157 |
+
# ์
๋ ํฐ ์ฐ์ ์์: ์ค๋งํธ์๋ํฐ3 > ๊ตฌํ ์๋ํฐ > ๊ธฐํ
|
| 158 |
+
content_div = (
|
| 159 |
+
soup.find("div", {"class": "se-main-container"})
|
| 160 |
+
or soup.find("div", {"id": "postViewArea"})
|
| 161 |
+
or soup.find("div", {"class": "post_ct"})
|
| 162 |
+
)
|
| 163 |
+
|
| 164 |
+
if content_div:
|
| 165 |
+
return content_div.get_text(separator=" ", strip=True)
|
| 166 |
+
|
| 167 |
+
return None
|
| 168 |
+
|
| 169 |
+
def crawl_blog_contents(
|
| 170 |
+
self, posts: list[dict], max_workers: int = 5
|
| 171 |
+
) -> list[dict]:
|
| 172 |
+
"""๋ธ๋ก๊ทธ URL ๋ชฉ๋ก์ ๋ฐ์ ๋ณธ๋ฌธ ๋ณ๋ ฌ ํฌ๋กค๋ง.
|
| 173 |
+
|
| 174 |
+
ThreadPoolExecutor๋ก max_workers๊ฐ ๋์ ์์ฒญ.
|
| 175 |
+
๋ค์ด๋ฒ ์๋ฒ ๋ถํ ๋ฐฉ์ง๋ฅผ ์ํด ์์ปค ๊ฐ 0.2์ด ๊ฐ๊ฒฉ ์ ์ง.
|
| 176 |
+
|
| 177 |
+
Args:
|
| 178 |
+
posts: collect_blog_urls()์ ๋ฐํ๊ฐ
|
| 179 |
+
max_workers: ๋์ ํฌ๋กค๋ง ์์ปค ์ (๊ธฐ๋ณธ 5)
|
| 180 |
+
|
| 181 |
+
Returns:
|
| 182 |
+
full_content ํ๋๊ฐ ์ถ๊ฐ๋ ํฌ์คํธ ๋ชฉ๋ก (ํฌ๋กค๋ง ์ฑ๊ณต๋ถ๋ง)
|
| 183 |
+
"""
|
| 184 |
+
results: list[dict] = []
|
| 185 |
+
total = len(posts)
|
| 186 |
+
lock = threading.Lock()
|
| 187 |
+
|
| 188 |
+
def crawl_one(post: dict) -> tuple[dict, str | None]:
|
| 189 |
+
content = self._crawl_single_blog(post["link"])
|
| 190 |
+
return post, content
|
| 191 |
+
|
| 192 |
+
logger.info(
|
| 193 |
+
"๋ณ๋ ฌ ํฌ๋กค๋ง ์์: %d๊ฑด (์์ปค %d๊ฐ)", total, max_workers
|
| 194 |
+
)
|
| 195 |
+
|
| 196 |
+
with ThreadPoolExecutor(max_workers=max_workers) as executor:
|
| 197 |
+
# ์์ปค ๊ฐ 0.2์ด ๊ฐ๊ฒฉ์ผ๋ก submit (์ด๊ธฐ burst ๋ฐฉ์ง)
|
| 198 |
+
futures = []
|
| 199 |
+
for i, post in enumerate(posts):
|
| 200 |
+
futures.append(executor.submit(crawl_one, post))
|
| 201 |
+
if (i + 1) % max_workers == 0:
|
| 202 |
+
time.sleep(0.2)
|
| 203 |
+
|
| 204 |
+
done_count = 0
|
| 205 |
+
for future in as_completed(futures):
|
| 206 |
+
try:
|
| 207 |
+
post, content = future.result()
|
| 208 |
+
except Exception as e:
|
| 209 |
+
logger.debug("ํฌ๋กค๋ง ์์ธ: %s", e)
|
| 210 |
+
with lock:
|
| 211 |
+
self._stats["crawl_fail"] += 1
|
| 212 |
+
done_count += 1
|
| 213 |
+
continue
|
| 214 |
+
|
| 215 |
+
with lock:
|
| 216 |
+
if content:
|
| 217 |
+
post["full_content"] = content
|
| 218 |
+
results.append(post)
|
| 219 |
+
self._stats["crawl_success"] += 1
|
| 220 |
+
else:
|
| 221 |
+
self._stats["crawl_fail"] += 1
|
| 222 |
+
|
| 223 |
+
done_count += 1
|
| 224 |
+
if done_count % 500 == 0 or done_count == total:
|
| 225 |
+
logger.info(
|
| 226 |
+
"ํฌ๋กค๋ง ์งํ: %d/%d (์ฑ๊ณต: %d, ์คํจ: %d)",
|
| 227 |
+
done_count, total,
|
| 228 |
+
self._stats["crawl_success"],
|
| 229 |
+
self._stats["crawl_fail"],
|
| 230 |
+
)
|
| 231 |
+
|
| 232 |
+
logger.info(
|
| 233 |
+
"2๋จ๊ณ ์๋ฃ: %d๊ฑด ํฌ๋กค๋ง โ ์ฑ๊ณต %d๊ฑด, ์คํจ %d๊ฑด",
|
| 234 |
+
total,
|
| 235 |
+
self._stats["crawl_success"],
|
| 236 |
+
self._stats["crawl_fail"],
|
| 237 |
+
)
|
| 238 |
+
return results
|
| 239 |
+
|
| 240 |
+
# โโ DB ์ ์ฅ โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 241 |
+
|
| 242 |
+
def save_to_db(self, posts: list[dict]) -> int:
|
| 243 |
+
"""ํฌ๋กค๋ง ๊ฒฐ๊ณผ๋ฅผ spot_trends ํ
์ด๋ธ์ ์ ์ฅ.
|
| 244 |
+
|
| 245 |
+
source='naver_blog', metric_type='blog_post'๋ก ์๋ณธ ๋ฐ์ดํฐ ๋ณด๊ด.
|
| 246 |
+
์ฅ์๋ช
์ถ์ถ ๋ฐ mention_count ์ง๊ณ๋ ๋ณ๋ ํ์ดํ๋ผ์ธ์์ ์ฒ๋ฆฌํ๋ค.
|
| 247 |
+
|
| 248 |
+
Args:
|
| 249 |
+
posts: crawl_blog_contents()์ ๋ฐํ๊ฐ (full_content ํฌํจ)
|
| 250 |
+
|
| 251 |
+
Returns:
|
| 252 |
+
์ ์ฅ๋ ๋ ์ฝ๋ ์
|
| 253 |
+
"""
|
| 254 |
+
today = date.today()
|
| 255 |
+
period_start = today - timedelta(days=7)
|
| 256 |
+
saved_count = 0
|
| 257 |
+
|
| 258 |
+
# ๋ฐฐ์น insert๋ฅผ ์ํ rows ์์ง
|
| 259 |
+
rows: list[dict] = []
|
| 260 |
+
for post in posts:
|
| 261 |
+
rows.append({
|
| 262 |
+
"spot_id": "__pending__", # ์ฅ์๋ช
์ถ์ถ ์ ์ด๋ฏ๋ก placeholder
|
| 263 |
+
"source": "naver_blog",
|
| 264 |
+
"metric_type": "blog_post",
|
| 265 |
+
"metric_value": 1,
|
| 266 |
+
"period_start": period_start.isoformat(),
|
| 267 |
+
"period_end": today.isoformat(),
|
| 268 |
+
"raw_data": {
|
| 269 |
+
"title": post["title"],
|
| 270 |
+
"link": post["link"],
|
| 271 |
+
"bloggername": post["bloggername"],
|
| 272 |
+
"postdate": post["postdate"],
|
| 273 |
+
"keyword": post["keyword"],
|
| 274 |
+
"content_length": len(post.get("full_content", "")),
|
| 275 |
+
"content_preview": post.get("full_content", "")[:1500],
|
| 276 |
+
},
|
| 277 |
+
})
|
| 278 |
+
|
| 279 |
+
# Supabase bulk insert (1000๊ฑด์ฉ ๋ฐฐ์น)
|
| 280 |
+
batch_size = 1000
|
| 281 |
+
for i in range(0, len(rows), batch_size):
|
| 282 |
+
batch = rows[i:i + batch_size]
|
| 283 |
+
try:
|
| 284 |
+
result = self.supabase.table("spot_trends").insert(batch).execute()
|
| 285 |
+
saved_count += len(result.data) if result.data else 0
|
| 286 |
+
except Exception as e:
|
| 287 |
+
logger.error("spot_trends ์ ์ฅ ์คํจ (batch %d): %s", i // batch_size, e)
|
| 288 |
+
|
| 289 |
+
self._stats["saved"] = saved_count
|
| 290 |
+
logger.info("DB ์ ์ฅ ์๋ฃ: %d๊ฑด", saved_count)
|
| 291 |
+
return saved_count
|
| 292 |
+
|
| 293 |
+
# โโ ์ ์ฒด ํ์ดํ๋ผ์ธ ์คํ โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 294 |
+
|
| 295 |
+
def run(self) -> dict:
|
| 296 |
+
"""์ ์ฒด ์์ง ํ์ดํ๋ผ์ธ ์คํ.
|
| 297 |
+
|
| 298 |
+
1๋จ๊ณ: ๊ฒ์ API๋ก URL ํ๋ณด
|
| 299 |
+
2๋จ๊ณ: ๋ชจ๋ฐ์ผ ํฌ๋กค๋ง์ผ๋ก ๋ณธ๋ฌธ ์์ง
|
| 300 |
+
3๋จ๊ณ: DB ์ ์ฅ (์๋ณธ ๋ณด๊ด)
|
| 301 |
+
|
| 302 |
+
Returns:
|
| 303 |
+
์์ง ํต๊ณ dict
|
| 304 |
+
"""
|
| 305 |
+
logger.info("=== ๋ค์ด๋ฒ ๋ธ๋ก๊ทธ ์์ง ์์ ===")
|
| 306 |
+
start_time = datetime.now()
|
| 307 |
+
|
| 308 |
+
# 1๋จ๊ณ
|
| 309 |
+
posts = self.collect_blog_urls()
|
| 310 |
+
if not posts:
|
| 311 |
+
logger.warning("๊ฒ์ ๊ฒฐ๊ณผ ์์ โ ์์ง ์ข
๋ฃ")
|
| 312 |
+
return {**self._stats, "duration_seconds": 0}
|
| 313 |
+
|
| 314 |
+
# 2๋จ๊ณ
|
| 315 |
+
crawled = self.crawl_blog_contents(posts)
|
| 316 |
+
if not crawled:
|
| 317 |
+
logger.warning("ํฌ๋กค๋ง ๊ฒฐ๊ณผ ์์ โ ์์ง ์ข
๋ฃ")
|
| 318 |
+
return {**self._stats, "duration_seconds": 0}
|
| 319 |
+
|
| 320 |
+
# 3๋จ๊ณ
|
| 321 |
+
self.save_to_db(crawled)
|
| 322 |
+
|
| 323 |
+
duration = (datetime.now() - start_time).total_seconds()
|
| 324 |
+
self._stats["duration_seconds"] = duration
|
| 325 |
+
logger.info(
|
| 326 |
+
"=== ๋ค์ด๋ฒ ๋ธ๋ก๊ทธ ์์ง ์๋ฃ (%.1f์ด) === %s",
|
| 327 |
+
duration,
|
| 328 |
+
self._stats,
|
| 329 |
+
)
|
| 330 |
+
return self._stats
|
trend_engine/collectors/naver_place.py
ADDED
|
@@ -0,0 +1,297 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
๋ค์ด๋ฒ ํ๋ ์ด์ค ๋ณด์กฐ ์์ง๊ธฐ (Naver Place Auxiliary Collector)
|
| 3 |
+
|
| 4 |
+
๋ณด์กฐ ์ฑ๋ (๊ฐ์ค์น 5%): ์นด์นด์ค๋งต ์ค์บ ๊ฒฐ๊ณผ ๊ธฐ๋ฐ์ผ๋ก ๋ค์ด๋ฒ ํ๋ ์ด์ค ๋ฆฌ๋ทฐ ์๋ง ์ถ๊ฐ ์์ง.
|
| 5 |
+
๋น๊ณต์ API ์์กด โ Graceful Degradation ์ค๊ณ.
|
| 6 |
+
|
| 7 |
+
์๋ ค์ง ์ ์ฝ (2026-02):
|
| 8 |
+
- ๋ค์ด๋ฒ ์ง์ญ ๊ฒ์ API link ํ๋: ์ธ๋ถ URL๋ง ๋ฐํ (Place ID ๋ฏธํฌํจ)
|
| 9 |
+
- ๋ค์ด๋ฒ ๋งต ๋ด๋ถ ๊ฒ์ API: ๋ด ํธ๋ํฝ CAPTCHA ์ฐจ๋จ
|
| 10 |
+
- ๊ฒฐ๊ณผ: Place ID ๋งค์นญ ๋ถ๊ฐ โ ๊ฐ์ค์น ์ฌ๋ถ๋ฐฐ (trend_scorer์์ None ์ฒ๋ฆฌ)
|
| 11 |
+
|
| 12 |
+
- ๋ค์ด๋ฒ ์ง์ญ ๊ฒ์ API: ์ฅ์๋ช
โํ๋ ์ด์ค ID ๋งค์นญ (display ์ต๋ 5๊ฑด)
|
| 13 |
+
- ๋น๊ณต์ API: https://map.naver.com/p/api/place/summary/{place_id} โ ๋ฆฌ๋ทฐ ์
|
| 14 |
+
- ์คํจ์จ 50% ์ด์ ์ ํด๋น ์ฃผ๊ธฐ ๊ฑด๋๋ฐ๊ธฐ
|
| 15 |
+
"""
|
| 16 |
+
|
| 17 |
+
import logging
|
| 18 |
+
import os
|
| 19 |
+
import re
|
| 20 |
+
from datetime import date, datetime, timedelta
|
| 21 |
+
|
| 22 |
+
import httpx
|
| 23 |
+
|
| 24 |
+
logger = logging.getLogger(__name__)
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
def _is_similar_address(naver_addr: str, kakao_addr: str) -> bool:
|
| 28 |
+
"""๋ค์ด๋ฒ/์นด์นด์ค ์ฃผ์ ์ ์ฌ๋ ๋น๊ต.
|
| 29 |
+
|
| 30 |
+
์ ํํ ์ฃผ์ ์ผ์น๊ฐ ์๋๋ผ ํต์ฌ ๊ตฌ์ฑ ์์(์/๋ฉด/๋, ๋ฒ์ง/๋๋ก๋ช
)๊ฐ
|
| 31 |
+
๊ฒน์น๋์ง ํ์ธํ๋ค.
|
| 32 |
+
"""
|
| 33 |
+
if not naver_addr or not kakao_addr:
|
| 34 |
+
return False
|
| 35 |
+
|
| 36 |
+
# HTML ํ๊ทธ ์ ๊ฑฐ (๋ค์ด๋ฒ ๊ฒ์ ๊ฒฐ๊ณผ์ <b> ํ๊ทธ๊ฐ ํฌํจ๋ ์ ์์)
|
| 37 |
+
naver_clean = re.sub(r"<[^>]+>", "", naver_addr).strip()
|
| 38 |
+
kakao_clean = kakao_addr.strip()
|
| 39 |
+
|
| 40 |
+
# ๊ณต๋ฐฑ/ํน์๋ฌธ์ ์ ๊ทํ
|
| 41 |
+
naver_tokens = set(re.findall(r"[\w๊ฐ-ํฃ]+", naver_clean))
|
| 42 |
+
kakao_tokens = set(re.findall(r"[\w๊ฐ-ํฃ]+", kakao_clean))
|
| 43 |
+
|
| 44 |
+
# ํต์ฌ ํ ํฐ(์๋ฉด๋ ์ดํ) ๊ฒน์นจ ๋น์จ ํ์ธ
|
| 45 |
+
overlap = naver_tokens & kakao_tokens
|
| 46 |
+
if not kakao_tokens:
|
| 47 |
+
return False
|
| 48 |
+
|
| 49 |
+
return len(overlap) / len(kakao_tokens) >= 0.4
|
| 50 |
+
|
| 51 |
+
|
| 52 |
+
class NaverPlaceCollector:
|
| 53 |
+
"""๋ค์ด๋ฒ ํ๋ ์ด์ค ๋ณด์กฐ ์์ง๊ธฐ."""
|
| 54 |
+
|
| 55 |
+
def __init__(self, supabase_client):
|
| 56 |
+
self.supabase = supabase_client
|
| 57 |
+
self.client_id = os.environ["NAVER_CLIENT_ID"]
|
| 58 |
+
self.client_secret = os.environ["NAVER_CLIENT_SECRET"]
|
| 59 |
+
self._stats = {
|
| 60 |
+
"total_spots": 0,
|
| 61 |
+
"matched": 0,
|
| 62 |
+
"match_failed": 0,
|
| 63 |
+
"review_success": 0,
|
| 64 |
+
"review_failed": 0,
|
| 65 |
+
"saved": 0,
|
| 66 |
+
"skipped_high_failure": False,
|
| 67 |
+
}
|
| 68 |
+
|
| 69 |
+
# โโ ๋ค์ด๋ฒ ํ๋ ์ด์ค ID ๋งค์นญ โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 70 |
+
|
| 71 |
+
async def match_naver_place_id(self, name: str, address: str) -> str | None:
|
| 72 |
+
"""์นด์นด์ค๋งต ์ฅ์๋ช
โ ๋ค์ด๋ฒ ํ๋ ์ด์ค ID ๋งค์นญ.
|
| 73 |
+
|
| 74 |
+
๋ค์ด๋ฒ ์ง์ญ ๊ฒ์ API๋ก ์ฅ์๋ช
๊ฒ์ ํ ์ฃผ์ ๋น๊ต๋ก ๋์ผ ์ฅ์ ํ๋ณ.
|
| 75 |
+
display ์ต๋ 5๊ฑด๋ง ๋ฐํ๋๋ฏ๋ก ์ ํํ ๋งค์นญ์ด ์ค์ํ๋ค.
|
| 76 |
+
|
| 77 |
+
Args:
|
| 78 |
+
name: ์นด์นด์ค๋งต ์ฅ์๋ช
|
| 79 |
+
address: ์นด์นด์ค๋งต ์ฃผ์
|
| 80 |
+
|
| 81 |
+
Returns:
|
| 82 |
+
๋ค์ด๋ฒ ํ๋ ์ด์ค ID (๋งค์นญ ์คํจ ์ None)
|
| 83 |
+
"""
|
| 84 |
+
headers = {
|
| 85 |
+
"X-Naver-Client-Id": self.client_id,
|
| 86 |
+
"X-Naver-Client-Secret": self.client_secret,
|
| 87 |
+
}
|
| 88 |
+
params = {
|
| 89 |
+
"query": f"์ ์ {name}",
|
| 90 |
+
"display": 5,
|
| 91 |
+
"start": 1,
|
| 92 |
+
"sort": "comment",
|
| 93 |
+
}
|
| 94 |
+
|
| 95 |
+
try:
|
| 96 |
+
async with httpx.AsyncClient() as client:
|
| 97 |
+
resp = await client.get(
|
| 98 |
+
"https://openapi.naver.com/v1/search/local",
|
| 99 |
+
headers=headers,
|
| 100 |
+
params=params,
|
| 101 |
+
timeout=10,
|
| 102 |
+
)
|
| 103 |
+
resp.raise_for_status()
|
| 104 |
+
data = resp.json()
|
| 105 |
+
except httpx.HTTPError as e:
|
| 106 |
+
logger.debug("์ง์ญ ๊ฒ์ API ์คํจ [%s]: %s", name, e)
|
| 107 |
+
return None
|
| 108 |
+
|
| 109 |
+
for item in data.get("items", []):
|
| 110 |
+
item_addr = item.get("address", "")
|
| 111 |
+
if _is_similar_address(item_addr, address):
|
| 112 |
+
# link์์ ํ๋ ์ด์ค ID ์ถ์ถ
|
| 113 |
+
link = item.get("link", "")
|
| 114 |
+
match = re.search(r"/(\d{5,})/?", link)
|
| 115 |
+
if match:
|
| 116 |
+
return match.group(1)
|
| 117 |
+
|
| 118 |
+
return None
|
| 119 |
+
|
| 120 |
+
# โโ ๋น๊ณต์ API๋ก ๋ฆฌ๋ทฐ ์ ์กฐํ โโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 121 |
+
|
| 122 |
+
async def fetch_review_count(self, place_id: str) -> dict:
|
| 123 |
+
"""๋ค์ด๋ฒ ํ๋ ์ด์ค ๋ด๋ถ API๋ก ๋ฆฌ๋ทฐ ์ ์กฐํ.
|
| 124 |
+
|
| 125 |
+
๋น๊ณต์ API โ ์ธ์ ๋ ๋ณ๊ฒฝ/์ฐจ๋จ๋ ์ ์๋ค.
|
| 126 |
+
์คํจ ์ ํด๋น ์ฅ์๋ง None ์ฒ๋ฆฌ, ๋ค๋ฅธ ์ฑ๋์ ์ํฅ ์์.
|
| 127 |
+
|
| 128 |
+
Args:
|
| 129 |
+
place_id: ๋ค์ด๋ฒ ํ๋ ์ด์ค ID
|
| 130 |
+
|
| 131 |
+
Returns:
|
| 132 |
+
๋ฆฌ๋ทฐ ์ ๋ฐ์ดํฐ dict (success=False์ธ ๊ฒฝ์ฐ 0๊ฐ)
|
| 133 |
+
"""
|
| 134 |
+
api_url = f"https://map.naver.com/p/api/place/summary/{place_id}"
|
| 135 |
+
headers = {
|
| 136 |
+
"User-Agent": (
|
| 137 |
+
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
|
| 138 |
+
"AppleWebKit/537.36 (KHTML, like Gecko) "
|
| 139 |
+
"Chrome/120.0.0.0 Safari/537.36"
|
| 140 |
+
),
|
| 141 |
+
"Referer": "https://map.naver.com/",
|
| 142 |
+
}
|
| 143 |
+
|
| 144 |
+
try:
|
| 145 |
+
async with httpx.AsyncClient() as client:
|
| 146 |
+
resp = await client.get(api_url, headers=headers, timeout=10)
|
| 147 |
+
resp.raise_for_status()
|
| 148 |
+
data = resp.json()
|
| 149 |
+
|
| 150 |
+
return {
|
| 151 |
+
"visitor_review_count": data.get("visitorReviewCount", 0),
|
| 152 |
+
"blog_review_count": data.get("blogReviewCount", 0),
|
| 153 |
+
"avg_rating": data.get("visitorReviewScore", 0),
|
| 154 |
+
"success": True,
|
| 155 |
+
}
|
| 156 |
+
except Exception as e:
|
| 157 |
+
logger.debug("ํ๋ ์ด์ค ๋ฆฌ๋ทฐ ์กฐํ ์คํจ [%s]: %s", place_id, e)
|
| 158 |
+
return {
|
| 159 |
+
"visitor_review_count": 0,
|
| 160 |
+
"blog_review_count": 0,
|
| 161 |
+
"avg_rating": 0,
|
| 162 |
+
"success": False,
|
| 163 |
+
"error": str(e),
|
| 164 |
+
}
|
| 165 |
+
|
| 166 |
+
# โโ DB ์ ์ฅ โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 167 |
+
|
| 168 |
+
def _save_results(self, results: list[dict]) -> int:
|
| 169 |
+
"""์์ง ๊ฒฐ๊ณผ๋ฅผ spot_trends ํ
์ด๋ธ์ ์ ์ฅ."""
|
| 170 |
+
today = date.today()
|
| 171 |
+
period_start = today - timedelta(days=7)
|
| 172 |
+
saved_count = 0
|
| 173 |
+
|
| 174 |
+
rows: list[dict] = []
|
| 175 |
+
for r in results:
|
| 176 |
+
if not r.get("review_data") or not r["review_data"].get("success"):
|
| 177 |
+
continue
|
| 178 |
+
rd = r["review_data"]
|
| 179 |
+
rows.append({
|
| 180 |
+
"spot_id": r.get("spot_id", r.get("kakao_id", "__unknown__")),
|
| 181 |
+
"source": "naver_place",
|
| 182 |
+
"metric_type": "review_count",
|
| 183 |
+
"metric_value": rd["visitor_review_count"],
|
| 184 |
+
"period_start": period_start.isoformat(),
|
| 185 |
+
"period_end": today.isoformat(),
|
| 186 |
+
"raw_data": {
|
| 187 |
+
"name": r.get("name"),
|
| 188 |
+
"naver_place_id": r.get("naver_place_id"),
|
| 189 |
+
"visitor_review_count": rd["visitor_review_count"],
|
| 190 |
+
"blog_review_count": rd["blog_review_count"],
|
| 191 |
+
"avg_rating": rd["avg_rating"],
|
| 192 |
+
},
|
| 193 |
+
})
|
| 194 |
+
|
| 195 |
+
if not rows:
|
| 196 |
+
return 0
|
| 197 |
+
|
| 198 |
+
batch_size = 500
|
| 199 |
+
for i in range(0, len(rows), batch_size):
|
| 200 |
+
batch = rows[i:i + batch_size]
|
| 201 |
+
try:
|
| 202 |
+
result = self.supabase.table("spot_trends").insert(batch).execute()
|
| 203 |
+
saved_count += len(result.data) if result.data else 0
|
| 204 |
+
except Exception as e:
|
| 205 |
+
logger.error("spot_trends ์ ์ฅ ์คํจ (naver_place batch %d): %s", i // batch_size, e)
|
| 206 |
+
|
| 207 |
+
self._stats["saved"] = saved_count
|
| 208 |
+
return saved_count
|
| 209 |
+
|
| 210 |
+
# โโ ์ ์ฒด ํ์ดํ๋ผ์ธ ์คํ โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 211 |
+
|
| 212 |
+
async def run(self, kakaomap_spots: list[dict]) -> dict:
|
| 213 |
+
"""๋ค์ด๋ฒ ํ๋ ์ด์ค ๋ณด์กฐ ์์ง ์คํ.
|
| 214 |
+
|
| 215 |
+
์นด์นด์ค๋งต ์ค์บ ๊ฒฐ๊ณผ(์์ 100๊ฐ ์ดํ) ๊ธฐ๋ฐ์ผ๋ก:
|
| 216 |
+
1. ์ฅ์๋ช
โ ๋ค์ด๋ฒ ํ๋ ์ด์ค ID ๋งค์นญ
|
| 217 |
+
2. ๋น๊ณต์ API๋ก ๋ฆฌ๋ทฐ ์ ์กฐํ
|
| 218 |
+
3. ์คํจ์จ 50% ์ด์ ์ ํด๋น ์ฃผ๊ธฐ ๊ฑด๋๋ฐ๊ธฐ
|
| 219 |
+
|
| 220 |
+
Args:
|
| 221 |
+
kakaomap_spots: ์นด์นด์ค๋งต ์์ง๊ธฐ์์ ์ ๋ฌ๋ฐ์ ์ฅ์ ๋ชฉ๋ก
|
| 222 |
+
[{"kakao_id": "...", "name": "...", "address": "...", ...}, ...]
|
| 223 |
+
|
| 224 |
+
Returns:
|
| 225 |
+
์์ง ํต๊ณ dict
|
| 226 |
+
"""
|
| 227 |
+
logger.info("=== ๋ค์ด๋ฒ ํ๋ ์ด์ค ๋ณด์กฐ ์์ง ์์ ===")
|
| 228 |
+
start_time = datetime.now()
|
| 229 |
+
|
| 230 |
+
spots = kakaomap_spots[:100] # ์์ 100๊ฐ๋ง ๋์
|
| 231 |
+
self._stats["total_spots"] = len(spots)
|
| 232 |
+
|
| 233 |
+
results: list[dict] = []
|
| 234 |
+
|
| 235 |
+
# 1๋จ๊ณ: ๋ค์ด๋ฒ ํ๋ ์ด์ค ID ๋งค์นญ
|
| 236 |
+
for spot in spots:
|
| 237 |
+
name = spot.get("name", "")
|
| 238 |
+
address = spot.get("address", "")
|
| 239 |
+
|
| 240 |
+
place_id = await self.match_naver_place_id(name, address)
|
| 241 |
+
if place_id:
|
| 242 |
+
self._stats["matched"] += 1
|
| 243 |
+
spot["naver_place_id"] = place_id
|
| 244 |
+
else:
|
| 245 |
+
self._stats["match_failed"] += 1
|
| 246 |
+
spot["naver_place_id"] = None
|
| 247 |
+
|
| 248 |
+
# ๋งค์นญ ์ฑ๊ณตํ ์ฅ์๋ง ๋ฆฌ๋ทฐ ์กฐํ
|
| 249 |
+
matched_spots = [s for s in spots if s.get("naver_place_id")]
|
| 250 |
+
logger.info(
|
| 251 |
+
"ID ๋งค์นญ: %d/%d ์ฑ๊ณต",
|
| 252 |
+
self._stats["matched"],
|
| 253 |
+
self._stats["total_spots"],
|
| 254 |
+
)
|
| 255 |
+
|
| 256 |
+
if not matched_spots:
|
| 257 |
+
logger.warning("๋งค์นญ๋ ์ฅ์ ์์ โ ์์ง ์ข
๋ฃ")
|
| 258 |
+
duration = (datetime.now() - start_time).total_seconds()
|
| 259 |
+
self._stats["duration_seconds"] = duration
|
| 260 |
+
return self._stats
|
| 261 |
+
|
| 262 |
+
# 2๋จ๊ณ: ๋ฆฌ๋ทฐ ์ ์กฐํ (์คํจ์จ ๋ชจ๋ํฐ๋ง)
|
| 263 |
+
for spot in matched_spots:
|
| 264 |
+
review_data = await self.fetch_review_count(spot["naver_place_id"])
|
| 265 |
+
spot["review_data"] = review_data
|
| 266 |
+
|
| 267 |
+
if review_data["success"]:
|
| 268 |
+
self._stats["review_success"] += 1
|
| 269 |
+
else:
|
| 270 |
+
self._stats["review_failed"] += 1
|
| 271 |
+
|
| 272 |
+
results.append(spot)
|
| 273 |
+
|
| 274 |
+
# ์คํจ์จ 50% ์ด์ ์ ์กฐ๊ธฐ ์ค๋จ
|
| 275 |
+
total_attempts = self._stats["review_success"] + self._stats["review_failed"]
|
| 276 |
+
if total_attempts >= 10:
|
| 277 |
+
failure_rate = self._stats["review_failed"] / total_attempts
|
| 278 |
+
if failure_rate >= 0.5:
|
| 279 |
+
logger.warning(
|
| 280 |
+
"๋ฆฌ๋ทฐ API ์คํจ์จ %.0f%% โ ํด๋น ์ฃผ๊ธฐ ๊ฑด๋๋ฐ๊ธฐ",
|
| 281 |
+
failure_rate * 100,
|
| 282 |
+
)
|
| 283 |
+
self._stats["skipped_high_failure"] = True
|
| 284 |
+
break
|
| 285 |
+
|
| 286 |
+
# 3๋จ๊ณ: DB ์ ์ฅ
|
| 287 |
+
if not self._stats["skipped_high_failure"]:
|
| 288 |
+
self._save_results(results)
|
| 289 |
+
|
| 290 |
+
duration = (datetime.now() - start_time).total_seconds()
|
| 291 |
+
self._stats["duration_seconds"] = duration
|
| 292 |
+
logger.info(
|
| 293 |
+
"=== ๋ค์ด๋ฒ ํ๋ ์ด์ค ๋ณด์กฐ ์์ง ์๋ฃ (%.1f์ด) === %s",
|
| 294 |
+
duration,
|
| 295 |
+
self._stats,
|
| 296 |
+
)
|
| 297 |
+
return self._stats
|
trend_engine/collectors/youtube.py
ADDED
|
@@ -0,0 +1,317 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
YouTube Collector โ YouTube Data API v3 ํค์๋ ๊ฒ์ + ์์น ๊ธฐ๋ฐ ๊ฒ์
|
| 3 |
+
|
| 4 |
+
์์ง ๋ฐฉ์:
|
| 5 |
+
1. 6๊ฐ ํค์๋๋ก ์ต๊ทผ 1์ฃผ ์์ ๊ฒ์ (search.list)
|
| 6 |
+
2. ์์ ์์ธ ์ ๋ณด ์กฐํ (videos.list โ ์กฐํ์, ์ข์์)
|
| 7 |
+
3. ์์น ๊ธฐ๋ฐ ๋ณด์กฐ ๊ฒ์ (์ ์ ์ค์ฌ 10km)
|
| 8 |
+
4. ์์ ์ ๋ชฉ+์ค๋ช
์์ ์ฅ์๋ช
์ถ์ถ (PlaceNameExtractor ์ฐ๋)
|
| 9 |
+
"""
|
| 10 |
+
|
| 11 |
+
import os
|
| 12 |
+
import logging
|
| 13 |
+
from datetime import date, datetime, timedelta, timezone
|
| 14 |
+
|
| 15 |
+
from googleapiclient.discovery import build
|
| 16 |
+
|
| 17 |
+
from trend_engine.place_extractor import PlaceNameExtractor
|
| 18 |
+
|
| 19 |
+
logger = logging.getLogger(__name__)
|
| 20 |
+
|
| 21 |
+
# โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 22 |
+
# ์ค์
|
| 23 |
+
# โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 24 |
+
|
| 25 |
+
YOUTUBE_KEYWORDS = [
|
| 26 |
+
"์ ์ ์ฌํ",
|
| 27 |
+
"์ ์ฃผ ์ ์ ์นดํ",
|
| 28 |
+
"์ ์ ๋ธ์ด๋ก๊ทธ",
|
| 29 |
+
"์ ์ ํด์ ์ฐ์ฑ
",
|
| 30 |
+
"์ ์ ๋ง์ง ์ถ์ฒ",
|
| 31 |
+
"ํ๋ด ํด์",
|
| 32 |
+
]
|
| 33 |
+
|
| 34 |
+
# ์ ์ ์ค์ฌ ์ขํ
|
| 35 |
+
AEWOL_CENTER = {"lat": "33.46", "lng": "126.31"}
|
| 36 |
+
LOCATION_RADIUS = "10km"
|
| 37 |
+
|
| 38 |
+
MAX_RESULTS_PER_KEYWORD = 20
|
| 39 |
+
MAX_RESULTS_LOCATION = 30
|
| 40 |
+
|
| 41 |
+
|
| 42 |
+
class YouTubeCollector:
|
| 43 |
+
"""YouTube Data API v3 ๊ธฐ๋ฐ ํธ๋ ๋ ์์ ์์ง๊ธฐ."""
|
| 44 |
+
|
| 45 |
+
def __init__(self, supabase_client, spot_matcher=None):
|
| 46 |
+
self.supabase = supabase_client
|
| 47 |
+
api_key = os.environ["AEWOL_AI_SYSTEM"]
|
| 48 |
+
self.youtube = build("youtube", "v3", developerKey=api_key)
|
| 49 |
+
self.extractor = PlaceNameExtractor(supabase_client)
|
| 50 |
+
self.spot_matcher = spot_matcher
|
| 51 |
+
|
| 52 |
+
# ------------------------------------------------------------------
|
| 53 |
+
# ํค์๋ ๊ฒ์
|
| 54 |
+
# ------------------------------------------------------------------
|
| 55 |
+
|
| 56 |
+
def collect_keyword_videos(self) -> list[dict]:
|
| 57 |
+
"""6๊ฐ ํค์๋๋ก ์ต๊ทผ 1์ฃผ ์์์ ๊ฒ์ํ๋ค."""
|
| 58 |
+
one_week_ago = (
|
| 59 |
+
datetime.now(timezone.utc) - timedelta(days=7)
|
| 60 |
+
).strftime("%Y-%m-%dT%H:%M:%SZ")
|
| 61 |
+
|
| 62 |
+
all_video_ids: list[str] = []
|
| 63 |
+
keyword_map: dict[str, list[str]] = {} # video_id โ keywords
|
| 64 |
+
|
| 65 |
+
for keyword in YOUTUBE_KEYWORDS:
|
| 66 |
+
try:
|
| 67 |
+
search_resp = (
|
| 68 |
+
self.youtube.search()
|
| 69 |
+
.list(
|
| 70 |
+
q=keyword,
|
| 71 |
+
type="video",
|
| 72 |
+
part="id,snippet",
|
| 73 |
+
order="date",
|
| 74 |
+
publishedAfter=one_week_ago,
|
| 75 |
+
maxResults=MAX_RESULTS_PER_KEYWORD,
|
| 76 |
+
)
|
| 77 |
+
.execute()
|
| 78 |
+
)
|
| 79 |
+
except Exception as e:
|
| 80 |
+
logger.warning("YouTube ๊ฒ์ ์คํจ (keyword=%s): %s", keyword, e)
|
| 81 |
+
continue
|
| 82 |
+
|
| 83 |
+
for item in search_resp.get("items", []):
|
| 84 |
+
vid = item["id"]["videoId"]
|
| 85 |
+
if vid not in keyword_map:
|
| 86 |
+
keyword_map[vid] = []
|
| 87 |
+
all_video_ids.append(vid)
|
| 88 |
+
keyword_map[vid].append(keyword)
|
| 89 |
+
|
| 90 |
+
logger.info(
|
| 91 |
+
"ํค์๋ ๊ฒ์ ์๋ฃ: %d๊ฐ ๊ณ ์ ์์ ๋ฐ๊ฒฌ (%d๊ฐ ํค์๋)",
|
| 92 |
+
len(all_video_ids), len(YOUTUBE_KEYWORDS),
|
| 93 |
+
)
|
| 94 |
+
|
| 95 |
+
# ์์ ์์ธ ์ ๋ณด ์กฐํ (50๊ฐ์ฉ ๋ฐฐ์น)
|
| 96 |
+
videos = self._fetch_video_details(all_video_ids, keyword_map)
|
| 97 |
+
return videos
|
| 98 |
+
|
| 99 |
+
# ------------------------------------------------------------------
|
| 100 |
+
# ์์น ๊ธฐ๋ฐ ๊ฒ์
|
| 101 |
+
# ------------------------------------------------------------------
|
| 102 |
+
|
| 103 |
+
def collect_location_videos(self) -> list[dict]:
|
| 104 |
+
"""์ ์ ์ค์ฌ 10km ๋ฐ๊ฒฝ ๋ด ์ต๊ทผ 1์ฃผ ์์์ ๊ฒ์ํ๋ค."""
|
| 105 |
+
one_week_ago = (
|
| 106 |
+
datetime.now(timezone.utc) - timedelta(days=7)
|
| 107 |
+
).strftime("%Y-%m-%dT%H:%M:%SZ")
|
| 108 |
+
|
| 109 |
+
try:
|
| 110 |
+
search_resp = (
|
| 111 |
+
self.youtube.search()
|
| 112 |
+
.list(
|
| 113 |
+
part="id,snippet",
|
| 114 |
+
type="video",
|
| 115 |
+
location=f"{AEWOL_CENTER['lat']},{AEWOL_CENTER['lng']}",
|
| 116 |
+
locationRadius=LOCATION_RADIUS,
|
| 117 |
+
order="date",
|
| 118 |
+
publishedAfter=one_week_ago,
|
| 119 |
+
maxResults=MAX_RESULTS_LOCATION,
|
| 120 |
+
)
|
| 121 |
+
.execute()
|
| 122 |
+
)
|
| 123 |
+
except Exception as e:
|
| 124 |
+
logger.warning("YouTube ์์น ๊ฒ์ ์คํจ: %s", e)
|
| 125 |
+
return []
|
| 126 |
+
|
| 127 |
+
video_ids = [item["id"]["videoId"] for item in search_resp.get("items", [])]
|
| 128 |
+
logger.info("์์น ๊ฒ์ ์๋ฃ: %d๊ฐ ์์ ๋ฐ๊ฒฌ", len(video_ids))
|
| 129 |
+
|
| 130 |
+
keyword_map = {vid: ["location_search"] for vid in video_ids}
|
| 131 |
+
return self._fetch_video_details(video_ids, keyword_map)
|
| 132 |
+
|
| 133 |
+
# ------------------------------------------------------------------
|
| 134 |
+
# ์์ ์์ธ ์กฐํ
|
| 135 |
+
# ------------------------------------------------------------------
|
| 136 |
+
|
| 137 |
+
def _fetch_video_details(
|
| 138 |
+
self,
|
| 139 |
+
video_ids: list[str],
|
| 140 |
+
keyword_map: dict[str, list[str]],
|
| 141 |
+
) -> list[dict]:
|
| 142 |
+
"""videos.list API๋ก ์กฐํ์/์ข์์ ๋ฑ ์์ธ ์ ๋ณด๋ฅผ ๊ฐ์ ธ์จ๋ค."""
|
| 143 |
+
videos: list[dict] = []
|
| 144 |
+
|
| 145 |
+
# 50๊ฐ์ฉ ๋ฐฐ์น ์ฒ๋ฆฌ
|
| 146 |
+
for i in range(0, len(video_ids), 50):
|
| 147 |
+
batch = video_ids[i : i + 50]
|
| 148 |
+
try:
|
| 149 |
+
resp = (
|
| 150 |
+
self.youtube.videos()
|
| 151 |
+
.list(
|
| 152 |
+
part="statistics,snippet",
|
| 153 |
+
id=",".join(batch),
|
| 154 |
+
)
|
| 155 |
+
.execute()
|
| 156 |
+
)
|
| 157 |
+
except Exception as e:
|
| 158 |
+
logger.warning("YouTube videos.list ์คํจ: %s", e)
|
| 159 |
+
continue
|
| 160 |
+
|
| 161 |
+
for video in resp.get("items", []):
|
| 162 |
+
stats = video.get("statistics", {})
|
| 163 |
+
videos.append({
|
| 164 |
+
"video_id": video["id"],
|
| 165 |
+
"title": video["snippet"]["title"],
|
| 166 |
+
"description": video["snippet"].get("description", ""),
|
| 167 |
+
"channel_title": video["snippet"].get("channelTitle", ""),
|
| 168 |
+
"view_count": int(stats.get("viewCount", 0)),
|
| 169 |
+
"like_count": int(stats.get("likeCount", 0)),
|
| 170 |
+
"comment_count": int(stats.get("commentCount", 0)),
|
| 171 |
+
"published_at": video["snippet"]["publishedAt"],
|
| 172 |
+
"keywords": keyword_map.get(video["id"], []),
|
| 173 |
+
})
|
| 174 |
+
|
| 175 |
+
return videos
|
| 176 |
+
|
| 177 |
+
# ------------------------------------------------------------------
|
| 178 |
+
# ์ฅ์๋ช
์ถ์ถ + ๋ฉํธ๋ฆญ ์ง๊ณ
|
| 179 |
+
# ------------------------------------------------------------------
|
| 180 |
+
|
| 181 |
+
def _extract_place_mentions(self, videos: list[dict]) -> dict[str, dict]:
|
| 182 |
+
"""
|
| 183 |
+
์์ ์ ๋ชฉ+์ค๋ช
์์ ์ฅ์๋ช
์ ์ถ์ถํ๊ณ ๋ฉํธ๋ฆญ์ ์ง๊ณํ๋ค.
|
| 184 |
+
|
| 185 |
+
Returns:
|
| 186 |
+
{spot_id_or_name: {name, spot_id, mention_video_count, total_views, total_likes}}
|
| 187 |
+
"""
|
| 188 |
+
place_metrics: dict[str, dict] = {}
|
| 189 |
+
|
| 190 |
+
for video in videos:
|
| 191 |
+
text = video["title"] + " " + video["description"]
|
| 192 |
+
places = self.extractor.extract(text)
|
| 193 |
+
|
| 194 |
+
for place in places:
|
| 195 |
+
key = place["spot_id"] or place["name"]
|
| 196 |
+
if key not in place_metrics:
|
| 197 |
+
place_metrics[key] = {
|
| 198 |
+
"name": place["name"],
|
| 199 |
+
"spot_id": place["spot_id"],
|
| 200 |
+
"method": place["method"],
|
| 201 |
+
"mention_video_count": 0,
|
| 202 |
+
"total_views": 0,
|
| 203 |
+
"total_likes": 0,
|
| 204 |
+
}
|
| 205 |
+
place_metrics[key]["mention_video_count"] += 1
|
| 206 |
+
place_metrics[key]["total_views"] += video["view_count"]
|
| 207 |
+
place_metrics[key]["total_likes"] += video["like_count"]
|
| 208 |
+
|
| 209 |
+
logger.info("์ฅ์๋ช
์ถ์ถ ์๋ฃ: %d๊ฐ ์ฅ์ ์๋ณ", len(place_metrics))
|
| 210 |
+
return place_metrics
|
| 211 |
+
|
| 212 |
+
# ------------------------------------------------------------------
|
| 213 |
+
# DB ์ ์ฅ
|
| 214 |
+
# ------------------------------------------------------------------
|
| 215 |
+
|
| 216 |
+
def _save_to_db(self, place_metrics: dict[str, dict], videos: list[dict]) -> int:
|
| 217 |
+
"""์ถ์ถ๋ ๋ฉํธ๋ฆญ์ spot_trends ํ
์ด๋ธ์ ์ ์ฅํ๋ค.
|
| 218 |
+
|
| 219 |
+
SpotMatcher๊ฐ ์ค์ ๋ ๊ฒฝ์ฐ trend_spots.id๋ก ์ ๊ทํ.
|
| 220 |
+
๋งค์นญ ์คํจํ ์ฅ์๋ ์คํต.
|
| 221 |
+
"""
|
| 222 |
+
today = date.today()
|
| 223 |
+
saved = 0
|
| 224 |
+
skipped = 0
|
| 225 |
+
|
| 226 |
+
for key, metrics in place_metrics.items():
|
| 227 |
+
# SpotMatcher๋ก spot_id ์ ๊ทํ
|
| 228 |
+
if self.spot_matcher:
|
| 229 |
+
spot_id = self.spot_matcher.match(metrics["name"])
|
| 230 |
+
if not spot_id:
|
| 231 |
+
skipped += 1
|
| 232 |
+
continue
|
| 233 |
+
else:
|
| 234 |
+
spot_id = metrics["spot_id"] or key
|
| 235 |
+
|
| 236 |
+
# view_count ๋ฉํธ๋ฆญ
|
| 237 |
+
try:
|
| 238 |
+
self.supabase.table("spot_trends").insert({
|
| 239 |
+
"spot_id": spot_id,
|
| 240 |
+
"source": "youtube",
|
| 241 |
+
"metric_type": "view_count",
|
| 242 |
+
"metric_value": metrics["total_views"],
|
| 243 |
+
"period_start": today.isoformat(),
|
| 244 |
+
"period_end": today.isoformat(),
|
| 245 |
+
"raw_data": {
|
| 246 |
+
"mention_video_count": metrics["mention_video_count"],
|
| 247 |
+
"total_likes": metrics["total_likes"],
|
| 248 |
+
"name": metrics["name"],
|
| 249 |
+
"method": metrics["method"],
|
| 250 |
+
},
|
| 251 |
+
}).execute()
|
| 252 |
+
saved += 1
|
| 253 |
+
except Exception as e:
|
| 254 |
+
logger.warning("spot_trends insert ์คํจ (youtube view_count, %s): %s", spot_id, e)
|
| 255 |
+
|
| 256 |
+
# video_count ๋ฉํธ๋ฆญ (์ฑ๋ ์ค์ฝ์ด๋ง์ ํ์)
|
| 257 |
+
try:
|
| 258 |
+
self.supabase.table("spot_trends").insert({
|
| 259 |
+
"spot_id": spot_id,
|
| 260 |
+
"source": "youtube",
|
| 261 |
+
"metric_type": "video_count",
|
| 262 |
+
"metric_value": metrics["mention_video_count"],
|
| 263 |
+
"period_start": today.isoformat(),
|
| 264 |
+
"period_end": today.isoformat(),
|
| 265 |
+
"raw_data": {"name": metrics["name"]},
|
| 266 |
+
}).execute()
|
| 267 |
+
except Exception as e:
|
| 268 |
+
logger.warning("spot_trends insert ์คํจ (youtube video_count, %s): %s", spot_id, e)
|
| 269 |
+
|
| 270 |
+
if skipped:
|
| 271 |
+
logger.info("YouTube ์ฅ์ ๋งค์นญ ์คํจ๋ก %d๊ฑด ์คํต", skipped)
|
| 272 |
+
logger.info("YouTube DB ์ ์ฅ ์๋ฃ: %d๊ฑด", saved)
|
| 273 |
+
return saved
|
| 274 |
+
|
| 275 |
+
# ------------------------------------------------------------------
|
| 276 |
+
# ๋ฉ์ธ ์คํ
|
| 277 |
+
# ------------------------------------------------------------------
|
| 278 |
+
|
| 279 |
+
def run(self) -> dict:
|
| 280 |
+
"""
|
| 281 |
+
YouTube ์์ง ํ์ดํ๋ผ์ธ ์ ์ฒด ์คํ.
|
| 282 |
+
|
| 283 |
+
Returns:
|
| 284 |
+
{"keyword_videos": int, "location_videos": int, "places_found": int, "saved": int}
|
| 285 |
+
"""
|
| 286 |
+
logger.info("=== YouTube ์์ง ์์ ===")
|
| 287 |
+
|
| 288 |
+
# 1๋จ๊ณ: ํค์๋ ๊ฒ์
|
| 289 |
+
keyword_videos = self.collect_keyword_videos()
|
| 290 |
+
|
| 291 |
+
# 2๋จ๊ณ: ์์น ๊ธฐ๋ฐ ๋ณด์กฐ ๊ฒ์
|
| 292 |
+
location_videos = self.collect_location_videos()
|
| 293 |
+
|
| 294 |
+
# ์ค๋ณต ์ ๊ฑฐ (video_id ๊ธฐ์ค)
|
| 295 |
+
seen_ids = {v["video_id"] for v in keyword_videos}
|
| 296 |
+
for v in location_videos:
|
| 297 |
+
if v["video_id"] not in seen_ids:
|
| 298 |
+
keyword_videos.append(v)
|
| 299 |
+
seen_ids.add(v["video_id"])
|
| 300 |
+
|
| 301 |
+
all_videos = keyword_videos
|
| 302 |
+
|
| 303 |
+
# 3๋จ๊ณ: ์ฅ์๋ช
์ถ์ถ + ๋ฉํธ๋ฆญ ์ง๊ณ
|
| 304 |
+
place_metrics = self._extract_place_mentions(all_videos)
|
| 305 |
+
|
| 306 |
+
# 4๋จ๊ณ: DB ์ ์ฅ
|
| 307 |
+
saved = self._save_to_db(place_metrics, all_videos)
|
| 308 |
+
|
| 309 |
+
result = {
|
| 310 |
+
"keyword_videos": len(keyword_videos),
|
| 311 |
+
"location_videos": len(location_videos),
|
| 312 |
+
"total_unique_videos": len(all_videos),
|
| 313 |
+
"places_found": len(place_metrics),
|
| 314 |
+
"saved": saved,
|
| 315 |
+
}
|
| 316 |
+
logger.info("=== YouTube ์์ง ์๋ฃ: %s ===", result)
|
| 317 |
+
return result
|
trend_engine/place_extractor.py
ADDED
|
@@ -0,0 +1,152 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
PlaceNameExtractor โ ๋ธ๋ก๊ทธ/์ ํ๋ธ ํ
์คํธ์์ ์ฅ์๋ช
์ ์ถ์ถํ๋ ๊ณตํต ๋ชจ๋
|
| 3 |
+
|
| 4 |
+
์ถ์ถ ์ฐ์ ์์:
|
| 5 |
+
1. ์ฅ์๋ช
์ฌ์ ๋งค์นญ (trend_spots + story_spots ์ด๋ฆ)
|
| 6 |
+
2. ์ ๋ฏธ์ฌ ํจํด ๋งค์นญ (์นดํ, ์๋น, ํด๋ณ ๋ฑ)
|
| 7 |
+
"""
|
| 8 |
+
|
| 9 |
+
import os
|
| 10 |
+
import re
|
| 11 |
+
import logging
|
| 12 |
+
|
| 13 |
+
from supabase import create_client, Client
|
| 14 |
+
|
| 15 |
+
logger = logging.getLogger(__name__)
|
| 16 |
+
|
| 17 |
+
# ์ฅ์๋ช
์ ๋ฏธ์ฌ ํจํด (์ฌ์ ์ ์๋ ์ ์ฅ์ ๋ฐ๊ฒฌ์ฉ)
|
| 18 |
+
SUFFIX_PATTERNS = [
|
| 19 |
+
re.compile(r"[\w๊ฐ-ํฃ]{2,}(?:์นดํ|์ปคํผ)"),
|
| 20 |
+
re.compile(r"[\w๊ฐ-ํฃ]{2,}(?:์๋น|๋ฐฅ์ง|๊ตญ์)"),
|
| 21 |
+
re.compile(r"[\w๊ฐ-ํฃ]{2,}(?:ํด๋ณ|ํด์|ํฌ๊ตฌ)"),
|
| 22 |
+
re.compile(r"[\w๊ฐ-ํฃ]{2,}(?:์ค๋ฆ|๊ณต์|์ฒ)"),
|
| 23 |
+
re.compile(r"[\w๊ฐ-ํฃ]{2,}(?:์ ๋ง๋|๋ทฐ)"),
|
| 24 |
+
re.compile(r"[\w๊ฐ-ํฃ]{2,}(?:๋ฒ ์ด์ปค๋ฆฌ|๋ธ๋ฐ์น|๋์ ํธ)"),
|
| 25 |
+
]
|
| 26 |
+
|
| 27 |
+
# ๋๋ฌด ์ผ๋ฐ์ ์ธ ๋จ์ด โ ์ฅ์๋ช
์ด ์๋ ๊ฒฝ์ฐ ํํฐ๋ง
|
| 28 |
+
STOPWORDS = frozenset({
|
| 29 |
+
# ์ง์๋๋ช
์ฌ + ์นดํ
๊ณ ๋ฆฌ
|
| 30 |
+
"์ด์นดํ", "๊ทธ์นดํ", "์ ์นดํ", "์ด์๋น", "๊ทธ์๋น",
|
| 31 |
+
"์ดํด๋ณ", "๊ทธํด๋ณ", "์ด์ค๋ฆ", "๊ทธ์ค๋ฆ",
|
| 32 |
+
# ํ์ฉ์ฌ + ์นดํ
๊ณ ๋ฆฌ
|
| 33 |
+
"์ข์์นดํ", "์์์นดํ", "๋ง์๋์๋น", "์ข์์๋น",
|
| 34 |
+
# ์์น/์์์ด + ์นดํ
๊ณ ๋ฆฌ
|
| 35 |
+
"๊ทผ์ฒ์นดํ", "์ฃผ๋ณ์นดํ", "๋๋ค์นดํ", "์ ๋ช
์นดํ",
|
| 36 |
+
"๊ฐ์ฑ์นดํ", "๋ทฐ์นดํ", "์ค์
๋ทฐ์นดํ",
|
| 37 |
+
"์ ์์นดํ", "ํ๋ฆผ์นดํ", "ํ์ฌ์นดํ",
|
| 38 |
+
"์ ์ฃผ์นดํ", "์ ์ฃผ์๋น", "์ ์ฃผํด๋ณ", "์ ์ฃผ์ค๋ฆ",
|
| 39 |
+
"์์์๋น", "์ ์ฃผ๋ง์ง", "์ ์๋ง์ง",
|
| 40 |
+
# ์นดํ
๊ณ ๋ฆฌ ์ผ๋ฐ๋ช
์ฌ (๋จ๋
์ฌ์ฉ ์ ์ฅ์๋ช
์๋)
|
| 41 |
+
"์นดํ", "์ปคํผ", "์๋น", "๋ง์ง", "๋ฐฅ์ง", "๊ตญ์",
|
| 42 |
+
"ํด๋ณ", "ํด์", "ํฌ๊ตฌ", "์ค๋ฆ", "๊ณต์", "์ฒ",
|
| 43 |
+
"์ ๋ง๋", "๋ทฐ", "๋ฒ ์ด์ปค๋ฆฌ", "๋ธ๋ฐ์น", "๋์ ํธ",
|
| 44 |
+
"์ฐ์ฑ
", "์ฐ์ฑ
๋ก", "๋๋ผ์ด๋ธ",
|
| 45 |
+
# ์ง์ญ + ์ผ๋ฐ ํํ
|
| 46 |
+
"์ ์ฃผ์ฌํ", "์ ์์ฌํ", "์ ์ฃผ๋", "์ ์", "์ ์๋ฆฌ",
|
| 47 |
+
})
|
| 48 |
+
|
| 49 |
+
|
| 50 |
+
def _get_supabase_client() -> Client:
|
| 51 |
+
url = os.environ.get("SUPABASE_URL") or os.environ.get("VITE_SUPABASE_URL")
|
| 52 |
+
key = os.environ.get("SUPABASE_SERVICE_ROLE_KEY") or os.environ.get("SUPABASE_SERVICE_KEY")
|
| 53 |
+
if not url or not key:
|
| 54 |
+
raise ValueError("SUPABASE_URL and SUPABASE_SERVICE_ROLE_KEY must be set")
|
| 55 |
+
return create_client(url, key)
|
| 56 |
+
|
| 57 |
+
|
| 58 |
+
class PlaceNameExtractor:
|
| 59 |
+
"""ํ
์คํธ์์ ์ฅ์๋ช
์ ์ถ์ถํ๋ ํ์ดํ๋ผ์ธ."""
|
| 60 |
+
|
| 61 |
+
def __init__(self, supabase: Client | None = None):
|
| 62 |
+
self.supabase = supabase or _get_supabase_client()
|
| 63 |
+
# {์ฅ์๋ช
: spot_id} โ ์ ํ ๋งค์นญ์ฉ
|
| 64 |
+
self.known_places: dict[str, str] = {}
|
| 65 |
+
self._load_place_dictionary()
|
| 66 |
+
|
| 67 |
+
# ------------------------------------------------------------------
|
| 68 |
+
# ์ฌ์ ๊ตฌ์ถ
|
| 69 |
+
# ------------------------------------------------------------------
|
| 70 |
+
|
| 71 |
+
def _load_place_dictionary(self) -> None:
|
| 72 |
+
"""trend_spots + story_spots์์ ์ฅ์๋ช
์ฌ์ ์ ๊ตฌ์ถํ๋ค."""
|
| 73 |
+
|
| 74 |
+
# 1) trend_spots
|
| 75 |
+
try:
|
| 76 |
+
resp = self.supabase.table("trend_spots").select("id, name").execute()
|
| 77 |
+
for row in resp.data or []:
|
| 78 |
+
self._register_name(row["name"], row["id"])
|
| 79 |
+
except Exception as e:
|
| 80 |
+
logger.warning("trend_spots ๋ก๋ ์คํจ (ํ
์ด๋ธ ๋ฏธ์กด์ฌ ๊ฐ๋ฅ): %s", e)
|
| 81 |
+
|
| 82 |
+
# 2) story_spots
|
| 83 |
+
try:
|
| 84 |
+
resp = (
|
| 85 |
+
self.supabase.table("story_spots")
|
| 86 |
+
.select("id, name, aliases")
|
| 87 |
+
.execute()
|
| 88 |
+
)
|
| 89 |
+
for row in resp.data or []:
|
| 90 |
+
self._register_name(row["name"], row["id"])
|
| 91 |
+
# aliases ๋ฐฐ์ด ๋ฑ๋ก
|
| 92 |
+
for alias in row.get("aliases") or []:
|
| 93 |
+
if alias:
|
| 94 |
+
self._register_name(alias, row["id"])
|
| 95 |
+
except Exception as e:
|
| 96 |
+
logger.warning("story_spots ๋ก๋ ์คํจ: %s", e)
|
| 97 |
+
|
| 98 |
+
logger.info("์ฅ์๋ช
์ฌ์ ๊ตฌ์ถ ์๋ฃ: %d๊ฑด", len(self.known_places))
|
| 99 |
+
|
| 100 |
+
def _register_name(self, name: str, spot_id: str) -> None:
|
| 101 |
+
"""์ด๋ฆ๊ณผ ๊ณต๋ฐฑ ์ ๊ฑฐ ๋ณํ์ ์ฌ์ ์ ๋ฑ๋กํ๋ค."""
|
| 102 |
+
name = name.strip()
|
| 103 |
+
if not name:
|
| 104 |
+
return
|
| 105 |
+
self.known_places[name] = spot_id
|
| 106 |
+
# ๊ณต๋ฐฑ ์ ๊ฑฐ ๋ณํ ("๋ด๋ ์ ์นดํ" โ "๋ด๋ ์์นดํ")
|
| 107 |
+
no_space = name.replace(" ", "")
|
| 108 |
+
if no_space != name:
|
| 109 |
+
self.known_places[no_space] = spot_id
|
| 110 |
+
|
| 111 |
+
# ------------------------------------------------------------------
|
| 112 |
+
# ์ถ์ถ
|
| 113 |
+
# ------------------------------------------------------------------
|
| 114 |
+
|
| 115 |
+
def extract(self, text: str) -> list[dict]:
|
| 116 |
+
"""
|
| 117 |
+
ํ
์คํธ์์ ์ฅ์๋ช
์ ์ถ์ถํ๋ค.
|
| 118 |
+
|
| 119 |
+
Returns:
|
| 120 |
+
[{"name": str, "spot_id": str|None, "method": "dictionary"|"pattern"}, ...]
|
| 121 |
+
"""
|
| 122 |
+
if not text:
|
| 123 |
+
return []
|
| 124 |
+
|
| 125 |
+
found: list[dict] = []
|
| 126 |
+
found_names: set[str] = set()
|
| 127 |
+
|
| 128 |
+
# 1์์: ์ฌ์ ๋งค์นญ โ ๊ธด ์ด๋ฆ๋ถํฐ ๋งค์นญ (๋ถ๋ถ ๋งค์นญ ๋ฐฉ์ง)
|
| 129 |
+
for name in sorted(self.known_places, key=len, reverse=True):
|
| 130 |
+
# 2๊ธ์ ์ดํ ์ผ๋ฐ๋ช
์ฌ ํํฐ + STOPWORDS ์ฒดํฌ
|
| 131 |
+
if len(name) <= 2 and name in STOPWORDS:
|
| 132 |
+
continue
|
| 133 |
+
if name in text and name not in found_names:
|
| 134 |
+
found.append({
|
| 135 |
+
"name": name,
|
| 136 |
+
"spot_id": self.known_places[name],
|
| 137 |
+
"method": "dictionary",
|
| 138 |
+
})
|
| 139 |
+
found_names.add(name)
|
| 140 |
+
|
| 141 |
+
# 2์์: ์ ๋ฏธ์ฌ ํจํด
|
| 142 |
+
for pattern in SUFFIX_PATTERNS:
|
| 143 |
+
for match in pattern.findall(text):
|
| 144 |
+
if match not in found_names and match not in STOPWORDS:
|
| 145 |
+
found.append({
|
| 146 |
+
"name": match,
|
| 147 |
+
"spot_id": None,
|
| 148 |
+
"method": "pattern",
|
| 149 |
+
})
|
| 150 |
+
found_names.add(match)
|
| 151 |
+
|
| 152 |
+
return found
|
trend_engine/spot_matcher.py
ADDED
|
@@ -0,0 +1,111 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
SpotMatcher โ ์ฑ๋๋ณ ์์ง ๊ฒฐ๊ณผ์ spot_id๋ฅผ trend_spots ๋ง์คํฐ ID๋ก ํตํฉ ๋งค์นญ
|
| 3 |
+
|
| 4 |
+
๋ชจ๋ ์ฑ๋์ ์์ง ๊ฒฐ๊ณผ๋ฅผ trend_spots(์นด์นด์ค๋งต ๊ธฐ๋ฐ)๊ณผ
|
| 5 |
+
story_spots(ํฅํ ์ง ๊ธฐ๋ฐ)์์ ์ ์๋ ์ฅ์ ID๋ก ๋งค์นญํ๋ค.
|
| 6 |
+
|
| 7 |
+
๋งค์นญ ์ฐ์ ์์:
|
| 8 |
+
1. trend_spots ์ ํ ๋งค์นญ (๊ณต๋ฐฑ ์ ๊ฑฐ ๋ณํ ํฌํจ)
|
| 9 |
+
2. story_spots ์ ํ ๋งค์นญ
|
| 10 |
+
3. trend_spots ๋ถ๋ถ ๋งค์นญ (์ต์ 3๊ธ์ ๊ฒน์นจ)
|
| 11 |
+
"""
|
| 12 |
+
|
| 13 |
+
import logging
|
| 14 |
+
import os
|
| 15 |
+
|
| 16 |
+
from supabase import create_client, Client
|
| 17 |
+
|
| 18 |
+
logger = logging.getLogger(__name__)
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
def _get_supabase_client() -> Client:
|
| 22 |
+
url = os.environ.get("SUPABASE_URL") or os.environ.get("VITE_SUPABASE_URL")
|
| 23 |
+
key = os.environ.get("SUPABASE_SERVICE_ROLE_KEY") or os.environ.get("SUPABASE_SERVICE_KEY")
|
| 24 |
+
if not url or not key:
|
| 25 |
+
raise ValueError("SUPABASE_URL and SUPABASE_SERVICE_ROLE_KEY must be set")
|
| 26 |
+
return create_client(url, key)
|
| 27 |
+
|
| 28 |
+
|
| 29 |
+
class SpotMatcher:
|
| 30 |
+
"""๋ชจ๋ ์ฑ๋์ ์์ง ๊ฒฐ๊ณผ๋ฅผ trend_spots/story_spots ID๋ก ๋งค์นญ.
|
| 31 |
+
|
| 32 |
+
์นด์นด์ค๋งต ์ค์บ ๊ฒฐ๊ณผ(trend_spots)๊ฐ ๊ธฐ์ค ์ฌ์ ์ญํ ์ ํ๋ค.
|
| 33 |
+
"""
|
| 34 |
+
|
| 35 |
+
def __init__(self, supabase: Client | None = None):
|
| 36 |
+
self.supabase = supabase or _get_supabase_client()
|
| 37 |
+
self.trend_spots: dict[str, str] = {}
|
| 38 |
+
self.story_spots: dict[str, str] = {}
|
| 39 |
+
self._load_dictionaries()
|
| 40 |
+
|
| 41 |
+
def _load_dictionaries(self) -> None:
|
| 42 |
+
"""trend_spots + story_spots์์ ์ฅ์๋ช
์ฌ์ ๋ก๋."""
|
| 43 |
+
|
| 44 |
+
# 1) trend_spots (์นด์นด์ค๋งต ๊ธฐ๋ฐ)
|
| 45 |
+
try:
|
| 46 |
+
resp = self.supabase.table("trend_spots").select("id, name").execute()
|
| 47 |
+
for row in resp.data or []:
|
| 48 |
+
name = row["name"].strip()
|
| 49 |
+
if not name:
|
| 50 |
+
continue
|
| 51 |
+
self.trend_spots[name] = row["id"]
|
| 52 |
+
no_space = name.replace(" ", "")
|
| 53 |
+
if no_space != name:
|
| 54 |
+
self.trend_spots[no_space] = row["id"]
|
| 55 |
+
except Exception as e:
|
| 56 |
+
logger.warning("trend_spots ๋ก๋ ์คํจ: %s", e)
|
| 57 |
+
|
| 58 |
+
# 2) story_spots (์ด๋ฆ์ด ์๋ ๊ฒ๋ง)
|
| 59 |
+
try:
|
| 60 |
+
resp = (
|
| 61 |
+
self.supabase.table("story_spots")
|
| 62 |
+
.select("id, name")
|
| 63 |
+
.not_.is_("name", "null")
|
| 64 |
+
.execute()
|
| 65 |
+
)
|
| 66 |
+
for row in resp.data or []:
|
| 67 |
+
name = (row.get("name") or "").strip()
|
| 68 |
+
if name:
|
| 69 |
+
self.story_spots[name] = row["id"]
|
| 70 |
+
except Exception as e:
|
| 71 |
+
logger.warning("story_spots ๋ก๋ ์คํจ: %s", e)
|
| 72 |
+
|
| 73 |
+
logger.info(
|
| 74 |
+
"SpotMatcher ์ฌ์ ๋ก๋: trend_spots %d๊ฑด, story_spots %d๊ฑด",
|
| 75 |
+
len(self.trend_spots),
|
| 76 |
+
len(self.story_spots),
|
| 77 |
+
)
|
| 78 |
+
|
| 79 |
+
def match(self, name: str) -> str | None:
|
| 80 |
+
"""์ฅ์๋ช
โ trend_spots.id ๋๋ story_spots.id ๋ฐํ.
|
| 81 |
+
|
| 82 |
+
๋งค์นญ ์ฐ์ ์์:
|
| 83 |
+
1. trend_spots ์ ํ ๋งค์นญ (๊ณต๋ฐฑ ์ ๊ฑฐ ํฌํจ)
|
| 84 |
+
2. story_spots ์ ํ ๋งค์นญ
|
| 85 |
+
3. trend_spots ๋ถ๋ถ ๋งค์นญ (์ต์ 3๊ธ์ ๊ฒน์นจ)
|
| 86 |
+
|
| 87 |
+
Returns:
|
| 88 |
+
๋งค์นญ๋ spot_id, ์คํจ ์ None
|
| 89 |
+
"""
|
| 90 |
+
if not name:
|
| 91 |
+
return None
|
| 92 |
+
name = name.strip()
|
| 93 |
+
|
| 94 |
+
# 1. trend_spots ์ ํ ๋งค์นญ
|
| 95 |
+
if name in self.trend_spots:
|
| 96 |
+
return self.trend_spots[name]
|
| 97 |
+
no_space = name.replace(" ", "")
|
| 98 |
+
if no_space in self.trend_spots:
|
| 99 |
+
return self.trend_spots[no_space]
|
| 100 |
+
|
| 101 |
+
# 2. story_spots ์ ํ ๋งค์นญ
|
| 102 |
+
if name in self.story_spots:
|
| 103 |
+
return self.story_spots[name]
|
| 104 |
+
|
| 105 |
+
# 3. ๋ถ๋ถ ๋งค์นญ (๊ธด ์ด๋ฆ๋ถํฐ ์๋)
|
| 106 |
+
for known_name in sorted(self.trend_spots.keys(), key=len, reverse=True):
|
| 107 |
+
shorter = min(known_name, name, key=len)
|
| 108 |
+
if len(shorter) >= 3 and (known_name in name or name in known_name):
|
| 109 |
+
return self.trend_spots[known_name]
|
| 110 |
+
|
| 111 |
+
return None
|
trend_engine/trend_scorer.py
ADDED
|
@@ -0,0 +1,344 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Trend Scorer โ ์ฑ๋๋ณ ์ธ๊ธฐ๋ ์ค์ฝ์ด ๊ณ์ฐ + ์ข
ํฉ ๋ญํน ์์ฑ
|
| 3 |
+
|
| 4 |
+
v3 ์ฑ๋ ๊ฐ์ค์น:
|
| 5 |
+
naver_blog 0.30
|
| 6 |
+
kakaomap 0.25
|
| 7 |
+
instagram 0.25
|
| 8 |
+
youtube 0.15
|
| 9 |
+
naver_place 0.05
|
| 10 |
+
"""
|
| 11 |
+
|
| 12 |
+
import os
|
| 13 |
+
import logging
|
| 14 |
+
from datetime import datetime, timedelta, date
|
| 15 |
+
|
| 16 |
+
from supabase import create_client, Client
|
| 17 |
+
|
| 18 |
+
logger = logging.getLogger(__name__)
|
| 19 |
+
|
| 20 |
+
# ์ฑ๋ ๊ฐ์ค์น (v3 โ naver_place ๋นํ์ฑ, 4์ฑ๋ ์ฒด์ )
|
| 21 |
+
CHANNEL_WEIGHTS: dict[str, float] = {
|
| 22 |
+
"naver_blog": 0.30,
|
| 23 |
+
"kakaomap": 0.25,
|
| 24 |
+
"instagram": 0.25,
|
| 25 |
+
"youtube": 0.15,
|
| 26 |
+
# "naver_place": 0.05, # ๋นํ์ฑ โ Place ID ๋งค์นญ ๋ถ๊ฐ (2026-02)
|
| 27 |
+
}
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
def _get_supabase_client() -> Client:
|
| 31 |
+
url = os.environ.get("SUPABASE_URL") or os.environ.get("VITE_SUPABASE_URL")
|
| 32 |
+
key = os.environ.get("SUPABASE_SERVICE_ROLE_KEY") or os.environ.get("SUPABASE_SERVICE_KEY")
|
| 33 |
+
if not url or not key:
|
| 34 |
+
raise ValueError("SUPABASE_URL and SUPABASE_SERVICE_ROLE_KEY must be set")
|
| 35 |
+
return create_client(url, key)
|
| 36 |
+
|
| 37 |
+
|
| 38 |
+
# ------------------------------------------------------------------
|
| 39 |
+
# ์ ๊ทํ ์ ํธ
|
| 40 |
+
# ------------------------------------------------------------------
|
| 41 |
+
|
| 42 |
+
|
| 43 |
+
def normalize_score(value: float, max_value: float) -> int:
|
| 44 |
+
"""์ฑ๋ ๋ด ์ต๋๊ฐ ๊ธฐ์ค 0~100 ์ ๊ทํ."""
|
| 45 |
+
if max_value <= 0:
|
| 46 |
+
return 0
|
| 47 |
+
return min(100, int((value / max_value) * 100))
|
| 48 |
+
|
| 49 |
+
|
| 50 |
+
# ------------------------------------------------------------------
|
| 51 |
+
# ์ฑ๋๋ณ ์ค์ฝ์ด ๊ณ์ฐ
|
| 52 |
+
# ------------------------------------------------------------------
|
| 53 |
+
|
| 54 |
+
|
| 55 |
+
def calc_naver_blog_score(
|
| 56 |
+
weekly_mentions: int,
|
| 57 |
+
max_weekly_mentions: int,
|
| 58 |
+
mention_growth: float = 0.0,
|
| 59 |
+
) -> int:
|
| 60 |
+
"""๋ค์ด๋ฒ ๋ธ๋ก๊ทธ: ์ฃผ๊ฐ ์ธ๊ธ ์ + ๊ธ์์น ๊ฐ์ฐ์ ."""
|
| 61 |
+
base = normalize_score(weekly_mentions, max_weekly_mentions)
|
| 62 |
+
growth_bonus = min(20, int(mention_growth * 10)) if mention_growth > 0 else 0
|
| 63 |
+
return min(100, base + growth_bonus)
|
| 64 |
+
|
| 65 |
+
|
| 66 |
+
def calc_kakaomap_score(
|
| 67 |
+
review_count: int,
|
| 68 |
+
max_review_count: int,
|
| 69 |
+
review_growth: int = 0,
|
| 70 |
+
max_review_growth: int = 1,
|
| 71 |
+
search_rank: int = 0,
|
| 72 |
+
max_rank: int = 256,
|
| 73 |
+
) -> int:
|
| 74 |
+
"""์นด์นด์ค๋งต: ๋ฆฌ๋ทฐ ์(60%) + ์ ์ฃผ ๋๋น ๋ฆฌ๋ทฐ ์ฆ๊ฐ(40%).
|
| 75 |
+
|
| 76 |
+
๋ฆฌ๋ทฐ ๋ฐ์ดํฐ ์์ผ๋ฉด search_rank ๊ธฐ๋ฐ ํด๋ฐฑ (1์=100์ , max_rank์=0์ ).
|
| 77 |
+
"""
|
| 78 |
+
if review_count > 0:
|
| 79 |
+
review_base = normalize_score(review_count, max_review_count) * 0.6
|
| 80 |
+
growth_base = normalize_score(review_growth, max_review_growth) * 0.4
|
| 81 |
+
return min(100, int(review_base + growth_base))
|
| 82 |
+
|
| 83 |
+
# search_rank ํด๋ฐฑ (๋น๊ณต์ ๋ฆฌ๋ทฐ API ์ฐจ๋จ ์)
|
| 84 |
+
if search_rank > 0:
|
| 85 |
+
return max(0, int((1 - search_rank / max(max_rank, 1)) * 100))
|
| 86 |
+
|
| 87 |
+
return 0
|
| 88 |
+
|
| 89 |
+
|
| 90 |
+
def calc_instagram_score(
|
| 91 |
+
hashtag_post_count: int,
|
| 92 |
+
max_post_count: int,
|
| 93 |
+
avg_engagement: float = 0.0,
|
| 94 |
+
max_engagement: float = 1.0,
|
| 95 |
+
) -> int:
|
| 96 |
+
"""์ธ์คํ๊ทธ๋จ: ๊ฒ์๋ฌผ ์(50%) + ํ๊ท ์ฐธ์ฌ๋(50%)."""
|
| 97 |
+
post_base = normalize_score(hashtag_post_count, max_post_count) * 0.5
|
| 98 |
+
engagement_base = normalize_score(avg_engagement, max_engagement) * 0.5
|
| 99 |
+
return min(100, int(post_base + engagement_base))
|
| 100 |
+
|
| 101 |
+
|
| 102 |
+
def calc_youtube_score(
|
| 103 |
+
mention_video_count: int,
|
| 104 |
+
max_video_count: int,
|
| 105 |
+
total_views: int = 0,
|
| 106 |
+
max_total_views: int = 1,
|
| 107 |
+
) -> int:
|
| 108 |
+
"""์ ํ๋ธ: ์ธ๊ธ ์์ ์(40%) + ์ด ์กฐํ์(60%)."""
|
| 109 |
+
video_base = normalize_score(mention_video_count, max_video_count) * 0.4
|
| 110 |
+
views_base = normalize_score(total_views, max_total_views) * 0.6
|
| 111 |
+
return min(100, int(video_base + views_base))
|
| 112 |
+
|
| 113 |
+
|
| 114 |
+
def calc_naver_place_score(
|
| 115 |
+
visitor_review_count: int,
|
| 116 |
+
max_visitor_reviews: int,
|
| 117 |
+
naver_review_growth: int = 0,
|
| 118 |
+
max_review_growth: int = 1,
|
| 119 |
+
data_available: bool = True,
|
| 120 |
+
) -> int | None:
|
| 121 |
+
"""๋ค์ด๋ฒ ํ๋ ์ด์ค (๋ณด์กฐ): ๋ฐ์ดํฐ ์์ผ๋ฉด None ๋ฐํ."""
|
| 122 |
+
if not data_available:
|
| 123 |
+
return None
|
| 124 |
+
review_base = normalize_score(visitor_review_count, max_visitor_reviews) * 0.7
|
| 125 |
+
growth_base = normalize_score(naver_review_growth, max_review_growth) * 0.3
|
| 126 |
+
return min(100, int(review_base + growth_base))
|
| 127 |
+
|
| 128 |
+
|
| 129 |
+
# ------------------------------------------------------------------
|
| 130 |
+
# ์ข
ํฉ ์ค์ฝ์ด
|
| 131 |
+
# ------------------------------------------------------------------
|
| 132 |
+
|
| 133 |
+
|
| 134 |
+
def calc_composite_score(channel_scores: dict[str, int | None]) -> int:
|
| 135 |
+
"""
|
| 136 |
+
์ฑ๋๋ณ ์ค์ฝ์ด๋ฅผ ๊ฐ์ค ํฉ์ฐํ์ฌ ์ข
ํฉ ์ธ๊ธฐ๋ ์ค์ฝ์ด (0~100) ์ฐ์ถ.
|
| 137 |
+
|
| 138 |
+
๋ฐ์ดํฐ๊ฐ ์๋ ์ฑ๋(None)์ ๊ฐ์ค์น๋ ๋๋จธ์ง ์ฑ๋์ ๋น๋ก ์ฌ๋ถ๋ฐฐ.
|
| 139 |
+
|
| 140 |
+
Args:
|
| 141 |
+
channel_scores: {"naver_blog": 80, "kakaomap": 60, ..., "naver_place": None}
|
| 142 |
+
"""
|
| 143 |
+
available_weight = 0.0
|
| 144 |
+
weighted_parts: list[tuple[float, float]] = [] # (score, weight)
|
| 145 |
+
|
| 146 |
+
for channel, weight in CHANNEL_WEIGHTS.items():
|
| 147 |
+
score = channel_scores.get(channel)
|
| 148 |
+
if score is not None:
|
| 149 |
+
weighted_parts.append((float(score), weight))
|
| 150 |
+
available_weight += weight
|
| 151 |
+
|
| 152 |
+
if available_weight == 0:
|
| 153 |
+
return 0
|
| 154 |
+
|
| 155 |
+
composite = sum(score * (weight / available_weight) for score, weight in weighted_parts)
|
| 156 |
+
return min(100, int(composite))
|
| 157 |
+
|
| 158 |
+
|
| 159 |
+
# ------------------------------------------------------------------
|
| 160 |
+
# ์ฃผ๊ฐ ๋ญํน ์์ฑ
|
| 161 |
+
# ------------------------------------------------------------------
|
| 162 |
+
|
| 163 |
+
|
| 164 |
+
def generate_weekly_ranking(supabase: Client | None = None) -> dict:
|
| 165 |
+
"""
|
| 166 |
+
์ฃผ๊ฐ ์ข
ํฉ ์ธ๊ธฐ ์ฅ์ ๋ญํน ์์ฑ.
|
| 167 |
+
|
| 168 |
+
1. spot_trends์์ ์ด๋ฒ ์ฃผ + ์ง๋์ฃผ ๋ฉํธ๋ฆญ ์กฐํ
|
| 169 |
+
2. ์ฑ๋๋ณ ์ค์ฝ์ด ๊ณ์ฐ โ ์ข
ํฉ ์ค์ฝ์ด
|
| 170 |
+
3. trend_spots.popularity_score ์
๋ฐ์ดํธ
|
| 171 |
+
4. ์ ์ฒด/์ฑ๋๋ณ ๋ญํน ๋ฐํ
|
| 172 |
+
|
| 173 |
+
Returns:
|
| 174 |
+
{"overall": [...], "naver_blog": [...], ..., "trending_up": [...]}
|
| 175 |
+
"""
|
| 176 |
+
sb = supabase or _get_supabase_client()
|
| 177 |
+
now = datetime.utcnow()
|
| 178 |
+
this_week_start = (now - timedelta(days=now.weekday())).date()
|
| 179 |
+
last_week_start = this_week_start - timedelta(days=7)
|
| 180 |
+
|
| 181 |
+
# -- ์ด๋ฒ ์ฃผ ๋ฉํธ๋ฆญ ์กฐํ (์ ํจํ spot_id๋ง) --
|
| 182 |
+
this_week_resp = (
|
| 183 |
+
sb.table("spot_trends")
|
| 184 |
+
.select("spot_id, source, metric_type, metric_value")
|
| 185 |
+
.gte("period_end", this_week_start.isoformat())
|
| 186 |
+
.neq("spot_id", "__pending__")
|
| 187 |
+
.execute()
|
| 188 |
+
)
|
| 189 |
+
this_week_rows = this_week_resp.data or []
|
| 190 |
+
|
| 191 |
+
# -- ์ง๋์ฃผ ๋ฉํธ๋ฆญ ์กฐํ (์ ํจํ spot_id๋ง) --
|
| 192 |
+
last_week_resp = (
|
| 193 |
+
sb.table("spot_trends")
|
| 194 |
+
.select("spot_id, source, metric_type, metric_value")
|
| 195 |
+
.gte("period_end", last_week_start.isoformat())
|
| 196 |
+
.lt("period_end", this_week_start.isoformat())
|
| 197 |
+
.neq("spot_id", "__pending__")
|
| 198 |
+
.execute()
|
| 199 |
+
)
|
| 200 |
+
last_week_rows = last_week_resp.data or []
|
| 201 |
+
|
| 202 |
+
# -- ์คํ๋ณ ๋ฉํธ๋ฆญ ์ง๊ณ --
|
| 203 |
+
spots_this: dict[str, dict[str, dict[str, int]]] = {}
|
| 204 |
+
for row in this_week_rows:
|
| 205 |
+
sid = row["spot_id"]
|
| 206 |
+
src = row["source"]
|
| 207 |
+
mt = row["metric_type"]
|
| 208 |
+
spots_this.setdefault(sid, {}).setdefault(src, {})[mt] = row["metric_value"]
|
| 209 |
+
|
| 210 |
+
spots_last: dict[str, dict[str, dict[str, int]]] = {}
|
| 211 |
+
for row in last_week_rows:
|
| 212 |
+
sid = row["spot_id"]
|
| 213 |
+
src = row["source"]
|
| 214 |
+
mt = row["metric_type"]
|
| 215 |
+
spots_last.setdefault(sid, {}).setdefault(src, {})[mt] = row["metric_value"]
|
| 216 |
+
|
| 217 |
+
# -- ์ ์ฒด ์ต๋๊ฐ ๊ณ์ฐ (์ ๊ทํ์ฉ) --
|
| 218 |
+
maxes = _calc_maxes(spots_this)
|
| 219 |
+
|
| 220 |
+
# -- ์คํ๋ณ ์ฑ๋ ์ค์ฝ์ด + ์ข
ํฉ ์ค์ฝ์ด ๊ณ์ฐ --
|
| 221 |
+
spot_scores: list[dict] = []
|
| 222 |
+
|
| 223 |
+
for spot_id, src_data in spots_this.items():
|
| 224 |
+
last_src = spots_last.get(spot_id, {})
|
| 225 |
+
|
| 226 |
+
# ๋ค์ด๋ฒ ๋ธ๋ก๊ทธ
|
| 227 |
+
blog = src_data.get("naver_blog", {})
|
| 228 |
+
blog_last = last_src.get("naver_blog", {})
|
| 229 |
+
wm = blog.get("mention_count", 0)
|
| 230 |
+
wm_last = blog_last.get("mention_count", 0)
|
| 231 |
+
growth = (wm - wm_last) / max(wm_last, 1) if wm_last else 0.0
|
| 232 |
+
nb_score = calc_naver_blog_score(wm, maxes["naver_blog_mentions"], growth)
|
| 233 |
+
|
| 234 |
+
# ์นด์นด์ค๋งต (search_rank ํด๋ฐฑ ์ง์)
|
| 235 |
+
km = src_data.get("kakaomap", {})
|
| 236 |
+
km_last = last_src.get("kakaomap", {})
|
| 237 |
+
rc = km.get("review_count", 0)
|
| 238 |
+
rc_last = km_last.get("review_count", 0)
|
| 239 |
+
sr = km.get("search_rank", 0)
|
| 240 |
+
km_score = calc_kakaomap_score(
|
| 241 |
+
rc, maxes["kakaomap_reviews"],
|
| 242 |
+
rc - rc_last, maxes["kakaomap_review_growth"],
|
| 243 |
+
search_rank=sr, max_rank=maxes["kakaomap_max_rank"],
|
| 244 |
+
)
|
| 245 |
+
|
| 246 |
+
# ์ธ์คํ๊ทธ๋จ
|
| 247 |
+
ig = src_data.get("instagram", {})
|
| 248 |
+
ig_score = calc_instagram_score(
|
| 249 |
+
ig.get("post_count", 0), maxes["instagram_posts"],
|
| 250 |
+
ig.get("avg_engagement", 0), maxes["instagram_engagement"],
|
| 251 |
+
)
|
| 252 |
+
|
| 253 |
+
# ์ ํ๋ธ
|
| 254 |
+
yt = src_data.get("youtube", {})
|
| 255 |
+
yt_score = calc_youtube_score(
|
| 256 |
+
yt.get("video_count", 0), maxes["youtube_videos"],
|
| 257 |
+
yt.get("view_count", 0), maxes["youtube_views"],
|
| 258 |
+
)
|
| 259 |
+
|
| 260 |
+
# ๋ค์ด๋ฒ ํ๋ ์ด์ค โ ๋นํ์ฑ (Place ID ๋งค์นญ ๋ถ๊ฐ, 2026-02)
|
| 261 |
+
# np_score๋ ํญ์ None โ CHANNEL_WEIGHTS์์๋ ์ ๊ฑฐ๋จ
|
| 262 |
+
|
| 263 |
+
channel_scores = {
|
| 264 |
+
"naver_blog": nb_score,
|
| 265 |
+
"kakaomap": km_score,
|
| 266 |
+
"instagram": ig_score,
|
| 267 |
+
"youtube": yt_score,
|
| 268 |
+
}
|
| 269 |
+
composite = calc_composite_score(channel_scores)
|
| 270 |
+
|
| 271 |
+
spot_scores.append({
|
| 272 |
+
"spot_id": spot_id,
|
| 273 |
+
"popularity_score": composite,
|
| 274 |
+
"naver_blog_score": nb_score,
|
| 275 |
+
"kakaomap_score": km_score,
|
| 276 |
+
"instagram_score": ig_score,
|
| 277 |
+
"youtube_score": yt_score,
|
| 278 |
+
"channel_scores": channel_scores,
|
| 279 |
+
})
|
| 280 |
+
|
| 281 |
+
# -- trend_spots ์
๋ฐ์ดํธ --
|
| 282 |
+
for entry in spot_scores:
|
| 283 |
+
try:
|
| 284 |
+
sb.table("trend_spots").update({
|
| 285 |
+
"popularity_score": entry["popularity_score"],
|
| 286 |
+
"popularity_updated_at": now.isoformat(),
|
| 287 |
+
}).eq("id", entry["spot_id"]).execute()
|
| 288 |
+
except Exception as e:
|
| 289 |
+
logger.warning("trend_spots ์
๋ฐ์ดํธ ์คํจ (%s): %s", entry["spot_id"], e)
|
| 290 |
+
|
| 291 |
+
# -- ๋ญํน ์์ฑ --
|
| 292 |
+
overall = sorted(spot_scores, key=lambda s: s["popularity_score"], reverse=True)
|
| 293 |
+
|
| 294 |
+
rankings = {
|
| 295 |
+
"overall": overall[:50],
|
| 296 |
+
"naver_blog": sorted(spot_scores, key=lambda s: s["naver_blog_score"], reverse=True)[:30],
|
| 297 |
+
"kakaomap": sorted(spot_scores, key=lambda s: s["kakaomap_score"], reverse=True)[:30],
|
| 298 |
+
"instagram": sorted(spot_scores, key=lambda s: s["instagram_score"], reverse=True)[:30],
|
| 299 |
+
"youtube": sorted(spot_scores, key=lambda s: s["youtube_score"], reverse=True)[:30],
|
| 300 |
+
"trending_up": overall[:10], # ์ ์ฃผ ๋๋น ๋น๊ต๋ ๋ค์ ์ฃผ๊ธฐ๋ถํฐ ๊ฐ๋ฅ
|
| 301 |
+
}
|
| 302 |
+
|
| 303 |
+
logger.info("์ฃผ๊ฐ ๋ญํน ์์ฑ ์๋ฃ: %d๊ฐ ์คํ", len(spot_scores))
|
| 304 |
+
return rankings
|
| 305 |
+
|
| 306 |
+
|
| 307 |
+
# ------------------------------------------------------------------
|
| 308 |
+
# ๋ด๋ถ ํฌํผ
|
| 309 |
+
# ------------------------------------------------------------------
|
| 310 |
+
|
| 311 |
+
|
| 312 |
+
def _calc_maxes(spots_data: dict[str, dict[str, dict[str, int]]]) -> dict[str, int]:
|
| 313 |
+
"""์ ์ฒด ์คํ์์ ์ฑ๋๋ณ ์ต๋๊ฐ ๊ณ์ฐ."""
|
| 314 |
+
maxes = {
|
| 315 |
+
"naver_blog_mentions": 1,
|
| 316 |
+
"kakaomap_reviews": 1,
|
| 317 |
+
"kakaomap_review_growth": 1,
|
| 318 |
+
"kakaomap_max_rank": 1,
|
| 319 |
+
"instagram_posts": 1,
|
| 320 |
+
"instagram_engagement": 1,
|
| 321 |
+
"youtube_videos": 1,
|
| 322 |
+
"youtube_views": 1,
|
| 323 |
+
}
|
| 324 |
+
|
| 325 |
+
for src_data in spots_data.values():
|
| 326 |
+
blog = src_data.get("naver_blog", {})
|
| 327 |
+
maxes["naver_blog_mentions"] = max(maxes["naver_blog_mentions"], blog.get("mention_count", 0))
|
| 328 |
+
|
| 329 |
+
km = src_data.get("kakaomap", {})
|
| 330 |
+
maxes["kakaomap_reviews"] = max(maxes["kakaomap_reviews"], km.get("review_count", 0))
|
| 331 |
+
maxes["kakaomap_max_rank"] = max(maxes["kakaomap_max_rank"], km.get("search_rank", 0))
|
| 332 |
+
|
| 333 |
+
ig = src_data.get("instagram", {})
|
| 334 |
+
maxes["instagram_posts"] = max(maxes["instagram_posts"], ig.get("post_count", 0))
|
| 335 |
+
# avg_engagement: -1 ๊ฐ ํํฐ๋ง (Apify ๋ฏธ์์ง ์ผ์ด์ค)
|
| 336 |
+
ig_eng = ig.get("avg_engagement", 0)
|
| 337 |
+
if ig_eng > 0:
|
| 338 |
+
maxes["instagram_engagement"] = max(maxes["instagram_engagement"], ig_eng)
|
| 339 |
+
|
| 340 |
+
yt = src_data.get("youtube", {})
|
| 341 |
+
maxes["youtube_videos"] = max(maxes["youtube_videos"], yt.get("video_count", 0))
|
| 342 |
+
maxes["youtube_views"] = max(maxes["youtube_views"], yt.get("view_count", 0))
|
| 343 |
+
|
| 344 |
+
return maxes
|