Spaces:
Sleeping
Sleeping
Upload folder using huggingface_hub
Browse files- requirements-trend.txt +1 -2
- scripts/run_trend_engine.py +8 -9
- scripts/test_instagram_collector.py +272 -0
- scripts/test_instagram_full.py +162 -0
- trend_engine/collectors/instagram.py +647 -802
- trend_engine/trend_scorer.py +3 -3
- utils/trending_builder.py +4 -4
requirements-trend.txt
CHANGED
|
@@ -4,7 +4,6 @@ beautifulsoup4>=4.12.0
|
|
| 4 |
lxml>=5.0.0
|
| 5 |
requests>=2.31.0
|
| 6 |
google-api-python-client>=2.100.0
|
| 7 |
-
|
| 8 |
-
# ensembledata>=0.2.0
|
| 9 |
supabase>=2.0.0
|
| 10 |
python-dotenv>=1.0.0
|
|
|
|
| 4 |
lxml>=5.0.0
|
| 5 |
requests>=2.31.0
|
| 6 |
google-api-python-client>=2.100.0
|
| 7 |
+
google-genai>=1.0.0,<1.64.0
|
|
|
|
| 8 |
supabase>=2.0.0
|
| 9 |
python-dotenv>=1.0.0
|
scripts/run_trend_engine.py
CHANGED
|
@@ -4,10 +4,10 @@ RE:Play Trend Engine v3 โ ์ฃผ๊ฐ ๋ฐฐ์น ์ค์ผ์คํธ๋ ์ดํฐ
|
|
| 4 |
์์ฐจ ์คํ ํ์ดํ๋ผ์ธ:
|
| 5 |
1. ์นด์นด์ค๋งต ๊ทธ๋ฆฌ๋ ์ค์บ + ๋ฆฌ๋ทฐ ํ์ฑ (trend_spots ๋ง์คํฐ ์์ฑ)
|
| 6 |
2. SpotMatcher ์ด๊ธฐํ (trend_spots + story_spots ์ฌ์ ๋ก๋)
|
| 7 |
-
3.
|
| 8 |
-
4.
|
| 9 |
-
5.
|
| 10 |
-
6.
|
| 11 |
7. ์ข
ํฉ ์ค์ฝ์ด ๊ณ์ฐ + ๋ญํน ์์ฑ
|
| 12 |
|
| 13 |
Usage:
|
|
@@ -122,11 +122,10 @@ def main() -> None:
|
|
| 122 |
youtube = YouTubeCollector(sb, spot_matcher=matcher)
|
| 123 |
run_step("3_youtube", youtube.run, results)
|
| 124 |
|
| 125 |
-
# โโ 4. ์ธ์คํ๊ทธ๋จ
|
| 126 |
-
|
| 127 |
-
logger.info("์ธ์คํ๊ทธ๋จ ๋ฐฑ์๋: %s", ig_backend)
|
| 128 |
instagram = InstagramCollector(sb, spot_matcher=matcher)
|
| 129 |
-
run_step("
|
| 130 |
|
| 131 |
# โโ 5. ๋ค์ด๋ฒ ํ๋ ์ด์ค โ ๋นํ์ฑ (Place ID ๋งค์นญ ๋ถ๊ฐ) โโ
|
| 132 |
logger.info("๋ค์ด๋ฒ ํ๋ ์ด์ค: ๋นํ์ฑ (Place ID ๋งค์นญ ๋ถ๊ฐ, 2026-02)")
|
|
@@ -273,7 +272,7 @@ def main() -> None:
|
|
| 273 |
|
| 274 |
# โโ 8. ์ข
ํฉ ์ค์ฝ์ด ๊ณ์ฐ + ๋ญํน ์์ฑ (์ต์ 2์ฑ๋ ์ฑ๊ณต ์) โโ
|
| 275 |
# ์์ง ์ฑ๋ ๋จ๊ณ๋ง ์นด์ดํธ (1, 3, 4, 6)
|
| 276 |
-
collection_steps = ["1_kakaomap", "3_youtube", "
|
| 277 |
successful_channels = [s for s in collection_steps if results.get(s, {}).get("status") == "ok"]
|
| 278 |
|
| 279 |
def calc_scores():
|
|
|
|
| 4 |
์์ฐจ ์คํ ํ์ดํ๋ผ์ธ:
|
| 5 |
1. ์นด์นด์ค๋งต ๊ทธ๋ฆฌ๋ ์ค์บ + ๋ฆฌ๋ทฐ ํ์ฑ (trend_spots ๋ง์คํฐ ์์ฑ)
|
| 6 |
2. SpotMatcher ์ด๊ธฐํ (trend_spots + story_spots ์ฌ์ ๋ก๋)
|
| 7 |
+
3. ์ ํ๋ธ API (SpotMatcher ์ฐ๋)
|
| 8 |
+
4. ์ธ์คํ๊ทธ๋จ ์ธํ๋ฃจ์ธ์ ๋ชจ๋ํฐ๋ง v5.0 (SpotMatcher ์ฐ๋)
|
| 9 |
+
5. ๋ค์ด๋ฒ ๋ธ๋ก๊ทธ ์์ง (URL ํ๋ณด + ํฌ๋กค๋ง + DB ์ ์ฅ)
|
| 10 |
+
6. ๋ธ๋ก๊ทธ ๋ณธ๋ฌธ โ ์ฅ์๋ช
์ถ์ถ + mention_count ์ง๊ณ
|
| 11 |
7. ์ข
ํฉ ์ค์ฝ์ด ๊ณ์ฐ + ๋ญํน ์์ฑ
|
| 12 |
|
| 13 |
Usage:
|
|
|
|
| 122 |
youtube = YouTubeCollector(sb, spot_matcher=matcher)
|
| 123 |
run_step("3_youtube", youtube.run, results)
|
| 124 |
|
| 125 |
+
# โโ 4. ์ธ์คํ๊ทธ๋จ ์ธํ๋ฃจ์ธ์ ๋ชจ๋ํฐ๋ง v5.1 Multimodal (SpotMatcher ์ฐ๋) โโ
|
| 126 |
+
logger.info("์ธ์คํ๊ทธ๋จ: ์ธํ๋ฃจ์ธ์ ๋ชจ๋ํฐ๋ง v5.1 (Multimodal AI)")
|
|
|
|
| 127 |
instagram = InstagramCollector(sb, spot_matcher=matcher)
|
| 128 |
+
run_step("4_instagram_influencer", instagram.run, results)
|
| 129 |
|
| 130 |
# โโ 5. ๋ค์ด๋ฒ ํ๋ ์ด์ค โ ๋นํ์ฑ (Place ID ๋งค์นญ ๋ถ๊ฐ) โโ
|
| 131 |
logger.info("๋ค์ด๋ฒ ํ๋ ์ด์ค: ๋นํ์ฑ (Place ID ๋งค์นญ ๋ถ๊ฐ, 2026-02)")
|
|
|
|
| 272 |
|
| 273 |
# โโ 8. ์ข
ํฉ ์ค์ฝ์ด ๊ณ์ฐ + ๋ญํน ์์ฑ (์ต์ 2์ฑ๋ ์ฑ๊ณต ์) โโ
|
| 274 |
# ์์ง ์ฑ๋ ๋จ๊ณ๋ง ์นด์ดํธ (1, 3, 4, 6)
|
| 275 |
+
collection_steps = ["1_kakaomap", "3_youtube", "4_instagram_influencer", "6_naver_blog"]
|
| 276 |
successful_channels = [s for s in collection_steps if results.get(s, {}).get("status") == "ok"]
|
| 277 |
|
| 278 |
def calc_scores():
|
scripts/test_instagram_collector.py
ADDED
|
@@ -0,0 +1,272 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Instagram Collector v5.1 โ ํตํฉ ํ
์คํธ ์คํฌ๋ฆฝํธ
|
| 3 |
+
|
| 4 |
+
์ธ์คํ๊ทธ๋จ ์ธํ๋ฃจ์ธ์ ํ์ดํ๋ผ์ธ์ ํต์ฌ ๋จ๊ณ๋ฅผ ๊ฒ์ฆํฉ๋๋ค:
|
| 5 |
+
1. DB์์ ์ธํ๋ฃจ์ธ์ ๊ณ์ ๋ก๋
|
| 6 |
+
2. Apify๋ก 1๊ฐ ๊ณ์ ๊ฒ์๋ฌผ ์์ง (๋น์ฉ ์ต์ํ)
|
| 7 |
+
3. ๊ฒ์๋ฌผ ์ ๊ทํ + ๊ธฐ๊ฐ/engagement ํํฐ
|
| 8 |
+
4. Pass 1: ๊ท์น ๊ธฐ๋ฐ ๋งค์นญ (์์นํ๊ทธ + ํด์ํ๊ทธ)
|
| 9 |
+
5. Pass 2: Gemini ๋ฉํฐ๋ชจ๋ฌ AI ๋ถ์ (์ด๋ฏธ์ง 1๊ฑด)
|
| 10 |
+
6. ๋งค์นญ ๊ฒฐ๊ณผ ์์ฝ (DB ์ ์ฅ ์ ํจ)
|
| 11 |
+
|
| 12 |
+
Usage:
|
| 13 |
+
python backend/scripts/test_instagram_collector.py
|
| 14 |
+
"""
|
| 15 |
+
|
| 16 |
+
import json
|
| 17 |
+
import logging
|
| 18 |
+
import os
|
| 19 |
+
import sys
|
| 20 |
+
import time
|
| 21 |
+
from datetime import datetime, timedelta, timezone
|
| 22 |
+
|
| 23 |
+
# backend/ ๋๋ ํ ๋ฆฌ๋ฅผ import path์ ์ถ๊ฐ
|
| 24 |
+
sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
|
| 25 |
+
|
| 26 |
+
# .env ๋ก๋
|
| 27 |
+
try:
|
| 28 |
+
from dotenv import load_dotenv
|
| 29 |
+
env_path = os.path.join(os.path.dirname(__file__), "..", "..", ".env")
|
| 30 |
+
load_dotenv(env_path)
|
| 31 |
+
except ImportError:
|
| 32 |
+
pass
|
| 33 |
+
|
| 34 |
+
logging.basicConfig(
|
| 35 |
+
level=logging.INFO,
|
| 36 |
+
format="%(asctime)s [%(levelname)s] %(name)s: %(message)s",
|
| 37 |
+
datefmt="%H:%M:%S",
|
| 38 |
+
)
|
| 39 |
+
logger = logging.getLogger("test_instagram")
|
| 40 |
+
|
| 41 |
+
|
| 42 |
+
def test_step(name: str, func):
|
| 43 |
+
"""ํ
์คํธ ๋จ๊ณ ์คํ ๋ํผ."""
|
| 44 |
+
logger.info("โโโ [TEST] %s โโโ", name)
|
| 45 |
+
start = time.time()
|
| 46 |
+
try:
|
| 47 |
+
result = func()
|
| 48 |
+
elapsed = time.time() - start
|
| 49 |
+
logger.info(" โ PASS โ %.1f์ด", elapsed)
|
| 50 |
+
return result
|
| 51 |
+
except Exception as e:
|
| 52 |
+
elapsed = time.time() - start
|
| 53 |
+
logger.error(" โ FAIL โ %s (%.1f์ด)", e, elapsed)
|
| 54 |
+
import traceback
|
| 55 |
+
traceback.print_exc()
|
| 56 |
+
return None
|
| 57 |
+
|
| 58 |
+
|
| 59 |
+
def main():
|
| 60 |
+
from supabase import create_client
|
| 61 |
+
|
| 62 |
+
url = os.environ.get("SUPABASE_URL") or os.environ.get("VITE_SUPABASE_URL")
|
| 63 |
+
key = os.environ.get("SUPABASE_SERVICE_ROLE_KEY")
|
| 64 |
+
if not url or not key:
|
| 65 |
+
logger.error("SUPABASE_URL / SUPABASE_SERVICE_ROLE_KEY ํ๊ฒฝ๋ณ์ ๋๋ฝ")
|
| 66 |
+
sys.exit(1)
|
| 67 |
+
|
| 68 |
+
sb = create_client(url, key)
|
| 69 |
+
|
| 70 |
+
# โโ 1. ์ธํ๋ฃจ์ธ์ ๊ณ์ ๋ก๋ โโ
|
| 71 |
+
def step1_load_accounts():
|
| 72 |
+
from trend_engine.collectors.instagram import InstagramCollector
|
| 73 |
+
collector = InstagramCollector(sb)
|
| 74 |
+
accounts = collector._load_influencer_accounts()
|
| 75 |
+
assert len(accounts) > 0, f"๊ณ์ 0๊ฐ ๋ก๋๋จ"
|
| 76 |
+
logger.info(" ๊ณ์ %d๊ฐ ๋ก๋:", len(accounts))
|
| 77 |
+
for a in accounts:
|
| 78 |
+
logger.info(" [%d] @%s (%s)", a.get("priority", 0), a["username"], a.get("category", ""))
|
| 79 |
+
return accounts
|
| 80 |
+
|
| 81 |
+
accounts = test_step("1. ์ธํ๋ฃจ์ธ์ ๊ณ์ ๋ก๋ (DB)", step1_load_accounts)
|
| 82 |
+
if not accounts:
|
| 83 |
+
logger.error("๊ณ์ ๋ก๋ ์คํจ โ ํ
์คํธ ์ค๋จ")
|
| 84 |
+
sys.exit(1)
|
| 85 |
+
|
| 86 |
+
# โโ 2. Apify ์์ง (1๊ฐ ๊ณ์ ๋ง) โโ
|
| 87 |
+
apify_token = os.environ.get("APIFY_API_TOKEN")
|
| 88 |
+
if not apify_token:
|
| 89 |
+
logger.error("APIFY_API_TOKEN ํ๊ฒฝ๋ณ์ ๋๋ฝ โ ์์ง ํ
์คํธ ์คํต")
|
| 90 |
+
sys.exit(1)
|
| 91 |
+
|
| 92 |
+
# ์ ์ฃผ ๊ฑฐ์ฃผ ์ธํ๋ฃจ์ธ์ ์ฐ์ , ์์ผ๋ฉด priority 1
|
| 93 |
+
jeju_accounts = [a for a in accounts if a.get("category") == "lifestyle"]
|
| 94 |
+
test_account = jeju_accounts[0] if jeju_accounts else accounts[0]
|
| 95 |
+
test_username = test_account["username"]
|
| 96 |
+
|
| 97 |
+
def step2_apify_scrape():
|
| 98 |
+
from trend_engine.collectors.instagram import InstagramCollector
|
| 99 |
+
collector = InstagramCollector(sb)
|
| 100 |
+
|
| 101 |
+
# 1๊ฐ ๊ณ์ ๋ง 5๊ฑด ์ ํ์ผ๋ก ์์ง
|
| 102 |
+
from trend_engine.collectors import instagram as ig_mod
|
| 103 |
+
orig_limit = ig_mod.RESULTS_LIMIT_PER_ACCOUNT
|
| 104 |
+
orig_window = ig_mod.TREND_WINDOW_DAYS
|
| 105 |
+
ig_mod.RESULTS_LIMIT_PER_ACCOUNT = 5 # ํ
์คํธ์ฉ ์ ํ
|
| 106 |
+
ig_mod.TREND_WINDOW_DAYS = 90 # ํ
์คํธ์ฉ: 90์ผ๋ก ํ์ฅ (์ถฉ๋ถํ ๋งค์นญ ๋ฐ์ดํฐ)
|
| 107 |
+
|
| 108 |
+
try:
|
| 109 |
+
posts = collector._scrape_profiles([test_username], "test_batch")
|
| 110 |
+
finally:
|
| 111 |
+
ig_mod.RESULTS_LIMIT_PER_ACCOUNT = orig_limit
|
| 112 |
+
ig_mod.TREND_WINDOW_DAYS = orig_window
|
| 113 |
+
|
| 114 |
+
logger.info(" @%s โ %d๊ฑด ์์ง", test_username, len(posts))
|
| 115 |
+
for i, p in enumerate(posts[:3]):
|
| 116 |
+
logger.info(
|
| 117 |
+
" [%d] type=%s likes=%d loc=%s caption=%.50s...",
|
| 118 |
+
i + 1,
|
| 119 |
+
p.get("media_type", "?"),
|
| 120 |
+
p.get("likes_count", 0),
|
| 121 |
+
p.get("location_name", "")[:30] or "(์์)",
|
| 122 |
+
(p.get("caption", "") or "")[:50],
|
| 123 |
+
)
|
| 124 |
+
if p.get("media_url"):
|
| 125 |
+
logger.info(" media_url=%s", p["media_url"][:80])
|
| 126 |
+
return posts
|
| 127 |
+
|
| 128 |
+
posts = test_step(f"2. Apify ์์ง (@{test_username}, 5๊ฑด ์ ํ)", step2_apify_scrape)
|
| 129 |
+
if not posts:
|
| 130 |
+
logger.warning("๊ฒ์๋ฌผ 0๊ฑด โ ์ดํ ๋งค์นญ ํ
์คํธ๋ ํฉ์ฑ ๋ฐ์ดํฐ๋ก ์งํ")
|
| 131 |
+
posts = []
|
| 132 |
+
|
| 133 |
+
# โโ 3. SpotMatcher ์ด๊ธฐํ โโ
|
| 134 |
+
def step3_spot_matcher():
|
| 135 |
+
from trend_engine.spot_matcher import SpotMatcher
|
| 136 |
+
matcher = SpotMatcher(sb)
|
| 137 |
+
logger.info(
|
| 138 |
+
" trend_spots: %d๊ฑด, story_spots: %d๊ฑด",
|
| 139 |
+
len(matcher.trend_spots), len(matcher.story_spots),
|
| 140 |
+
)
|
| 141 |
+
# ์ํ ๋งค์นญ ํ
์คํธ
|
| 142 |
+
test_names = ["์นดํ ๋ ์ด์ด๋ ์ ์", "ํ์ฌํด์์์ฅ", "์๋ณ์ค๋ฆ", "์ ์ ์นดํ๊ฑฐ๋ฆฌ"]
|
| 143 |
+
for name in test_names:
|
| 144 |
+
sid = matcher.match(name)
|
| 145 |
+
logger.info(" match('%s') โ %s", name, sid or "(๋ฏธ๋งค์นญ)")
|
| 146 |
+
return matcher
|
| 147 |
+
|
| 148 |
+
matcher = test_step("3. SpotMatcher ์ด๊ธฐํ", step3_spot_matcher)
|
| 149 |
+
if not matcher:
|
| 150 |
+
logger.error("SpotMatcher ์ด๊ธฐํ ์คํจ โ ํ
์คํธ ์ค๋จ")
|
| 151 |
+
sys.exit(1)
|
| 152 |
+
|
| 153 |
+
# โโ 4. Pass 1: ๊ท์น ๊ธฐ๋ฐ ๋งค์นญ โโ
|
| 154 |
+
def step4_pass1_matching():
|
| 155 |
+
from trend_engine.collectors.instagram import InstagramCollector
|
| 156 |
+
from trend_engine.collectors import instagram as ig_mod
|
| 157 |
+
collector = InstagramCollector(sb, spot_matcher=matcher)
|
| 158 |
+
|
| 159 |
+
# ํ
์คํธ์ฉ: ๊ธฐ๊ฐ ํํฐ ํ์ฅ (90์ผ)
|
| 160 |
+
orig_window = ig_mod.TREND_WINDOW_DAYS
|
| 161 |
+
ig_mod.TREND_WINDOW_DAYS = 90
|
| 162 |
+
|
| 163 |
+
if not posts:
|
| 164 |
+
logger.info(" ์ค์ ๊ฒ์๋ฌผ ์์ โ ํฉ์ฑ ๋ฐ์ดํฐ๋ก ํ
์คํธ")
|
| 165 |
+
test_posts = [
|
| 166 |
+
{
|
| 167 |
+
"search_term": "@test",
|
| 168 |
+
"search_type": "profile",
|
| 169 |
+
"location_name": "์นดํ ๋ ์ด์ด๋ ์ ์",
|
| 170 |
+
"likes_count": 500,
|
| 171 |
+
"comments_count": 30,
|
| 172 |
+
"caption": "์ ์ ์นดํ ๋๋ฌด ์ข๋ค #์ ์์นดํ #์ ์ฃผ๋",
|
| 173 |
+
"timestamp": datetime.now(timezone.utc).isoformat(),
|
| 174 |
+
"url": "https://instagram.com/p/test1",
|
| 175 |
+
"hashtags": ["์ ์์นดํ", "์ ์ฃผ๋"],
|
| 176 |
+
"media_url": "",
|
| 177 |
+
"media_type": "Image",
|
| 178 |
+
"_source_account": "test",
|
| 179 |
+
},
|
| 180 |
+
{
|
| 181 |
+
"search_term": "@test",
|
| 182 |
+
"search_type": "profile",
|
| 183 |
+
"location_name": "",
|
| 184 |
+
"likes_count": 200,
|
| 185 |
+
"comments_count": 10,
|
| 186 |
+
"caption": "์๋ณ์ค๋ฆ ์ผ์ถ ๋ณด๊ณ ์์ด์ #์๋ณ์ค๋ฆ #์ ์ฃผ์ฌํ",
|
| 187 |
+
"timestamp": datetime.now(timezone.utc).isoformat(),
|
| 188 |
+
"url": "https://instagram.com/p/test2",
|
| 189 |
+
"hashtags": ["์๋ณ์ค๋ฆ", "์ ์ฃผ์ฌํ"],
|
| 190 |
+
"media_url": "https://example.com/image.jpg",
|
| 191 |
+
"media_type": "Image",
|
| 192 |
+
"_source_account": "test",
|
| 193 |
+
},
|
| 194 |
+
]
|
| 195 |
+
else:
|
| 196 |
+
test_posts = posts
|
| 197 |
+
|
| 198 |
+
spot_metrics, unmatched, match_stats = collector._aggregate_with_unmatched(test_posts)
|
| 199 |
+
|
| 200 |
+
logger.info(" ์ด ๊ฒ์๋ฌผ: %d๊ฑด", len(test_posts))
|
| 201 |
+
logger.info(" Pass 1 ๋งค์นญ: %d๊ฐ ์คํ", len(spot_metrics))
|
| 202 |
+
logger.info(" ๋ฏธ๋งค์นญ (โ Pass 2 ๋์): %d๊ฑด", len(unmatched))
|
| 203 |
+
logger.info(" ๋งค์นญ ํต๊ณ: %s", json.dumps(match_stats, ensure_ascii=False))
|
| 204 |
+
|
| 205 |
+
if spot_metrics:
|
| 206 |
+
logger.info(" ๋งค์นญ๋ ์คํ:")
|
| 207 |
+
for sid, m in spot_metrics.items():
|
| 208 |
+
logger.info(
|
| 209 |
+
" %s: posts=%d, engagement=%d, methods=%s",
|
| 210 |
+
sid, m["post_count"], m["weighted_score"],
|
| 211 |
+
m["match_methods"],
|
| 212 |
+
)
|
| 213 |
+
|
| 214 |
+
ig_mod.TREND_WINDOW_DAYS = orig_window
|
| 215 |
+
return spot_metrics, unmatched, match_stats
|
| 216 |
+
|
| 217 |
+
result4 = test_step("4. Pass 1 โ ๊ท์น ๊ธฐ๋ฐ ๋งค์นญ", step4_pass1_matching)
|
| 218 |
+
if not result4:
|
| 219 |
+
logger.error("Pass 1 ํ
์คํธ ์คํจ")
|
| 220 |
+
sys.exit(1)
|
| 221 |
+
|
| 222 |
+
spot_metrics, unmatched, match_stats = result4
|
| 223 |
+
|
| 224 |
+
# โโ 5. Pass 2: AI ๋ฉํฐ๋ชจ๋ฌ ๋ถ์ (์ ํ์ ) โโ
|
| 225 |
+
gemini_key = os.environ.get("GEMINI_API_KEY")
|
| 226 |
+
if gemini_key and unmatched:
|
| 227 |
+
def step5_ai_multimodal():
|
| 228 |
+
from trend_engine.collectors.instagram import InstagramCollector
|
| 229 |
+
collector = InstagramCollector(sb, spot_matcher=matcher)
|
| 230 |
+
|
| 231 |
+
# 1๊ฑด๋ง ํ
์คํธ
|
| 232 |
+
test_unmatched = unmatched[:1]
|
| 233 |
+
p = test_unmatched[0]
|
| 234 |
+
logger.info(
|
| 235 |
+
" ํ
์คํธ ๊ฒ์๋ฌผ: type=%s, caption=%.60s...",
|
| 236 |
+
p.get("media_type", "?"),
|
| 237 |
+
(p.get("caption", "") or "")[:60],
|
| 238 |
+
)
|
| 239 |
+
if p.get("media_url"):
|
| 240 |
+
logger.info(" media_url=%s", p["media_url"][:80])
|
| 241 |
+
|
| 242 |
+
ai_count = collector._ai_analyze_content(
|
| 243 |
+
test_unmatched, spot_metrics, match_stats,
|
| 244 |
+
)
|
| 245 |
+
logger.info(" AI ๋งค์นญ ๊ฒฐ๊ณผ: %d๊ฑด", ai_count)
|
| 246 |
+
return ai_count
|
| 247 |
+
|
| 248 |
+
test_step("5. Pass 2 โ Gemini ๋ฉํฐ๋ชจ๋ฌ AI (1๊ฑด)", step5_ai_multimodal)
|
| 249 |
+
elif not gemini_key:
|
| 250 |
+
logger.info("โโโ [SKIP] 5. AI ๋ฉํฐ๋ชจ๋ฌ โ GEMINI_API_KEY ๋ฏธ์ค์ โโโ")
|
| 251 |
+
else:
|
| 252 |
+
logger.info("โโโ [SKIP] 5. AI ๋ฉํฐ๋ชจ๋ฌ โ ๋ฏธ๋งค์นญ ๊ฒ์๋ฌผ ์์ โโโ")
|
| 253 |
+
|
| 254 |
+
# โโ 6. ์ต์ข
์์ฝ โโ
|
| 255 |
+
logger.info("")
|
| 256 |
+
logger.info("โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ")
|
| 257 |
+
logger.info(" Instagram v5.1 ํ
์คํธ ๊ฒฐ๊ณผ ์์ฝ")
|
| 258 |
+
logger.info("โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ")
|
| 259 |
+
logger.info(" ์ธํ๋ฃจ์ธ์ ๊ณ์ : %d๊ฐ (DB)", len(accounts))
|
| 260 |
+
logger.info(" Apify ์์ง: @%s โ %d๊ฑด", test_username, len(posts))
|
| 261 |
+
logger.info(" SpotMatcher: trend=%d, story=%d",
|
| 262 |
+
len(matcher.trend_spots), len(matcher.story_spots))
|
| 263 |
+
logger.info(" Pass 1 ๋งค์นญ: %d๊ฐ ์คํ", len(spot_metrics))
|
| 264 |
+
logger.info(" ๋งค์นญ ํต๊ณ: %s", json.dumps(match_stats, ensure_ascii=False))
|
| 265 |
+
logger.info(" AI ๋ฉํฐ๋ชจ๋ฌ: %s", "ํ์ฑ" if gemini_key else "๋นํ์ฑ")
|
| 266 |
+
logger.info("โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ")
|
| 267 |
+
logger.info(" โ ๏ธ DB ์ ์ฅ ์ ํจ (ํ
์คํธ ๋ชจ๋)")
|
| 268 |
+
logger.info("โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ")
|
| 269 |
+
|
| 270 |
+
|
| 271 |
+
if __name__ == "__main__":
|
| 272 |
+
main()
|
scripts/test_instagram_full.py
ADDED
|
@@ -0,0 +1,162 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Instagram Collector v5.1 โ ์ ์ฒด ์ธํ๋ฃจ์ธ์ ์ค์ ํ
์คํธ
|
| 3 |
+
|
| 4 |
+
15๊ฐ ์ธํ๋ฃจ์ธ์ ์ ์ฒด์ ๋ํด ์ค์ ํ์ดํ๋ผ์ธ์ ์คํํฉ๋๋ค.
|
| 5 |
+
DB ์ ์ฅ์ ํ์ง ์๊ณ ๊ฒฐ๊ณผ๋ง ํ์ธํฉ๋๋ค.
|
| 6 |
+
|
| 7 |
+
Usage:
|
| 8 |
+
python3 backend/scripts/test_instagram_full.py
|
| 9 |
+
"""
|
| 10 |
+
|
| 11 |
+
import json
|
| 12 |
+
import logging
|
| 13 |
+
import os
|
| 14 |
+
import sys
|
| 15 |
+
import time
|
| 16 |
+
from datetime import datetime, timedelta, timezone
|
| 17 |
+
|
| 18 |
+
sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
|
| 19 |
+
|
| 20 |
+
try:
|
| 21 |
+
from dotenv import load_dotenv
|
| 22 |
+
load_dotenv(os.path.join(os.path.dirname(__file__), "..", "..", ".env"))
|
| 23 |
+
except ImportError:
|
| 24 |
+
pass
|
| 25 |
+
|
| 26 |
+
logging.basicConfig(
|
| 27 |
+
level=logging.INFO,
|
| 28 |
+
format="%(asctime)s [%(levelname)s] %(name)s: %(message)s",
|
| 29 |
+
datefmt="%H:%M:%S",
|
| 30 |
+
)
|
| 31 |
+
logger = logging.getLogger("test_instagram_full")
|
| 32 |
+
|
| 33 |
+
|
| 34 |
+
def main():
|
| 35 |
+
from supabase import create_client
|
| 36 |
+
from trend_engine.collectors.instagram import InstagramCollector
|
| 37 |
+
from trend_engine.spot_matcher import SpotMatcher
|
| 38 |
+
|
| 39 |
+
url = os.environ.get("SUPABASE_URL") or os.environ.get("VITE_SUPABASE_URL")
|
| 40 |
+
key = os.environ.get("SUPABASE_SERVICE_ROLE_KEY")
|
| 41 |
+
if not url or not key:
|
| 42 |
+
logger.error("SUPABASE_URL / SUPABASE_SERVICE_ROLE_KEY ํ๊ฒฝ๋ณ์ ๋๋ฝ")
|
| 43 |
+
sys.exit(1)
|
| 44 |
+
|
| 45 |
+
sb = create_client(url, key)
|
| 46 |
+
|
| 47 |
+
# SpotMatcher ์ด๊ธฐํ
|
| 48 |
+
matcher = SpotMatcher(sb)
|
| 49 |
+
logger.info("SpotMatcher: trend=%d, story=%d", len(matcher.trend_spots), len(matcher.story_spots))
|
| 50 |
+
|
| 51 |
+
# InstagramCollector ์ด๊ธฐํ
|
| 52 |
+
collector = InstagramCollector(sb, spot_matcher=matcher)
|
| 53 |
+
|
| 54 |
+
# โโ 1. ๊ณ์ ๋ก๋ โโ
|
| 55 |
+
accounts = collector._load_influencer_accounts()
|
| 56 |
+
logger.info("์ธํ๋ฃจ์ธ์ ๊ณ์ %d๊ฐ ๋ก๋", len(accounts))
|
| 57 |
+
|
| 58 |
+
# โโ 2. ์ ์ฒด ๊ฒ์๋ฌผ ์์ง โโ
|
| 59 |
+
logger.info("โโโ ์ ์ฒด ์ธํ๋ฃจ์ธ์ ๊ฒ์๋ฌผ ์์ง ์์ โโโ")
|
| 60 |
+
start = time.time()
|
| 61 |
+
all_posts = collector._collect_from_accounts(accounts)
|
| 62 |
+
collect_elapsed = time.time() - start
|
| 63 |
+
logger.info("์์ง ์๋ฃ: %d๊ฑด (%.1f์ด)", len(all_posts), collect_elapsed)
|
| 64 |
+
|
| 65 |
+
# ๊ณ์ ๋ณ ์์ง ํต๊ณ
|
| 66 |
+
account_stats: dict[str, int] = {}
|
| 67 |
+
for p in all_posts:
|
| 68 |
+
acct = p.get("_source_account", "unknown")
|
| 69 |
+
account_stats[acct] = account_stats.get(acct, 0) + 1
|
| 70 |
+
|
| 71 |
+
logger.info("โโโ ๊ณ์ ๋ณ ์์ง ํํฉ โโโ")
|
| 72 |
+
for acct, count in sorted(account_stats.items(), key=lambda x: -x[1]):
|
| 73 |
+
logger.info(" @%-25s โ %d๊ฑด", acct, count)
|
| 74 |
+
|
| 75 |
+
if not all_posts:
|
| 76 |
+
logger.error("์์ง๋ ๊ฒ์๋ฌผ ์์ โ ์ข
๋ฃ")
|
| 77 |
+
sys.exit(1)
|
| 78 |
+
|
| 79 |
+
# โโ 3. Pass 1: ๊ท์น ๊ธฐ๋ฐ ๋งค์นญ โโ
|
| 80 |
+
logger.info("โโโ Pass 1: ๊ท์น ๊ธฐ๋ฐ ๋งค์นญ โโโ")
|
| 81 |
+
spot_metrics, unmatched_posts, match_stats = collector._aggregate_with_unmatched(all_posts)
|
| 82 |
+
|
| 83 |
+
logger.info("Pass 1 ๊ฒฐ๊ณผ:")
|
| 84 |
+
logger.info(" ์ ์ฒด ๊ฒ์๋ฌผ: %d๊ฑด", len(all_posts))
|
| 85 |
+
logger.info(" ๊ธฐ๊ฐ ํํฐ ์ ์ธ: %d๊ฑด (30์ผ ์ด๊ณผ)", match_stats["filtered_old"])
|
| 86 |
+
logger.info(" ์ engagement ์ ์ธ: %d๊ฑด", match_stats["filtered_low_engagement"])
|
| 87 |
+
logger.info(" ์์นํ๊ทธ ๋งค์นญ: %d๊ฑด", match_stats["location_tag"])
|
| 88 |
+
logger.info(" ํด์ํ๊ทธ ๋งค์นญ: %d๊ฑด", match_stats["hashtag"])
|
| 89 |
+
logger.info(" ๋ฏธ๋งค์นญ โ AI ๋์: %d๊ฑด", match_stats["unmatched"])
|
| 90 |
+
logger.info(" Pass 1 ์คํ: %d๊ฐ", len(spot_metrics))
|
| 91 |
+
|
| 92 |
+
if spot_metrics:
|
| 93 |
+
logger.info(" ๋งค์นญ๋ ์คํ:")
|
| 94 |
+
for sid, m in sorted(spot_metrics.items(), key=lambda x: -x[1]["weighted_score"]):
|
| 95 |
+
logger.info(
|
| 96 |
+
" %s: posts=%d, score=%d, methods=%s, accounts=%s",
|
| 97 |
+
sid, m["post_count"], m["weighted_score"],
|
| 98 |
+
m["match_methods"], m["source_accounts"],
|
| 99 |
+
)
|
| 100 |
+
|
| 101 |
+
# โโ 4. Pass 2: AI ๋ฉํฐ๋ชจ๋ฌ ๋ถ์ (์ ์ฒด) โโ
|
| 102 |
+
ai_matched = 0
|
| 103 |
+
if unmatched_posts and os.environ.get("GEMINI_API_KEY"):
|
| 104 |
+
logger.info("โโโ Pass 2: Gemini 2.5 Flash ๋ฉํฐ๋ชจ๋ฌ ๋ถ์ (%d๊ฑด) โโโ", len(unmatched_posts))
|
| 105 |
+
|
| 106 |
+
# ๋ฏธ๋งค์นญ ๊ฒ์๋ฌผ ๋ฏธ๋์ด ํ์
ํต๊ณ
|
| 107 |
+
image_count = sum(1 for p in unmatched_posts if p.get("media_type") != "Video")
|
| 108 |
+
video_count = sum(1 for p in unmatched_posts if p.get("media_type") == "Video")
|
| 109 |
+
logger.info(" ์ด๋ฏธ์ง: %d๊ฑด, ์์: %d๊ฑด", image_count, video_count)
|
| 110 |
+
|
| 111 |
+
# ๋ฏธ๋งค์นญ ๊ฒ์๋ฌผ ์์ธ (์บก์
๋ฏธ๋ฆฌ๋ณด๊ธฐ)
|
| 112 |
+
for i, p in enumerate(unmatched_posts[:10]):
|
| 113 |
+
logger.info(
|
| 114 |
+
" [%d] @%s type=%s caption=%.60s...",
|
| 115 |
+
i + 1, p.get("_source_account", "?"),
|
| 116 |
+
p.get("media_type", "?"),
|
| 117 |
+
(p.get("caption", "") or "")[:60],
|
| 118 |
+
)
|
| 119 |
+
|
| 120 |
+
start_ai = time.time()
|
| 121 |
+
ai_matched = collector._ai_analyze_content(unmatched_posts, spot_metrics, match_stats)
|
| 122 |
+
ai_elapsed = time.time() - start_ai
|
| 123 |
+
logger.info("AI ๋ถ์ ์๋ฃ: %d๊ฑด ๋งค์นญ (%.1f์ด)", ai_matched, ai_elapsed)
|
| 124 |
+
elif not os.environ.get("GEMINI_API_KEY"):
|
| 125 |
+
logger.warning("GEMINI_API_KEY ๋ฏธ์ค์ โ AI ๋ถ์ ์คํต")
|
| 126 |
+
else:
|
| 127 |
+
logger.info("๋ฏธ๋งค์นญ ๊ฒ์๋ฌผ ์์ โ AI ๋ถ์ ๋ถํ์")
|
| 128 |
+
|
| 129 |
+
# โโ 5. ์ต์ข
๊ฒฐ๊ณผ โโ
|
| 130 |
+
logger.info("")
|
| 131 |
+
logger.info("โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ")
|
| 132 |
+
logger.info(" Instagram v5.1 ์ ์ฒด ํ
์คํธ ๊ฒฐ๊ณผ")
|
| 133 |
+
logger.info("โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ")
|
| 134 |
+
logger.info(" ์ธํ๋ฃจ์ธ์: %d๊ฐ ๊ณ์ ", len(accounts))
|
| 135 |
+
logger.info(" ์์ง ์ฑ๊ณต: %d๊ฐ ๊ณ์ (%d๊ฑด)", len(account_stats), len(all_posts))
|
| 136 |
+
logger.info(" ์์ง ์์: %.1f์ด", collect_elapsed)
|
| 137 |
+
logger.info(" โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ")
|
| 138 |
+
logger.info(" 30์ผ ์ด๋ด ๊ฒ์๋ฌผ: %d๊ฑด", len(all_posts) - match_stats["filtered_old"])
|
| 139 |
+
logger.info(" engagement โฅ 50: %d๊ฑด",
|
| 140 |
+
len(all_posts) - match_stats["filtered_old"] - match_stats["filtered_low_engagement"])
|
| 141 |
+
logger.info(" โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ")
|
| 142 |
+
logger.info(" Pass 1 (์์นํ๊ทธ): %d๊ฑด", match_stats["location_tag"])
|
| 143 |
+
logger.info(" Pass 1 (ํด์ํ๊ทธ): %d๊ฑด", match_stats["hashtag"])
|
| 144 |
+
logger.info(" Pass 2 (AI ๋ฉํฐ๋ชจ๋ฌ): %d๊ฑด", match_stats.get("ai_matched", 0))
|
| 145 |
+
logger.info(" ์ด ๋งค์นญ ์คํ: %d๊ฐ", len(spot_metrics))
|
| 146 |
+
logger.info(" โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ")
|
| 147 |
+
|
| 148 |
+
if spot_metrics:
|
| 149 |
+
logger.info(" ์ต์ข
๋งค์นญ ์คํ ๋ชฉ๋ก:")
|
| 150 |
+
for sid, m in sorted(spot_metrics.items(), key=lambda x: -x[1]["weighted_score"]):
|
| 151 |
+
logger.info(
|
| 152 |
+
" %s: posts=%d, score=%d",
|
| 153 |
+
sid, m["post_count"], m["weighted_score"],
|
| 154 |
+
)
|
| 155 |
+
|
| 156 |
+
logger.info(" โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ")
|
| 157 |
+
logger.info(" โ ๏ธ DB ์ ์ฅ ์ ํจ (ํ
์คํธ ๋ชจ๋)")
|
| 158 |
+
logger.info("โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ")
|
| 159 |
+
|
| 160 |
+
|
| 161 |
+
if __name__ == "__main__":
|
| 162 |
+
main()
|
trend_engine/collectors/instagram.py
CHANGED
|
@@ -1,165 +1,64 @@
|
|
| 1 |
"""
|
| 2 |
-
Instagram Collector โ
|
| 3 |
-
|
| 4 |
-
|
| 5 |
-
|
| 6 |
-
|
| 7 |
-
|
| 8 |
-
|
| 9 |
-
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
|
|
|
|
|
|
|
| 14 |
5. ๊ฐ์ค ์ง๊ณ: weighted_score = sum(min(engagement, cap))
|
| 15 |
-
6. spot_trends ํ
์ด๋ธ์ ์ ์ฅ (source =
|
| 16 |
-
|
| 17 |
-
Apify (v3.6):
|
| 18 |
-
- Apify instagram-hashtag-scraper Actor (~30๊ฑด/ํด์ํ๊ทธ)
|
| 19 |
-
- directUrls: Location ID ๋ฐ์๋ ํ์ + JSON ์บ์ฑ + SpotMatcher ์ฐ๋
|
| 20 |
-
|
| 21 |
-
EnsembleData (v4.0):
|
| 22 |
-
- REST API /instagram/hashtag/posts (~63๊ฑด/ํด์ํ๊ทธ)
|
| 23 |
-
- ์ผ๋ณ ์ ๋ ์์ฐ ๊ด๋ฆฌ (BudgetTracker)
|
| 24 |
-
- ํด์ํ๊ทธ ๋กํ
์ด์
์ค์ผ์ค (Free Trial: 1๊ฐ/์ผ, Wood: ์ ์ฒด)
|
| 25 |
"""
|
| 26 |
|
| 27 |
from __future__ import annotations
|
| 28 |
|
| 29 |
import json
|
|
|
|
| 30 |
import os
|
| 31 |
import re
|
| 32 |
-
import
|
| 33 |
-
|
| 34 |
-
|
| 35 |
-
# โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 36 |
-
# Backend Switch
|
| 37 |
-
# โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 38 |
-
|
| 39 |
-
INSTAGRAM_BACKEND = os.getenv("INSTAGRAM_BACKEND", "apify")
|
| 40 |
|
| 41 |
-
|
| 42 |
-
from apify_client import ApifyClient
|
| 43 |
-
elif INSTAGRAM_BACKEND == "ed":
|
| 44 |
-
import httpx
|
| 45 |
|
| 46 |
-
from trend_engine.place_extractor import PlaceNameExtractor
|
| 47 |
from trend_engine.utils import get_week_period, safe_upsert_spot_trend
|
| 48 |
|
| 49 |
logger = logging.getLogger(__name__)
|
| 50 |
|
| 51 |
|
| 52 |
# โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 53 |
-
#
|
| 54 |
# โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 55 |
|
| 56 |
-
|
| 57 |
-
|
| 58 |
-
AREA_SUB_NAMES = ["ํ๋ด", "๊ณฝ์ง"]
|
| 59 |
-
|
| 60 |
-
# โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 61 |
-
# ์นดํ
๊ณ ๋ฆฌ 1: ์ง์ญ ์ผ๋ฐ ํ
ํ๋ฆฟ
|
| 62 |
-
# โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 63 |
-
|
| 64 |
-
GENERAL_SUFFIXES = [
|
| 65 |
-
"์นดํ", "๋ง์ง", "์ฌํ", "๊ฐ๋ณผ๋งํ๊ณณ", "ํซํ",
|
| 66 |
-
"๊ฐ์ฑ", "๋์ ํธ", "ํด์", "์ฐ์ฑ
", "์ผ๋ชฐ", "์ค์
๋ทฐ",
|
| 67 |
-
]
|
| 68 |
-
|
| 69 |
-
|
| 70 |
-
def build_general_hashtags(area: str, aliases: list[str]) -> list[str]:
|
| 71 |
-
"""์ง์ญ๋ช
+ ์ผ๋ฐ ์ ๋ฏธ์ฌ ์กฐํฉ์ผ๋ก ํด์ํ๊ทธ ์์ฑ."""
|
| 72 |
-
tags = []
|
| 73 |
-
for suffix in GENERAL_SUFFIXES:
|
| 74 |
-
tags.append(f"{area}{suffix}")
|
| 75 |
-
for alias in aliases:
|
| 76 |
-
tags.append(alias)
|
| 77 |
-
return tags
|
| 78 |
-
|
| 79 |
-
|
| 80 |
-
# โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 81 |
-
# ์นดํ
๊ณ ๋ฆฌ 2: ํ์ ์ง๋ช
ํด์ํ๊ทธ
|
| 82 |
-
# โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 83 |
-
|
| 84 |
-
SUB_AREA_SUFFIXES = ["ํด๋ณ", "ํด์์์ฅ"]
|
| 85 |
|
|
|
|
|
|
|
|
|
|
| 86 |
|
| 87 |
-
|
| 88 |
-
|
| 89 |
-
tags = []
|
| 90 |
-
for name in sub_names:
|
| 91 |
-
for suffix in SUB_AREA_SUFFIXES:
|
| 92 |
-
tags.append(f"{name}{suffix}")
|
| 93 |
-
return tags
|
| 94 |
|
| 95 |
-
|
| 96 |
-
#
|
| 97 |
-
|
| 98 |
-
#
|
| 99 |
-
|
| 100 |
-
|
| 101 |
-
def build_spot_hashtags(supabase_client, limit: int = 15) -> list[str]:
|
| 102 |
-
"""trend_spots ํ
์ด๋ธ์์ ์ธ๊ธฐ ์ฅ์๋ช
์ ํด์ํ๊ทธ ํ๋ณด๋ก ์ถ์ถ."""
|
| 103 |
-
try:
|
| 104 |
-
result = (
|
| 105 |
-
supabase_client.table("trend_spots")
|
| 106 |
-
.select("name, category")
|
| 107 |
-
.in_("category", ["๊ด๊ด๋ช
์", "์นดํ", "๋ฌธํ์์ค"])
|
| 108 |
-
.execute()
|
| 109 |
-
)
|
| 110 |
-
except Exception as e:
|
| 111 |
-
logger.warning("trend_spots ์กฐํ ์คํจ (๊ณ ์ ๋ช
์ฌ ํด์ํ๊ทธ): %s", e)
|
| 112 |
-
return []
|
| 113 |
-
|
| 114 |
-
spot_names = []
|
| 115 |
-
for row in result.data or []:
|
| 116 |
-
name = row.get("name", "")
|
| 117 |
-
if not name or len(name) < 2 or len(name) > 15:
|
| 118 |
-
continue
|
| 119 |
-
if " " in name:
|
| 120 |
-
continue
|
| 121 |
-
spot_names.append(name)
|
| 122 |
-
|
| 123 |
-
spot_names.sort(key=len, reverse=True)
|
| 124 |
-
return spot_names[:limit]
|
| 125 |
|
| 126 |
|
| 127 |
# โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 128 |
-
#
|
| 129 |
# โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 130 |
|
| 131 |
-
|
| 132 |
-
def build_all_hashtags(supabase_client) -> list[str]:
|
| 133 |
-
"""3๊ฐ ์นดํ
๊ณ ๋ฆฌ๋ฅผ ํฉ์ฐํ๊ณ ์ค๋ณต ์ ๊ฑฐํ ์ต์ข
ํด์ํ๊ทธ ๋ฆฌ์คํธ."""
|
| 134 |
-
general = build_general_hashtags(AREA_NAME, AREA_ALIASES)
|
| 135 |
-
sub_area = build_sub_area_hashtags(AREA_SUB_NAMES)
|
| 136 |
-
spots = build_spot_hashtags(supabase_client, limit=15)
|
| 137 |
-
|
| 138 |
-
seen: set[str] = set()
|
| 139 |
-
all_tags: list[str] = []
|
| 140 |
-
for tag in general + sub_area + spots:
|
| 141 |
-
if tag not in seen:
|
| 142 |
-
seen.add(tag)
|
| 143 |
-
all_tags.append(tag)
|
| 144 |
-
|
| 145 |
-
logger.info(
|
| 146 |
-
"ํด์ํ๊ทธ ๊ตฌ์ฑ: ์ผ๋ฐ %d + ํ์์ง๋ช
%d + ๊ณ ์ ๋ช
์ฌ %d = ์ด %d๊ฐ",
|
| 147 |
-
len(general), len(sub_area), len(spots), len(all_tags),
|
| 148 |
-
)
|
| 149 |
-
return all_tags
|
| 150 |
-
|
| 151 |
-
|
| 152 |
-
# โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 153 |
-
# Shared: ๊ณตํต ์ค์
|
| 154 |
-
# โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 155 |
-
|
| 156 |
-
# ๊ธฐ๊ฐ ํํฐ โ ์ง์ 1์ฃผ + 2~3์ผ ๋ฒํผ
|
| 157 |
-
TREND_WINDOW_DAYS = 10
|
| 158 |
-
|
| 159 |
-
# ๊ฐ์ค ์ง๊ณ ์์
|
| 160 |
-
MIN_ENGAGEMENT = 10 # ์ต์ engagement ์๊ณ๊ฐ (๋ด/์คํธ ํํฐ)
|
| 161 |
-
ENGAGEMENT_CAP = 1000 # ๋จ์ผ ๊ฒ์๋ฌผ engagement ์ํ (์ธํ๋ฃจ์ธ์ ์ง๋ฐฐ ๋ฐฉ์ง)
|
| 162 |
-
|
| 163 |
# ๊ตญ๊ฐ/๊ณตํญ ๋จ์๋ง BLACKLIST
|
| 164 |
LOCATION_BLACKLIST = frozenset({
|
| 165 |
"South Korea", "Korea", "๋ํ๋ฏผ๊ตญ", "ํ๊ตญ",
|
|
@@ -204,683 +103,293 @@ def clean_location_name(name: str) -> str:
|
|
| 204 |
HASHTAG_RE = re.compile(r"#([\w๊ฐ-ํฃ]{2,30})")
|
| 205 |
|
| 206 |
|
| 207 |
-
#
|
| 208 |
-
#
|
| 209 |
-
#
|
| 210 |
-
|
| 211 |
-
# ํด์ํ๊ทธ๋น ์ต๋ ๊ฒ์๋ฌผ ์ (์ธ์คํ๊ทธ๋จ ๊ณต๊ฐ API 1ํ์ด์ง = ~30๊ฑด)
|
| 212 |
-
RESULTS_LIMIT_PER_HASHTAG = 30
|
| 213 |
-
|
| 214 |
-
# Actor ์ค์
|
| 215 |
-
ACTOR_ID = "apify/instagram-hashtag-scraper"
|
| 216 |
-
SEARCH_ACTOR_ID = "apify/instagram-search-scraper"
|
| 217 |
-
|
| 218 |
-
# Location ID ์บ์ ํ์ผ ๊ฒฝ๋ก
|
| 219 |
-
LOCATION_CACHE_FILE = os.path.join(
|
| 220 |
-
os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))),
|
| 221 |
-
"data", "instagram_location_ids.json",
|
| 222 |
-
)
|
| 223 |
-
|
| 224 |
-
|
| 225 |
-
def discover_location_ids(
|
| 226 |
-
apify_client: ApifyClient,
|
| 227 |
-
spot_names: list[str],
|
| 228 |
-
area: str = "์ ์",
|
| 229 |
-
) -> dict[str, dict]:
|
| 230 |
-
"""์ฃผ์ ์ฅ์์ Instagram Location ID๋ฅผ ํ์ํ๋ค.
|
| 231 |
-
|
| 232 |
-
Instagram Search Scraper๋ก ์ฅ์๋ช
๊ฒ์ โ locationId ์ถ์ถ.
|
| 233 |
-
์ด๊ธฐ 1ํ ์คํ ๋๋ ์ 1ํ ๊ฐฑ์ ์ฉ. ๊ฒฐ๊ณผ๋ ํ์ผ์ ์บ์ฑ.
|
| 234 |
-
"""
|
| 235 |
-
location_map: dict[str, dict] = {}
|
| 236 |
-
|
| 237 |
-
for name in spot_names:
|
| 238 |
-
try:
|
| 239 |
-
run_input = {
|
| 240 |
-
"search": f"{name} {area}",
|
| 241 |
-
"searchType": "place",
|
| 242 |
-
"resultsLimit": 3,
|
| 243 |
-
}
|
| 244 |
-
run = apify_client.actor(SEARCH_ACTOR_ID).call(
|
| 245 |
-
run_input=run_input,
|
| 246 |
-
timeout_secs=60,
|
| 247 |
-
)
|
| 248 |
-
items = list(
|
| 249 |
-
apify_client.dataset(run["defaultDatasetId"]).iterate_items()
|
| 250 |
-
)
|
| 251 |
-
|
| 252 |
-
for item in items:
|
| 253 |
-
loc_id = item.get("locationId") or item.get("id")
|
| 254 |
-
loc_name = item.get("name", "")
|
| 255 |
-
if loc_id and loc_name:
|
| 256 |
-
url = f"https://www.instagram.com/explore/locations/{loc_id}/"
|
| 257 |
-
location_map[url] = {
|
| 258 |
-
"instagram_name": loc_name,
|
| 259 |
-
"search_query": name,
|
| 260 |
-
}
|
| 261 |
-
logger.info("Location ID ํ๋ณด: %s โ %s (%s)", name, loc_id, loc_name)
|
| 262 |
-
break # ์ฒซ ๋ฒ์งธ ๋งค์นญ๋ง
|
| 263 |
-
|
| 264 |
-
except Exception as e:
|
| 265 |
-
logger.warning("Location ID ํ์ ์คํจ: %s โ %s", name, e)
|
| 266 |
-
continue
|
| 267 |
-
|
| 268 |
-
logger.info("Location ID ํ์ ์๋ฃ: %d/%d๊ฐ ์ฑ๊ณต", len(location_map), len(spot_names))
|
| 269 |
-
return location_map
|
| 270 |
-
|
| 271 |
-
|
| 272 |
-
def load_or_discover_locations(
|
| 273 |
-
apify_client: ApifyClient, supabase_client, max_age_days: int = 30,
|
| 274 |
-
) -> dict[str, dict]:
|
| 275 |
-
"""์บ์๋ Location ID๋ฅผ ๋ถ๋ฌ์ค๊ฑฐ๋, ์์ผ๋ฉด ํ์ํ๋ค."""
|
| 276 |
-
if os.path.exists(LOCATION_CACHE_FILE):
|
| 277 |
-
try:
|
| 278 |
-
with open(LOCATION_CACHE_FILE) as f:
|
| 279 |
-
cached = json.load(f)
|
| 280 |
-
updated_at = cached.get("updated_at", "")
|
| 281 |
-
if updated_at:
|
| 282 |
-
updated = datetime.fromisoformat(updated_at)
|
| 283 |
-
if (datetime.now(timezone.utc) - updated).days < max_age_days:
|
| 284 |
-
locations = cached.get("locations", {})
|
| 285 |
-
logger.info("Location ID ์บ์ ์ฌ์ฉ (%d๊ฐ, %s)", len(locations), updated_at[:10])
|
| 286 |
-
return locations
|
| 287 |
-
except (json.JSONDecodeError, ValueError, KeyError) as e:
|
| 288 |
-
logger.warning("Location ID ์บ์ ํ์ฑ ์คํจ: %s", e)
|
| 289 |
-
|
| 290 |
-
# ์บ์ ์๊ฑฐ๋ ๋ง๋ฃ โ ์ฌํ์
|
| 291 |
-
spot_names = build_spot_hashtags(supabase_client, limit=15)
|
| 292 |
-
if not spot_names:
|
| 293 |
-
logger.warning("๊ณ ์ ๋ช
์ฌ ํด์ํ๊ทธ 0๊ฐ โ Location ID ํ์ ์คํต")
|
| 294 |
-
return {}
|
| 295 |
-
|
| 296 |
-
locations = discover_location_ids(apify_client, spot_names)
|
| 297 |
-
|
| 298 |
-
# ์บ์ ์ ์ฅ
|
| 299 |
-
try:
|
| 300 |
-
os.makedirs(os.path.dirname(LOCATION_CACHE_FILE), exist_ok=True)
|
| 301 |
-
with open(LOCATION_CACHE_FILE, "w") as f:
|
| 302 |
-
json.dump({
|
| 303 |
-
"updated_at": datetime.now(timezone.utc).isoformat(),
|
| 304 |
-
"locations": locations,
|
| 305 |
-
}, f, ensure_ascii=False, indent=2)
|
| 306 |
-
logger.info("Location ID ์บ์ ์ ์ฅ: %s (%d๊ฐ)", LOCATION_CACHE_FILE, len(locations))
|
| 307 |
-
except OSError as e:
|
| 308 |
-
logger.warning("Location ID ์บ์ ์ ์ฅ ์คํจ: %s", e)
|
| 309 |
-
|
| 310 |
-
return locations
|
| 311 |
-
|
| 312 |
-
|
| 313 |
-
def build_direct_urls_with_spot_ids(
|
| 314 |
-
location_map: dict[str, dict], spot_matcher,
|
| 315 |
-
) -> dict[str, str]:
|
| 316 |
-
"""Location URL โ spot_id ๋งคํ์ ๊ตฌ์ถํ๋ค."""
|
| 317 |
-
url_to_spot: dict[str, str] = {}
|
| 318 |
-
for url, info in location_map.items():
|
| 319 |
-
search_query = info.get("search_query", "")
|
| 320 |
-
if not search_query or not spot_matcher:
|
| 321 |
-
continue
|
| 322 |
-
spot_id = spot_matcher.match(search_query)
|
| 323 |
-
if spot_id:
|
| 324 |
-
url_to_spot[url] = spot_id
|
| 325 |
-
logger.info("directUrl ๋งคํ: %s โ %s", search_query, spot_id)
|
| 326 |
-
else:
|
| 327 |
-
logger.debug("directUrl ๋งคํ ์คํจ: %s", search_query)
|
| 328 |
-
|
| 329 |
-
logger.info("directUrl ๋งคํ ์๋ฃ: %d/%d๊ฐ ์ฑ๊ณต", len(url_to_spot), len(location_map))
|
| 330 |
-
return url_to_spot
|
| 331 |
-
|
| 332 |
-
|
| 333 |
-
# โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 334 |
-
# EnsembleData-Only: API, Budget, Post Conversion
|
| 335 |
-
# โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 336 |
|
| 337 |
-
|
| 338 |
-
|
| 339 |
-
|
| 340 |
-
|
| 341 |
-
|
| 342 |
-
"
|
| 343 |
-
"
|
| 344 |
-
"
|
| 345 |
-
"
|
| 346 |
-
"
|
| 347 |
-
"
|
| 348 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 349 |
]
|
| 350 |
|
| 351 |
|
| 352 |
-
def get_daily_hashtags(all_hashtags: list[str], daily_budget: int) -> list[str]:
|
| 353 |
-
"""์ผ๋ณ ์์ฐ์ ๋ง์ถฐ ์ค๋ ์์งํ ํด์ํ๊ทธ ๋ชฉ๋ก์ ๋ฐํํ๋ค.
|
| 354 |
-
|
| 355 |
-
- daily_budget >= 1500 (Wood ํ๋): ์ ์ฒด ํด์ํ๊ทธ ๋ฐํ
|
| 356 |
-
- daily_budget < 1500 (Free Trial): ์์ผ ๊ธฐ๋ฐ ๋กํ
์ด์
|
| 357 |
-
"""
|
| 358 |
-
if daily_budget >= 1500:
|
| 359 |
-
return all_hashtags
|
| 360 |
-
|
| 361 |
-
day_of_week = date.today().weekday() # 0=์, 6=์ผ
|
| 362 |
-
|
| 363 |
-
if day_of_week == 6:
|
| 364 |
-
logger.info("์ผ์์ผ โ ์์ง ์คํต (์ง๊ณ ์ ์ฉ)")
|
| 365 |
-
return []
|
| 366 |
-
|
| 367 |
-
if day_of_week < len(PRIORITY_HASHTAGS):
|
| 368 |
-
tag = PRIORITY_HASHTAGS[day_of_week]
|
| 369 |
-
logger.info("Free Trial ๋กํ
์ด์
: %s์์ผ โ #%s", "์ํ์๋ชฉ๊ธํ "[day_of_week], tag)
|
| 370 |
-
return [tag]
|
| 371 |
-
|
| 372 |
-
return []
|
| 373 |
-
|
| 374 |
-
|
| 375 |
-
class BudgetTracker:
|
| 376 |
-
"""์ผ๋ณ EnsembleData ์ ๋ ์ฌ์ฉ๋ ์ถ์ .
|
| 377 |
-
|
| 378 |
-
์ํ ํ์ผ์ ์ค๋ ์ฌ์ฉ๋์ ๊ธฐ๋กํ์ฌ ์ฌ์คํ ์์๋ ์์ฐ ์ด๊ณผ๋ฅผ ๋ฐฉ์งํ๋ค.
|
| 379 |
-
"""
|
| 380 |
-
|
| 381 |
-
def __init__(self, daily_limit: int, state_file: str | None = None):
|
| 382 |
-
self.daily_limit = daily_limit
|
| 383 |
-
self.state_file = state_file or os.path.join(
|
| 384 |
-
os.environ.get("BUDGET_STATE_DIR", "/tmp"),
|
| 385 |
-
"ed_budget_state.json",
|
| 386 |
-
)
|
| 387 |
-
self.used_today = self._load_today_usage()
|
| 388 |
-
|
| 389 |
-
@property
|
| 390 |
-
def remaining(self) -> int:
|
| 391 |
-
return max(0, self.daily_limit - self.used_today)
|
| 392 |
-
|
| 393 |
-
def can_afford(self, estimated_posts: int = 70) -> bool:
|
| 394 |
-
"""์์ ๊ฒ์๋ฌผ ์ ๊ธฐ์ค์ผ๋ก ์์ฐ ๋ด์ธ์ง ํ์ธ."""
|
| 395 |
-
return self.remaining >= estimated_posts
|
| 396 |
-
|
| 397 |
-
def record(self, units: int) -> None:
|
| 398 |
-
"""์ ๋ ์ฌ์ฉ ๊ธฐ๋ก."""
|
| 399 |
-
self.used_today += units
|
| 400 |
-
self._save_state()
|
| 401 |
-
logger.info(
|
| 402 |
-
"์ ๋ ์ฌ์ฉ: +%d (์ค๋ ํฉ๊ณ: %d/%d, ์์ฌ: %d)",
|
| 403 |
-
units, self.used_today, self.daily_limit, self.remaining,
|
| 404 |
-
)
|
| 405 |
-
|
| 406 |
-
def _load_today_usage(self) -> int:
|
| 407 |
-
try:
|
| 408 |
-
with open(self.state_file) as f:
|
| 409 |
-
state = json.load(f)
|
| 410 |
-
if state.get("date") == date.today().isoformat():
|
| 411 |
-
return state.get("used", 0)
|
| 412 |
-
except (FileNotFoundError, json.JSONDecodeError, KeyError):
|
| 413 |
-
pass
|
| 414 |
-
return 0
|
| 415 |
-
|
| 416 |
-
def _save_state(self) -> None:
|
| 417 |
-
try:
|
| 418 |
-
os.makedirs(os.path.dirname(self.state_file), exist_ok=True)
|
| 419 |
-
with open(self.state_file, "w") as f:
|
| 420 |
-
json.dump({
|
| 421 |
-
"date": date.today().isoformat(),
|
| 422 |
-
"used": self.used_today,
|
| 423 |
-
"limit": self.daily_limit,
|
| 424 |
-
}, f)
|
| 425 |
-
except OSError as e:
|
| 426 |
-
logger.warning("์์ฐ ์ํ ์ ์ฅ ์คํจ: %s", e)
|
| 427 |
-
|
| 428 |
-
|
| 429 |
-
def _extract_caption(node: dict) -> str:
|
| 430 |
-
"""EnsembleData node์์ ์บก์
ํ
์คํธ ์ถ์ถ."""
|
| 431 |
-
edges = node.get("edge_media_to_caption", {}).get("edges", [])
|
| 432 |
-
if edges:
|
| 433 |
-
return edges[0].get("node", {}).get("text", "")
|
| 434 |
-
return ""
|
| 435 |
-
|
| 436 |
-
|
| 437 |
-
def _extract_likes(node: dict) -> int:
|
| 438 |
-
"""์ข์์ ์ ์ถ์ถ (null/hidden likes ์ฒ๋ฆฌ)."""
|
| 439 |
-
likes = node.get("edge_liked_by", {}).get("count")
|
| 440 |
-
if likes is not None:
|
| 441 |
-
return max(likes, 0)
|
| 442 |
-
likes = node.get("edge_media_preview_like", {}).get("count")
|
| 443 |
-
if likes is not None:
|
| 444 |
-
return max(likes, 0)
|
| 445 |
-
return 0
|
| 446 |
-
|
| 447 |
-
|
| 448 |
-
def _convert_node_to_post(node: dict, hashtag: str) -> dict:
|
| 449 |
-
"""EnsembleData node โ ํ์ค post dict๋ก ๋ณํ."""
|
| 450 |
-
caption = _extract_caption(node)
|
| 451 |
-
hashtags = HASHTAG_RE.findall(caption)
|
| 452 |
-
|
| 453 |
-
location = node.get("location") or {}
|
| 454 |
-
location_name = location.get("name", "")
|
| 455 |
-
|
| 456 |
-
ts = node.get("taken_at_timestamp")
|
| 457 |
-
timestamp_iso = ""
|
| 458 |
-
if ts:
|
| 459 |
-
try:
|
| 460 |
-
timestamp_iso = datetime.fromtimestamp(ts, tz=timezone.utc).isoformat()
|
| 461 |
-
except (ValueError, OSError):
|
| 462 |
-
pass
|
| 463 |
-
|
| 464 |
-
shortcode = node.get("shortcode", "")
|
| 465 |
-
|
| 466 |
-
return {
|
| 467 |
-
"search_term": hashtag,
|
| 468 |
-
"search_type": "hashtag",
|
| 469 |
-
"location_name": location_name,
|
| 470 |
-
"likes_count": _extract_likes(node),
|
| 471 |
-
"comments_count": node.get("edge_media_to_comment", {}).get("count", 0) or 0,
|
| 472 |
-
"caption": caption,
|
| 473 |
-
"timestamp": timestamp_iso,
|
| 474 |
-
"url": f"https://www.instagram.com/p/{shortcode}/" if shortcode else "",
|
| 475 |
-
"hashtags": hashtags,
|
| 476 |
-
"_location_lat": location.get("lat"),
|
| 477 |
-
"_location_lng": location.get("lng"),
|
| 478 |
-
"_location_address": location.get("address", ""),
|
| 479 |
-
"_location_pk": location.get("pk"),
|
| 480 |
-
}
|
| 481 |
-
|
| 482 |
-
|
| 483 |
# โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 484 |
-
# InstagramCollector โ
|
| 485 |
# โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 486 |
|
| 487 |
|
| 488 |
class InstagramCollector:
|
| 489 |
-
"""Instagram ์์ง๊ธฐ (
|
| 490 |
|
| 491 |
-
|
|
|
|
|
|
|
| 492 |
"""
|
| 493 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 494 |
def __init__(self, supabase_client, spot_matcher=None):
|
| 495 |
self.supabase = supabase_client
|
| 496 |
self.spot_matcher = spot_matcher
|
| 497 |
-
self.
|
| 498 |
-
|
| 499 |
-
if INSTAGRAM_BACKEND == "apify":
|
| 500 |
-
self.apify = ApifyClient(os.environ["APIFY_API_TOKEN"])
|
| 501 |
-
elif INSTAGRAM_BACKEND == "ed":
|
| 502 |
-
self.token = os.environ.get("ENSEMBLEDATA_TOKEN", "")
|
| 503 |
-
if not self.token:
|
| 504 |
-
raise ValueError("ENSEMBLEDATA_TOKEN ํ๊ฒฝ๋ณ์๊ฐ ์ค์ ๋์ง ์์์ต๋๋ค")
|
| 505 |
-
daily_limit = int(os.environ.get("ED_DAILY_UNIT_BUDGET", "50"))
|
| 506 |
-
self.budget = BudgetTracker(daily_limit=daily_limit)
|
| 507 |
-
self.http = httpx.Client(timeout=API_TIMEOUT_SECS)
|
| 508 |
-
else:
|
| 509 |
-
raise ValueError(
|
| 510 |
-
f"Unknown INSTAGRAM_BACKEND: {INSTAGRAM_BACKEND!r} "
|
| 511 |
-
"(expected 'apify' or 'ed')"
|
| 512 |
-
)
|
| 513 |
-
|
| 514 |
-
def _get_source_name(self) -> str:
|
| 515 |
-
"""DB source ์ปฌ๋ผ ๊ฐ: instagram_apify ๋๋ instagram_ed."""
|
| 516 |
-
return f"instagram_{INSTAGRAM_BACKEND}"
|
| 517 |
|
| 518 |
# ==================================================================
|
| 519 |
# Main Entry Point
|
| 520 |
# ==================================================================
|
| 521 |
|
| 522 |
def run(self) -> dict:
|
| 523 |
-
"""Instagram ์์ง ํ์ดํ๋ผ์ธ
|
| 524 |
-
|
| 525 |
-
|
| 526 |
-
|
| 527 |
-
|
| 528 |
-
|
| 529 |
-
|
| 530 |
-
|
| 531 |
-
# ==================================================================
|
| 532 |
-
# Apify Backend (v3.6)
|
| 533 |
-
# ==================================================================
|
| 534 |
-
|
| 535 |
-
ACTOR_MEMORY_MB = 1024
|
| 536 |
-
ACTOR_TIMEOUT_SECS = 120
|
| 537 |
-
|
| 538 |
-
def _run_apify(self) -> dict:
|
| 539 |
-
"""Apify ๋ฐฑ์๋ ์์ง ํ์ดํ๋ผ์ธ.
|
| 540 |
-
|
| 541 |
-
[1] ํด์ํ๊ทธ ๋ฆฌ์คํธ ๋์ ์์ฑ (์ผ๋ฐ + ํ์์ง๋ช
+ ๊ณ ์ ๋ช
์ฌ)
|
| 542 |
-
[2] ํด์ํ๊ทธ ๊ฒ์์ผ๋ก ๊ฒ์๋ฌผ ์์ง
|
| 543 |
-
[3] directUrls๋ก ์ฃผ์ ์ฅ์ ๊ฒ์๋ฌผ ์ถ๊ฐ ์์ง
|
| 544 |
-
[4] 3๋จ๊ณ ๋งค์นญ + ๊ธฐ๊ฐ ํํฐ + ์๊ณ๊ฐ ํํฐ + ๊ฐ์ค ์ง๊ณ
|
| 545 |
-
[5] DB ์ ์ฅ
|
| 546 |
"""
|
| 547 |
-
logger.info("=== Instagram ์์ง ์์ (
|
| 548 |
|
| 549 |
-
# [1]
|
| 550 |
-
|
|
|
|
|
|
|
|
|
|
| 551 |
|
| 552 |
-
|
| 553 |
-
search_posts = self._collect_posts_apify(hashtags)
|
| 554 |
|
| 555 |
-
# [
|
| 556 |
-
|
| 557 |
-
direct_urls = build_direct_urls_with_spot_ids(location_map, self.spot_matcher)
|
| 558 |
-
direct_posts = self._collect_direct_location_posts(direct_urls)
|
| 559 |
-
|
| 560 |
-
# [4] ํตํฉ ์ง๊ณ
|
| 561 |
-
all_posts = search_posts + direct_posts
|
| 562 |
|
| 563 |
if not all_posts:
|
| 564 |
logger.warning("์์ง๋ ๊ฒ์๋ฌผ ์์ โ ์ข
๋ฃ")
|
| 565 |
-
return {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 566 |
|
| 567 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 568 |
|
| 569 |
-
#
|
|
|
|
|
|
|
|
|
|
| 570 |
saved = self._save_to_db(spot_metrics)
|
| 571 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 572 |
result = {
|
| 573 |
-
"
|
| 574 |
-
"
|
| 575 |
"total_posts": len(all_posts),
|
| 576 |
-
"
|
| 577 |
-
"
|
| 578 |
-
"
|
| 579 |
"saved": saved,
|
| 580 |
}
|
| 581 |
-
logger.info("=== Instagram ์์ง ์๋ฃ (
|
| 582 |
return result
|
| 583 |
|
| 584 |
-
|
| 585 |
-
|
| 586 |
-
|
| 587 |
-
|
| 588 |
-
if len(posts) == 0:
|
| 589 |
-
logger.info("[%s] ๊ฒฐ๊ณผ 0๊ฑด โ 1ํ ์ฌ์๋", label)
|
| 590 |
-
posts = self._execute_hashtag_actor(hashtag, f"{label} retry")
|
| 591 |
-
|
| 592 |
-
return posts
|
| 593 |
|
| 594 |
-
def
|
| 595 |
-
"""
|
| 596 |
-
run_input = {
|
| 597 |
-
"hashtags": [hashtag],
|
| 598 |
-
"resultsLimit": RESULTS_LIMIT_PER_HASHTAG,
|
| 599 |
-
"proxy": {
|
| 600 |
-
"useApifyProxy": True,
|
| 601 |
-
"apifyProxyGroups": ["RESIDENTIAL"],
|
| 602 |
-
},
|
| 603 |
-
}
|
| 604 |
|
|
|
|
|
|
|
| 605 |
try:
|
| 606 |
-
|
| 607 |
-
|
| 608 |
-
|
| 609 |
-
|
|
|
|
|
|
|
|
|
|
| 610 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 611 |
except Exception as e:
|
| 612 |
-
logger.warning("
|
| 613 |
-
return []
|
| 614 |
-
|
| 615 |
-
if run.get("status") not in ("SUCCEEDED", None):
|
| 616 |
-
logger.warning(
|
| 617 |
-
"Apify Actor ๋น์ ์ ์ข
๋ฃ [hashtag=%s]: status=%s",
|
| 618 |
-
hashtag, run.get("status"),
|
| 619 |
-
)
|
| 620 |
-
return []
|
| 621 |
-
|
| 622 |
-
posts: list[dict] = []
|
| 623 |
-
dataset_id = run["defaultDatasetId"]
|
| 624 |
-
|
| 625 |
-
for item in self.apify.dataset(dataset_id).iterate_items():
|
| 626 |
-
likes = item.get("likesCount", 0)
|
| 627 |
-
if likes == -1:
|
| 628 |
-
likes = 0
|
| 629 |
-
|
| 630 |
-
hashtags = item.get("hashtags") or []
|
| 631 |
-
if not hashtags:
|
| 632 |
-
caption = item.get("caption", "")
|
| 633 |
-
if caption:
|
| 634 |
-
hashtags = HASHTAG_RE.findall(caption)
|
| 635 |
-
|
| 636 |
-
posts.append({
|
| 637 |
-
"search_term": hashtag,
|
| 638 |
-
"search_type": "hashtag",
|
| 639 |
-
"location_name": item.get("locationName", ""),
|
| 640 |
-
"likes_count": likes,
|
| 641 |
-
"comments_count": item.get("commentsCount", 0),
|
| 642 |
-
"caption": item.get("caption", ""),
|
| 643 |
-
"timestamp": item.get("timestamp", ""),
|
| 644 |
-
"url": item.get("url", ""),
|
| 645 |
-
"hashtags": hashtags,
|
| 646 |
-
})
|
| 647 |
-
|
| 648 |
-
logger.info("[%s] #%s โ %d๊ฑด", label, hashtag, len(posts))
|
| 649 |
-
return posts
|
| 650 |
-
|
| 651 |
-
def _collect_posts_apify(self, hashtags: list[str]) -> list[dict]:
|
| 652 |
-
"""ํด์ํ๊ทธ ๋ฆฌ์คํธ์์ ๊ฒ์๋ฌผ์ ์์งํ๋ค (Apify). URL ๊ธฐ๋ฐ ์ค๋ณต ์ ๊ฑฐ."""
|
| 653 |
-
logger.info(
|
| 654 |
-
"Apify Actor ์คํ ์์: %d๊ฐ ํด์ํ๊ทธ (resultsLimit=%d)",
|
| 655 |
-
len(hashtags), RESULTS_LIMIT_PER_HASHTAG,
|
| 656 |
-
)
|
| 657 |
-
|
| 658 |
-
all_posts: list[dict] = []
|
| 659 |
-
|
| 660 |
-
for i, tag in enumerate(hashtags, 1):
|
| 661 |
-
label = f"{i}/{len(hashtags)}"
|
| 662 |
-
posts = self._scrape_hashtag(tag, label)
|
| 663 |
-
all_posts.extend(posts)
|
| 664 |
|
| 665 |
-
|
| 666 |
-
|
| 667 |
-
return unique_posts
|
| 668 |
-
|
| 669 |
-
def _collect_direct_location_posts(self, direct_urls: dict[str, str]) -> list[dict]:
|
| 670 |
-
"""์ธ์คํ๊ทธ๋จ ์์น ํ์ด์ง์์ ๊ฒ์๋ฌผ์ ์์งํ๋ค.
|
| 671 |
-
|
| 672 |
-
๊ฐ ์์น ํ์ด์ง์ spot_id๊ฐ ์ด๋ฏธ ํ์ ๋์ด ์์ผ๋ฏ๋ก
|
| 673 |
-
๊ฒ์๋ฌผ์ _direct_spot_id๋ฅผ ์ฒจ๋ถํ๋ค.
|
| 674 |
-
"""
|
| 675 |
-
if not direct_urls:
|
| 676 |
-
logger.info("directUrls ์์ โ ๋ฐฉํฅ B ์คํต")
|
| 677 |
-
return []
|
| 678 |
-
|
| 679 |
-
logger.info("directUrls ์์ง ์์: %d๊ฐ ์์น ํ์ด์ง", len(direct_urls))
|
| 680 |
-
all_posts: list[dict] = []
|
| 681 |
-
|
| 682 |
-
for url, spot_id in direct_urls.items():
|
| 683 |
-
run_input = {
|
| 684 |
-
"directUrls": [url],
|
| 685 |
-
"resultsLimit": RESULTS_LIMIT_PER_HASHTAG,
|
| 686 |
-
"proxy": {
|
| 687 |
-
"useApifyProxy": True,
|
| 688 |
-
"apifyProxyGroups": ["RESIDENTIAL"],
|
| 689 |
-
},
|
| 690 |
-
}
|
| 691 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 692 |
try:
|
| 693 |
-
|
| 694 |
-
|
| 695 |
-
|
| 696 |
-
memory_mbytes=self.ACTOR_MEMORY_MB,
|
| 697 |
-
)
|
| 698 |
except Exception as e:
|
| 699 |
-
logger.
|
| 700 |
-
continue
|
| 701 |
-
|
| 702 |
-
if run.get("status") not in ("SUCCEEDED", None):
|
| 703 |
-
logger.warning("directUrls Actor ๋น์ ์ ์ข
๋ฃ [%s]: status=%s", url, run.get("status"))
|
| 704 |
-
continue
|
| 705 |
-
|
| 706 |
-
dataset_id = run["defaultDatasetId"]
|
| 707 |
-
count = 0
|
| 708 |
-
for item in self.apify.dataset(dataset_id).iterate_items():
|
| 709 |
-
likes = item.get("likesCount", 0)
|
| 710 |
-
if likes == -1:
|
| 711 |
-
likes = 0
|
| 712 |
-
|
| 713 |
-
all_posts.append({
|
| 714 |
-
"search_term": "__direct__",
|
| 715 |
-
"search_type": "direct",
|
| 716 |
-
"location_name": item.get("locationName", ""),
|
| 717 |
-
"likes_count": likes,
|
| 718 |
-
"comments_count": item.get("commentsCount", 0),
|
| 719 |
-
"caption": item.get("caption", ""),
|
| 720 |
-
"timestamp": item.get("timestamp", ""),
|
| 721 |
-
"url": item.get("url", ""),
|
| 722 |
-
"hashtags": item.get("hashtags") or [],
|
| 723 |
-
"_direct_spot_id": spot_id,
|
| 724 |
-
})
|
| 725 |
-
count += 1
|
| 726 |
-
|
| 727 |
-
logger.info("directUrls [%s] โ %d๊ฑด (spot_id=%s)", url, count, spot_id)
|
| 728 |
|
| 729 |
-
logger.info("
|
| 730 |
-
return all_posts
|
| 731 |
|
| 732 |
# ==================================================================
|
| 733 |
-
#
|
| 734 |
# ==================================================================
|
| 735 |
|
| 736 |
-
def
|
| 737 |
-
"""
|
| 738 |
|
| 739 |
-
|
| 740 |
-
|
| 741 |
-
[3] EnsembleData API๋ก ๊ฒ์๋ฌผ ์์ง
|
| 742 |
-
[4] 3๋จ๊ณ ๋งค์นญ + ๊ฐ์ค ์ง๊ณ
|
| 743 |
-
[5] DB ์ ์ฅ
|
| 744 |
"""
|
| 745 |
-
|
| 746 |
logger.info(
|
| 747 |
-
"
|
| 748 |
-
|
| 749 |
)
|
| 750 |
|
| 751 |
-
|
| 752 |
-
all_hashtags = build_all_hashtags(self.supabase)
|
| 753 |
-
|
| 754 |
-
# [2] ์์ฐ์ ๋ง์ถฐ ์ค๋์ ํด์ํ๊ทธ ์ ์
|
| 755 |
-
hashtags = get_daily_hashtags(all_hashtags, self.budget.daily_limit)
|
| 756 |
-
|
| 757 |
-
if not hashtags:
|
| 758 |
-
logger.info("์ค๋ ์์งํ ํด์ํ๊ทธ ์์ โ ์ข
๋ฃ")
|
| 759 |
-
return {
|
| 760 |
-
"total_posts": 0, "spots_matched": 0, "saved": 0,
|
| 761 |
-
"backend": "ed", "reason": "no_hashtags_today",
|
| 762 |
-
}
|
| 763 |
-
|
| 764 |
-
# [3] ๊ฒ์๋ฌผ ์์ง
|
| 765 |
-
posts = self._collect_posts_ed(hashtags)
|
| 766 |
-
|
| 767 |
-
if not posts:
|
| 768 |
-
logger.warning("์์ง๋ ๊ฒ์๋ฌผ ์์ โ ์ข
๋ฃ")
|
| 769 |
-
return {"total_posts": 0, "spots_matched": 0, "saved": 0, "backend": "ed"}
|
| 770 |
|
| 771 |
-
#
|
| 772 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 773 |
|
| 774 |
-
|
| 775 |
-
|
|
|
|
| 776 |
|
| 777 |
-
|
| 778 |
-
|
| 779 |
-
|
| 780 |
-
"
|
| 781 |
-
"
|
| 782 |
-
"
|
| 783 |
-
|
| 784 |
-
|
| 785 |
-
|
| 786 |
}
|
| 787 |
-
logger.info("=== Instagram ์์ง ์๋ฃ (v4.0 โ EnsembleData): %s ===", result)
|
| 788 |
-
return result
|
| 789 |
-
|
| 790 |
-
def _fetch_hashtag_posts(self, hashtag: str, label: str) -> list[dict]:
|
| 791 |
-
"""EnsembleData API๋ก ํด์ํ๊ทธ ๊ฒ์๋ฌผ์ ๊ฐ์ ธ์จ๋ค.
|
| 792 |
-
|
| 793 |
-
top_posts + recent_posts๋ฅผ ํตํฉํ์ฌ ๋ฐํํ๋ค.
|
| 794 |
-
"""
|
| 795 |
-
if not self.budget.can_afford(estimated_posts=30):
|
| 796 |
-
logger.warning("[%s] ์ ๋ ์์ฐ ๋ถ์กฑ (์์ฌ: %d) โ ์คํต", label, self.budget.remaining)
|
| 797 |
-
return []
|
| 798 |
|
| 799 |
try:
|
| 800 |
-
|
| 801 |
-
|
| 802 |
-
|
|
|
|
| 803 |
)
|
| 804 |
-
except
|
| 805 |
-
logger.warning("
|
| 806 |
-
return []
|
| 807 |
-
|
| 808 |
-
if resp.status_code == 495:
|
| 809 |
-
logger.error("[%s] ์ผ์ผ ์ ๋ ํ๋ ์ด๊ณผ โ ์์ง ์ค๋จ", label)
|
| 810 |
-
self.budget.record(self.budget.remaining)
|
| 811 |
return []
|
| 812 |
|
| 813 |
-
if
|
| 814 |
-
logger.warning(
|
|
|
|
|
|
|
|
|
|
| 815 |
return []
|
| 816 |
|
| 817 |
-
data = resp.json().get("data", {})
|
| 818 |
-
|
| 819 |
posts: list[dict] = []
|
| 820 |
-
|
| 821 |
-
recent_nodes = data.get("recent_posts", [])
|
| 822 |
|
| 823 |
-
for
|
| 824 |
-
|
| 825 |
-
|
|
|
|
| 826 |
|
| 827 |
-
|
| 828 |
-
|
| 829 |
-
|
| 830 |
|
| 831 |
-
|
|
|
|
|
|
|
|
|
|
| 832 |
|
| 833 |
-
logger.info(
|
| 834 |
-
"[%s] #%s โ %d๊ฑด (top %d + recent %d, ํด์ํ๊ทธ ์ ์ฒด %s๊ฑด)",
|
| 835 |
-
label, hashtag, len(posts), len(top_nodes), len(recent_nodes),
|
| 836 |
-
f"{data.get('count', 0):,}" if isinstance(data.get("count"), int) else "?",
|
| 837 |
-
)
|
| 838 |
return posts
|
| 839 |
|
| 840 |
-
|
| 841 |
-
|
| 842 |
-
|
| 843 |
-
|
| 844 |
-
|
| 845 |
-
|
| 846 |
-
|
| 847 |
-
|
| 848 |
-
|
| 849 |
-
|
| 850 |
-
|
| 851 |
-
|
| 852 |
-
|
| 853 |
-
|
| 854 |
-
|
| 855 |
-
|
| 856 |
-
|
| 857 |
-
|
| 858 |
-
|
| 859 |
-
|
| 860 |
-
|
| 861 |
-
|
| 862 |
-
|
| 863 |
-
|
| 864 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 865 |
|
| 866 |
# ==================================================================
|
| 867 |
-
#
|
| 868 |
# ==================================================================
|
| 869 |
|
| 870 |
def _match_post_to_spot(self, post: dict) -> tuple[str | None, str]:
|
| 871 |
-
"""๊ฒ์๋ฌผ 1๊ฑด์ ๋ํด spot_id๋ฅผ ๋งค์นญํ๋ค.
|
| 872 |
|
| 873 |
๋งค์นญ ์ฐ์ ์์:
|
| 874 |
-
0. directUrls ๊ฒ์๋ฌผ (_direct_spot_id ์ด๋ฏธ ํ์ ) โ Apify only
|
| 875 |
1. locationName ํ๊ทธ โ ์ ๋์ฌ ์ ๊ฑฐ โ SpotMatcher
|
| 876 |
2. hashtags ๋ฐฐ์ด โ SpotMatcher.match_hashtag() (๋ฐฉํฅ ์ ํ)
|
| 877 |
-
3. caption โ PlaceNameExtractor โ SpotMatcher
|
| 878 |
-
"""
|
| 879 |
-
# Stage 0: directUrls (Apify only โ ED posts don't have this field)
|
| 880 |
-
direct_sid = post.get("_direct_spot_id")
|
| 881 |
-
if direct_sid:
|
| 882 |
-
return direct_sid, "direct"
|
| 883 |
|
|
|
|
|
|
|
| 884 |
# Stage 1: locationName โ ์ ๋์ฌ ์ ๊ฑฐ โ SpotMatcher
|
| 885 |
loc = post.get("location_name", "")
|
| 886 |
if loc and loc not in LOCATION_BLACKLIST and self.spot_matcher:
|
|
@@ -900,42 +409,28 @@ class InstagramCollector:
|
|
| 900 |
if sid:
|
| 901 |
return sid, "hashtag"
|
| 902 |
|
| 903 |
-
# Stage 3: caption โ PlaceNameExtractor โ SpotMatcher
|
| 904 |
-
caption = post.get("caption", "")
|
| 905 |
-
if caption and len(caption) >= 5:
|
| 906 |
-
places = self.extractor.extract(caption)
|
| 907 |
-
for place in places:
|
| 908 |
-
if self.spot_matcher:
|
| 909 |
-
sid = self.spot_matcher.match(place["name"])
|
| 910 |
-
if sid:
|
| 911 |
-
return sid, "caption"
|
| 912 |
-
elif place.get("spot_id"):
|
| 913 |
-
return place["spot_id"], "caption"
|
| 914 |
-
|
| 915 |
return None, "unmatched"
|
| 916 |
|
| 917 |
# ==================================================================
|
| 918 |
-
#
|
| 919 |
# ==================================================================
|
| 920 |
|
| 921 |
-
def
|
| 922 |
-
|
| 923 |
-
|
| 924 |
-
|
| 925 |
-
- ์ต์ engagement ์๊ณ๊ฐ: MIN_ENGAGEMENT(10) ๋ฏธ๋ง ์ ์ธ
|
| 926 |
-
- engagement cap: ENGAGEMENT_CAP(1000) ์ด๊ณผ ์ ์ ์ญ
|
| 927 |
-
- weighted_score: sum(min(engagement, cap))
|
| 928 |
|
| 929 |
Returns:
|
| 930 |
-
|
| 931 |
-
avg_engagement, weighted_score, match_methods, hashtags}}
|
| 932 |
"""
|
| 933 |
cutoff = datetime.now(timezone.utc) - timedelta(days=TREND_WINDOW_DAYS)
|
| 934 |
|
| 935 |
spot_metrics: dict[str, dict] = {}
|
| 936 |
-
|
| 937 |
-
|
| 938 |
-
"
|
|
|
|
|
|
|
| 939 |
"filtered_old": 0, "filtered_low_engagement": 0,
|
| 940 |
}
|
| 941 |
|
|
@@ -951,14 +446,7 @@ class InstagramCollector:
|
|
| 951 |
except (ValueError, TypeError):
|
| 952 |
pass # ํ์ฑ ์คํจ ์ ํฌํจ
|
| 953 |
|
| 954 |
-
#
|
| 955 |
-
spot_id, method = self._match_post_to_spot(post)
|
| 956 |
-
match_stats[method] += 1
|
| 957 |
-
|
| 958 |
-
if not spot_id:
|
| 959 |
-
continue
|
| 960 |
-
|
| 961 |
-
# ์ต์ engagement ์๊ณ๊ฐ
|
| 962 |
likes = post.get("likes_count", 0) or 0
|
| 963 |
comments = post.get("comments_count", 0) or 0
|
| 964 |
engagement = likes + comments
|
|
@@ -967,44 +455,253 @@ class InstagramCollector:
|
|
| 967 |
match_stats["filtered_low_engagement"] += 1
|
| 968 |
continue
|
| 969 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 970 |
# engagement cap
|
| 971 |
capped_engagement = min(engagement, ENGAGEMENT_CAP)
|
| 972 |
|
| 973 |
# ์ง๊ณ
|
| 974 |
-
|
| 975 |
-
|
| 976 |
-
|
| 977 |
-
|
| 978 |
-
|
| 979 |
-
|
| 980 |
-
|
| 981 |
-
|
| 982 |
-
|
| 983 |
-
|
| 984 |
-
|
| 985 |
-
|
| 986 |
-
|
| 987 |
-
|
| 988 |
-
|
| 989 |
-
|
| 990 |
-
|
| 991 |
-
|
| 992 |
-
|
| 993 |
-
|
| 994 |
-
|
| 995 |
-
|
| 996 |
-
|
| 997 |
-
|
| 998 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 999 |
)
|
| 1000 |
-
metrics["match_methods"] = sorted(metrics["match_methods"])
|
| 1001 |
-
metrics["hashtags"] = sorted(metrics["hashtags"])
|
| 1002 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1003 |
if match_stats["filtered_old"] > 0:
|
| 1004 |
logger.info(
|
| 1005 |
-
"๊ธฐ๊ฐ ํํฐ: %d๊ฑด ์ ์ธ (์ต๊ทผ %d์ผ ์ธ)
|
| 1006 |
match_stats["filtered_old"], TREND_WINDOW_DAYS,
|
| 1007 |
-
len(posts) - match_stats["filtered_old"],
|
| 1008 |
)
|
| 1009 |
if match_stats["filtered_low_engagement"] > 0:
|
| 1010 |
logger.info(
|
|
@@ -1012,29 +709,30 @@ class InstagramCollector:
|
|
| 1012 |
match_stats["filtered_low_engagement"], MIN_ENGAGEMENT,
|
| 1013 |
)
|
| 1014 |
logger.info(
|
| 1015 |
-
"
|
| 1016 |
-
"
|
| 1017 |
-
|
| 1018 |
match_stats["location_tag"],
|
| 1019 |
match_stats["hashtag"],
|
| 1020 |
-
match_stats
|
| 1021 |
-
match_stats["direct"],
|
| 1022 |
match_stats["unmatched"],
|
| 1023 |
)
|
| 1024 |
-
return spot_metrics
|
| 1025 |
|
| 1026 |
# ==================================================================
|
| 1027 |
-
#
|
| 1028 |
# ==================================================================
|
| 1029 |
|
| 1030 |
def _save_to_db(self, spot_metrics: dict[str, dict]) -> int:
|
| 1031 |
"""์ง๊ณ๋ ๋ฉํธ๋ฆญ์ spot_trends ํ
์ด๋ธ์ ์ ์ฅํ๋ค.
|
| 1032 |
|
| 1033 |
-
source =
|
| 1034 |
์ ์ฅ ๋ฉํธ๋ฆญ: post_count, avg_engagement, weighted_score
|
| 1035 |
"""
|
|
|
|
|
|
|
|
|
|
| 1036 |
period_start, period_end = get_week_period()
|
| 1037 |
-
source =
|
| 1038 |
saved = 0
|
| 1039 |
|
| 1040 |
for spot_id, metrics in spot_metrics.items():
|
|
@@ -1043,7 +741,7 @@ class InstagramCollector:
|
|
| 1043 |
"total_comments": metrics["total_comments"],
|
| 1044 |
"match_methods": metrics["match_methods"],
|
| 1045 |
"hashtags": metrics["hashtags"],
|
| 1046 |
-
"
|
| 1047 |
}
|
| 1048 |
|
| 1049 |
# post_count
|
|
@@ -1063,7 +761,7 @@ class InstagramCollector:
|
|
| 1063 |
})
|
| 1064 |
saved += 1
|
| 1065 |
except Exception as e:
|
| 1066 |
-
logger.warning("spot_trends ์ ์ฅ ์คํจ (
|
| 1067 |
|
| 1068 |
# avg_engagement
|
| 1069 |
if metrics["avg_engagement"] > 0:
|
|
@@ -1078,7 +776,7 @@ class InstagramCollector:
|
|
| 1078 |
"raw_data": {"match_methods": metrics["match_methods"]},
|
| 1079 |
})
|
| 1080 |
except Exception as e:
|
| 1081 |
-
logger.warning("spot_trends ์ ์ฅ ์คํจ (
|
| 1082 |
|
| 1083 |
# weighted_score
|
| 1084 |
if metrics["weighted_score"] > 0:
|
|
@@ -1093,17 +791,164 @@ class InstagramCollector:
|
|
| 1093 |
"raw_data": {},
|
| 1094 |
})
|
| 1095 |
except Exception as e:
|
| 1096 |
-
logger.warning("spot_trends ์ ์ฅ ์คํจ (
|
| 1097 |
|
| 1098 |
logger.info("Instagram DB ์ ์ฅ: %d๊ฑด (%d ์คํ, source=%s)", saved, len(spot_metrics), source)
|
| 1099 |
return saved
|
| 1100 |
|
| 1101 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1102 |
# โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 1103 |
# Shared Utility
|
| 1104 |
# โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 1105 |
|
| 1106 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1107 |
def _dedup_posts_by_url(posts: list[dict]) -> list[dict]:
|
| 1108 |
"""URL ๊ธฐ๋ฐ ์ค๋ณต ์ ๊ฑฐ."""
|
| 1109 |
seen_urls: set[str] = set()
|
|
|
|
| 1 |
"""
|
| 2 |
+
Instagram Collector โ Influencer Monitoring (v5.1 Multimodal)
|
| 3 |
+
|
| 4 |
+
ํ๋ ์ดํ
๋ ์ ์ฃผ ์ฌํ ์ธํ๋ฃจ์ธ์ ๊ณ์ ์ ์ต๊ทผ ๊ฒ์๋ฌผ์ ์์งํฉ๋๋ค.
|
| 5 |
+
|
| 6 |
+
ํ์ดํ๋ผ์ธ:
|
| 7 |
+
1. influencer_accounts ํ
์ด๋ธ์์ ํ์ฑ ๊ณ์ ๋ชฉ๋ก ๋ก๋ (DB ์คํจ ์ ๊ธฐ๋ณธ๊ฐ ํด๋ฐฑ)
|
| 8 |
+
2. Apify instagram-profile-scraper๋ก ๊ณ์ ๋ณ ์ต๊ทผ ๊ฒ์๋ฌผ ์์ง
|
| 9 |
+
3. ๊ฒ์๋ฌผ ์ ๊ทํ + ์ค๋ณต ์ ๊ฑฐ
|
| 10 |
+
4. 2-pass ํ์ด๋ธ๋ฆฌ๋ ์ฅ์ ๋งค์นญ:
|
| 11 |
+
Pass 1 (๊ท์น ๊ธฐ๋ฐ): ์์นํ๊ทธ โ ํด์ํ๊ทธ ๋งค์นญ (๊ณ ์ ๋ขฐ ์ ํธ๋ง)
|
| 12 |
+
Pass 2 (AI ๋ฉํฐ๋ชจ๋ฌ): Gemini 2.0 Flash๋ก ์ฝํ
์ธ ๋ถ์
|
| 13 |
+
- ์ด๋ฏธ์ง ๊ฒ์๊ธ: ์ด๋ฏธ์ง + ์บก์
โ ๊ฐํ/๊ฒฝ๊ด/ํ
์คํธ ์ธ์
|
| 14 |
+
- ๋ฆด์ค(์์): ์์ + ์บก์
โ ๋๋ ์ด์
/์๋ง/๊ฐํ ์ธ์
|
| 15 |
+
- ๋ฏธ๋์ด ์์: ์บก์
ํ
์คํธ ๋ถ์ (ํด๋ฐฑ)
|
| 16 |
5. ๊ฐ์ค ์ง๊ณ: weighted_score = sum(min(engagement, cap))
|
| 17 |
+
6. spot_trends ํ
์ด๋ธ์ ์ ์ฅ (source = instagram_influencer)
|
| 18 |
+
7. ๊ณ์ ๋ณ last_scraped_at ์
๋ฐ์ดํธ
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 19 |
"""
|
| 20 |
|
| 21 |
from __future__ import annotations
|
| 22 |
|
| 23 |
import json
|
| 24 |
+
import logging
|
| 25 |
import os
|
| 26 |
import re
|
| 27 |
+
import tempfile
|
| 28 |
+
import time
|
| 29 |
+
from datetime import datetime, timedelta, timezone
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 30 |
|
| 31 |
+
from apify_client import ApifyClient
|
|
|
|
|
|
|
|
|
|
| 32 |
|
|
|
|
| 33 |
from trend_engine.utils import get_week_period, safe_upsert_spot_trend
|
| 34 |
|
| 35 |
logger = logging.getLogger(__name__)
|
| 36 |
|
| 37 |
|
| 38 |
# โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 39 |
+
# ์์ง ์ค์
|
| 40 |
# โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 41 |
|
| 42 |
+
# ๊ธฐ๊ฐ ํํฐ โ ์ง์ 30์ผ (7์ผ์ ์คํ ๋งค์นญ๋ฅ ๊ณผ์)
|
| 43 |
+
TREND_WINDOW_DAYS = 30
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 44 |
|
| 45 |
+
# ๊ฐ์ค ์ง๊ณ ์์ (์ธํ๋ฃจ์ธ์ ์ฝํ
์ธ ๊ธฐ์ค)
|
| 46 |
+
MIN_ENGAGEMENT = 50 # ์ต์ engagement ์๊ณ๊ฐ
|
| 47 |
+
ENGAGEMENT_CAP = 5000 # ๋จ์ผ ๊ฒ์๋ฌผ engagement ์ํ
|
| 48 |
|
| 49 |
+
# ๊ณ์ ๋น ์ต๋ ๊ฒ์๋ฌผ ์
|
| 50 |
+
RESULTS_LIMIT_PER_ACCOUNT = 20
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 51 |
|
| 52 |
+
# AI ๋ฉํฐ๋ชจ๋ฌ ๋ถ์ ์ค์
|
| 53 |
+
MAX_IMAGE_BYTES = 5 * 1024 * 1024 # ์ด๋ฏธ์ง ์ต๋ 5MB
|
| 54 |
+
MAX_VIDEO_BYTES = 50 * 1024 * 1024 # ์์ ์ต๋ 50MB
|
| 55 |
+
MEDIA_DOWNLOAD_TIMEOUT = 15 # ๋ฏธ๋์ด ๋ค์ด๋ก๋ ํ์์์(์ด)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 56 |
|
| 57 |
|
| 58 |
# โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 59 |
+
# ์์น ํ๊ทธ ์ ๋ฆฌ
|
| 60 |
# โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 61 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 62 |
# ๊ตญ๊ฐ/๊ณตํญ ๋จ์๋ง BLACKLIST
|
| 63 |
LOCATION_BLACKLIST = frozenset({
|
| 64 |
"South Korea", "Korea", "๋ํ๋ฏผ๊ตญ", "ํ๊ตญ",
|
|
|
|
| 103 |
HASHTAG_RE = re.compile(r"#([\w๊ฐ-ํฃ]{2,30})")
|
| 104 |
|
| 105 |
|
| 106 |
+
# โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 107 |
+
# ๊ธฐ๋ณธ ์ธํ๋ฃจ์ธ์ ๊ณ์ (DB ์กฐํ ์คํจ ์ ํด๋ฐฑ)
|
| 108 |
+
# โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 109 |
|
| 110 |
+
DEFAULT_INFLUENCER_ACCOUNTS = [
|
| 111 |
+
{"username": "_sohee.e", "category": "travel", "priority": 1},
|
| 112 |
+
{"username": "foto_ycy", "category": "photo", "priority": 2},
|
| 113 |
+
{"username": "bbo_muksta", "category": "food", "priority": 3},
|
| 114 |
+
{"username": "yoontheroad", "category": "photo", "priority": 4},
|
| 115 |
+
{"username": "siniple", "category": "photo", "priority": 5},
|
| 116 |
+
{"username": "bigg_jun", "category": "photo", "priority": 6},
|
| 117 |
+
{"username": "aria.leeee", "category": "travel", "priority": 7},
|
| 118 |
+
{"username": "gamttanam", "category": "lifestyle", "priority": 8},
|
| 119 |
+
{"username": "by_malgm", "category": "photo", "priority": 9},
|
| 120 |
+
{"username": "colorny", "category": "travel", "priority": 10},
|
| 121 |
+
{"username": "mongle_jyh", "category": "photo", "priority": 11},
|
| 122 |
+
{"username": "ryuppeum", "category": "travel", "priority": 12},
|
| 123 |
+
{"username": "thesoulofseoulblog", "category": "travel", "priority": 13},
|
| 124 |
+
{"username": "hey_jejuisland", "category": "lifestyle", "priority": 14},
|
| 125 |
+
{"username": "yooonjeju", "category": "travel", "priority": 15},
|
| 126 |
]
|
| 127 |
|
| 128 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 129 |
# โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 130 |
+
# InstagramCollector โ Influencer Monitoring v5.1
|
| 131 |
# โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 132 |
|
| 133 |
|
| 134 |
class InstagramCollector:
|
| 135 |
+
"""Instagram ์์ง๊ธฐ (v5.1 โ Influencer Monitoring + Multimodal AI).
|
| 136 |
|
| 137 |
+
ํ๋ ์ดํ
๋ ์ธํ๋ฃจ์ธ์ ๊ณ์ ์ ์ต๊ทผ ๊ฒ์๋ฌผ์ ์์งํ๊ณ
|
| 138 |
+
2-pass ํ์ด๋ธ๋ฆฌ๋ ๋งค์นญ(์์นํ๊ทธ/ํด์ํ๊ทธ + Gemini ๋ฉํฐ๋ชจ๋ฌ) ํ
|
| 139 |
+
spot_trends์ ์ ์ฅํฉ๋๋ค.
|
| 140 |
"""
|
| 141 |
|
| 142 |
+
ACTOR_ID = "apify/instagram-profile-scraper"
|
| 143 |
+
ACTOR_MEMORY_MB = 1024
|
| 144 |
+
ACTOR_TIMEOUT_SECS = 180
|
| 145 |
+
|
| 146 |
def __init__(self, supabase_client, spot_matcher=None):
|
| 147 |
self.supabase = supabase_client
|
| 148 |
self.spot_matcher = spot_matcher
|
| 149 |
+
self.apify = ApifyClient(os.environ["APIFY_API_TOKEN"])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 150 |
|
| 151 |
# ==================================================================
|
| 152 |
# Main Entry Point
|
| 153 |
# ==================================================================
|
| 154 |
|
| 155 |
def run(self) -> dict:
|
| 156 |
+
"""Instagram ์ธํ๋ฃจ์ธ์ ์์ง ํ์ดํ๋ผ์ธ v5.1.
|
| 157 |
+
|
| 158 |
+
[1] ์ธํ๋ฃจ์ธ์ ๊ณ์ ๋ชฉ๋ก ๋ก๋ (DB โ ํด๋ฐฑ)
|
| 159 |
+
[2] Apify profile-scraper๋ก ๊ณ์ ๋ณ ์ต๊ทผ ๊ฒ์๋ฌผ ์์ง
|
| 160 |
+
[3] Pass 1: ์์นํ๊ทธ + ํด์ํ๊ทธ ๋งค์นญ
|
| 161 |
+
[3b] Pass 2: ๋ฏธ๋งค์นญ โ Gemini ๋ฉํฐ๋ชจ๋ฌ (์ด๋ฏธ์ง/์์ + ์บก์
) โ SpotMatcher
|
| 162 |
+
[4] DB ์ ์ฅ
|
| 163 |
+
[5] last_scraped_at ์
๋ฐ์ดํธ
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 164 |
"""
|
| 165 |
+
logger.info("=== Instagram ์์ง ์์ (v5.1 โ Multimodal AI) ===")
|
| 166 |
|
| 167 |
+
# [1] ์ธํ๋ฃจ์ธ์ ๊ณ์ ๋ชฉ๋ก
|
| 168 |
+
accounts = self._load_influencer_accounts()
|
| 169 |
+
if not accounts:
|
| 170 |
+
logger.warning("ํ์ฑ ์ธํ๋ฃจ์ธ์ ๊ณ์ ์์ โ ์ข
๋ฃ")
|
| 171 |
+
return {"total_posts": 0, "spots_matched": 0, "saved": 0, "accounts": 0}
|
| 172 |
|
| 173 |
+
logger.info("์ธํ๋ฃจ์ธ์ ๊ณ์ %d๊ฐ ๋ก๋", len(accounts))
|
|
|
|
| 174 |
|
| 175 |
+
# [2] ๊ฒ์๋ฌผ ์์ง
|
| 176 |
+
all_posts = self._collect_from_accounts(accounts)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 177 |
|
| 178 |
if not all_posts:
|
| 179 |
logger.warning("์์ง๋ ๊ฒ์๋ฌผ ์์ โ ์ข
๋ฃ")
|
| 180 |
+
return {
|
| 181 |
+
"total_posts": 0, "spots_matched": 0, "saved": 0,
|
| 182 |
+
"accounts": len(accounts),
|
| 183 |
+
}
|
| 184 |
+
|
| 185 |
+
# [3] Pass 1: ๊ท์น ๊ธฐ๋ฐ ๋งค์นญ + ์ง๊ณ
|
| 186 |
+
spot_metrics, unmatched_posts, match_stats = self._aggregate_with_unmatched(all_posts)
|
| 187 |
+
pass1_matched = len(spot_metrics)
|
| 188 |
|
| 189 |
+
# [3b] Pass 2: AI ๋ฉํฐ๋ชจ๋ฌ ๋ถ์ (๋ฏธ๋งค์นญ ๊ฒ์๋ฌผ โ ์ด๋ฏธ์ง/์์ + ์บก์
๋ถ์)
|
| 190 |
+
ai_matched_count = 0
|
| 191 |
+
if unmatched_posts and os.environ.get("GEMINI_API_KEY"):
|
| 192 |
+
ai_matched_count = self._ai_analyze_content(unmatched_posts, spot_metrics, match_stats)
|
| 193 |
+
elif unmatched_posts:
|
| 194 |
+
logger.info("GEMINI_API_KEY ๋ฏธ์ค์ โ AI ๋ฉํฐ๋ชจ๋ฌ ๋ถ์ ์คํต (%d๊ฑด ๋ฏธ๋งค์นญ)", len(unmatched_posts))
|
| 195 |
|
| 196 |
+
# ์ต์ข
ํต๊ณ ๋ก๊น
|
| 197 |
+
self._log_match_stats(match_stats, len(all_posts), len(spot_metrics))
|
| 198 |
+
|
| 199 |
+
# [4] DB ์ ์ฅ
|
| 200 |
saved = self._save_to_db(spot_metrics)
|
| 201 |
|
| 202 |
+
# [5] last_scraped_at ์
๋ฐ์ดํธ
|
| 203 |
+
scraped_usernames = list({
|
| 204 |
+
p.get("_source_account", "")
|
| 205 |
+
for p in all_posts if p.get("_source_account")
|
| 206 |
+
})
|
| 207 |
+
self._update_last_scraped(scraped_usernames)
|
| 208 |
+
|
| 209 |
result = {
|
| 210 |
+
"accounts": len(accounts),
|
| 211 |
+
"accounts_scraped": len(scraped_usernames),
|
| 212 |
"total_posts": len(all_posts),
|
| 213 |
+
"pass1_spots_matched": pass1_matched,
|
| 214 |
+
"ai_matched": ai_matched_count,
|
| 215 |
+
"total_spots_matched": len(spot_metrics),
|
| 216 |
"saved": saved,
|
| 217 |
}
|
| 218 |
+
logger.info("=== Instagram ์์ง ์๋ฃ (v5.1 โ Multimodal AI): %s ===", result)
|
| 219 |
return result
|
| 220 |
|
| 221 |
+
# ==================================================================
|
| 222 |
+
# ์ธํ๋ฃจ์ธ์ ๊ณ์ ๊ด๋ฆฌ
|
| 223 |
+
# ==================================================================
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 224 |
|
| 225 |
+
def _load_influencer_accounts(self) -> list[dict]:
|
| 226 |
+
"""influencer_accounts ํ
์ด๋ธ์์ ํ์ฑ ๊ณ์ ๋ชฉ๋ก์ ๋ก๋ํ๋ค.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 227 |
|
| 228 |
+
DB ์กฐํ ์คํจ ์ DEFAULT_INFLUENCER_ACCOUNTS๋ก ํด๋ฐฑ.
|
| 229 |
+
"""
|
| 230 |
try:
|
| 231 |
+
resp = (
|
| 232 |
+
self.supabase.table("influencer_accounts")
|
| 233 |
+
.select("username, category, priority")
|
| 234 |
+
.eq("platform", "instagram")
|
| 235 |
+
.eq("is_active", True)
|
| 236 |
+
.order("priority")
|
| 237 |
+
.execute()
|
| 238 |
)
|
| 239 |
+
accounts = resp.data or []
|
| 240 |
+
if accounts:
|
| 241 |
+
logger.info("DB์์ ์ธํ๋ฃจ์ธ์ ๊ณ์ %d๊ฐ ๋ก๋", len(accounts))
|
| 242 |
+
return accounts
|
| 243 |
except Exception as e:
|
| 244 |
+
logger.warning("influencer_accounts ์กฐํ ์คํจ (ํด๋ฐฑ ์ฌ์ฉ): %s", e)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 245 |
|
| 246 |
+
logger.info("๊ธฐ๋ณธ ์ธํ๋ฃจ์ธ์ ๊ณ์ %d๊ฐ ์ฌ์ฉ (ํด๋ฐฑ)", len(DEFAULT_INFLUENCER_ACCOUNTS))
|
| 247 |
+
return list(DEFAULT_INFLUENCER_ACCOUNTS)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 248 |
|
| 249 |
+
def _update_last_scraped(self, usernames: list[str]) -> None:
|
| 250 |
+
"""์์ง ์๋ฃ๋ ๊ณ์ ์ last_scraped_at์ ์
๋ฐ์ดํธํ๋ค."""
|
| 251 |
+
if not usernames:
|
| 252 |
+
return
|
| 253 |
+
now = datetime.now(timezone.utc).isoformat()
|
| 254 |
+
for username in usernames:
|
| 255 |
try:
|
| 256 |
+
self.supabase.table("influencer_accounts").update({
|
| 257 |
+
"last_scraped_at": now,
|
| 258 |
+
}).eq("platform", "instagram").eq("username", username).execute()
|
|
|
|
|
|
|
| 259 |
except Exception as e:
|
| 260 |
+
logger.debug("last_scraped_at ์
๋ฐ์ดํธ ์คํจ (%s): %s", username, e)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 261 |
|
| 262 |
+
logger.info("last_scraped_at ์
๋ฐ์ดํธ: %d๊ฐ ๊ณ์ ", len(usernames))
|
|
|
|
| 263 |
|
| 264 |
# ==================================================================
|
| 265 |
+
# ๊ฒ์๋ฌผ ์์ง (Apify Profile Scraper)
|
| 266 |
# ==================================================================
|
| 267 |
|
| 268 |
+
def _collect_from_accounts(self, accounts: list[dict]) -> list[dict]:
|
| 269 |
+
"""์ธํ๋ฃจ์ธ์ ๊ณ์ ์์ ๊ฒ์๋ฌผ์ ์์งํ๋ค.
|
| 270 |
|
| 271 |
+
Apify instagram-profile-scraper Actor๋ฅผ ์ฌ์ฉํ์ฌ
|
| 272 |
+
๊ณ์ ๋ณ ์ต๊ทผ ๊ฒ์๋ฌผ์ ๊ฐ์ ธ์จ๋ค. 5๊ฐ์ฉ ๋ฐฐ์น ์คํ.
|
|
|
|
|
|
|
|
|
|
| 273 |
"""
|
| 274 |
+
usernames = [a["username"] for a in accounts]
|
| 275 |
logger.info(
|
| 276 |
+
"Apify Actor ์คํ: %d๊ฐ ๊ณ์ , ๊ณ์ ๋น ์ต๋ %d๊ฑด",
|
| 277 |
+
len(usernames), RESULTS_LIMIT_PER_ACCOUNT,
|
| 278 |
)
|
| 279 |
|
| 280 |
+
all_posts: list[dict] = []
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 281 |
|
| 282 |
+
# ๊ณ์ ์ 5๊ฐ์ฉ ๋ฐฐ์น๋ก ์คํ (๋จ์ผ Actor ํธ์ถ ์คํจ ์ ์ํฅ ์ต์ํ)
|
| 283 |
+
batch_size = 5
|
| 284 |
+
for i in range(0, len(usernames), batch_size):
|
| 285 |
+
batch = usernames[i:i + batch_size]
|
| 286 |
+
label = f"batch {i // batch_size + 1}/{(len(usernames) + batch_size - 1) // batch_size}"
|
| 287 |
+
posts = self._scrape_profiles(batch, label)
|
| 288 |
+
all_posts.extend(posts)
|
| 289 |
|
| 290 |
+
unique_posts = _dedup_posts_by_url(all_posts)
|
| 291 |
+
logger.info("๊ฒ์๋ฌผ ์์ง ์๋ฃ: %d๊ฑด (%d๊ฐ ๊ณ์ )", len(unique_posts), len(usernames))
|
| 292 |
+
return unique_posts
|
| 293 |
|
| 294 |
+
def _scrape_profiles(self, usernames: list[str], label: str) -> list[dict]:
|
| 295 |
+
"""Apify instagram-profile-scraper Actor๋ก ํ๋กํ ๊ฒ์๋ฌผ ์์ง."""
|
| 296 |
+
run_input = {
|
| 297 |
+
"usernames": usernames,
|
| 298 |
+
"resultsLimit": RESULTS_LIMIT_PER_ACCOUNT,
|
| 299 |
+
"proxy": {
|
| 300 |
+
"useApifyProxy": True,
|
| 301 |
+
"apifyProxyGroups": ["RESIDENTIAL"],
|
| 302 |
+
},
|
| 303 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 304 |
|
| 305 |
try:
|
| 306 |
+
run = self.apify.actor(self.ACTOR_ID).call(
|
| 307 |
+
run_input=run_input,
|
| 308 |
+
timeout_secs=self.ACTOR_TIMEOUT_SECS,
|
| 309 |
+
memory_mbytes=self.ACTOR_MEMORY_MB,
|
| 310 |
)
|
| 311 |
+
except Exception as e:
|
| 312 |
+
logger.warning("Apify Actor ์คํ ์คํจ [%s]: %s", label, e)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 313 |
return []
|
| 314 |
|
| 315 |
+
if run.get("status") not in ("SUCCEEDED", None):
|
| 316 |
+
logger.warning(
|
| 317 |
+
"Apify Actor ๋น์ ์ ์ข
๋ฃ [%s]: status=%s",
|
| 318 |
+
label, run.get("status"),
|
| 319 |
+
)
|
| 320 |
return []
|
| 321 |
|
|
|
|
|
|
|
| 322 |
posts: list[dict] = []
|
| 323 |
+
dataset_id = run["defaultDatasetId"]
|
|
|
|
| 324 |
|
| 325 |
+
for profile in self.apify.dataset(dataset_id).iterate_items():
|
| 326 |
+
# profile-scraper๋ ํ๋กํ 1๊ฐ = ์์ดํ
1๊ฐ, ๊ฒ์๋ฌผ์ latestPosts ์
|
| 327 |
+
profile_username = profile.get("username", "")
|
| 328 |
+
latest_posts = profile.get("latestPosts", [])
|
| 329 |
|
| 330 |
+
if not latest_posts:
|
| 331 |
+
logger.debug("๊ฒ์๋ฌผ ์์: @%s", profile_username)
|
| 332 |
+
continue
|
| 333 |
|
| 334 |
+
for item in latest_posts:
|
| 335 |
+
post = self._normalize_post(item)
|
| 336 |
+
if post:
|
| 337 |
+
posts.append(post)
|
| 338 |
|
| 339 |
+
logger.info("[%s] %s โ %d๊ฑด", label, usernames, len(posts))
|
|
|
|
|
|
|
|
|
|
|
|
|
| 340 |
return posts
|
| 341 |
|
| 342 |
+
@staticmethod
|
| 343 |
+
def _normalize_post(item: dict) -> dict | None:
|
| 344 |
+
"""Apify profile-scraper ์๋ต โ ํ์ค post dict๋ก ๋ณํ."""
|
| 345 |
+
owner = item.get("ownerUsername", "") or ""
|
| 346 |
+
if not owner:
|
| 347 |
+
return None
|
| 348 |
+
|
| 349 |
+
likes = item.get("likesCount", 0)
|
| 350 |
+
if likes == -1:
|
| 351 |
+
likes = 0
|
| 352 |
+
|
| 353 |
+
caption = item.get("caption", "") or ""
|
| 354 |
+
hashtags = item.get("hashtags") or []
|
| 355 |
+
if not hashtags and caption:
|
| 356 |
+
hashtags = HASHTAG_RE.findall(caption)
|
| 357 |
+
|
| 358 |
+
# ์ฝํ
์ธ ์ ํ ๋ฐ ๋ฏธ๋์ด URL
|
| 359 |
+
post_type = item.get("type", "Image") # Image, Video, Sidecar
|
| 360 |
+
if post_type == "Video":
|
| 361 |
+
media_url = item.get("videoUrl", "") or item.get("displayUrl", "") or ""
|
| 362 |
+
else:
|
| 363 |
+
media_url = item.get("displayUrl", "") or ""
|
| 364 |
+
|
| 365 |
+
return {
|
| 366 |
+
"search_term": f"@{owner}",
|
| 367 |
+
"search_type": "profile",
|
| 368 |
+
"location_name": item.get("locationName", "") or "",
|
| 369 |
+
"likes_count": likes,
|
| 370 |
+
"comments_count": item.get("commentsCount", 0) or 0,
|
| 371 |
+
"caption": caption,
|
| 372 |
+
"timestamp": item.get("timestamp", ""),
|
| 373 |
+
"url": item.get("url", ""),
|
| 374 |
+
"hashtags": hashtags,
|
| 375 |
+
"media_url": media_url,
|
| 376 |
+
"media_type": post_type,
|
| 377 |
+
"_source_account": owner,
|
| 378 |
+
}
|
| 379 |
|
| 380 |
# ==================================================================
|
| 381 |
+
# 3๋จ๊ณ ์ฅ์ ๋งค์นญ (v5.0 โ ํ๋กํ ๊ธฐ๋ฐ)
|
| 382 |
# ==================================================================
|
| 383 |
|
| 384 |
def _match_post_to_spot(self, post: dict) -> tuple[str | None, str]:
|
| 385 |
+
"""๊ฒ์๋ฌผ 1๊ฑด์ ๋ํด spot_id๋ฅผ ๋งค์นญํ๋ค (๊ณ ์ ๋ขฐ ์ ํธ๋ง).
|
| 386 |
|
| 387 |
๋งค์นญ ์ฐ์ ์์:
|
|
|
|
| 388 |
1. locationName ํ๊ทธ โ ์ ๋์ฌ ์ ๊ฑฐ โ SpotMatcher
|
| 389 |
2. hashtags ๋ฐฐ์ด โ SpotMatcher.match_hashtag() (๋ฐฉํฅ ์ ํ)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 390 |
|
| 391 |
+
์บก์
/์ด๋ฏธ์ง/์์ ๊ธฐ๋ฐ ๋ถ์์ Pass 2 (AI ๋ฉํฐ๋ชจ๋ฌ)์์ ์ฒ๋ฆฌ.
|
| 392 |
+
"""
|
| 393 |
# Stage 1: locationName โ ์ ๋์ฌ ์ ๊ฑฐ โ SpotMatcher
|
| 394 |
loc = post.get("location_name", "")
|
| 395 |
if loc and loc not in LOCATION_BLACKLIST and self.spot_matcher:
|
|
|
|
| 409 |
if sid:
|
| 410 |
return sid, "hashtag"
|
| 411 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 412 |
return None, "unmatched"
|
| 413 |
|
| 414 |
# ==================================================================
|
| 415 |
+
# Pass 1: ๊ท์น ๊ธฐ๋ฐ ๋งค์นญ + ์ง๊ณ (๋ฏธ๋งค์นญ ๊ฒ์๋ฌผ ์์ง)
|
| 416 |
# ==================================================================
|
| 417 |
|
| 418 |
+
def _aggregate_with_unmatched(
|
| 419 |
+
self, posts: list[dict],
|
| 420 |
+
) -> tuple[dict[str, dict], list[dict], dict[str, int]]:
|
| 421 |
+
"""์์ง๋ ๊ฒ์๋ฌผ์ spot_id ๊ธฐ์ค์ผ๋ก ์ง๊ณํ๊ณ , ๋ฏธ๋งค์นญ ๊ฒ์๋ฌผ์ ๋ณ๋ ๋ฐํํ๋ค.
|
|
|
|
|
|
|
|
|
|
| 422 |
|
| 423 |
Returns:
|
| 424 |
+
(spot_metrics, unmatched_posts, match_stats)
|
|
|
|
| 425 |
"""
|
| 426 |
cutoff = datetime.now(timezone.utc) - timedelta(days=TREND_WINDOW_DAYS)
|
| 427 |
|
| 428 |
spot_metrics: dict[str, dict] = {}
|
| 429 |
+
unmatched_posts: list[dict] = []
|
| 430 |
+
match_stats: dict[str, int] = {
|
| 431 |
+
"location_tag": 0,
|
| 432 |
+
"hashtag": 0, "unmatched": 0,
|
| 433 |
+
"ai_matched": 0,
|
| 434 |
"filtered_old": 0, "filtered_low_engagement": 0,
|
| 435 |
}
|
| 436 |
|
|
|
|
| 446 |
except (ValueError, TypeError):
|
| 447 |
pass # ํ์ฑ ์คํจ ์ ํฌํจ
|
| 448 |
|
| 449 |
+
# ์ต์ engagement ์๊ณ๊ฐ (๋งค์นญ ์ ํํฐ โ ์ engagement๋ AI๋ ์ฒ๋ฆฌ ๋ถ์)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 450 |
likes = post.get("likes_count", 0) or 0
|
| 451 |
comments = post.get("comments_count", 0) or 0
|
| 452 |
engagement = likes + comments
|
|
|
|
| 455 |
match_stats["filtered_low_engagement"] += 1
|
| 456 |
continue
|
| 457 |
|
| 458 |
+
# 3๋จ๊ณ ๋งค์นญ
|
| 459 |
+
spot_id, method = self._match_post_to_spot(post)
|
| 460 |
+
match_stats[method] += 1
|
| 461 |
+
|
| 462 |
+
if not spot_id:
|
| 463 |
+
# ๋ฏธ๋งค์นญ โ Pass 2 ๋์
|
| 464 |
+
unmatched_posts.append(post)
|
| 465 |
+
continue
|
| 466 |
+
|
| 467 |
# engagement cap
|
| 468 |
capped_engagement = min(engagement, ENGAGEMENT_CAP)
|
| 469 |
|
| 470 |
# ์ง๊ณ
|
| 471 |
+
_add_to_metrics(spot_metrics, spot_id, post, capped_engagement, method)
|
| 472 |
+
|
| 473 |
+
logger.info(
|
| 474 |
+
"Pass 1 ์๋ฃ: %d๊ฐ ์คํ ๋งค์นญ, %d๊ฑด ๋ฏธ๋งค์นญ โ AI ๋์",
|
| 475 |
+
len(spot_metrics), len(unmatched_posts),
|
| 476 |
+
)
|
| 477 |
+
return spot_metrics, unmatched_posts, match_stats
|
| 478 |
+
|
| 479 |
+
# ==================================================================
|
| 480 |
+
# Pass 2: AI ๋ฉํฐ๋ชจ๋ฌ ๋ถ์ (Gemini 2.0 Flash)
|
| 481 |
+
# ==================================================================
|
| 482 |
+
|
| 483 |
+
def _ai_analyze_content(
|
| 484 |
+
self,
|
| 485 |
+
unmatched_posts: list[dict],
|
| 486 |
+
spot_metrics: dict[str, dict],
|
| 487 |
+
match_stats: dict[str, int],
|
| 488 |
+
) -> int:
|
| 489 |
+
"""๋ฏธ๋งค์นญ ๊ฒ์๋ฌผ์ Gemini ๋ฉํฐ๋ชจ๋ฌ๋ก ๋ถ์ํ์ฌ ์ฅ์๋ฅผ ์ถ์ถํ๋ค.
|
| 490 |
+
|
| 491 |
+
์ด๋ฏธ์ง ๊ฒ์๊ธ: ์ด๋ฏธ์ง ๋ค์ด๋ก๋ โ Gemini Vision + ์บก์
๋ถ์
|
| 492 |
+
๋ฆด์ค(์์): ์์ ๋ค์ด๋ก๋ โ Gemini File API + ์บก์
๋ถ์
|
| 493 |
+
๋ฏธ๋์ด ์์: ์บก์
ํ
์คํธ๋ง ๋ถ์ (ํด๋ฐฑ)
|
| 494 |
+
|
| 495 |
+
Returns:
|
| 496 |
+
AI๋ก ์ถ๊ฐ ๋งค์นญ๋ ๊ฒ์๋ฌผ ์
|
| 497 |
+
"""
|
| 498 |
+
try:
|
| 499 |
+
from google import genai
|
| 500 |
+
from google.genai import types
|
| 501 |
+
except ImportError:
|
| 502 |
+
logger.warning("google-genai ๋ฏธ์ค์น โ AI ๋ฉํฐ๋ชจ๋ฌ ๋ถ์ ์คํต")
|
| 503 |
+
return 0
|
| 504 |
+
|
| 505 |
+
api_key = os.environ.get("GEMINI_API_KEY")
|
| 506 |
+
if not api_key:
|
| 507 |
+
return 0
|
| 508 |
+
|
| 509 |
+
client = genai.Client(api_key=api_key)
|
| 510 |
+
|
| 511 |
+
image_count = sum(1 for p in unmatched_posts if p.get("media_type") != "Video")
|
| 512 |
+
video_count = sum(1 for p in unmatched_posts if p.get("media_type") == "Video")
|
| 513 |
+
logger.info(
|
| 514 |
+
"AI ๋ฉํฐ๋ชจ๋ฌ ๋ถ์ ์์: %d๊ฑด (์ด๋ฏธ์ง %d, ์์ %d)",
|
| 515 |
+
len(unmatched_posts), image_count, video_count,
|
| 516 |
+
)
|
| 517 |
+
|
| 518 |
+
matched_count = 0
|
| 519 |
+
analyzed = 0
|
| 520 |
+
|
| 521 |
+
for post in unmatched_posts:
|
| 522 |
+
media_url = post.get("media_url", "")
|
| 523 |
+
media_type = post.get("media_type", "Image")
|
| 524 |
+
caption = post.get("caption", "")[:500]
|
| 525 |
+
hashtags = ", ".join(post.get("hashtags", [])[:10])
|
| 526 |
+
|
| 527 |
+
places: list[str] = []
|
| 528 |
+
try:
|
| 529 |
+
if media_type == "Video" and media_url:
|
| 530 |
+
places = self._ai_extract_from_video(
|
| 531 |
+
client, types, media_url, caption, hashtags,
|
| 532 |
+
)
|
| 533 |
+
elif media_url:
|
| 534 |
+
places = self._ai_extract_from_image(
|
| 535 |
+
client, types, media_url, caption, hashtags,
|
| 536 |
+
)
|
| 537 |
+
elif caption and len(caption) >= 10:
|
| 538 |
+
places = self._ai_extract_from_text(
|
| 539 |
+
client, types, caption, hashtags,
|
| 540 |
+
)
|
| 541 |
+
analyzed += 1
|
| 542 |
+
except Exception as e:
|
| 543 |
+
logger.debug("AI ๋ถ์ ์คํจ (%s): %s", post.get("url", "")[:60], e)
|
| 544 |
+
continue
|
| 545 |
+
|
| 546 |
+
# ์ถ์ถ๋ ์ฅ์๋ช
โ SpotMatcher
|
| 547 |
+
if places:
|
| 548 |
+
logger.info(
|
| 549 |
+
"AI ์ถ์ถ ์ฅ์: %s โ @%s (%s)",
|
| 550 |
+
[p[:30] for p in places], post.get("_source_account", "?"), media_type,
|
| 551 |
+
)
|
| 552 |
+
else:
|
| 553 |
+
logger.debug(
|
| 554 |
+
"AI ์ถ์ถ ์ฅ์ ์์ โ @%s (%s)",
|
| 555 |
+
post.get("_source_account", "?"), media_type,
|
| 556 |
+
)
|
| 557 |
+
for name in places:
|
| 558 |
+
if not name or len(name) < 2:
|
| 559 |
+
continue
|
| 560 |
+
sid = self.spot_matcher.match(name) if self.spot_matcher else None
|
| 561 |
+
if sid:
|
| 562 |
+
capped = min(
|
| 563 |
+
(post.get("likes_count", 0) or 0)
|
| 564 |
+
+ (post.get("comments_count", 0) or 0),
|
| 565 |
+
ENGAGEMENT_CAP,
|
| 566 |
+
)
|
| 567 |
+
_add_to_metrics(spot_metrics, sid, post, capped, "ai")
|
| 568 |
+
match_stats["ai_matched"] += 1
|
| 569 |
+
match_stats["unmatched"] = max(0, match_stats["unmatched"] - 1)
|
| 570 |
+
matched_count += 1
|
| 571 |
+
break # ํ ๊ฒ์๋ฌผ์์ ์ฒซ ๋งค์นญ๋ง
|
| 572 |
+
|
| 573 |
+
logger.info("AI ๋ฉํฐ๋ชจ๋ฌ ๋ถ์ ์๋ฃ: %d๊ฑด ๋ถ์, %d๊ฑด ๋งค์นญ", analyzed, matched_count)
|
| 574 |
+
return matched_count
|
| 575 |
+
|
| 576 |
+
def _ai_extract_from_image(
|
| 577 |
+
self, client, types, media_url: str, caption: str, hashtags: str,
|
| 578 |
+
) -> list[str]:
|
| 579 |
+
"""์ด๋ฏธ์ง ๊ฒ์๋ฌผ์์ Gemini Vision์ผ๋ก ์ฅ์๋ฅผ ์ถ์ถํ๋ค."""
|
| 580 |
+
image_bytes = _download_media(media_url, MAX_IMAGE_BYTES)
|
| 581 |
+
|
| 582 |
+
prompt = _build_spot_prompt(
|
| 583 |
+
content_type="๊ฒ์๊ธ (์ด๋ฏธ์ง)",
|
| 584 |
+
caption=caption,
|
| 585 |
+
hashtags=hashtags,
|
| 586 |
+
media_instruction=(
|
| 587 |
+
"์ด๋ฏธ์ง์์ ๊ฐํ, ๋ฉ๋ดํ, ํน์ง์ ๊ฒฝ๊ด์ ํ์ธํ๊ณ "
|
| 588 |
+
"์บก์
๋ด์ฉ๋ ํจ๊ป ๋ถ์ํ์ฌ ์ฅ์๋ฅผ ์๋ณํ์ธ์."
|
| 589 |
+
),
|
| 590 |
+
)
|
| 591 |
+
|
| 592 |
+
contents: list = []
|
| 593 |
+
if image_bytes:
|
| 594 |
+
contents.append(
|
| 595 |
+
types.Part.from_bytes(data=image_bytes, mime_type="image/jpeg"),
|
| 596 |
+
)
|
| 597 |
+
contents.append(prompt)
|
| 598 |
+
|
| 599 |
+
response = client.models.generate_content(
|
| 600 |
+
model="gemini-2.5-flash",
|
| 601 |
+
contents=contents,
|
| 602 |
+
config=types.GenerateContentConfig(
|
| 603 |
+
temperature=0.1, max_output_tokens=200,
|
| 604 |
+
thinking_config=types.ThinkingConfig(thinking_budget=0),
|
| 605 |
+
),
|
| 606 |
+
)
|
| 607 |
+
return _parse_ai_places(response.text)
|
| 608 |
+
|
| 609 |
+
def _ai_extract_from_video(
|
| 610 |
+
self, client, types, media_url: str, caption: str, hashtags: str,
|
| 611 |
+
) -> list[str]:
|
| 612 |
+
"""๋ฆด์ค(์์)์์ Gemini๋ก ์ฅ์๋ฅผ ์ถ์ถํ๋ค."""
|
| 613 |
+
video_bytes = _download_media(media_url, MAX_VIDEO_BYTES)
|
| 614 |
+
if not video_bytes:
|
| 615 |
+
# ๋ค์ด๋ก๋ ์คํจ โ ์บก์
๋ง ๋ถ์
|
| 616 |
+
if caption and len(caption) >= 10:
|
| 617 |
+
return self._ai_extract_from_text(client, types, caption, hashtags)
|
| 618 |
+
return []
|
| 619 |
+
|
| 620 |
+
temp_path = None
|
| 621 |
+
try:
|
| 622 |
+
with tempfile.NamedTemporaryFile(suffix=".mp4", delete=False) as f:
|
| 623 |
+
f.write(video_bytes)
|
| 624 |
+
temp_path = f.name
|
| 625 |
+
|
| 626 |
+
# Gemini File API์ ์
๋ก๋
|
| 627 |
+
video_file = client.files.upload(file=temp_path)
|
| 628 |
+
|
| 629 |
+
# ์ฒ๋ฆฌ ๋๊ธฐ (์ต๋ 60์ด)
|
| 630 |
+
wait_count = 0
|
| 631 |
+
while wait_count < 30:
|
| 632 |
+
state_name = getattr(video_file.state, "name", str(video_file.state))
|
| 633 |
+
if "PROCESSING" not in state_name:
|
| 634 |
+
break
|
| 635 |
+
time.sleep(2)
|
| 636 |
+
video_file = client.files.get(name=video_file.name)
|
| 637 |
+
wait_count += 1
|
| 638 |
+
|
| 639 |
+
prompt = _build_spot_prompt(
|
| 640 |
+
content_type="๋ฆด์ค (์์)",
|
| 641 |
+
caption=caption,
|
| 642 |
+
hashtags=hashtags,
|
| 643 |
+
media_instruction=(
|
| 644 |
+
"์์ ์ ๊ฐํ, ์๋ง, ๋๋ ์ด์
, ํน์ง์ ๊ฒฝ๊ด์ ํ์ธํ๊ณ "
|
| 645 |
+
"์บก์
๋ ํจ๊ป ๋ถ์ํ์ฌ ์ฅ์๋ฅผ ์๋ณํ์ธ์."
|
| 646 |
+
),
|
| 647 |
+
)
|
| 648 |
+
|
| 649 |
+
response = client.models.generate_content(
|
| 650 |
+
model="gemini-2.5-flash",
|
| 651 |
+
contents=[video_file, prompt],
|
| 652 |
+
config=types.GenerateContentConfig(
|
| 653 |
+
temperature=0.1, max_output_tokens=200,
|
| 654 |
+
thinking_config=types.ThinkingConfig(thinking_budget=0),
|
| 655 |
+
),
|
| 656 |
)
|
|
|
|
|
|
|
| 657 |
|
| 658 |
+
# ์
๋ก๋ ํ์ผ ์ ๋ฆฌ
|
| 659 |
+
try:
|
| 660 |
+
client.files.delete(name=video_file.name)
|
| 661 |
+
except Exception:
|
| 662 |
+
pass
|
| 663 |
+
|
| 664 |
+
return _parse_ai_places(response.text)
|
| 665 |
+
finally:
|
| 666 |
+
if temp_path:
|
| 667 |
+
try:
|
| 668 |
+
os.unlink(temp_path)
|
| 669 |
+
except OSError:
|
| 670 |
+
pass
|
| 671 |
+
|
| 672 |
+
@staticmethod
|
| 673 |
+
def _ai_extract_from_text(client, types, caption: str, hashtags: str) -> list[str]:
|
| 674 |
+
"""์บก์
ํ
์คํธ๋ง์ผ๋ก ์ฅ์๋ฅผ ์ถ์ถํ๋ค (๋ฏธ๋์ด ๋ค์ด๋ก๋ ์คํจ ์ ํด๋ฐฑ)."""
|
| 675 |
+
prompt = _build_spot_prompt(
|
| 676 |
+
content_type="๊ฒ์๊ธ",
|
| 677 |
+
caption=caption,
|
| 678 |
+
hashtags=hashtags,
|
| 679 |
+
media_instruction="์บก์
ํ
์คํธ์์ ์ฅ์๋ฅผ ์ถ์ถํ์ธ์.",
|
| 680 |
+
)
|
| 681 |
+
|
| 682 |
+
response = client.models.generate_content(
|
| 683 |
+
model="gemini-2.5-flash",
|
| 684 |
+
contents=prompt,
|
| 685 |
+
config=types.GenerateContentConfig(
|
| 686 |
+
temperature=0.1, max_output_tokens=200,
|
| 687 |
+
thinking_config=types.ThinkingConfig(thinking_budget=0),
|
| 688 |
+
),
|
| 689 |
+
)
|
| 690 |
+
return _parse_ai_places(response.text)
|
| 691 |
+
|
| 692 |
+
# ==================================================================
|
| 693 |
+
# ๋งค์นญ ํต๊ณ ๋ก๊น
|
| 694 |
+
# ==================================================================
|
| 695 |
+
|
| 696 |
+
@staticmethod
|
| 697 |
+
def _log_match_stats(
|
| 698 |
+
match_stats: dict[str, int], total_posts: int, total_spots: int,
|
| 699 |
+
) -> None:
|
| 700 |
+
"""Pass 1 + Pass 2 ํตํฉ ๋งค์นญ ํต๊ณ๋ฅผ ๋ก๊น
ํ๋ค."""
|
| 701 |
if match_stats["filtered_old"] > 0:
|
| 702 |
logger.info(
|
| 703 |
+
"๊ธฐ๊ฐ ํํฐ: %d๊ฑด ์ ์ธ (์ต๊ทผ %d์ผ ์ธ)",
|
| 704 |
match_stats["filtered_old"], TREND_WINDOW_DAYS,
|
|
|
|
| 705 |
)
|
| 706 |
if match_stats["filtered_low_engagement"] > 0:
|
| 707 |
logger.info(
|
|
|
|
| 709 |
match_stats["filtered_low_engagement"], MIN_ENGAGEMENT,
|
| 710 |
)
|
| 711 |
logger.info(
|
| 712 |
+
"์ต์ข
๋งค์นญ ํต๊ณ: %d๊ฐ ์คํ โ "
|
| 713 |
+
"์์นํ๊ทธ %d, ํด์ํ๊ทธ %d, AI๋ฉํฐ๋ชจ๋ฌ %d, ๋ฏธ์๋ณ %d",
|
| 714 |
+
total_spots,
|
| 715 |
match_stats["location_tag"],
|
| 716 |
match_stats["hashtag"],
|
| 717 |
+
match_stats.get("ai_matched", 0),
|
|
|
|
| 718 |
match_stats["unmatched"],
|
| 719 |
)
|
|
|
|
| 720 |
|
| 721 |
# ==================================================================
|
| 722 |
+
# DB ์ ์ฅ
|
| 723 |
# ==================================================================
|
| 724 |
|
| 725 |
def _save_to_db(self, spot_metrics: dict[str, dict]) -> int:
|
| 726 |
"""์ง๊ณ๋ ๋ฉํธ๋ฆญ์ spot_trends ํ
์ด๋ธ์ ์ ์ฅํ๋ค.
|
| 727 |
|
| 728 |
+
source = instagram_influencer
|
| 729 |
์ ์ฅ ๋ฉํธ๋ฆญ: post_count, avg_engagement, weighted_score
|
| 730 |
"""
|
| 731 |
+
# set โ sorted list ๋ณํ + avg_engagement ๊ณ์ฐ
|
| 732 |
+
_finalize_metrics(spot_metrics)
|
| 733 |
+
|
| 734 |
period_start, period_end = get_week_period()
|
| 735 |
+
source = "instagram_influencer"
|
| 736 |
saved = 0
|
| 737 |
|
| 738 |
for spot_id, metrics in spot_metrics.items():
|
|
|
|
| 741 |
"total_comments": metrics["total_comments"],
|
| 742 |
"match_methods": metrics["match_methods"],
|
| 743 |
"hashtags": metrics["hashtags"],
|
| 744 |
+
"source_accounts": metrics["source_accounts"],
|
| 745 |
}
|
| 746 |
|
| 747 |
# post_count
|
|
|
|
| 761 |
})
|
| 762 |
saved += 1
|
| 763 |
except Exception as e:
|
| 764 |
+
logger.warning("spot_trends ์ ์ฅ ์คํจ (post_count, %s): %s", spot_id, e)
|
| 765 |
|
| 766 |
# avg_engagement
|
| 767 |
if metrics["avg_engagement"] > 0:
|
|
|
|
| 776 |
"raw_data": {"match_methods": metrics["match_methods"]},
|
| 777 |
})
|
| 778 |
except Exception as e:
|
| 779 |
+
logger.warning("spot_trends ์ ์ฅ ์คํจ (avg_engagement, %s): %s", spot_id, e)
|
| 780 |
|
| 781 |
# weighted_score
|
| 782 |
if metrics["weighted_score"] > 0:
|
|
|
|
| 791 |
"raw_data": {},
|
| 792 |
})
|
| 793 |
except Exception as e:
|
| 794 |
+
logger.warning("spot_trends ์ ์ฅ ์คํจ (weighted_score, %s): %s", spot_id, e)
|
| 795 |
|
| 796 |
logger.info("Instagram DB ์ ์ฅ: %d๊ฑด (%d ์คํ, source=%s)", saved, len(spot_metrics), source)
|
| 797 |
return saved
|
| 798 |
|
| 799 |
|
| 800 |
+
# โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 801 |
+
# AI ๋ฉํฐ๋ชจ๋ฌ ์ ํธ๋ฆฌํฐ
|
| 802 |
+
# โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 803 |
+
|
| 804 |
+
|
| 805 |
+
def _build_spot_prompt(
|
| 806 |
+
content_type: str, caption: str, hashtags: str, media_instruction: str,
|
| 807 |
+
) -> str:
|
| 808 |
+
"""Gemini์ฉ ์ฅ์ ์ถ์ถ ํ๋กฌํํธ๋ฅผ ์์ฑํ๋ค."""
|
| 809 |
+
return (
|
| 810 |
+
f"์ ์ฃผ๋์ ์ธ์คํ๊ทธ๋จ {content_type}์ ๋ถ์ํ์ฌ "
|
| 811 |
+
"๊ตฌ์ฒด์ ์ธ ์ฅ์๋ช
(์ํธ๋ช
)์ ์ถ์ถํ์ธ์.\n\n"
|
| 812 |
+
f"์บก์
: {caption or '(์์)'}\n"
|
| 813 |
+
f"ํด์ํ๊ทธ: {hashtags or '(์์)'}\n\n"
|
| 814 |
+
f"{media_instruction}\n\n"
|
| 815 |
+
"์ถ์ถ ๋์:\n"
|
| 816 |
+
"- ์นดํ, ์๋น, ๋ฒ ์ด์ปค๋ฆฌ ๋ฑ ์ํธ๋ช
(์: ์นดํ ๋ ์ด์ด๋, ๋ด๋ ์ ํ
์ด๋ธ)\n"
|
| 817 |
+
"- ๊ด๊ด์ง, ํด๋ณ, ์ค๋ฆ ๊ณ ์ ๋ช
์ฌ (์: ์๋ณ์ค๋ฆ, ํ์ฌํด์์์ฅ, ๊ตฐ์ฐ์ค๋ฆ)\n"
|
| 818 |
+
"- ๊ณต์, ๋ง์, ๊ฑฐ๋ฆฌ ๊ณ ์ ๋ช
์ฌ (์: ํ๋ดํด์์ฐ์ฑ
๋ก, ๊ณฝ์ง๊ณผ๋ฌผํด๋ณ)\n\n"
|
| 819 |
+
"์ ์ธ ๋์:\n"
|
| 820 |
+
"- '์ ์', '์ ์ฃผ', '์ ์ฃผ๋', 'ํ๋ฆผ', '์๊ทํฌ' ๊ฐ์ ๊ด์ญ ์ง๋ช
\n"
|
| 821 |
+
"- '์นดํ', '๋ง์ง', 'ํด๋ณ' ๊ฐ์ ์ผ๋ฐ ์นดํ
๊ณ ๋ฆฌ๋ช
\n\n"
|
| 822 |
+
"์ค์: thinking ์์ด JSON ๋ฐฐ์ด๋ง ์ถ๋ ฅํ์ธ์.\n"
|
| 823 |
+
'์๋ต ํ์: ["์ฅ์๋ช
1", "์ฅ์๋ช
2"]\n'
|
| 824 |
+
"์ฅ์๊ฐ ์์ผ๋ฉด: []"
|
| 825 |
+
)
|
| 826 |
+
|
| 827 |
+
|
| 828 |
+
def _download_media(url: str, max_bytes: int) -> bytes | None:
|
| 829 |
+
"""๋ฏธ๋์ด URL์์ ๋ฐ์ดํธ๋ฅผ ๋ค์ด๋ก๋ํ๋ค."""
|
| 830 |
+
if not url:
|
| 831 |
+
return None
|
| 832 |
+
try:
|
| 833 |
+
import httpx
|
| 834 |
+
|
| 835 |
+
with httpx.stream(
|
| 836 |
+
"GET", url, timeout=MEDIA_DOWNLOAD_TIMEOUT, follow_redirects=True,
|
| 837 |
+
) as resp:
|
| 838 |
+
if resp.status_code != 200:
|
| 839 |
+
return None
|
| 840 |
+
chunks: list[bytes] = []
|
| 841 |
+
total = 0
|
| 842 |
+
for chunk in resp.iter_bytes(chunk_size=8192):
|
| 843 |
+
total += len(chunk)
|
| 844 |
+
if total > max_bytes:
|
| 845 |
+
logger.debug("๋ฏธ๋์ด ํฌ๊ธฐ ์ด๊ณผ (%d > %d): %s", total, max_bytes, url[:60])
|
| 846 |
+
return None
|
| 847 |
+
chunks.append(chunk)
|
| 848 |
+
return b"".join(chunks)
|
| 849 |
+
except Exception as e:
|
| 850 |
+
logger.debug("๋ฏธ๋์ด ๋ค์ด๋ก๋ ์คํจ: %s โ %s", url[:60], e)
|
| 851 |
+
return None
|
| 852 |
+
|
| 853 |
+
|
| 854 |
+
def _parse_ai_places(text: str) -> list[str]:
|
| 855 |
+
"""Gemini ์๋ต์์ ์ฅ์๋ช
๋ฐฐ์ด์ ํ์ฑํ๋ค."""
|
| 856 |
+
if not text:
|
| 857 |
+
logger.debug("AI ์๋ต ๋น์ด์์")
|
| 858 |
+
return []
|
| 859 |
+
|
| 860 |
+
raw_text = text # ๋๋ฒ๊น
์ฉ ์๋ณธ ๋ณด์กด
|
| 861 |
+
text = text.strip()
|
| 862 |
+
|
| 863 |
+
# Gemini 2.5 Flash thinking ๋ธ๋ก ์ ๊ฑฐ
|
| 864 |
+
if "<think>" in text:
|
| 865 |
+
# thinking ๋ธ๋ก ์ดํ์ ์ค์ ์๋ต๋ง ์ถ์ถ
|
| 866 |
+
parts = text.split("</think>")
|
| 867 |
+
text = parts[-1].strip() if len(parts) > 1 else text
|
| 868 |
+
|
| 869 |
+
# ๋งํฌ๋ค์ด ์ฝ๋ ๋ธ๋ก ์ ๊ฑฐ
|
| 870 |
+
if text.startswith("```"):
|
| 871 |
+
text = text.split("\n", 1)[-1]
|
| 872 |
+
if text.endswith("```"):
|
| 873 |
+
text = text.rsplit("```", 1)[0]
|
| 874 |
+
text = text.strip()
|
| 875 |
+
|
| 876 |
+
# JSON ๋ฐฐ์ด ์ง์ ํ์ฑ ์๋
|
| 877 |
+
try:
|
| 878 |
+
result = json.loads(text)
|
| 879 |
+
if isinstance(result, list):
|
| 880 |
+
return [p.strip() for p in result if isinstance(p, str) and p.strip()]
|
| 881 |
+
except (json.JSONDecodeError, ValueError):
|
| 882 |
+
pass
|
| 883 |
+
|
| 884 |
+
# ํ
์คํธ ์์ JSON ๋ฐฐ์ด์ด ํฌํจ๋ ๊ฒฝ์ฐ ์ถ์ถ
|
| 885 |
+
match = re.search(r'\[.*?\]', text, re.DOTALL)
|
| 886 |
+
if match:
|
| 887 |
+
try:
|
| 888 |
+
result = json.loads(match.group())
|
| 889 |
+
if isinstance(result, list):
|
| 890 |
+
return [p.strip() for p in result if isinstance(p, str) and p.strip()]
|
| 891 |
+
except (json.JSONDecodeError, ValueError):
|
| 892 |
+
pass
|
| 893 |
+
|
| 894 |
+
logger.debug("AI ์๋ต ํ์ฑ ์คํจ: %.200s", raw_text)
|
| 895 |
+
return []
|
| 896 |
+
|
| 897 |
+
|
| 898 |
# โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 899 |
# Shared Utility
|
| 900 |
# โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 901 |
|
| 902 |
|
| 903 |
+
def _add_to_metrics(
|
| 904 |
+
spot_metrics: dict[str, dict],
|
| 905 |
+
spot_id: str,
|
| 906 |
+
post: dict,
|
| 907 |
+
capped_engagement: int,
|
| 908 |
+
method: str,
|
| 909 |
+
) -> None:
|
| 910 |
+
"""๊ฒ์๋ฌผ 1๊ฑด์ spot_metrics์ ์ง๊ณํ๋ค."""
|
| 911 |
+
if spot_id not in spot_metrics:
|
| 912 |
+
spot_metrics[spot_id] = {
|
| 913 |
+
"post_count": 0,
|
| 914 |
+
"total_likes": 0,
|
| 915 |
+
"total_comments": 0,
|
| 916 |
+
"weighted_score": 0,
|
| 917 |
+
"match_methods": set(),
|
| 918 |
+
"hashtags": set(),
|
| 919 |
+
"source_accounts": set(),
|
| 920 |
+
}
|
| 921 |
+
|
| 922 |
+
likes = post.get("likes_count", 0) or 0
|
| 923 |
+
comments = post.get("comments_count", 0) or 0
|
| 924 |
+
|
| 925 |
+
m = spot_metrics[spot_id]
|
| 926 |
+
m["post_count"] += 1
|
| 927 |
+
m["total_likes"] += likes
|
| 928 |
+
m["total_comments"] += comments
|
| 929 |
+
m["weighted_score"] += capped_engagement
|
| 930 |
+
m["match_methods"].add(method)
|
| 931 |
+
for tag in post.get("hashtags", []):
|
| 932 |
+
tag = tag.strip().lstrip("#")
|
| 933 |
+
if tag and len(tag) >= 2:
|
| 934 |
+
m["hashtags"].add(tag)
|
| 935 |
+
account = post.get("_source_account", "")
|
| 936 |
+
if account:
|
| 937 |
+
m["source_accounts"].add(account)
|
| 938 |
+
|
| 939 |
+
|
| 940 |
+
def _finalize_metrics(spot_metrics: dict[str, dict]) -> None:
|
| 941 |
+
"""set โ sorted list ๋ณํ + avg_engagement ๊ณ์ฐ."""
|
| 942 |
+
for metrics in spot_metrics.values():
|
| 943 |
+
count = max(metrics["post_count"], 1)
|
| 944 |
+
metrics["avg_engagement"] = int(
|
| 945 |
+
round((metrics["total_likes"] + metrics["total_comments"]) / count)
|
| 946 |
+
)
|
| 947 |
+
metrics["match_methods"] = sorted(metrics["match_methods"])
|
| 948 |
+
metrics["hashtags"] = sorted(metrics["hashtags"])
|
| 949 |
+
metrics["source_accounts"] = sorted(metrics["source_accounts"])
|
| 950 |
+
|
| 951 |
+
|
| 952 |
def _dedup_posts_by_url(posts: list[dict]) -> list[dict]:
|
| 953 |
"""URL ๊ธฐ๋ฐ ์ค๋ณต ์ ๊ฑฐ."""
|
| 954 |
seen_urls: set[str] = set()
|
trend_engine/trend_scorer.py
CHANGED
|
@@ -236,12 +236,12 @@ def generate_weekly_ranking(supabase: Client | None = None) -> dict:
|
|
| 236 |
mt = row["metric_type"]
|
| 237 |
spots_last.setdefault(sid, {}).setdefault(src, {})[mt] = row["metric_value"]
|
| 238 |
|
| 239 |
-
# --
|
| 240 |
-
# DB
|
| 241 |
# ์ค์ฝ์ด๋ฌ๋ "instagram" ํค๋ก ์ฐธ์กฐํ๋ฏ๋ก ๊ฐ์ฅ ์ต๊ทผ ๋ฐฑ์๋ ๋ฐ์ดํฐ๋ฅผ ์ฌ์ฉ
|
| 242 |
for spots_dict in (spots_this, spots_last):
|
| 243 |
for sid in list(spots_dict.keys()):
|
| 244 |
-
for ig_src in ("instagram_apify", "instagram_ed"):
|
| 245 |
if ig_src in spots_dict[sid]:
|
| 246 |
spots_dict[sid]["instagram"] = spots_dict[sid].pop(ig_src)
|
| 247 |
break # ์ฒซ ๋ฒ์งธ ๋ฐ๊ฒฌ๋ ๋ฐฑ์๋ ์ฌ์ฉ (ํ๋๋ง ํ์ฑ)
|
|
|
|
| 236 |
mt = row["metric_type"]
|
| 237 |
spots_last.setdefault(sid, {}).setdefault(src, {})[mt] = row["metric_value"]
|
| 238 |
|
| 239 |
+
# -- instagram_* โ "instagram" ํค๋ก ํตํฉ --
|
| 240 |
+
# DB source๊ฐ instagram_apify/instagram_ed/instagram_influencer๋ก ๋ถ๋ฆฌ๋์์ผ๋
|
| 241 |
# ์ค์ฝ์ด๋ฌ๋ "instagram" ํค๋ก ์ฐธ์กฐํ๋ฏ๋ก ๊ฐ์ฅ ์ต๊ทผ ๋ฐฑ์๋ ๋ฐ์ดํฐ๋ฅผ ์ฌ์ฉ
|
| 242 |
for spots_dict in (spots_this, spots_last):
|
| 243 |
for sid in list(spots_dict.keys()):
|
| 244 |
+
for ig_src in ("instagram_influencer", "instagram_apify", "instagram_ed"):
|
| 245 |
if ig_src in spots_dict[sid]:
|
| 246 |
spots_dict[sid]["instagram"] = spots_dict[sid].pop(ig_src)
|
| 247 |
break # ์ฒซ ๋ฒ์งธ ๋ฐ๊ฒฌ๋ ๋ฐฑ์๋ ์ฌ์ฉ (ํ๋๋ง ํ์ฑ)
|
utils/trending_builder.py
CHANGED
|
@@ -65,8 +65,8 @@ CHANNEL_THEMES: dict[str, dict] = {
|
|
| 65 |
"metric_type": "post_count",
|
| 66 |
"sort_by": "metric_value",
|
| 67 |
"min_spots": 3,
|
| 68 |
-
# DB ์์ค๋ช
์ด
|
| 69 |
-
"db_sources": ["instagram_apify", "instagram_ed"],
|
| 70 |
},
|
| 71 |
}
|
| 72 |
|
|
@@ -103,7 +103,7 @@ class TrendingBuilder:
|
|
| 103 |
period_start, _ = get_week_period()
|
| 104 |
|
| 105 |
# ์ด๋ฒ ์ฃผ spot_trends์์ ํด๋น ์ฑ๋+metric_type ์กฐํ
|
| 106 |
-
# instagram์ DB ์์ค๋ช
์ด instagram_apify/instagram_ed๋ก ๋ถ๋ฆฌ๋จ โ in_ ์ฟผ๋ฆฌ
|
| 107 |
sort_ascending = theme.get("sort_ascending", False)
|
| 108 |
db_sources = theme.get("db_sources", [channel])
|
| 109 |
query = (
|
|
@@ -127,7 +127,7 @@ class TrendingBuilder:
|
|
| 127 |
if not trend_rows:
|
| 128 |
return []
|
| 129 |
|
| 130 |
-
# ๊ฐ์ spot_id๊ฐ ์ฌ๋ฌ ์์ค(์:
|
| 131 |
# ์ฌ ์ ์์ผ๋ฏ๋ก spot_id ๊ธฐ์ค ์ค๋ณต ์ ๊ฑฐ (metric_value๊ฐ ํฐ ํ ์ฐ์ )
|
| 132 |
if len(db_sources) > 1:
|
| 133 |
seen: dict[str, dict] = {}
|
|
|
|
| 65 |
"metric_type": "post_count",
|
| 66 |
"sort_by": "metric_value",
|
| 67 |
"min_spots": 3,
|
| 68 |
+
# DB ์์ค๋ช
์ด instagram_influencer (v5.0) + ๋ ๊ฑฐ์ ๋ถ๋ฆฌ ์์ค
|
| 69 |
+
"db_sources": ["instagram_influencer", "instagram_apify", "instagram_ed"],
|
| 70 |
},
|
| 71 |
}
|
| 72 |
|
|
|
|
| 103 |
period_start, _ = get_week_period()
|
| 104 |
|
| 105 |
# ์ด๋ฒ ์ฃผ spot_trends์์ ํด๋น ์ฑ๋+metric_type ์กฐํ
|
| 106 |
+
# instagram์ DB ์์ค๋ช
์ด instagram_influencer/instagram_apify/instagram_ed๋ก ๋ถ๋ฆฌ๋จ โ in_ ์ฟผ๋ฆฌ
|
| 107 |
sort_ascending = theme.get("sort_ascending", False)
|
| 108 |
db_sources = theme.get("db_sources", [channel])
|
| 109 |
query = (
|
|
|
|
| 127 |
if not trend_rows:
|
| 128 |
return []
|
| 129 |
|
| 130 |
+
# ๊ฐ์ spot_id๊ฐ ์ฌ๋ฌ ์์ค(์: instagram_influencer + ๋ ๊ฑฐ์)์์
|
| 131 |
# ์ฌ ์ ์์ผ๋ฏ๋ก spot_id ๊ธฐ์ค ์ค๋ณต ์ ๊ฑฐ (metric_value๊ฐ ํฐ ํ ์ฐ์ )
|
| 132 |
if len(db_sources) > 1:
|
| 133 |
seen: dict[str, dict] = {}
|