JHyeok5 commited on
Commit
206c732
ยท
verified ยท
1 Parent(s): 19aa2f9

Upload folder using huggingface_hub

Browse files
requirements-trend.txt CHANGED
@@ -4,7 +4,6 @@ beautifulsoup4>=4.12.0
4
  lxml>=5.0.0
5
  requests>=2.31.0
6
  google-api-python-client>=2.100.0
7
- # ensembledata SDK optional (httpx๋กœ ์ง์ ‘ ํ˜ธ์ถœ)
8
- # ensembledata>=0.2.0
9
  supabase>=2.0.0
10
  python-dotenv>=1.0.0
 
4
  lxml>=5.0.0
5
  requests>=2.31.0
6
  google-api-python-client>=2.100.0
7
+ google-genai>=1.0.0,<1.64.0
 
8
  supabase>=2.0.0
9
  python-dotenv>=1.0.0
scripts/run_trend_engine.py CHANGED
@@ -4,10 +4,10 @@ RE:Play Trend Engine v3 โ€” ์ฃผ๊ฐ„ ๋ฐฐ์น˜ ์˜ค์ผ€์ŠคํŠธ๋ ˆ์ดํ„ฐ
4
  ์ˆœ์ฐจ ์‹คํ–‰ ํŒŒ์ดํ”„๋ผ์ธ:
5
  1. ์นด์นด์˜ค๋งต ๊ทธ๋ฆฌ๋“œ ์Šค์บ” + ๋ฆฌ๋ทฐ ํŒŒ์‹ฑ (trend_spots ๋งˆ์Šคํ„ฐ ์ƒ์„ฑ)
6
  2. SpotMatcher ์ดˆ๊ธฐํ™” (trend_spots + story_spots ์‚ฌ์ „ ๋กœ๋“œ)
7
- 3. ๋„ค์ด๋ฒ„ ๋ธ”๋กœ๊ทธ ์ˆ˜์ง‘ (URL ํ™•๋ณด + ํฌ๋กค๋ง + DB ์ €์žฅ)
8
- 4. ๋ธ”๋กœ๊ทธ ๋ณธ๋ฌธ โ†’ ์žฅ์†Œ๋ช… ์ถ”์ถœ + mention_count ์ง‘๊ณ„
9
- 5. ์œ ํŠœ๋ธŒ API (SpotMatcher ์—ฐ๋™)
10
- 6. ์ธ์Šคํƒ€๊ทธ๋žจ Apify (SpotMatcher ์—ฐ๋™)
11
  7. ์ข…ํ•ฉ ์Šค์ฝ”์–ด ๊ณ„์‚ฐ + ๋žญํ‚น ์ƒ์„ฑ
12
 
13
  Usage:
@@ -122,11 +122,10 @@ def main() -> None:
122
  youtube = YouTubeCollector(sb, spot_matcher=matcher)
123
  run_step("3_youtube", youtube.run, results)
124
 
125
- # โ”€โ”€ 4. ์ธ์Šคํƒ€๊ทธ๋žจ (Apify/EnsembleData ๋“€์–ผ ๋ฐฑ์—”๋“œ, SpotMatcher ์—ฐ๋™) โ”€โ”€
126
- ig_backend = os.environ.get("INSTAGRAM_BACKEND", "apify")
127
- logger.info("์ธ์Šคํƒ€๊ทธ๋žจ ๋ฐฑ์—”๋“œ: %s", ig_backend)
128
  instagram = InstagramCollector(sb, spot_matcher=matcher)
129
- run_step("4_instagram", instagram.run, results)
130
 
131
  # โ”€โ”€ 5. ๋„ค์ด๋ฒ„ ํ”Œ๋ ˆ์ด์Šค โ€” ๋น„ํ™œ์„ฑ (Place ID ๋งค์นญ ๋ถˆ๊ฐ€) โ”€โ”€
132
  logger.info("๋„ค์ด๋ฒ„ ํ”Œ๋ ˆ์ด์Šค: ๋น„ํ™œ์„ฑ (Place ID ๋งค์นญ ๋ถˆ๊ฐ€, 2026-02)")
@@ -273,7 +272,7 @@ def main() -> None:
273
 
274
  # โ”€โ”€ 8. ์ข…ํ•ฉ ์Šค์ฝ”์–ด ๊ณ„์‚ฐ + ๋žญํ‚น ์ƒ์„ฑ (์ตœ์†Œ 2์ฑ„๋„ ์„ฑ๊ณต ์‹œ) โ”€โ”€
275
  # ์ˆ˜์ง‘ ์ฑ„๋„ ๋‹จ๊ณ„๋งŒ ์นด์šดํŠธ (1, 3, 4, 6)
276
- collection_steps = ["1_kakaomap", "3_youtube", "4_instagram", "6_naver_blog"]
277
  successful_channels = [s for s in collection_steps if results.get(s, {}).get("status") == "ok"]
278
 
279
  def calc_scores():
 
4
  ์ˆœ์ฐจ ์‹คํ–‰ ํŒŒ์ดํ”„๋ผ์ธ:
5
  1. ์นด์นด์˜ค๋งต ๊ทธ๋ฆฌ๋“œ ์Šค์บ” + ๋ฆฌ๋ทฐ ํŒŒ์‹ฑ (trend_spots ๋งˆ์Šคํ„ฐ ์ƒ์„ฑ)
6
  2. SpotMatcher ์ดˆ๊ธฐํ™” (trend_spots + story_spots ์‚ฌ์ „ ๋กœ๋“œ)
7
+ 3. ์œ ํŠœ๋ธŒ API (SpotMatcher ์—ฐ๋™)
8
+ 4. ์ธ์Šคํƒ€๊ทธ๋žจ ์ธํ”Œ๋ฃจ์–ธ์„œ ๋ชจ๋‹ˆํ„ฐ๋ง v5.0 (SpotMatcher ์—ฐ๋™)
9
+ 5. ๋„ค์ด๋ฒ„ ๋ธ”๋กœ๊ทธ ์ˆ˜์ง‘ (URL ํ™•๋ณด + ํฌ๋กค๋ง + DB ์ €์žฅ)
10
+ 6. ๋ธ”๋กœ๊ทธ ๋ณธ๋ฌธ โ†’ ์žฅ์†Œ๋ช… ์ถ”์ถœ + mention_count ์ง‘๊ณ„
11
  7. ์ข…ํ•ฉ ์Šค์ฝ”์–ด ๊ณ„์‚ฐ + ๋žญํ‚น ์ƒ์„ฑ
12
 
13
  Usage:
 
122
  youtube = YouTubeCollector(sb, spot_matcher=matcher)
123
  run_step("3_youtube", youtube.run, results)
124
 
125
+ # โ”€โ”€ 4. ์ธ์Šคํƒ€๊ทธ๋žจ ์ธํ”Œ๋ฃจ์–ธ์„œ ๋ชจ๋‹ˆํ„ฐ๋ง v5.1 Multimodal (SpotMatcher ์—ฐ๋™) โ”€โ”€
126
+ logger.info("์ธ์Šคํƒ€๊ทธ๋žจ: ์ธํ”Œ๋ฃจ์–ธ์„œ ๋ชจ๋‹ˆํ„ฐ๋ง v5.1 (Multimodal AI)")
 
127
  instagram = InstagramCollector(sb, spot_matcher=matcher)
128
+ run_step("4_instagram_influencer", instagram.run, results)
129
 
130
  # โ”€โ”€ 5. ๋„ค์ด๋ฒ„ ํ”Œ๋ ˆ์ด์Šค โ€” ๋น„ํ™œ์„ฑ (Place ID ๋งค์นญ ๋ถˆ๊ฐ€) โ”€โ”€
131
  logger.info("๋„ค์ด๋ฒ„ ํ”Œ๋ ˆ์ด์Šค: ๋น„ํ™œ์„ฑ (Place ID ๋งค์นญ ๋ถˆ๊ฐ€, 2026-02)")
 
272
 
273
  # โ”€โ”€ 8. ์ข…ํ•ฉ ์Šค์ฝ”์–ด ๊ณ„์‚ฐ + ๋žญํ‚น ์ƒ์„ฑ (์ตœ์†Œ 2์ฑ„๋„ ์„ฑ๊ณต ์‹œ) โ”€โ”€
274
  # ์ˆ˜์ง‘ ์ฑ„๋„ ๋‹จ๊ณ„๋งŒ ์นด์šดํŠธ (1, 3, 4, 6)
275
+ collection_steps = ["1_kakaomap", "3_youtube", "4_instagram_influencer", "6_naver_blog"]
276
  successful_channels = [s for s in collection_steps if results.get(s, {}).get("status") == "ok"]
277
 
278
  def calc_scores():
scripts/test_instagram_collector.py ADDED
@@ -0,0 +1,272 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Instagram Collector v5.1 โ€” ํ†ตํ•ฉ ํ…Œ์ŠคํŠธ ์Šคํฌ๋ฆฝํŠธ
3
+
4
+ ์ธ์Šคํƒ€๊ทธ๋žจ ์ธํ”Œ๋ฃจ์–ธ์„œ ํŒŒ์ดํ”„๋ผ์ธ์˜ ํ•ต์‹ฌ ๋‹จ๊ณ„๋ฅผ ๊ฒ€์ฆํ•ฉ๋‹ˆ๋‹ค:
5
+ 1. DB์—์„œ ์ธํ”Œ๋ฃจ์–ธ์„œ ๊ณ„์ • ๋กœ๋“œ
6
+ 2. Apify๋กœ 1๊ฐœ ๊ณ„์ • ๊ฒŒ์‹œ๋ฌผ ์ˆ˜์ง‘ (๋น„์šฉ ์ตœ์†Œํ™”)
7
+ 3. ๊ฒŒ์‹œ๋ฌผ ์ •๊ทœํ™” + ๊ธฐ๊ฐ„/engagement ํ•„ํ„ฐ
8
+ 4. Pass 1: ๊ทœ์น™ ๊ธฐ๋ฐ˜ ๋งค์นญ (์œ„์น˜ํƒœ๊ทธ + ํ•ด์‹œํƒœ๊ทธ)
9
+ 5. Pass 2: Gemini ๋ฉ€ํ‹ฐ๋ชจ๋‹ฌ AI ๋ถ„์„ (์ด๋ฏธ์ง€ 1๊ฑด)
10
+ 6. ๋งค์นญ ๊ฒฐ๊ณผ ์š”์•ฝ (DB ์ €์žฅ ์•ˆ ํ•จ)
11
+
12
+ Usage:
13
+ python backend/scripts/test_instagram_collector.py
14
+ """
15
+
16
+ import json
17
+ import logging
18
+ import os
19
+ import sys
20
+ import time
21
+ from datetime import datetime, timedelta, timezone
22
+
23
+ # backend/ ๋””๋ ‰ํ† ๋ฆฌ๋ฅผ import path์— ์ถ”๊ฐ€
24
+ sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
25
+
26
+ # .env ๋กœ๋“œ
27
+ try:
28
+ from dotenv import load_dotenv
29
+ env_path = os.path.join(os.path.dirname(__file__), "..", "..", ".env")
30
+ load_dotenv(env_path)
31
+ except ImportError:
32
+ pass
33
+
34
+ logging.basicConfig(
35
+ level=logging.INFO,
36
+ format="%(asctime)s [%(levelname)s] %(name)s: %(message)s",
37
+ datefmt="%H:%M:%S",
38
+ )
39
+ logger = logging.getLogger("test_instagram")
40
+
41
+
42
+ def test_step(name: str, func):
43
+ """ํ…Œ์ŠคํŠธ ๋‹จ๊ณ„ ์‹คํ–‰ ๋ž˜ํผ."""
44
+ logger.info("โ”โ”โ” [TEST] %s โ”โ”โ”", name)
45
+ start = time.time()
46
+ try:
47
+ result = func()
48
+ elapsed = time.time() - start
49
+ logger.info(" โœ“ PASS โ€” %.1f์ดˆ", elapsed)
50
+ return result
51
+ except Exception as e:
52
+ elapsed = time.time() - start
53
+ logger.error(" โœ— FAIL โ€” %s (%.1f์ดˆ)", e, elapsed)
54
+ import traceback
55
+ traceback.print_exc()
56
+ return None
57
+
58
+
59
+ def main():
60
+ from supabase import create_client
61
+
62
+ url = os.environ.get("SUPABASE_URL") or os.environ.get("VITE_SUPABASE_URL")
63
+ key = os.environ.get("SUPABASE_SERVICE_ROLE_KEY")
64
+ if not url or not key:
65
+ logger.error("SUPABASE_URL / SUPABASE_SERVICE_ROLE_KEY ํ™˜๊ฒฝ๋ณ€์ˆ˜ ๋ˆ„๋ฝ")
66
+ sys.exit(1)
67
+
68
+ sb = create_client(url, key)
69
+
70
+ # โ”€โ”€ 1. ์ธํ”Œ๋ฃจ์–ธ์„œ ๊ณ„์ • ๋กœ๋“œ โ”€โ”€
71
+ def step1_load_accounts():
72
+ from trend_engine.collectors.instagram import InstagramCollector
73
+ collector = InstagramCollector(sb)
74
+ accounts = collector._load_influencer_accounts()
75
+ assert len(accounts) > 0, f"๊ณ„์ • 0๊ฐœ ๋กœ๋“œ๋จ"
76
+ logger.info(" ๊ณ„์ • %d๊ฐœ ๋กœ๋“œ:", len(accounts))
77
+ for a in accounts:
78
+ logger.info(" [%d] @%s (%s)", a.get("priority", 0), a["username"], a.get("category", ""))
79
+ return accounts
80
+
81
+ accounts = test_step("1. ์ธํ”Œ๋ฃจ์–ธ์„œ ๊ณ„์ • ๋กœ๋“œ (DB)", step1_load_accounts)
82
+ if not accounts:
83
+ logger.error("๊ณ„์ • ๋กœ๋“œ ์‹คํŒจ โ€” ํ…Œ์ŠคํŠธ ์ค‘๋‹จ")
84
+ sys.exit(1)
85
+
86
+ # โ”€โ”€ 2. Apify ์ˆ˜์ง‘ (1๊ฐœ ๊ณ„์ •๋งŒ) โ”€โ”€
87
+ apify_token = os.environ.get("APIFY_API_TOKEN")
88
+ if not apify_token:
89
+ logger.error("APIFY_API_TOKEN ํ™˜๊ฒฝ๋ณ€์ˆ˜ ๋ˆ„๋ฝ โ€” ์ˆ˜์ง‘ ํ…Œ์ŠคํŠธ ์Šคํ‚ต")
90
+ sys.exit(1)
91
+
92
+ # ์ œ์ฃผ ๊ฑฐ์ฃผ ์ธํ”Œ๋ฃจ์–ธ์„œ ์šฐ์„ , ์—†์œผ๋ฉด priority 1
93
+ jeju_accounts = [a for a in accounts if a.get("category") == "lifestyle"]
94
+ test_account = jeju_accounts[0] if jeju_accounts else accounts[0]
95
+ test_username = test_account["username"]
96
+
97
+ def step2_apify_scrape():
98
+ from trend_engine.collectors.instagram import InstagramCollector
99
+ collector = InstagramCollector(sb)
100
+
101
+ # 1๊ฐœ ๊ณ„์ •๋งŒ 5๊ฑด ์ œํ•œ์œผ๋กœ ์ˆ˜์ง‘
102
+ from trend_engine.collectors import instagram as ig_mod
103
+ orig_limit = ig_mod.RESULTS_LIMIT_PER_ACCOUNT
104
+ orig_window = ig_mod.TREND_WINDOW_DAYS
105
+ ig_mod.RESULTS_LIMIT_PER_ACCOUNT = 5 # ํ…Œ์ŠคํŠธ์šฉ ์ œํ•œ
106
+ ig_mod.TREND_WINDOW_DAYS = 90 # ํ…Œ์ŠคํŠธ์šฉ: 90์ผ๋กœ ํ™•์žฅ (์ถฉ๋ถ„ํ•œ ๋งค์นญ ๋ฐ์ดํ„ฐ)
107
+
108
+ try:
109
+ posts = collector._scrape_profiles([test_username], "test_batch")
110
+ finally:
111
+ ig_mod.RESULTS_LIMIT_PER_ACCOUNT = orig_limit
112
+ ig_mod.TREND_WINDOW_DAYS = orig_window
113
+
114
+ logger.info(" @%s โ†’ %d๊ฑด ์ˆ˜์ง‘", test_username, len(posts))
115
+ for i, p in enumerate(posts[:3]):
116
+ logger.info(
117
+ " [%d] type=%s likes=%d loc=%s caption=%.50s...",
118
+ i + 1,
119
+ p.get("media_type", "?"),
120
+ p.get("likes_count", 0),
121
+ p.get("location_name", "")[:30] or "(์—†์Œ)",
122
+ (p.get("caption", "") or "")[:50],
123
+ )
124
+ if p.get("media_url"):
125
+ logger.info(" media_url=%s", p["media_url"][:80])
126
+ return posts
127
+
128
+ posts = test_step(f"2. Apify ์ˆ˜์ง‘ (@{test_username}, 5๊ฑด ์ œํ•œ)", step2_apify_scrape)
129
+ if not posts:
130
+ logger.warning("๊ฒŒ์‹œ๋ฌผ 0๊ฑด โ€” ์ดํ›„ ๋งค์นญ ํ…Œ์ŠคํŠธ๋Š” ํ•ฉ์„ฑ ๋ฐ์ดํ„ฐ๋กœ ์ง„ํ–‰")
131
+ posts = []
132
+
133
+ # โ”€โ”€ 3. SpotMatcher ์ดˆ๊ธฐํ™” โ”€โ”€
134
+ def step3_spot_matcher():
135
+ from trend_engine.spot_matcher import SpotMatcher
136
+ matcher = SpotMatcher(sb)
137
+ logger.info(
138
+ " trend_spots: %d๊ฑด, story_spots: %d๊ฑด",
139
+ len(matcher.trend_spots), len(matcher.story_spots),
140
+ )
141
+ # ์ƒ˜ํ”Œ ๋งค์นญ ํ…Œ์ŠคํŠธ
142
+ test_names = ["์นดํŽ˜ ๋ ˆ์ด์–ด๋“œ ์• ์›”", "ํ˜‘์žฌํ•ด์ˆ˜์š•์žฅ", "์ƒˆ๋ณ„์˜ค๋ฆ„", "์• ์›” ์นดํŽ˜๊ฑฐ๋ฆฌ"]
143
+ for name in test_names:
144
+ sid = matcher.match(name)
145
+ logger.info(" match('%s') โ†’ %s", name, sid or "(๋ฏธ๋งค์นญ)")
146
+ return matcher
147
+
148
+ matcher = test_step("3. SpotMatcher ์ดˆ๊ธฐํ™”", step3_spot_matcher)
149
+ if not matcher:
150
+ logger.error("SpotMatcher ์ดˆ๊ธฐํ™” ์‹คํŒจ โ€” ํ…Œ์ŠคํŠธ ์ค‘๋‹จ")
151
+ sys.exit(1)
152
+
153
+ # โ”€โ”€ 4. Pass 1: ๊ทœ์น™ ๊ธฐ๋ฐ˜ ๋งค์นญ โ”€โ”€
154
+ def step4_pass1_matching():
155
+ from trend_engine.collectors.instagram import InstagramCollector
156
+ from trend_engine.collectors import instagram as ig_mod
157
+ collector = InstagramCollector(sb, spot_matcher=matcher)
158
+
159
+ # ํ…Œ์ŠคํŠธ์šฉ: ๊ธฐ๊ฐ„ ํ•„ํ„ฐ ํ™•์žฅ (90์ผ)
160
+ orig_window = ig_mod.TREND_WINDOW_DAYS
161
+ ig_mod.TREND_WINDOW_DAYS = 90
162
+
163
+ if not posts:
164
+ logger.info(" ์‹ค์ œ ๊ฒŒ์‹œ๋ฌผ ์—†์Œ โ€” ํ•ฉ์„ฑ ๋ฐ์ดํ„ฐ๋กœ ํ…Œ์ŠคํŠธ")
165
+ test_posts = [
166
+ {
167
+ "search_term": "@test",
168
+ "search_type": "profile",
169
+ "location_name": "์นดํŽ˜ ๋ ˆ์ด์–ด๋“œ ์• ์›”",
170
+ "likes_count": 500,
171
+ "comments_count": 30,
172
+ "caption": "์• ์›” ์นดํŽ˜ ๋„ˆ๋ฌด ์ข‹๋‹ค #์• ์›”์นดํŽ˜ #์ œ์ฃผ๋„",
173
+ "timestamp": datetime.now(timezone.utc).isoformat(),
174
+ "url": "https://instagram.com/p/test1",
175
+ "hashtags": ["์• ์›”์นดํŽ˜", "์ œ์ฃผ๋„"],
176
+ "media_url": "",
177
+ "media_type": "Image",
178
+ "_source_account": "test",
179
+ },
180
+ {
181
+ "search_term": "@test",
182
+ "search_type": "profile",
183
+ "location_name": "",
184
+ "likes_count": 200,
185
+ "comments_count": 10,
186
+ "caption": "์ƒˆ๋ณ„์˜ค๋ฆ„ ์ผ์ถœ ๋ณด๊ณ ์™”์–ด์š” #์ƒˆ๋ณ„์˜ค๋ฆ„ #์ œ์ฃผ์—ฌํ–‰",
187
+ "timestamp": datetime.now(timezone.utc).isoformat(),
188
+ "url": "https://instagram.com/p/test2",
189
+ "hashtags": ["์ƒˆ๋ณ„์˜ค๋ฆ„", "์ œ์ฃผ์—ฌํ–‰"],
190
+ "media_url": "https://example.com/image.jpg",
191
+ "media_type": "Image",
192
+ "_source_account": "test",
193
+ },
194
+ ]
195
+ else:
196
+ test_posts = posts
197
+
198
+ spot_metrics, unmatched, match_stats = collector._aggregate_with_unmatched(test_posts)
199
+
200
+ logger.info(" ์ด ๊ฒŒ์‹œ๋ฌผ: %d๊ฑด", len(test_posts))
201
+ logger.info(" Pass 1 ๋งค์นญ: %d๊ฐœ ์ŠคํŒŸ", len(spot_metrics))
202
+ logger.info(" ๋ฏธ๋งค์นญ (โ†’ Pass 2 ๋Œ€์ƒ): %d๊ฑด", len(unmatched))
203
+ logger.info(" ๋งค์นญ ํ†ต๊ณ„: %s", json.dumps(match_stats, ensure_ascii=False))
204
+
205
+ if spot_metrics:
206
+ logger.info(" ๋งค์นญ๋œ ์ŠคํŒŸ:")
207
+ for sid, m in spot_metrics.items():
208
+ logger.info(
209
+ " %s: posts=%d, engagement=%d, methods=%s",
210
+ sid, m["post_count"], m["weighted_score"],
211
+ m["match_methods"],
212
+ )
213
+
214
+ ig_mod.TREND_WINDOW_DAYS = orig_window
215
+ return spot_metrics, unmatched, match_stats
216
+
217
+ result4 = test_step("4. Pass 1 โ€” ๊ทœ์น™ ๊ธฐ๋ฐ˜ ๋งค์นญ", step4_pass1_matching)
218
+ if not result4:
219
+ logger.error("Pass 1 ํ…Œ์ŠคํŠธ ์‹คํŒจ")
220
+ sys.exit(1)
221
+
222
+ spot_metrics, unmatched, match_stats = result4
223
+
224
+ # โ”€โ”€ 5. Pass 2: AI ๋ฉ€ํ‹ฐ๋ชจ๋‹ฌ ๋ถ„์„ (์„ ํƒ์ ) โ”€โ”€
225
+ gemini_key = os.environ.get("GEMINI_API_KEY")
226
+ if gemini_key and unmatched:
227
+ def step5_ai_multimodal():
228
+ from trend_engine.collectors.instagram import InstagramCollector
229
+ collector = InstagramCollector(sb, spot_matcher=matcher)
230
+
231
+ # 1๊ฑด๋งŒ ํ…Œ์ŠคํŠธ
232
+ test_unmatched = unmatched[:1]
233
+ p = test_unmatched[0]
234
+ logger.info(
235
+ " ํ…Œ์ŠคํŠธ ๊ฒŒ์‹œ๋ฌผ: type=%s, caption=%.60s...",
236
+ p.get("media_type", "?"),
237
+ (p.get("caption", "") or "")[:60],
238
+ )
239
+ if p.get("media_url"):
240
+ logger.info(" media_url=%s", p["media_url"][:80])
241
+
242
+ ai_count = collector._ai_analyze_content(
243
+ test_unmatched, spot_metrics, match_stats,
244
+ )
245
+ logger.info(" AI ๋งค์นญ ๊ฒฐ๊ณผ: %d๊ฑด", ai_count)
246
+ return ai_count
247
+
248
+ test_step("5. Pass 2 โ€” Gemini ๋ฉ€ํ‹ฐ๋ชจ๋‹ฌ AI (1๊ฑด)", step5_ai_multimodal)
249
+ elif not gemini_key:
250
+ logger.info("โ”โ”โ” [SKIP] 5. AI ๋ฉ€ํ‹ฐ๋ชจ๋‹ฌ โ€” GEMINI_API_KEY ๋ฏธ์„ค์ • โ”โ”โ”")
251
+ else:
252
+ logger.info("โ”โ”โ” [SKIP] 5. AI ๋ฉ€ํ‹ฐ๋ชจ๋‹ฌ โ€” ๋ฏธ๋งค์นญ ๊ฒŒ์‹œ๋ฌผ ์—†์Œ โ”โ”โ”")
253
+
254
+ # โ”€โ”€ 6. ์ตœ์ข… ์š”์•ฝ โ”€โ”€
255
+ logger.info("")
256
+ logger.info("โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•")
257
+ logger.info(" Instagram v5.1 ํ…Œ์ŠคํŠธ ๊ฒฐ๊ณผ ์š”์•ฝ")
258
+ logger.info("โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•")
259
+ logger.info(" ์ธํ”Œ๋ฃจ์–ธ์„œ ๊ณ„์ •: %d๊ฐœ (DB)", len(accounts))
260
+ logger.info(" Apify ์ˆ˜์ง‘: @%s โ†’ %d๊ฑด", test_username, len(posts))
261
+ logger.info(" SpotMatcher: trend=%d, story=%d",
262
+ len(matcher.trend_spots), len(matcher.story_spots))
263
+ logger.info(" Pass 1 ๋งค์นญ: %d๊ฐœ ์ŠคํŒŸ", len(spot_metrics))
264
+ logger.info(" ๋งค์นญ ํ†ต๊ณ„: %s", json.dumps(match_stats, ensure_ascii=False))
265
+ logger.info(" AI ๋ฉ€ํ‹ฐ๋ชจ๋‹ฌ: %s", "ํ™œ์„ฑ" if gemini_key else "๋น„ํ™œ์„ฑ")
266
+ logger.info("โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•")
267
+ logger.info(" โš ๏ธ DB ์ €์žฅ ์•ˆ ํ•จ (ํ…Œ์ŠคํŠธ ๋ชจ๋“œ)")
268
+ logger.info("โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•")
269
+
270
+
271
+ if __name__ == "__main__":
272
+ main()
scripts/test_instagram_full.py ADDED
@@ -0,0 +1,162 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Instagram Collector v5.1 โ€” ์ „์ฒด ์ธํ”Œ๋ฃจ์–ธ์„œ ์‹ค์ „ ํ…Œ์ŠคํŠธ
3
+
4
+ 15๊ฐœ ์ธํ”Œ๋ฃจ์–ธ์„œ ์ „์ฒด์— ๋Œ€ํ•ด ์‹ค์ œ ํŒŒ์ดํ”„๋ผ์ธ์„ ์‹คํ–‰ํ•ฉ๋‹ˆ๋‹ค.
5
+ DB ์ €์žฅ์€ ํ•˜์ง€ ์•Š๊ณ  ๊ฒฐ๊ณผ๋งŒ ํ™•์ธํ•ฉ๋‹ˆ๋‹ค.
6
+
7
+ Usage:
8
+ python3 backend/scripts/test_instagram_full.py
9
+ """
10
+
11
+ import json
12
+ import logging
13
+ import os
14
+ import sys
15
+ import time
16
+ from datetime import datetime, timedelta, timezone
17
+
18
+ sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
19
+
20
+ try:
21
+ from dotenv import load_dotenv
22
+ load_dotenv(os.path.join(os.path.dirname(__file__), "..", "..", ".env"))
23
+ except ImportError:
24
+ pass
25
+
26
+ logging.basicConfig(
27
+ level=logging.INFO,
28
+ format="%(asctime)s [%(levelname)s] %(name)s: %(message)s",
29
+ datefmt="%H:%M:%S",
30
+ )
31
+ logger = logging.getLogger("test_instagram_full")
32
+
33
+
34
+ def main():
35
+ from supabase import create_client
36
+ from trend_engine.collectors.instagram import InstagramCollector
37
+ from trend_engine.spot_matcher import SpotMatcher
38
+
39
+ url = os.environ.get("SUPABASE_URL") or os.environ.get("VITE_SUPABASE_URL")
40
+ key = os.environ.get("SUPABASE_SERVICE_ROLE_KEY")
41
+ if not url or not key:
42
+ logger.error("SUPABASE_URL / SUPABASE_SERVICE_ROLE_KEY ํ™˜๊ฒฝ๋ณ€์ˆ˜ ๋ˆ„๋ฝ")
43
+ sys.exit(1)
44
+
45
+ sb = create_client(url, key)
46
+
47
+ # SpotMatcher ์ดˆ๊ธฐํ™”
48
+ matcher = SpotMatcher(sb)
49
+ logger.info("SpotMatcher: trend=%d, story=%d", len(matcher.trend_spots), len(matcher.story_spots))
50
+
51
+ # InstagramCollector ์ดˆ๊ธฐํ™”
52
+ collector = InstagramCollector(sb, spot_matcher=matcher)
53
+
54
+ # โ”€โ”€ 1. ๊ณ„์ • ๋กœ๋“œ โ”€โ”€
55
+ accounts = collector._load_influencer_accounts()
56
+ logger.info("์ธํ”Œ๋ฃจ์–ธ์„œ ๊ณ„์ • %d๊ฐœ ๋กœ๋“œ", len(accounts))
57
+
58
+ # โ”€โ”€ 2. ์ „์ฒด ๊ฒŒ์‹œ๋ฌผ ์ˆ˜์ง‘ โ”€โ”€
59
+ logger.info("โ”โ”โ” ์ „์ฒด ์ธํ”Œ๋ฃจ์–ธ์„œ ๊ฒŒ์‹œ๋ฌผ ์ˆ˜์ง‘ ์‹œ์ž‘ โ”โ”โ”")
60
+ start = time.time()
61
+ all_posts = collector._collect_from_accounts(accounts)
62
+ collect_elapsed = time.time() - start
63
+ logger.info("์ˆ˜์ง‘ ์™„๋ฃŒ: %d๊ฑด (%.1f์ดˆ)", len(all_posts), collect_elapsed)
64
+
65
+ # ๊ณ„์ •๋ณ„ ์ˆ˜์ง‘ ํ†ต๊ณ„
66
+ account_stats: dict[str, int] = {}
67
+ for p in all_posts:
68
+ acct = p.get("_source_account", "unknown")
69
+ account_stats[acct] = account_stats.get(acct, 0) + 1
70
+
71
+ logger.info("โ”โ”โ” ๊ณ„์ •๋ณ„ ์ˆ˜์ง‘ ํ˜„ํ™ฉ โ”โ”โ”")
72
+ for acct, count in sorted(account_stats.items(), key=lambda x: -x[1]):
73
+ logger.info(" @%-25s โ†’ %d๊ฑด", acct, count)
74
+
75
+ if not all_posts:
76
+ logger.error("์ˆ˜์ง‘๋œ ๊ฒŒ์‹œ๋ฌผ ์—†์Œ โ€” ์ข…๋ฃŒ")
77
+ sys.exit(1)
78
+
79
+ # โ”€โ”€ 3. Pass 1: ๊ทœ์น™ ๊ธฐ๋ฐ˜ ๋งค์นญ โ”€โ”€
80
+ logger.info("โ”โ”โ” Pass 1: ๊ทœ์น™ ๊ธฐ๋ฐ˜ ๋งค์นญ โ”โ”โ”")
81
+ spot_metrics, unmatched_posts, match_stats = collector._aggregate_with_unmatched(all_posts)
82
+
83
+ logger.info("Pass 1 ๊ฒฐ๊ณผ:")
84
+ logger.info(" ์ „์ฒด ๊ฒŒ์‹œ๋ฌผ: %d๊ฑด", len(all_posts))
85
+ logger.info(" ๊ธฐ๊ฐ„ ํ•„ํ„ฐ ์ œ์™ธ: %d๊ฑด (30์ผ ์ดˆ๊ณผ)", match_stats["filtered_old"])
86
+ logger.info(" ์ €engagement ์ œ์™ธ: %d๊ฑด", match_stats["filtered_low_engagement"])
87
+ logger.info(" ์œ„์น˜ํƒœ๊ทธ ๋งค์นญ: %d๊ฑด", match_stats["location_tag"])
88
+ logger.info(" ํ•ด์‹œํƒœ๊ทธ ๋งค์นญ: %d๊ฑด", match_stats["hashtag"])
89
+ logger.info(" ๋ฏธ๋งค์นญ โ†’ AI ๋Œ€์ƒ: %d๊ฑด", match_stats["unmatched"])
90
+ logger.info(" Pass 1 ์ŠคํŒŸ: %d๊ฐœ", len(spot_metrics))
91
+
92
+ if spot_metrics:
93
+ logger.info(" ๋งค์นญ๋œ ์ŠคํŒŸ:")
94
+ for sid, m in sorted(spot_metrics.items(), key=lambda x: -x[1]["weighted_score"]):
95
+ logger.info(
96
+ " %s: posts=%d, score=%d, methods=%s, accounts=%s",
97
+ sid, m["post_count"], m["weighted_score"],
98
+ m["match_methods"], m["source_accounts"],
99
+ )
100
+
101
+ # โ”€โ”€ 4. Pass 2: AI ๋ฉ€ํ‹ฐ๋ชจ๋‹ฌ ๋ถ„์„ (์ „์ฒด) โ”€โ”€
102
+ ai_matched = 0
103
+ if unmatched_posts and os.environ.get("GEMINI_API_KEY"):
104
+ logger.info("โ”โ”โ” Pass 2: Gemini 2.5 Flash ๋ฉ€ํ‹ฐ๋ชจ๋‹ฌ ๋ถ„์„ (%d๊ฑด) โ”โ”โ”", len(unmatched_posts))
105
+
106
+ # ๋ฏธ๋งค์นญ ๊ฒŒ์‹œ๋ฌผ ๋ฏธ๋””์–ด ํƒ€์ž… ํ†ต๊ณ„
107
+ image_count = sum(1 for p in unmatched_posts if p.get("media_type") != "Video")
108
+ video_count = sum(1 for p in unmatched_posts if p.get("media_type") == "Video")
109
+ logger.info(" ์ด๋ฏธ์ง€: %d๊ฑด, ์˜์ƒ: %d๊ฑด", image_count, video_count)
110
+
111
+ # ๋ฏธ๋งค์นญ ๊ฒŒ์‹œ๋ฌผ ์ƒ์„ธ (์บก์…˜ ๋ฏธ๋ฆฌ๋ณด๊ธฐ)
112
+ for i, p in enumerate(unmatched_posts[:10]):
113
+ logger.info(
114
+ " [%d] @%s type=%s caption=%.60s...",
115
+ i + 1, p.get("_source_account", "?"),
116
+ p.get("media_type", "?"),
117
+ (p.get("caption", "") or "")[:60],
118
+ )
119
+
120
+ start_ai = time.time()
121
+ ai_matched = collector._ai_analyze_content(unmatched_posts, spot_metrics, match_stats)
122
+ ai_elapsed = time.time() - start_ai
123
+ logger.info("AI ๋ถ„์„ ์™„๋ฃŒ: %d๊ฑด ๋งค์นญ (%.1f์ดˆ)", ai_matched, ai_elapsed)
124
+ elif not os.environ.get("GEMINI_API_KEY"):
125
+ logger.warning("GEMINI_API_KEY ๋ฏธ์„ค์ • โ€” AI ๋ถ„์„ ์Šคํ‚ต")
126
+ else:
127
+ logger.info("๋ฏธ๋งค์นญ ๊ฒŒ์‹œ๋ฌผ ์—†์Œ โ€” AI ๋ถ„์„ ๋ถˆํ•„์š”")
128
+
129
+ # โ”€โ”€ 5. ์ตœ์ข… ๊ฒฐ๊ณผ โ”€โ”€
130
+ logger.info("")
131
+ logger.info("โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•")
132
+ logger.info(" Instagram v5.1 ์ „์ฒด ํ…Œ์ŠคํŠธ ๊ฒฐ๊ณผ")
133
+ logger.info("โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•")
134
+ logger.info(" ์ธํ”Œ๋ฃจ์–ธ์„œ: %d๊ฐœ ๊ณ„์ •", len(accounts))
135
+ logger.info(" ์ˆ˜์ง‘ ์„ฑ๊ณต: %d๊ฐœ ๊ณ„์ • (%d๊ฑด)", len(account_stats), len(all_posts))
136
+ logger.info(" ์ˆ˜์ง‘ ์†Œ์š”: %.1f์ดˆ", collect_elapsed)
137
+ logger.info(" โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€")
138
+ logger.info(" 30์ผ ์ด๋‚ด ๊ฒŒ์‹œ๋ฌผ: %d๊ฑด", len(all_posts) - match_stats["filtered_old"])
139
+ logger.info(" engagement โ‰ฅ 50: %d๊ฑด",
140
+ len(all_posts) - match_stats["filtered_old"] - match_stats["filtered_low_engagement"])
141
+ logger.info(" โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€")
142
+ logger.info(" Pass 1 (์œ„์น˜ํƒœ๊ทธ): %d๊ฑด", match_stats["location_tag"])
143
+ logger.info(" Pass 1 (ํ•ด์‹œํƒœ๊ทธ): %d๊ฑด", match_stats["hashtag"])
144
+ logger.info(" Pass 2 (AI ๋ฉ€ํ‹ฐ๋ชจ๋‹ฌ): %d๊ฑด", match_stats.get("ai_matched", 0))
145
+ logger.info(" ์ด ๋งค์นญ ์ŠคํŒŸ: %d๊ฐœ", len(spot_metrics))
146
+ logger.info(" โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€")
147
+
148
+ if spot_metrics:
149
+ logger.info(" ์ตœ์ข… ๋งค์นญ ์ŠคํŒŸ ๋ชฉ๋ก:")
150
+ for sid, m in sorted(spot_metrics.items(), key=lambda x: -x[1]["weighted_score"]):
151
+ logger.info(
152
+ " %s: posts=%d, score=%d",
153
+ sid, m["post_count"], m["weighted_score"],
154
+ )
155
+
156
+ logger.info(" โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€")
157
+ logger.info(" โš ๏ธ DB ์ €์žฅ ์•ˆ ํ•จ (ํ…Œ์ŠคํŠธ ๋ชจ๋“œ)")
158
+ logger.info("โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•")
159
+
160
+
161
+ if __name__ == "__main__":
162
+ main()
trend_engine/collectors/instagram.py CHANGED
@@ -1,165 +1,64 @@
1
  """
2
- Instagram Collector โ€” Dual Backend (v4.1)
3
-
4
- ํ™˜๊ฒฝ๋ณ€์ˆ˜ INSTAGRAM_BACKEND๋กœ ์ˆ˜์ง‘ ๋ฐฑ์—”๋“œ๋ฅผ ์„ ํƒํ•ฉ๋‹ˆ๋‹ค:
5
- - "apify" (๊ธฐ๋ณธ๊ฐ’/Phase 1): Apify instagram-hashtag-scraper + directUrls
6
- - "ed" (Phase 2): EnsembleData REST API + ์ผ๋ณ„ ์˜ˆ์‚ฐ ๊ด€๋ฆฌ
7
-
8
- ๊ณตํ†ต ํŒŒ์ดํ”„๋ผ์ธ:
9
- 1. ํ…œํ”Œ๋ฆฟ ๊ธฐ๋ฐ˜ ํ•ด์‹œํƒœ๊ทธ ๋™์  ์ƒ์„ฑ (์ผ๋ฐ˜ + ํ•˜์œ„์ง€๋ช… + ๊ณ ์œ ๋ช…์‚ฌ)
10
- 2. ๋ฐฑ์—”๋“œ๋ณ„ ๊ฒŒ์‹œ๋ฌผ ์ˆ˜์ง‘ (Apify Actor / EnsembleData REST)
11
- 3. ๊ธฐ๊ฐ„ ํ•„ํ„ฐ(10์ผ) + ์ตœ์†Œ engagement ์ž„๊ณ„๊ฐ’ + cap ์ ์šฉ
12
- 4. 3๋‹จ๊ณ„ ์žฅ์†Œ ๋งค์นญ: ์œ„์น˜ํƒœ๊ทธ(์ ‘๋‘์‚ฌ ์ œ๊ฑฐ) โ†’ ํ•ด์‹œํƒœ๊ทธ(๋ฐฉํ–ฅ ์ œํ•œ) โ†’ ์บก์…˜ ์ถ”์ถœ
13
- - Apify ์ถ”๊ฐ€: Stage 0 directUrls (_direct_spot_id ํ™•์ •)
 
 
14
  5. ๊ฐ€์ค‘ ์ง‘๊ณ„: weighted_score = sum(min(engagement, cap))
15
- 6. spot_trends ํ…Œ์ด๋ธ”์— ์ €์žฅ (source = instagram_apify / instagram_ed)
16
-
17
- Apify (v3.6):
18
- - Apify instagram-hashtag-scraper Actor (~30๊ฑด/ํ•ด์‹œํƒœ๊ทธ)
19
- - directUrls: Location ID ๋ฐ˜์ž๋™ ํƒ์ƒ‰ + JSON ์บ์‹ฑ + SpotMatcher ์—ฐ๋™
20
-
21
- EnsembleData (v4.0):
22
- - REST API /instagram/hashtag/posts (~63๊ฑด/ํ•ด์‹œํƒœ๊ทธ)
23
- - ์ผ๋ณ„ ์œ ๋‹› ์˜ˆ์‚ฐ ๊ด€๋ฆฌ (BudgetTracker)
24
- - ํ•ด์‹œํƒœ๊ทธ ๋กœํ…Œ์ด์…˜ ์Šค์ผ€์ค„ (Free Trial: 1๊ฐœ/์ผ, Wood: ์ „์ฒด)
25
  """
26
 
27
  from __future__ import annotations
28
 
29
  import json
 
30
  import os
31
  import re
32
- import logging
33
- from datetime import datetime, timedelta, timezone, date
34
-
35
- # โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
36
- # Backend Switch
37
- # โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
38
-
39
- INSTAGRAM_BACKEND = os.getenv("INSTAGRAM_BACKEND", "apify")
40
 
41
- if INSTAGRAM_BACKEND == "apify":
42
- from apify_client import ApifyClient
43
- elif INSTAGRAM_BACKEND == "ed":
44
- import httpx
45
 
46
- from trend_engine.place_extractor import PlaceNameExtractor
47
  from trend_engine.utils import get_week_period, safe_upsert_spot_trend
48
 
49
  logger = logging.getLogger(__name__)
50
 
51
 
52
  # โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•
53
- # Shared: ์ง€์—ญ ์„ค์ • โ€” ์„œ๋น„์Šค ์ง€์—ญ ํ™•์žฅ ์‹œ ์ด ์„น์…˜๋งŒ ์ˆ˜์ •
54
  # โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•
55
 
56
- AREA_NAME = "์• ์›”"
57
- AREA_ALIASES = ["์ œ์ฃผ์• ์›”", "์ œ์ฃผ๋„์• ์›”"]
58
- AREA_SUB_NAMES = ["ํ•œ๋‹ด", "๊ณฝ์ง€"]
59
-
60
- # โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
61
- # ์นดํ…Œ๊ณ ๋ฆฌ 1: ์ง€์—ญ ์ผ๋ฐ˜ ํ…œํ”Œ๋ฆฟ
62
- # โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
63
-
64
- GENERAL_SUFFIXES = [
65
- "์นดํŽ˜", "๋ง›์ง‘", "์—ฌํ–‰", "๊ฐ€๋ณผ๋งŒํ•œ๊ณณ", "ํ•ซํ”Œ",
66
- "๊ฐ์„ฑ", "๋””์ €ํŠธ", "ํ•ด์•ˆ", "์‚ฐ์ฑ…", "์ผ๋ชฐ", "์˜ค์…˜๋ทฐ",
67
- ]
68
-
69
-
70
- def build_general_hashtags(area: str, aliases: list[str]) -> list[str]:
71
- """์ง€์—ญ๋ช… + ์ผ๋ฐ˜ ์ ‘๋ฏธ์‚ฌ ์กฐํ•ฉ์œผ๋กœ ํ•ด์‹œํƒœ๊ทธ ์ƒ์„ฑ."""
72
- tags = []
73
- for suffix in GENERAL_SUFFIXES:
74
- tags.append(f"{area}{suffix}")
75
- for alias in aliases:
76
- tags.append(alias)
77
- return tags
78
-
79
-
80
- # โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
81
- # ์นดํ…Œ๊ณ ๋ฆฌ 2: ํ•˜์œ„ ์ง€๋ช… ํ•ด์‹œํƒœ๊ทธ
82
- # โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
83
-
84
- SUB_AREA_SUFFIXES = ["ํ•ด๋ณ€", "ํ•ด์ˆ˜์š•์žฅ"]
85
 
 
 
 
86
 
87
- def build_sub_area_hashtags(sub_names: list[str]) -> list[str]:
88
- """ํ•˜์œ„ ์ง€๋ช… + ์ง€ํ˜• ์ ‘๋ฏธ์‚ฌ ์กฐํ•ฉ."""
89
- tags = []
90
- for name in sub_names:
91
- for suffix in SUB_AREA_SUFFIXES:
92
- tags.append(f"{name}{suffix}")
93
- return tags
94
 
95
-
96
- # โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
97
- # ์นดํ…Œ๊ณ ๋ฆฌ 3: ์žฅ์†Œ ๊ณ ์œ ๋ช…์‚ฌ (๋™์  ์ƒ์„ฑ)
98
- # โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
99
-
100
-
101
- def build_spot_hashtags(supabase_client, limit: int = 15) -> list[str]:
102
- """trend_spots ํ…Œ์ด๋ธ”์—์„œ ์ธ๊ธฐ ์žฅ์†Œ๋ช…์„ ํ•ด์‹œํƒœ๊ทธ ํ›„๋ณด๋กœ ์ถ”์ถœ."""
103
- try:
104
- result = (
105
- supabase_client.table("trend_spots")
106
- .select("name, category")
107
- .in_("category", ["๊ด€๊ด‘๋ช…์†Œ", "์นดํŽ˜", "๋ฌธํ™”์‹œ์„ค"])
108
- .execute()
109
- )
110
- except Exception as e:
111
- logger.warning("trend_spots ์กฐํšŒ ์‹คํŒจ (๊ณ ์œ ๋ช…์‚ฌ ํ•ด์‹œํƒœ๊ทธ): %s", e)
112
- return []
113
-
114
- spot_names = []
115
- for row in result.data or []:
116
- name = row.get("name", "")
117
- if not name or len(name) < 2 or len(name) > 15:
118
- continue
119
- if " " in name:
120
- continue
121
- spot_names.append(name)
122
-
123
- spot_names.sort(key=len, reverse=True)
124
- return spot_names[:limit]
125
 
126
 
127
  # โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
128
- # ์ตœ์ข… ํ•ด์‹œํƒœ๊ทธ ๋ฆฌ์ŠคํŠธ ์กฐํ•ฉ
129
  # โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
130
 
131
-
132
- def build_all_hashtags(supabase_client) -> list[str]:
133
- """3๊ฐœ ์นดํ…Œ๊ณ ๋ฆฌ๋ฅผ ํ•ฉ์‚ฐํ•˜๊ณ  ์ค‘๋ณต ์ œ๊ฑฐํ•œ ์ตœ์ข… ํ•ด์‹œํƒœ๊ทธ ๋ฆฌ์ŠคํŠธ."""
134
- general = build_general_hashtags(AREA_NAME, AREA_ALIASES)
135
- sub_area = build_sub_area_hashtags(AREA_SUB_NAMES)
136
- spots = build_spot_hashtags(supabase_client, limit=15)
137
-
138
- seen: set[str] = set()
139
- all_tags: list[str] = []
140
- for tag in general + sub_area + spots:
141
- if tag not in seen:
142
- seen.add(tag)
143
- all_tags.append(tag)
144
-
145
- logger.info(
146
- "ํ•ด์‹œํƒœ๊ทธ ๊ตฌ์„ฑ: ์ผ๋ฐ˜ %d + ํ•˜์œ„์ง€๋ช… %d + ๊ณ ์œ ๋ช…์‚ฌ %d = ์ด %d๊ฐœ",
147
- len(general), len(sub_area), len(spots), len(all_tags),
148
- )
149
- return all_tags
150
-
151
-
152
- # โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•
153
- # Shared: ๊ณตํ†ต ์„ค์ •
154
- # โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•
155
-
156
- # ๊ธฐ๊ฐ„ ํ•„ํ„ฐ โ€” ์ง์ „ 1์ฃผ + 2~3์ผ ๋ฒ„ํผ
157
- TREND_WINDOW_DAYS = 10
158
-
159
- # ๊ฐ€์ค‘ ์ง‘๊ณ„ ์ƒ์ˆ˜
160
- MIN_ENGAGEMENT = 10 # ์ตœ์†Œ engagement ์ž„๊ณ„๊ฐ’ (๋ด‡/์ŠคํŒธ ํ•„ํ„ฐ)
161
- ENGAGEMENT_CAP = 1000 # ๋‹จ์ผ ๊ฒŒ์‹œ๋ฌผ engagement ์ƒํ•œ (์ธํ”Œ๋ฃจ์–ธ์„œ ์ง€๋ฐฐ ๋ฐฉ์ง€)
162
-
163
  # ๊ตญ๊ฐ€/๊ณตํ•ญ ๋‹จ์œ„๋งŒ BLACKLIST
164
  LOCATION_BLACKLIST = frozenset({
165
  "South Korea", "Korea", "๋Œ€ํ•œ๋ฏผ๊ตญ", "ํ•œ๊ตญ",
@@ -204,683 +103,293 @@ def clean_location_name(name: str) -> str:
204
  HASHTAG_RE = re.compile(r"#([\w๊ฐ€-ํžฃ]{2,30})")
205
 
206
 
207
- # โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•
208
- # Apify-Only: Actor ์„ค์ • & Location Cache
209
- # โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•
210
-
211
- # ํ•ด์‹œํƒœ๊ทธ๋‹น ์ตœ๋Œ€ ๊ฒŒ์‹œ๋ฌผ ์ˆ˜ (์ธ์Šคํƒ€๊ทธ๋žจ ๊ณต๊ฐœ API 1ํŽ˜์ด์ง€ = ~30๊ฑด)
212
- RESULTS_LIMIT_PER_HASHTAG = 30
213
-
214
- # Actor ์„ค์ •
215
- ACTOR_ID = "apify/instagram-hashtag-scraper"
216
- SEARCH_ACTOR_ID = "apify/instagram-search-scraper"
217
-
218
- # Location ID ์บ์‹œ ํŒŒ์ผ ๊ฒฝ๋กœ
219
- LOCATION_CACHE_FILE = os.path.join(
220
- os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))),
221
- "data", "instagram_location_ids.json",
222
- )
223
-
224
-
225
- def discover_location_ids(
226
- apify_client: ApifyClient,
227
- spot_names: list[str],
228
- area: str = "์• ์›”",
229
- ) -> dict[str, dict]:
230
- """์ฃผ์š” ์žฅ์†Œ์˜ Instagram Location ID๋ฅผ ํƒ์ƒ‰ํ•œ๋‹ค.
231
-
232
- Instagram Search Scraper๋กœ ์žฅ์†Œ๋ช… ๊ฒ€์ƒ‰ โ†’ locationId ์ถ”์ถœ.
233
- ์ดˆ๊ธฐ 1ํšŒ ์‹คํ–‰ ๋˜๋Š” ์›” 1ํšŒ ๊ฐฑ์‹ ์šฉ. ๊ฒฐ๊ณผ๋Š” ํŒŒ์ผ์— ์บ์‹ฑ.
234
- """
235
- location_map: dict[str, dict] = {}
236
-
237
- for name in spot_names:
238
- try:
239
- run_input = {
240
- "search": f"{name} {area}",
241
- "searchType": "place",
242
- "resultsLimit": 3,
243
- }
244
- run = apify_client.actor(SEARCH_ACTOR_ID).call(
245
- run_input=run_input,
246
- timeout_secs=60,
247
- )
248
- items = list(
249
- apify_client.dataset(run["defaultDatasetId"]).iterate_items()
250
- )
251
-
252
- for item in items:
253
- loc_id = item.get("locationId") or item.get("id")
254
- loc_name = item.get("name", "")
255
- if loc_id and loc_name:
256
- url = f"https://www.instagram.com/explore/locations/{loc_id}/"
257
- location_map[url] = {
258
- "instagram_name": loc_name,
259
- "search_query": name,
260
- }
261
- logger.info("Location ID ํ™•๋ณด: %s โ†’ %s (%s)", name, loc_id, loc_name)
262
- break # ์ฒซ ๋ฒˆ์งธ ๋งค์นญ๋งŒ
263
-
264
- except Exception as e:
265
- logger.warning("Location ID ํƒ์ƒ‰ ์‹คํŒจ: %s โ€” %s", name, e)
266
- continue
267
-
268
- logger.info("Location ID ํƒ์ƒ‰ ์™„๋ฃŒ: %d/%d๊ฐœ ์„ฑ๊ณต", len(location_map), len(spot_names))
269
- return location_map
270
-
271
-
272
- def load_or_discover_locations(
273
- apify_client: ApifyClient, supabase_client, max_age_days: int = 30,
274
- ) -> dict[str, dict]:
275
- """์บ์‹œ๋œ Location ID๋ฅผ ๋ถˆ๋Ÿฌ์˜ค๊ฑฐ๋‚˜, ์—†์œผ๋ฉด ํƒ์ƒ‰ํ•œ๋‹ค."""
276
- if os.path.exists(LOCATION_CACHE_FILE):
277
- try:
278
- with open(LOCATION_CACHE_FILE) as f:
279
- cached = json.load(f)
280
- updated_at = cached.get("updated_at", "")
281
- if updated_at:
282
- updated = datetime.fromisoformat(updated_at)
283
- if (datetime.now(timezone.utc) - updated).days < max_age_days:
284
- locations = cached.get("locations", {})
285
- logger.info("Location ID ์บ์‹œ ์‚ฌ์šฉ (%d๊ฐœ, %s)", len(locations), updated_at[:10])
286
- return locations
287
- except (json.JSONDecodeError, ValueError, KeyError) as e:
288
- logger.warning("Location ID ์บ์‹œ ํŒŒ์‹ฑ ์‹คํŒจ: %s", e)
289
-
290
- # ์บ์‹œ ์—†๊ฑฐ๋‚˜ ๋งŒ๋ฃŒ โ†’ ์žฌํƒ์ƒ‰
291
- spot_names = build_spot_hashtags(supabase_client, limit=15)
292
- if not spot_names:
293
- logger.warning("๊ณ ์œ ๋ช…์‚ฌ ํ•ด์‹œํƒœ๊ทธ 0๊ฐœ โ€” Location ID ํƒ์ƒ‰ ์Šคํ‚ต")
294
- return {}
295
-
296
- locations = discover_location_ids(apify_client, spot_names)
297
-
298
- # ์บ์‹œ ์ €์žฅ
299
- try:
300
- os.makedirs(os.path.dirname(LOCATION_CACHE_FILE), exist_ok=True)
301
- with open(LOCATION_CACHE_FILE, "w") as f:
302
- json.dump({
303
- "updated_at": datetime.now(timezone.utc).isoformat(),
304
- "locations": locations,
305
- }, f, ensure_ascii=False, indent=2)
306
- logger.info("Location ID ์บ์‹œ ์ €์žฅ: %s (%d๊ฐœ)", LOCATION_CACHE_FILE, len(locations))
307
- except OSError as e:
308
- logger.warning("Location ID ์บ์‹œ ์ €์žฅ ์‹คํŒจ: %s", e)
309
-
310
- return locations
311
-
312
-
313
- def build_direct_urls_with_spot_ids(
314
- location_map: dict[str, dict], spot_matcher,
315
- ) -> dict[str, str]:
316
- """Location URL โ†’ spot_id ๋งคํ•‘์„ ๊ตฌ์ถ•ํ•œ๋‹ค."""
317
- url_to_spot: dict[str, str] = {}
318
- for url, info in location_map.items():
319
- search_query = info.get("search_query", "")
320
- if not search_query or not spot_matcher:
321
- continue
322
- spot_id = spot_matcher.match(search_query)
323
- if spot_id:
324
- url_to_spot[url] = spot_id
325
- logger.info("directUrl ๋งคํ•‘: %s โ†’ %s", search_query, spot_id)
326
- else:
327
- logger.debug("directUrl ๋งคํ•‘ ์‹คํŒจ: %s", search_query)
328
-
329
- logger.info("directUrl ๋งคํ•‘ ์™„๋ฃŒ: %d/%d๊ฐœ ์„ฑ๊ณต", len(url_to_spot), len(location_map))
330
- return url_to_spot
331
-
332
-
333
- # โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•
334
- # EnsembleData-Only: API, Budget, Post Conversion
335
- # โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•
336
 
337
- ENSEMBLEDATA_BASE_URL = "https://ensembledata.com/apis"
338
- API_TIMEOUT_SECS = 30
339
-
340
- # ์šฐ์„ ์ˆœ์œ„ ํ•ด์‹œํƒœ๊ทธ (Free Trial ๋กœํ…Œ์ด์…˜์šฉ)
341
- PRIORITY_HASHTAGS = [
342
- "์• ์›”์นดํŽ˜", # ์›” (Day 0)
343
- "์• ์›”๋ง›์ง‘", # ํ™” (Day 1)
344
- "์• ์›”์—ฌํ–‰", # ์ˆ˜ (Day 2)
345
- "์ œ์ฃผ์• ์›”", # ๋ชฉ (Day 3)
346
- "์• ์›”ํ•ซํ”Œ", # ๊ธˆ (Day 4)
347
- "์• ์›”๊ฐ์„ฑ", # ํ†  (Day 5)
348
- # ์ผ์š”์ผ (Day 6) = ์ง‘๊ณ„ ์ „๏ฟฝ๏ฟฝ๏ฟฝ, ์ˆ˜์ง‘ ์—†์Œ
 
 
 
 
349
  ]
350
 
351
 
352
- def get_daily_hashtags(all_hashtags: list[str], daily_budget: int) -> list[str]:
353
- """์ผ๋ณ„ ์˜ˆ์‚ฐ์— ๋งž์ถฐ ์˜ค๋Š˜ ์ˆ˜์ง‘ํ•  ํ•ด์‹œํƒœ๊ทธ ๋ชฉ๋ก์„ ๋ฐ˜ํ™˜ํ•œ๋‹ค.
354
-
355
- - daily_budget >= 1500 (Wood ํ”Œ๋žœ): ์ „์ฒด ํ•ด์‹œํƒœ๊ทธ ๋ฐ˜ํ™˜
356
- - daily_budget < 1500 (Free Trial): ์š”์ผ ๊ธฐ๋ฐ˜ ๋กœํ…Œ์ด์…˜
357
- """
358
- if daily_budget >= 1500:
359
- return all_hashtags
360
-
361
- day_of_week = date.today().weekday() # 0=์›”, 6=์ผ
362
-
363
- if day_of_week == 6:
364
- logger.info("์ผ์š”์ผ โ€” ์ˆ˜์ง‘ ์Šคํ‚ต (์ง‘๊ณ„ ์ „์šฉ)")
365
- return []
366
-
367
- if day_of_week < len(PRIORITY_HASHTAGS):
368
- tag = PRIORITY_HASHTAGS[day_of_week]
369
- logger.info("Free Trial ๋กœํ…Œ์ด์…˜: %s์š”์ผ โ†’ #%s", "์›”ํ™”์ˆ˜๋ชฉ๊ธˆํ† "[day_of_week], tag)
370
- return [tag]
371
-
372
- return []
373
-
374
-
375
- class BudgetTracker:
376
- """์ผ๋ณ„ EnsembleData ์œ ๋‹› ์‚ฌ์šฉ๋Ÿ‰ ์ถ”์ .
377
-
378
- ์ƒํƒœ ํŒŒ์ผ์— ์˜ค๋Š˜ ์‚ฌ์šฉ๋Ÿ‰์„ ๊ธฐ๋กํ•˜์—ฌ ์žฌ์‹คํ–‰ ์‹œ์—๋„ ์˜ˆ์‚ฐ ์ดˆ๊ณผ๋ฅผ ๋ฐฉ์ง€ํ•œ๋‹ค.
379
- """
380
-
381
- def __init__(self, daily_limit: int, state_file: str | None = None):
382
- self.daily_limit = daily_limit
383
- self.state_file = state_file or os.path.join(
384
- os.environ.get("BUDGET_STATE_DIR", "/tmp"),
385
- "ed_budget_state.json",
386
- )
387
- self.used_today = self._load_today_usage()
388
-
389
- @property
390
- def remaining(self) -> int:
391
- return max(0, self.daily_limit - self.used_today)
392
-
393
- def can_afford(self, estimated_posts: int = 70) -> bool:
394
- """์˜ˆ์ƒ ๊ฒŒ์‹œ๋ฌผ ์ˆ˜ ๊ธฐ์ค€์œผ๋กœ ์˜ˆ์‚ฐ ๋‚ด์ธ์ง€ ํ™•์ธ."""
395
- return self.remaining >= estimated_posts
396
-
397
- def record(self, units: int) -> None:
398
- """์œ ๋‹› ์‚ฌ์šฉ ๊ธฐ๋ก."""
399
- self.used_today += units
400
- self._save_state()
401
- logger.info(
402
- "์œ ๋‹› ์‚ฌ์šฉ: +%d (์˜ค๋Š˜ ํ•ฉ๊ณ„: %d/%d, ์ž”์—ฌ: %d)",
403
- units, self.used_today, self.daily_limit, self.remaining,
404
- )
405
-
406
- def _load_today_usage(self) -> int:
407
- try:
408
- with open(self.state_file) as f:
409
- state = json.load(f)
410
- if state.get("date") == date.today().isoformat():
411
- return state.get("used", 0)
412
- except (FileNotFoundError, json.JSONDecodeError, KeyError):
413
- pass
414
- return 0
415
-
416
- def _save_state(self) -> None:
417
- try:
418
- os.makedirs(os.path.dirname(self.state_file), exist_ok=True)
419
- with open(self.state_file, "w") as f:
420
- json.dump({
421
- "date": date.today().isoformat(),
422
- "used": self.used_today,
423
- "limit": self.daily_limit,
424
- }, f)
425
- except OSError as e:
426
- logger.warning("์˜ˆ์‚ฐ ์ƒํƒœ ์ €์žฅ ์‹คํŒจ: %s", e)
427
-
428
-
429
- def _extract_caption(node: dict) -> str:
430
- """EnsembleData node์—์„œ ์บก์…˜ ํ…์ŠคํŠธ ์ถ”์ถœ."""
431
- edges = node.get("edge_media_to_caption", {}).get("edges", [])
432
- if edges:
433
- return edges[0].get("node", {}).get("text", "")
434
- return ""
435
-
436
-
437
- def _extract_likes(node: dict) -> int:
438
- """์ข‹์•„์š” ์ˆ˜ ์ถ”์ถœ (null/hidden likes ์ฒ˜๋ฆฌ)."""
439
- likes = node.get("edge_liked_by", {}).get("count")
440
- if likes is not None:
441
- return max(likes, 0)
442
- likes = node.get("edge_media_preview_like", {}).get("count")
443
- if likes is not None:
444
- return max(likes, 0)
445
- return 0
446
-
447
-
448
- def _convert_node_to_post(node: dict, hashtag: str) -> dict:
449
- """EnsembleData node โ†’ ํ‘œ์ค€ post dict๋กœ ๋ณ€ํ™˜."""
450
- caption = _extract_caption(node)
451
- hashtags = HASHTAG_RE.findall(caption)
452
-
453
- location = node.get("location") or {}
454
- location_name = location.get("name", "")
455
-
456
- ts = node.get("taken_at_timestamp")
457
- timestamp_iso = ""
458
- if ts:
459
- try:
460
- timestamp_iso = datetime.fromtimestamp(ts, tz=timezone.utc).isoformat()
461
- except (ValueError, OSError):
462
- pass
463
-
464
- shortcode = node.get("shortcode", "")
465
-
466
- return {
467
- "search_term": hashtag,
468
- "search_type": "hashtag",
469
- "location_name": location_name,
470
- "likes_count": _extract_likes(node),
471
- "comments_count": node.get("edge_media_to_comment", {}).get("count", 0) or 0,
472
- "caption": caption,
473
- "timestamp": timestamp_iso,
474
- "url": f"https://www.instagram.com/p/{shortcode}/" if shortcode else "",
475
- "hashtags": hashtags,
476
- "_location_lat": location.get("lat"),
477
- "_location_lng": location.get("lng"),
478
- "_location_address": location.get("address", ""),
479
- "_location_pk": location.get("pk"),
480
- }
481
-
482
-
483
  # โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•
484
- # InstagramCollector โ€” Dual Backend
485
  # โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•
486
 
487
 
488
  class InstagramCollector:
489
- """Instagram ์ˆ˜์ง‘๊ธฐ (v4.1 โ€” Dual Backend).
490
 
491
- INSTAGRAM_BACKEND ํ™˜๊ฒฝ๋ณ€์ˆ˜์— ๋”ฐ๋ผ Apify ๋˜๋Š” EnsembleData๋ฅผ ์‚ฌ์šฉํ•ฉ๋‹ˆ๋‹ค.
 
 
492
  """
493
 
 
 
 
 
494
  def __init__(self, supabase_client, spot_matcher=None):
495
  self.supabase = supabase_client
496
  self.spot_matcher = spot_matcher
497
- self.extractor = PlaceNameExtractor(supabase_client)
498
-
499
- if INSTAGRAM_BACKEND == "apify":
500
- self.apify = ApifyClient(os.environ["APIFY_API_TOKEN"])
501
- elif INSTAGRAM_BACKEND == "ed":
502
- self.token = os.environ.get("ENSEMBLEDATA_TOKEN", "")
503
- if not self.token:
504
- raise ValueError("ENSEMBLEDATA_TOKEN ํ™˜๊ฒฝ๋ณ€์ˆ˜๊ฐ€ ์„ค์ •๋˜์ง€ ์•Š์•˜์Šต๋‹ˆ๋‹ค")
505
- daily_limit = int(os.environ.get("ED_DAILY_UNIT_BUDGET", "50"))
506
- self.budget = BudgetTracker(daily_limit=daily_limit)
507
- self.http = httpx.Client(timeout=API_TIMEOUT_SECS)
508
- else:
509
- raise ValueError(
510
- f"Unknown INSTAGRAM_BACKEND: {INSTAGRAM_BACKEND!r} "
511
- "(expected 'apify' or 'ed')"
512
- )
513
-
514
- def _get_source_name(self) -> str:
515
- """DB source ์ปฌ๋Ÿผ ๊ฐ’: instagram_apify ๋˜๋Š” instagram_ed."""
516
- return f"instagram_{INSTAGRAM_BACKEND}"
517
 
518
  # ==================================================================
519
  # Main Entry Point
520
  # ==================================================================
521
 
522
  def run(self) -> dict:
523
- """Instagram ์ˆ˜์ง‘ ํŒŒ์ดํ”„๋ผ์ธ โ€” ๋ฐฑ์—”๋“œ๋ณ„ ๋ถ„๊ธฐ."""
524
- if INSTAGRAM_BACKEND == "apify":
525
- return self._run_apify()
526
- elif INSTAGRAM_BACKEND == "ed":
527
- return self._run_ensembledata()
528
- else:
529
- raise ValueError(f"Unknown INSTAGRAM_BACKEND: {INSTAGRAM_BACKEND!r}")
530
-
531
- # ==================================================================
532
- # Apify Backend (v3.6)
533
- # ==================================================================
534
-
535
- ACTOR_MEMORY_MB = 1024
536
- ACTOR_TIMEOUT_SECS = 120
537
-
538
- def _run_apify(self) -> dict:
539
- """Apify ๋ฐฑ์—”๋“œ ์ˆ˜์ง‘ ํŒŒ์ดํ”„๋ผ์ธ.
540
-
541
- [1] ํ•ด์‹œํƒœ๊ทธ ๋ฆฌ์ŠคํŠธ ๋™์  ์ƒ์„ฑ (์ผ๋ฐ˜ + ํ•˜์œ„์ง€๋ช… + ๊ณ ์œ ๋ช…์‚ฌ)
542
- [2] ํ•ด์‹œํƒœ๊ทธ ๊ฒ€์ƒ‰์œผ๋กœ ๊ฒŒ์‹œ๋ฌผ ์ˆ˜์ง‘
543
- [3] directUrls๋กœ ์ฃผ์š” ์žฅ์†Œ ๊ฒŒ์‹œ๋ฌผ ์ถ”๊ฐ€ ์ˆ˜์ง‘
544
- [4] 3๋‹จ๊ณ„ ๋งค์นญ + ๊ธฐ๊ฐ„ ํ•„ํ„ฐ + ์ž„๊ณ„๊ฐ’ ํ•„ํ„ฐ + ๊ฐ€์ค‘ ์ง‘๊ณ„
545
- [5] DB ์ €์žฅ
546
  """
547
- logger.info("=== Instagram ์ˆ˜์ง‘ ์‹œ์ž‘ (v3.6 โ€” Apify) ===")
548
 
549
- # [1] ํ•ด์‹œํƒœ๊ทธ ๋™์  ์ƒ์„ฑ
550
- hashtags = build_all_hashtags(self.supabase)
 
 
 
551
 
552
- # [2] ํ•ด์‹œํƒœ๊ทธ ๊ฒ€์ƒ‰
553
- search_posts = self._collect_posts_apify(hashtags)
554
 
555
- # [3] directUrls (Location ID ์บ์‹œ โ†’ spot_id ๋งคํ•‘ โ†’ ์ˆ˜์ง‘)
556
- location_map = load_or_discover_locations(self.apify, self.supabase)
557
- direct_urls = build_direct_urls_with_spot_ids(location_map, self.spot_matcher)
558
- direct_posts = self._collect_direct_location_posts(direct_urls)
559
-
560
- # [4] ํ†ตํ•ฉ ์ง‘๊ณ„
561
- all_posts = search_posts + direct_posts
562
 
563
  if not all_posts:
564
  logger.warning("์ˆ˜์ง‘๋œ ๊ฒŒ์‹œ๋ฌผ ์—†์Œ โ€” ์ข…๋ฃŒ")
565
- return {"total_posts": 0, "spots_matched": 0, "saved": 0, "backend": "apify"}
 
 
 
 
 
 
 
566
 
567
- spot_metrics = self.aggregate_spot_metrics(all_posts)
 
 
 
 
 
568
 
569
- # [5] DB ์ €์žฅ
 
 
 
570
  saved = self._save_to_db(spot_metrics)
571
 
 
 
 
 
 
 
 
572
  result = {
573
- "backend": "apify",
574
- "hashtag_count": len(hashtags),
575
  "total_posts": len(all_posts),
576
- "search_posts": len(search_posts),
577
- "direct_posts": len(direct_posts),
578
- "spots_matched": len(spot_metrics),
579
  "saved": saved,
580
  }
581
- logger.info("=== Instagram ์ˆ˜์ง‘ ์™„๋ฃŒ (v3.6 โ€” Apify): %s ===", result)
582
  return result
583
 
584
- def _scrape_hashtag(self, hashtag: str, label: str) -> list[dict]:
585
- """๋‹จ์ผ ํ•ด์‹œํƒœ๊ทธ ํŽ˜์ด์ง€์—์„œ ๊ฒŒ์‹œ๋ฌผ์„ ์Šคํฌ๋žฉํ•œ๋‹ค. ์‹คํŒจ ์‹œ 1ํšŒ ์žฌ์‹œ๋„."""
586
- posts = self._execute_hashtag_actor(hashtag, label)
587
-
588
- if len(posts) == 0:
589
- logger.info("[%s] ๊ฒฐ๊ณผ 0๊ฑด โ€” 1ํšŒ ์žฌ์‹œ๋„", label)
590
- posts = self._execute_hashtag_actor(hashtag, f"{label} retry")
591
-
592
- return posts
593
 
594
- def _execute_hashtag_actor(self, hashtag: str, label: str) -> list[dict]:
595
- """instagram-hashtag-scraper Actor 1ํšŒ ์‹คํ–‰."""
596
- run_input = {
597
- "hashtags": [hashtag],
598
- "resultsLimit": RESULTS_LIMIT_PER_HASHTAG,
599
- "proxy": {
600
- "useApifyProxy": True,
601
- "apifyProxyGroups": ["RESIDENTIAL"],
602
- },
603
- }
604
 
 
 
605
  try:
606
- run = self.apify.actor(ACTOR_ID).call(
607
- run_input=run_input,
608
- timeout_secs=self.ACTOR_TIMEOUT_SECS,
609
- memory_mbytes=self.ACTOR_MEMORY_MB,
 
 
 
610
  )
 
 
 
 
611
  except Exception as e:
612
- logger.warning("Apify Actor ์‹คํ–‰ ์‹คํŒจ [hashtag=%s]: %s", hashtag, e)
613
- return []
614
-
615
- if run.get("status") not in ("SUCCEEDED", None):
616
- logger.warning(
617
- "Apify Actor ๋น„์ •์ƒ ์ข…๋ฃŒ [hashtag=%s]: status=%s",
618
- hashtag, run.get("status"),
619
- )
620
- return []
621
-
622
- posts: list[dict] = []
623
- dataset_id = run["defaultDatasetId"]
624
-
625
- for item in self.apify.dataset(dataset_id).iterate_items():
626
- likes = item.get("likesCount", 0)
627
- if likes == -1:
628
- likes = 0
629
-
630
- hashtags = item.get("hashtags") or []
631
- if not hashtags:
632
- caption = item.get("caption", "")
633
- if caption:
634
- hashtags = HASHTAG_RE.findall(caption)
635
-
636
- posts.append({
637
- "search_term": hashtag,
638
- "search_type": "hashtag",
639
- "location_name": item.get("locationName", ""),
640
- "likes_count": likes,
641
- "comments_count": item.get("commentsCount", 0),
642
- "caption": item.get("caption", ""),
643
- "timestamp": item.get("timestamp", ""),
644
- "url": item.get("url", ""),
645
- "hashtags": hashtags,
646
- })
647
-
648
- logger.info("[%s] #%s โ†’ %d๊ฑด", label, hashtag, len(posts))
649
- return posts
650
-
651
- def _collect_posts_apify(self, hashtags: list[str]) -> list[dict]:
652
- """ํ•ด์‹œํƒœ๊ทธ ๋ฆฌ์ŠคํŠธ์—์„œ ๊ฒŒ์‹œ๋ฌผ์„ ์ˆ˜์ง‘ํ•œ๋‹ค (Apify). URL ๊ธฐ๋ฐ˜ ์ค‘๋ณต ์ œ๊ฑฐ."""
653
- logger.info(
654
- "Apify Actor ์‹คํ–‰ ์‹œ์ž‘: %d๊ฐœ ํ•ด์‹œํƒœ๊ทธ (resultsLimit=%d)",
655
- len(hashtags), RESULTS_LIMIT_PER_HASHTAG,
656
- )
657
-
658
- all_posts: list[dict] = []
659
-
660
- for i, tag in enumerate(hashtags, 1):
661
- label = f"{i}/{len(hashtags)}"
662
- posts = self._scrape_hashtag(tag, label)
663
- all_posts.extend(posts)
664
 
665
- unique_posts = _dedup_posts_by_url(all_posts)
666
- logger.info("๊ฒŒ์‹œ๋ฌผ ์ˆ˜์ง‘ ์™„๋ฃŒ: %d๊ฑด (%d๊ฐœ ํ•ด์‹œํƒœ๊ทธ)", len(unique_posts), len(hashtags))
667
- return unique_posts
668
-
669
- def _collect_direct_location_posts(self, direct_urls: dict[str, str]) -> list[dict]:
670
- """์ธ์Šคํƒ€๊ทธ๋žจ ์œ„์น˜ ํŽ˜์ด์ง€์—์„œ ๊ฒŒ์‹œ๋ฌผ์„ ์ˆ˜์ง‘ํ•œ๋‹ค.
671
-
672
- ๊ฐ ์œ„์น˜ ํŽ˜์ด์ง€์˜ spot_id๊ฐ€ ์ด๋ฏธ ํ™•์ •๋˜์–ด ์žˆ์œผ๋ฏ€๋กœ
673
- ๊ฒŒ์‹œ๋ฌผ์— _direct_spot_id๋ฅผ ์ฒจ๋ถ€ํ•œ๋‹ค.
674
- """
675
- if not direct_urls:
676
- logger.info("directUrls ์—†์Œ โ€” ๋ฐฉํ–ฅ B ์Šคํ‚ต")
677
- return []
678
-
679
- logger.info("directUrls ์ˆ˜์ง‘ ์‹œ์ž‘: %d๊ฐœ ์œ„์น˜ ํŽ˜์ด์ง€", len(direct_urls))
680
- all_posts: list[dict] = []
681
-
682
- for url, spot_id in direct_urls.items():
683
- run_input = {
684
- "directUrls": [url],
685
- "resultsLimit": RESULTS_LIMIT_PER_HASHTAG,
686
- "proxy": {
687
- "useApifyProxy": True,
688
- "apifyProxyGroups": ["RESIDENTIAL"],
689
- },
690
- }
691
 
 
 
 
 
 
 
692
  try:
693
- run = self.apify.actor(ACTOR_ID).call(
694
- run_input=run_input,
695
- timeout_secs=self.ACTOR_TIMEOUT_SECS,
696
- memory_mbytes=self.ACTOR_MEMORY_MB,
697
- )
698
  except Exception as e:
699
- logger.warning("directUrls Actor ์‹คํ–‰ ์‹คํŒจ [%s]: %s", url, e)
700
- continue
701
-
702
- if run.get("status") not in ("SUCCEEDED", None):
703
- logger.warning("directUrls Actor ๋น„์ •์ƒ ์ข…๋ฃŒ [%s]: status=%s", url, run.get("status"))
704
- continue
705
-
706
- dataset_id = run["defaultDatasetId"]
707
- count = 0
708
- for item in self.apify.dataset(dataset_id).iterate_items():
709
- likes = item.get("likesCount", 0)
710
- if likes == -1:
711
- likes = 0
712
-
713
- all_posts.append({
714
- "search_term": "__direct__",
715
- "search_type": "direct",
716
- "location_name": item.get("locationName", ""),
717
- "likes_count": likes,
718
- "comments_count": item.get("commentsCount", 0),
719
- "caption": item.get("caption", ""),
720
- "timestamp": item.get("timestamp", ""),
721
- "url": item.get("url", ""),
722
- "hashtags": item.get("hashtags") or [],
723
- "_direct_spot_id": spot_id,
724
- })
725
- count += 1
726
-
727
- logger.info("directUrls [%s] โ†’ %d๊ฑด (spot_id=%s)", url, count, spot_id)
728
 
729
- logger.info("directUrls ์ˆ˜์ง‘ ์™„๋ฃŒ: %d๊ฑด", len(all_posts))
730
- return all_posts
731
 
732
  # ==================================================================
733
- # EnsembleData Backend (v4.0)
734
  # ==================================================================
735
 
736
- def _run_ensembledata(self) -> dict:
737
- """EnsembleData ๋ฐฑ์—”๋“œ ์ˆ˜์ง‘ ํŒŒ์ดํ”„๋ผ์ธ.
738
 
739
- [1] ํ•ด์‹œํƒœ๊ทธ ๋ฆฌ์ŠคํŠธ ๋™์  ์ƒ์„ฑ
740
- [2] ์ผ๋ณ„ ์˜ˆ์‚ฐ์— ๋งž์ถฐ ํ•ด์‹œํƒœ๊ทธ ์„ ์ •
741
- [3] EnsembleData API๋กœ ๊ฒŒ์‹œ๋ฌผ ์ˆ˜์ง‘
742
- [4] 3๋‹จ๊ณ„ ๋งค์นญ + ๊ฐ€์ค‘ ์ง‘๊ณ„
743
- [5] DB ์ €์žฅ
744
  """
745
- logger.info("=== Instagram ์ˆ˜์ง‘ ์‹œ์ž‘ (v4.0 โ€” EnsembleData) ===")
746
  logger.info(
747
- "์ผ๋ณ„ ์˜ˆ์‚ฐ: %d/%d์œ ๋‹› (์ž”์—ฌ: %d)",
748
- self.budget.used_today, self.budget.daily_limit, self.budget.remaining,
749
  )
750
 
751
- # [1] ํ•ด์‹œํƒœ๊ทธ ๋™์  ์ƒ์„ฑ
752
- all_hashtags = build_all_hashtags(self.supabase)
753
-
754
- # [2] ์˜ˆ์‚ฐ์— ๋งž์ถฐ ์˜ค๋Š˜์˜ ํ•ด์‹œํƒœ๊ทธ ์„ ์ •
755
- hashtags = get_daily_hashtags(all_hashtags, self.budget.daily_limit)
756
-
757
- if not hashtags:
758
- logger.info("์˜ค๋Š˜ ์ˆ˜์ง‘ํ•  ํ•ด์‹œํƒœ๊ทธ ์—†์Œ โ€” ์ข…๋ฃŒ")
759
- return {
760
- "total_posts": 0, "spots_matched": 0, "saved": 0,
761
- "backend": "ed", "reason": "no_hashtags_today",
762
- }
763
-
764
- # [3] ๊ฒŒ์‹œ๋ฌผ ์ˆ˜์ง‘
765
- posts = self._collect_posts_ed(hashtags)
766
-
767
- if not posts:
768
- logger.warning("์ˆ˜์ง‘๋œ ๊ฒŒ์‹œ๋ฌผ ์—†์Œ โ€” ์ข…๋ฃŒ")
769
- return {"total_posts": 0, "spots_matched": 0, "saved": 0, "backend": "ed"}
770
 
771
- # [4] ์ง‘๊ณ„
772
- spot_metrics = self.aggregate_spot_metrics(posts)
 
 
 
 
 
773
 
774
- # [5] DB ์ €์žฅ
775
- saved = self._save_to_db(spot_metrics)
 
776
 
777
- result = {
778
- "backend": "ed",
779
- "hashtag_count": len(hashtags),
780
- "hashtags_collected": hashtags,
781
- "total_posts": len(posts),
782
- "spots_matched": len(spot_metrics),
783
- "saved": saved,
784
- "budget_used": self.budget.used_today,
785
- "budget_remaining": self.budget.remaining,
786
  }
787
- logger.info("=== Instagram ์ˆ˜์ง‘ ์™„๋ฃŒ (v4.0 โ€” EnsembleData): %s ===", result)
788
- return result
789
-
790
- def _fetch_hashtag_posts(self, hashtag: str, label: str) -> list[dict]:
791
- """EnsembleData API๋กœ ํ•ด์‹œํƒœ๊ทธ ๊ฒŒ์‹œ๋ฌผ์„ ๊ฐ€์ ธ์˜จ๋‹ค.
792
-
793
- top_posts + recent_posts๋ฅผ ํ†ตํ•ฉํ•˜์—ฌ ๋ฐ˜ํ™˜ํ•œ๋‹ค.
794
- """
795
- if not self.budget.can_afford(estimated_posts=30):
796
- logger.warning("[%s] ์œ ๋‹› ์˜ˆ์‚ฐ ๋ถ€์กฑ (์ž”์—ฌ: %d) โ€” ์Šคํ‚ต", label, self.budget.remaining)
797
- return []
798
 
799
  try:
800
- resp = self.http.get(
801
- f"{ENSEMBLEDATA_BASE_URL}/instagram/hashtag/posts",
802
- params={"name": hashtag, "token": self.token},
 
803
  )
804
- except httpx.HTTPError as e:
805
- logger.warning("[%s] API ํ˜ธ์ถœ ์‹คํŒจ (#%s): %s", label, hashtag, e)
806
- return []
807
-
808
- if resp.status_code == 495:
809
- logger.error("[%s] ์ผ์ผ ์œ ๋‹› ํ•œ๋„ ์ดˆ๊ณผ โ€” ์ˆ˜์ง‘ ์ค‘๋‹จ", label)
810
- self.budget.record(self.budget.remaining)
811
  return []
812
 
813
- if resp.status_code != 200:
814
- logger.warning("[%s] API ์‘๋‹ต ์˜ค๋ฅ˜ (#%s): status=%d", label, hashtag, resp.status_code)
 
 
 
815
  return []
816
 
817
- data = resp.json().get("data", {})
818
-
819
  posts: list[dict] = []
820
- top_nodes = data.get("top_posts", [])
821
- recent_nodes = data.get("recent_posts", [])
822
 
823
- for item in top_nodes:
824
- node = item.get("node", item)
825
- posts.append(_convert_node_to_post(node, hashtag))
 
826
 
827
- for item in recent_nodes:
828
- node = item.get("node", item)
829
- posts.append(_convert_node_to_post(node, hashtag))
830
 
831
- self.budget.record(len(posts))
 
 
 
832
 
833
- logger.info(
834
- "[%s] #%s โ†’ %d๊ฑด (top %d + recent %d, ํ•ด์‹œํƒœ๊ทธ ์ „์ฒด %s๊ฑด)",
835
- label, hashtag, len(posts), len(top_nodes), len(recent_nodes),
836
- f"{data.get('count', 0):,}" if isinstance(data.get("count"), int) else "?",
837
- )
838
  return posts
839
 
840
- def _collect_posts_ed(self, hashtags: list[str]) -> list[dict]:
841
- """ํ•ด์‹œํƒœ๊ทธ ๋ฆฌ์ŠคํŠธ์—์„œ ๊ฒŒ์‹œ๋ฌผ์„ ์ˆ˜์ง‘ํ•œ๋‹ค (EnsembleData). URL ๊ธฐ๋ฐ˜ ์ค‘๋ณต ์ œ๊ฑฐ."""
842
- if not hashtags:
843
- logger.info("์ˆ˜์ง‘ํ•  ํ•ด์‹œํƒœ๊ทธ ์—†์Œ โ€” ์Šคํ‚ต")
844
- return []
845
-
846
- logger.info(
847
- "EnsembleData ์ˆ˜์ง‘ ์‹œ์ž‘: %d๊ฐœ ํ•ด์‹œํƒœ๊ทธ (์ผ์ผ ์˜ˆ์‚ฐ: %d/%d)",
848
- len(hashtags), self.budget.remaining, self.budget.daily_limit,
849
- )
850
-
851
- all_posts: list[dict] = []
852
-
853
- for i, tag in enumerate(hashtags, 1):
854
- if not self.budget.can_afford(estimated_posts=30):
855
- logger.warning("์œ ๋‹› ์˜ˆ์‚ฐ ์†Œ์ง„ โ€” ๋‚˜๋จธ์ง€ %d๊ฐœ ํ•ด์‹œํƒœ๊ทธ ์Šคํ‚ต", len(hashtags) - i + 1)
856
- break
857
-
858
- label = f"{i}/{len(hashtags)}"
859
- posts = self._fetch_hashtag_posts(tag, label)
860
- all_posts.extend(posts)
861
-
862
- unique_posts = _dedup_posts_by_url(all_posts)
863
- logger.info("๊ฒŒ์‹œ๋ฌผ ์ˆ˜์ง‘ ์™„๋ฃŒ: %d๊ฑด (%d๊ฐœ ํ•ด์‹œํƒœ๊ทธ)", len(unique_posts), len(hashtags))
864
- return unique_posts
 
 
 
 
 
 
 
 
 
 
 
 
865
 
866
  # ==================================================================
867
- # Shared: 3๋‹จ๊ณ„ ์žฅ์†Œ ๋งค์นญ (v3.6 โ€” Stage 0 directUrls ํฌํ•จ)
868
  # ==================================================================
869
 
870
  def _match_post_to_spot(self, post: dict) -> tuple[str | None, str]:
871
- """๊ฒŒ์‹œ๋ฌผ 1๊ฑด์— ๋Œ€ํ•ด spot_id๋ฅผ ๋งค์นญํ•œ๋‹ค.
872
 
873
  ๋งค์นญ ์šฐ์„ ์ˆœ์œ„:
874
- 0. directUrls ๊ฒŒ์‹œ๋ฌผ (_direct_spot_id ์ด๋ฏธ ํ™•์ •) โ€” Apify only
875
  1. locationName ํƒœ๊ทธ โ†’ ์ ‘๋‘์‚ฌ ์ œ๊ฑฐ โ†’ SpotMatcher
876
  2. hashtags ๋ฐฐ์—ด โ†’ SpotMatcher.match_hashtag() (๋ฐฉํ–ฅ ์ œํ•œ)
877
- 3. caption โ†’ PlaceNameExtractor โ†’ SpotMatcher
878
- """
879
- # Stage 0: directUrls (Apify only โ€” ED posts don't have this field)
880
- direct_sid = post.get("_direct_spot_id")
881
- if direct_sid:
882
- return direct_sid, "direct"
883
 
 
 
884
  # Stage 1: locationName โ†’ ์ ‘๋‘์‚ฌ ์ œ๊ฑฐ โ†’ SpotMatcher
885
  loc = post.get("location_name", "")
886
  if loc and loc not in LOCATION_BLACKLIST and self.spot_matcher:
@@ -900,42 +409,28 @@ class InstagramCollector:
900
  if sid:
901
  return sid, "hashtag"
902
 
903
- # Stage 3: caption โ†’ PlaceNameExtractor โ†’ SpotMatcher
904
- caption = post.get("caption", "")
905
- if caption and len(caption) >= 5:
906
- places = self.extractor.extract(caption)
907
- for place in places:
908
- if self.spot_matcher:
909
- sid = self.spot_matcher.match(place["name"])
910
- if sid:
911
- return sid, "caption"
912
- elif place.get("spot_id"):
913
- return place["spot_id"], "caption"
914
-
915
  return None, "unmatched"
916
 
917
  # ==================================================================
918
- # Shared: ๊ฐ€์ค‘ ์ง‘๊ณ„ (v3.6)
919
  # ==================================================================
920
 
921
- def aggregate_spot_metrics(self, posts: list[dict]) -> dict[str, dict]:
922
- """์ˆ˜์ง‘๋œ ๊ฒŒ์‹œ๋ฌผ์„ spot_id ๊ธฐ์ค€์œผ๋กœ ์ง‘๊ณ„ํ•œ๋‹ค.
923
-
924
- - ๊ธฐ๊ฐ„ ํ•„ํ„ฐ: ์ตœ๊ทผ TREND_WINDOW_DAYS(10์ผ) ์ด๋‚ด๋งŒ ์ฒ˜๋ฆฌ
925
- - ์ตœ์†Œ engagement ์ž„๊ณ„๊ฐ’: MIN_ENGAGEMENT(10) ๋ฏธ๋งŒ ์ œ์™ธ
926
- - engagement cap: ENGAGEMENT_CAP(1000) ์ดˆ๊ณผ ์‹œ ์ ˆ์‚ญ
927
- - weighted_score: sum(min(engagement, cap))
928
 
929
  Returns:
930
- {spot_id: {post_count, total_likes, total_comments,
931
- avg_engagement, weighted_score, match_methods, hashtags}}
932
  """
933
  cutoff = datetime.now(timezone.utc) - timedelta(days=TREND_WINDOW_DAYS)
934
 
935
  spot_metrics: dict[str, dict] = {}
936
- match_stats = {
937
- "direct": 0, "location_tag": 0,
938
- "hashtag": 0, "caption": 0, "unmatched": 0,
 
 
939
  "filtered_old": 0, "filtered_low_engagement": 0,
940
  }
941
 
@@ -951,14 +446,7 @@ class InstagramCollector:
951
  except (ValueError, TypeError):
952
  pass # ํŒŒ์‹ฑ ์‹คํŒจ ์‹œ ํฌํ•จ
953
 
954
- # 3๋‹จ๊ณ„ ๋งค์นญ
955
- spot_id, method = self._match_post_to_spot(post)
956
- match_stats[method] += 1
957
-
958
- if not spot_id:
959
- continue
960
-
961
- # ์ตœ์†Œ engagement ์ž„๊ณ„๊ฐ’
962
  likes = post.get("likes_count", 0) or 0
963
  comments = post.get("comments_count", 0) or 0
964
  engagement = likes + comments
@@ -967,44 +455,253 @@ class InstagramCollector:
967
  match_stats["filtered_low_engagement"] += 1
968
  continue
969
 
 
 
 
 
 
 
 
 
 
970
  # engagement cap
971
  capped_engagement = min(engagement, ENGAGEMENT_CAP)
972
 
973
  # ์ง‘๊ณ„
974
- if spot_id not in spot_metrics:
975
- spot_metrics[spot_id] = {
976
- "post_count": 0,
977
- "total_likes": 0,
978
- "total_comments": 0,
979
- "weighted_score": 0,
980
- "match_methods": set(),
981
- "hashtags": set(),
982
- }
983
-
984
- m = spot_metrics[spot_id]
985
- m["post_count"] += 1
986
- m["total_likes"] += likes
987
- m["total_comments"] += comments
988
- m["weighted_score"] += capped_engagement
989
- m["match_methods"].add(method)
990
- term = post.get("search_term", "")
991
- if term and term != "__direct__":
992
- m["hashtags"].add(term)
993
-
994
- # avg_engagement ๊ณ„์‚ฐ + set โ†’ list ๋ณ€ํ™˜
995
- for metrics in spot_metrics.values():
996
- count = max(metrics["post_count"], 1)
997
- metrics["avg_engagement"] = int(
998
- round((metrics["total_likes"] + metrics["total_comments"]) / count)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
999
  )
1000
- metrics["match_methods"] = sorted(metrics["match_methods"])
1001
- metrics["hashtags"] = sorted(metrics["hashtags"])
1002
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1003
  if match_stats["filtered_old"] > 0:
1004
  logger.info(
1005
- "๊ธฐ๊ฐ„ ํ•„ํ„ฐ: %d๊ฑด ์ œ์™ธ (์ตœ๊ทผ %d์ผ ์™ธ), %d๊ฑด ์ฒ˜๋ฆฌ",
1006
  match_stats["filtered_old"], TREND_WINDOW_DAYS,
1007
- len(posts) - match_stats["filtered_old"],
1008
  )
1009
  if match_stats["filtered_low_engagement"] > 0:
1010
  logger.info(
@@ -1012,29 +709,30 @@ class InstagramCollector:
1012
  match_stats["filtered_low_engagement"], MIN_ENGAGEMENT,
1013
  )
1014
  logger.info(
1015
- "์ŠคํŒŸ ๋งค์นญ ์™„๋ฃŒ: %d๊ฐœ ์ŠคํŒŸ "
1016
- "(์œ„์น˜ํƒœ๊ทธ %d, ํ•ด์‹œํƒœ๊ทธ %d, ์บก์…˜ %d, direct %d, ๋ฏธ์‹๋ณ„ %d)",
1017
- len(spot_metrics),
1018
  match_stats["location_tag"],
1019
  match_stats["hashtag"],
1020
- match_stats["caption"],
1021
- match_stats["direct"],
1022
  match_stats["unmatched"],
1023
  )
1024
- return spot_metrics
1025
 
1026
  # ==================================================================
1027
- # Shared: DB ์ €์žฅ
1028
  # ==================================================================
1029
 
1030
  def _save_to_db(self, spot_metrics: dict[str, dict]) -> int:
1031
  """์ง‘๊ณ„๋œ ๋ฉ”ํŠธ๋ฆญ์„ spot_trends ํ…Œ์ด๋ธ”์— ์ €์žฅํ•œ๋‹ค.
1032
 
1033
- source = instagram_apify ๋˜๋Š” instagram_ed (INSTAGRAM_BACKEND ๊ธฐ๋ฐ˜).
1034
  ์ €์žฅ ๋ฉ”ํŠธ๋ฆญ: post_count, avg_engagement, weighted_score
1035
  """
 
 
 
1036
  period_start, period_end = get_week_period()
1037
- source = self._get_source_name()
1038
  saved = 0
1039
 
1040
  for spot_id, metrics in spot_metrics.items():
@@ -1043,7 +741,7 @@ class InstagramCollector:
1043
  "total_comments": metrics["total_comments"],
1044
  "match_methods": metrics["match_methods"],
1045
  "hashtags": metrics["hashtags"],
1046
- "backend": INSTAGRAM_BACKEND,
1047
  }
1048
 
1049
  # post_count
@@ -1063,7 +761,7 @@ class InstagramCollector:
1063
  })
1064
  saved += 1
1065
  except Exception as e:
1066
- logger.warning("spot_trends ์ €์žฅ ์‹คํŒจ (%s post_count, %s): %s", source, spot_id, e)
1067
 
1068
  # avg_engagement
1069
  if metrics["avg_engagement"] > 0:
@@ -1078,7 +776,7 @@ class InstagramCollector:
1078
  "raw_data": {"match_methods": metrics["match_methods"]},
1079
  })
1080
  except Exception as e:
1081
- logger.warning("spot_trends ์ €์žฅ ์‹คํŒจ (%s avg_engagement, %s): %s", source, spot_id, e)
1082
 
1083
  # weighted_score
1084
  if metrics["weighted_score"] > 0:
@@ -1093,17 +791,164 @@ class InstagramCollector:
1093
  "raw_data": {},
1094
  })
1095
  except Exception as e:
1096
- logger.warning("spot_trends ์ €์žฅ ์‹คํŒจ (%s weighted_score, %s): %s", source, spot_id, e)
1097
 
1098
  logger.info("Instagram DB ์ €์žฅ: %d๊ฑด (%d ์ŠคํŒŸ, source=%s)", saved, len(spot_metrics), source)
1099
  return saved
1100
 
1101
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1102
  # โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•
1103
  # Shared Utility
1104
  # โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•
1105
 
1106
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1107
  def _dedup_posts_by_url(posts: list[dict]) -> list[dict]:
1108
  """URL ๊ธฐ๋ฐ˜ ์ค‘๋ณต ์ œ๊ฑฐ."""
1109
  seen_urls: set[str] = set()
 
1
  """
2
+ Instagram Collector โ€” Influencer Monitoring (v5.1 Multimodal)
3
+
4
+ ํ๋ ˆ์ดํŒ…๋œ ์ œ์ฃผ ์—ฌํ–‰ ์ธํ”Œ๋ฃจ์–ธ์„œ ๊ณ„์ •์˜ ์ตœ๊ทผ ๊ฒŒ์‹œ๋ฌผ์„ ์ˆ˜์ง‘ํ•ฉ๋‹ˆ๋‹ค.
5
+
6
+ ํŒŒ์ดํ”„๋ผ์ธ:
7
+ 1. influencer_accounts ํ…Œ์ด๋ธ”์—์„œ ํ™œ์„ฑ ๊ณ„์ • ๋ชฉ๋ก ๋กœ๋“œ (DB ์‹คํŒจ ์‹œ ๊ธฐ๋ณธ๊ฐ’ ํด๋ฐฑ)
8
+ 2. Apify instagram-profile-scraper๋กœ ๊ณ„์ •๋ณ„ ์ตœ๊ทผ ๊ฒŒ์‹œ๋ฌผ ์ˆ˜์ง‘
9
+ 3. ๊ฒŒ์‹œ๋ฌผ ์ •๊ทœํ™” + ์ค‘๋ณต ์ œ๊ฑฐ
10
+ 4. 2-pass ํ•˜์ด๋ธŒ๋ฆฌ๋“œ ์žฅ์†Œ ๋งค์นญ:
11
+ Pass 1 (๊ทœ์น™ ๊ธฐ๋ฐ˜): ์œ„์น˜ํƒœ๊ทธ โ†’ ํ•ด์‹œํƒœ๊ทธ ๋งค์นญ (๊ณ ์‹ ๋ขฐ ์‹ ํ˜ธ๋งŒ)
12
+ Pass 2 (AI ๋ฉ€ํ‹ฐ๋ชจ๋‹ฌ): Gemini 2.0 Flash๋กœ ์ฝ˜ํ…์ธ  ๋ถ„์„
13
+ - ์ด๋ฏธ์ง€ ๊ฒŒ์‹œ๊ธ€: ์ด๋ฏธ์ง€ + ์บก์…˜ โ†’ ๊ฐ„ํŒ/๊ฒฝ๊ด€/ํ…์ŠคํŠธ ์ธ์‹
14
+ - ๋ฆด์Šค(์˜์ƒ): ์˜์ƒ + ์บก์…˜ โ†’ ๋‚˜๋ ˆ์ด์…˜/์ž๋ง‰/๊ฐ„ํŒ ์ธ์‹
15
+ - ๋ฏธ๋””์–ด ์—†์Œ: ์บก์…˜ ํ…์ŠคํŠธ ๋ถ„์„ (ํด๋ฐฑ)
16
  5. ๊ฐ€์ค‘ ์ง‘๊ณ„: weighted_score = sum(min(engagement, cap))
17
+ 6. spot_trends ํ…Œ์ด๋ธ”์— ์ €์žฅ (source = instagram_influencer)
18
+ 7. ๊ณ„์ •๋ณ„ last_scraped_at ์—…๋ฐ์ดํŠธ
 
 
 
 
 
 
 
 
19
  """
20
 
21
  from __future__ import annotations
22
 
23
  import json
24
+ import logging
25
  import os
26
  import re
27
+ import tempfile
28
+ import time
29
+ from datetime import datetime, timedelta, timezone
 
 
 
 
 
30
 
31
+ from apify_client import ApifyClient
 
 
 
32
 
 
33
  from trend_engine.utils import get_week_period, safe_upsert_spot_trend
34
 
35
  logger = logging.getLogger(__name__)
36
 
37
 
38
  # โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•
39
+ # ์ˆ˜์ง‘ ์„ค์ •
40
  # โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•
41
 
42
+ # ๊ธฐ๊ฐ„ ํ•„ํ„ฐ โ€” ์ง์ „ 30์ผ (7์ผ์€ ์ŠคํŒŸ ๋งค์นญ๋ฅ  ๊ณผ์†Œ)
43
+ TREND_WINDOW_DAYS = 30
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
44
 
45
+ # ๊ฐ€์ค‘ ์ง‘๊ณ„ ์ƒ์ˆ˜ (์ธํ”Œ๋ฃจ์–ธ์„œ ์ฝ˜ํ…์ธ  ๊ธฐ์ค€)
46
+ MIN_ENGAGEMENT = 50 # ์ตœ์†Œ engagement ์ž„๊ณ„๊ฐ’
47
+ ENGAGEMENT_CAP = 5000 # ๋‹จ์ผ ๊ฒŒ์‹œ๋ฌผ engagement ์ƒํ•œ
48
 
49
+ # ๊ณ„์ •๋‹น ์ตœ๋Œ€ ๊ฒŒ์‹œ๋ฌผ ์ˆ˜
50
+ RESULTS_LIMIT_PER_ACCOUNT = 20
 
 
 
 
 
51
 
52
+ # AI ๋ฉ€ํ‹ฐ๋ชจ๋‹ฌ ๋ถ„์„ ์„ค์ •
53
+ MAX_IMAGE_BYTES = 5 * 1024 * 1024 # ์ด๋ฏธ์ง€ ์ตœ๋Œ€ 5MB
54
+ MAX_VIDEO_BYTES = 50 * 1024 * 1024 # ์˜์ƒ ์ตœ๋Œ€ 50MB
55
+ MEDIA_DOWNLOAD_TIMEOUT = 15 # ๋ฏธ๋””์–ด ๋‹ค์šด๋กœ๋“œ ํƒ€์ž„์•„์›ƒ(์ดˆ)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
56
 
57
 
58
  # โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
59
+ # ์œ„์น˜ ํƒœ๊ทธ ์ •๋ฆฌ
60
  # โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
61
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
62
  # ๊ตญ๊ฐ€/๊ณตํ•ญ ๋‹จ์œ„๋งŒ BLACKLIST
63
  LOCATION_BLACKLIST = frozenset({
64
  "South Korea", "Korea", "๋Œ€ํ•œ๋ฏผ๊ตญ", "ํ•œ๊ตญ",
 
103
  HASHTAG_RE = re.compile(r"#([\w๊ฐ€-ํžฃ]{2,30})")
104
 
105
 
106
+ # โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
107
+ # ๊ธฐ๋ณธ ์ธํ”Œ๋ฃจ์–ธ์„œ ๊ณ„์ • (DB ์กฐํšŒ ์‹คํŒจ ์‹œ ํด๋ฐฑ)
108
+ # โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
109
 
110
+ DEFAULT_INFLUENCER_ACCOUNTS = [
111
+ {"username": "_sohee.e", "category": "travel", "priority": 1},
112
+ {"username": "foto_ycy", "category": "photo", "priority": 2},
113
+ {"username": "bbo_muksta", "category": "food", "priority": 3},
114
+ {"username": "yoontheroad", "category": "photo", "priority": 4},
115
+ {"username": "siniple", "category": "photo", "priority": 5},
116
+ {"username": "bigg_jun", "category": "photo", "priority": 6},
117
+ {"username": "aria.leeee", "category": "travel", "priority": 7},
118
+ {"username": "gamttanam", "category": "lifestyle", "priority": 8},
119
+ {"username": "by_malgm", "category": "photo", "priority": 9},
120
+ {"username": "colorny", "category": "travel", "priority": 10},
121
+ {"username": "mongle_jyh", "category": "photo", "priority": 11},
122
+ {"username": "ryuppeum", "category": "travel", "priority": 12},
123
+ {"username": "thesoulofseoulblog", "category": "travel", "priority": 13},
124
+ {"username": "hey_jejuisland", "category": "lifestyle", "priority": 14},
125
+ {"username": "yooonjeju", "category": "travel", "priority": 15},
126
  ]
127
 
128
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
129
  # โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•
130
+ # InstagramCollector โ€” Influencer Monitoring v5.1
131
  # โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•
132
 
133
 
134
  class InstagramCollector:
135
+ """Instagram ์ˆ˜์ง‘๊ธฐ (v5.1 โ€” Influencer Monitoring + Multimodal AI).
136
 
137
+ ํ๋ ˆ์ดํŒ…๋œ ์ธํ”Œ๋ฃจ์–ธ์„œ ๊ณ„์ •์˜ ์ตœ๊ทผ ๊ฒŒ์‹œ๋ฌผ์„ ์ˆ˜์ง‘ํ•˜๊ณ 
138
+ 2-pass ํ•˜์ด๋ธŒ๋ฆฌ๋“œ ๋งค์นญ(์œ„์น˜ํƒœ๊ทธ/ํ•ด์‹œํƒœ๊ทธ + Gemini ๋ฉ€ํ‹ฐ๋ชจ๋‹ฌ) ํ›„
139
+ spot_trends์— ์ €์žฅํ•ฉ๋‹ˆ๋‹ค.
140
  """
141
 
142
+ ACTOR_ID = "apify/instagram-profile-scraper"
143
+ ACTOR_MEMORY_MB = 1024
144
+ ACTOR_TIMEOUT_SECS = 180
145
+
146
  def __init__(self, supabase_client, spot_matcher=None):
147
  self.supabase = supabase_client
148
  self.spot_matcher = spot_matcher
149
+ self.apify = ApifyClient(os.environ["APIFY_API_TOKEN"])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
150
 
151
  # ==================================================================
152
  # Main Entry Point
153
  # ==================================================================
154
 
155
  def run(self) -> dict:
156
+ """Instagram ์ธํ”Œ๋ฃจ์–ธ์„œ ์ˆ˜์ง‘ ํŒŒ์ดํ”„๋ผ์ธ v5.1.
157
+
158
+ [1] ์ธํ”Œ๋ฃจ์–ธ์„œ ๊ณ„์ • ๋ชฉ๋ก ๋กœ๋“œ (DB โ†’ ํด๋ฐฑ)
159
+ [2] Apify profile-scraper๋กœ ๊ณ„์ •๋ณ„ ์ตœ๊ทผ ๊ฒŒ์‹œ๋ฌผ ์ˆ˜์ง‘
160
+ [3] Pass 1: ์œ„์น˜ํƒœ๊ทธ + ํ•ด์‹œํƒœ๊ทธ ๋งค์นญ
161
+ [3b] Pass 2: ๋ฏธ๋งค์นญ โ†’ Gemini ๋ฉ€ํ‹ฐ๋ชจ๋‹ฌ (์ด๋ฏธ์ง€/์˜์ƒ + ์บก์…˜) โ†’ SpotMatcher
162
+ [4] DB ์ €์žฅ
163
+ [5] last_scraped_at ์—…๋ฐ์ดํŠธ
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
164
  """
165
+ logger.info("=== Instagram ์ˆ˜์ง‘ ์‹œ์ž‘ (v5.1 โ€” Multimodal AI) ===")
166
 
167
+ # [1] ์ธํ”Œ๋ฃจ์–ธ์„œ ๊ณ„์ • ๋ชฉ๋ก
168
+ accounts = self._load_influencer_accounts()
169
+ if not accounts:
170
+ logger.warning("ํ™œ์„ฑ ์ธํ”Œ๋ฃจ์–ธ์„œ ๊ณ„์ • ์—†์Œ โ€” ์ข…๋ฃŒ")
171
+ return {"total_posts": 0, "spots_matched": 0, "saved": 0, "accounts": 0}
172
 
173
+ logger.info("์ธํ”Œ๋ฃจ์–ธ์„œ ๊ณ„์ • %d๊ฐœ ๋กœ๋“œ", len(accounts))
 
174
 
175
+ # [2] ๊ฒŒ์‹œ๋ฌผ ์ˆ˜์ง‘
176
+ all_posts = self._collect_from_accounts(accounts)
 
 
 
 
 
177
 
178
  if not all_posts:
179
  logger.warning("์ˆ˜์ง‘๋œ ๊ฒŒ์‹œ๋ฌผ ์—†์Œ โ€” ์ข…๋ฃŒ")
180
+ return {
181
+ "total_posts": 0, "spots_matched": 0, "saved": 0,
182
+ "accounts": len(accounts),
183
+ }
184
+
185
+ # [3] Pass 1: ๊ทœ์น™ ๊ธฐ๋ฐ˜ ๋งค์นญ + ์ง‘๊ณ„
186
+ spot_metrics, unmatched_posts, match_stats = self._aggregate_with_unmatched(all_posts)
187
+ pass1_matched = len(spot_metrics)
188
 
189
+ # [3b] Pass 2: AI ๋ฉ€ํ‹ฐ๋ชจ๋‹ฌ ๋ถ„์„ (๋ฏธ๋งค์นญ ๊ฒŒ์‹œ๋ฌผ โ†’ ์ด๋ฏธ์ง€/์˜์ƒ + ์บก์…˜ ๋ถ„์„)
190
+ ai_matched_count = 0
191
+ if unmatched_posts and os.environ.get("GEMINI_API_KEY"):
192
+ ai_matched_count = self._ai_analyze_content(unmatched_posts, spot_metrics, match_stats)
193
+ elif unmatched_posts:
194
+ logger.info("GEMINI_API_KEY ๋ฏธ์„ค์ • โ€” AI ๋ฉ€ํ‹ฐ๋ชจ๋‹ฌ ๋ถ„์„ ์Šคํ‚ต (%d๊ฑด ๋ฏธ๋งค์นญ)", len(unmatched_posts))
195
 
196
+ # ์ตœ์ข… ํ†ต๊ณ„ ๋กœ๊น…
197
+ self._log_match_stats(match_stats, len(all_posts), len(spot_metrics))
198
+
199
+ # [4] DB ์ €์žฅ
200
  saved = self._save_to_db(spot_metrics)
201
 
202
+ # [5] last_scraped_at ์—…๋ฐ์ดํŠธ
203
+ scraped_usernames = list({
204
+ p.get("_source_account", "")
205
+ for p in all_posts if p.get("_source_account")
206
+ })
207
+ self._update_last_scraped(scraped_usernames)
208
+
209
  result = {
210
+ "accounts": len(accounts),
211
+ "accounts_scraped": len(scraped_usernames),
212
  "total_posts": len(all_posts),
213
+ "pass1_spots_matched": pass1_matched,
214
+ "ai_matched": ai_matched_count,
215
+ "total_spots_matched": len(spot_metrics),
216
  "saved": saved,
217
  }
218
+ logger.info("=== Instagram ์ˆ˜์ง‘ ์™„๋ฃŒ (v5.1 โ€” Multimodal AI): %s ===", result)
219
  return result
220
 
221
+ # ==================================================================
222
+ # ์ธํ”Œ๋ฃจ์–ธ์„œ ๊ณ„์ • ๊ด€๋ฆฌ
223
+ # ==================================================================
 
 
 
 
 
 
224
 
225
+ def _load_influencer_accounts(self) -> list[dict]:
226
+ """influencer_accounts ํ…Œ์ด๋ธ”์—์„œ ํ™œ์„ฑ ๊ณ„์ • ๋ชฉ๋ก์„ ๋กœ๋“œํ•œ๋‹ค.
 
 
 
 
 
 
 
 
227
 
228
+ DB ์กฐํšŒ ์‹คํŒจ ์‹œ DEFAULT_INFLUENCER_ACCOUNTS๋กœ ํด๋ฐฑ.
229
+ """
230
  try:
231
+ resp = (
232
+ self.supabase.table("influencer_accounts")
233
+ .select("username, category, priority")
234
+ .eq("platform", "instagram")
235
+ .eq("is_active", True)
236
+ .order("priority")
237
+ .execute()
238
  )
239
+ accounts = resp.data or []
240
+ if accounts:
241
+ logger.info("DB์—์„œ ์ธํ”Œ๋ฃจ์–ธ์„œ ๊ณ„์ • %d๊ฐœ ๋กœ๋“œ", len(accounts))
242
+ return accounts
243
  except Exception as e:
244
+ logger.warning("influencer_accounts ์กฐํšŒ ์‹คํŒจ (ํด๋ฐฑ ์‚ฌ์šฉ): %s", e)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
245
 
246
+ logger.info("๊ธฐ๋ณธ ์ธํ”Œ๋ฃจ์–ธ์„œ ๊ณ„์ • %d๊ฐœ ์‚ฌ์šฉ (ํด๋ฐฑ)", len(DEFAULT_INFLUENCER_ACCOUNTS))
247
+ return list(DEFAULT_INFLUENCER_ACCOUNTS)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
248
 
249
+ def _update_last_scraped(self, usernames: list[str]) -> None:
250
+ """์ˆ˜์ง‘ ์™„๋ฃŒ๋œ ๊ณ„์ •์˜ last_scraped_at์„ ์—…๋ฐ์ดํŠธํ•œ๋‹ค."""
251
+ if not usernames:
252
+ return
253
+ now = datetime.now(timezone.utc).isoformat()
254
+ for username in usernames:
255
  try:
256
+ self.supabase.table("influencer_accounts").update({
257
+ "last_scraped_at": now,
258
+ }).eq("platform", "instagram").eq("username", username).execute()
 
 
259
  except Exception as e:
260
+ logger.debug("last_scraped_at ์—…๋ฐ์ดํŠธ ์‹คํŒจ (%s): %s", username, e)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
261
 
262
+ logger.info("last_scraped_at ์—…๋ฐ์ดํŠธ: %d๊ฐœ ๊ณ„์ •", len(usernames))
 
263
 
264
  # ==================================================================
265
+ # ๊ฒŒ์‹œ๋ฌผ ์ˆ˜์ง‘ (Apify Profile Scraper)
266
  # ==================================================================
267
 
268
+ def _collect_from_accounts(self, accounts: list[dict]) -> list[dict]:
269
+ """์ธํ”Œ๋ฃจ์–ธ์„œ ๊ณ„์ •์—์„œ ๊ฒŒ์‹œ๋ฌผ์„ ์ˆ˜์ง‘ํ•œ๋‹ค.
270
 
271
+ Apify instagram-profile-scraper Actor๋ฅผ ์‚ฌ์šฉํ•˜์—ฌ
272
+ ๊ณ„์ •๋ณ„ ์ตœ๊ทผ ๊ฒŒ์‹œ๋ฌผ์„ ๊ฐ€์ ธ์˜จ๋‹ค. 5๊ฐœ์”ฉ ๋ฐฐ์น˜ ์‹คํ–‰.
 
 
 
273
  """
274
+ usernames = [a["username"] for a in accounts]
275
  logger.info(
276
+ "Apify Actor ์‹คํ–‰: %d๊ฐœ ๊ณ„์ •, ๊ณ„์ •๋‹น ์ตœ๋Œ€ %d๊ฑด",
277
+ len(usernames), RESULTS_LIMIT_PER_ACCOUNT,
278
  )
279
 
280
+ all_posts: list[dict] = []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
281
 
282
+ # ๊ณ„์ •์„ 5๊ฐœ์”ฉ ๋ฐฐ์น˜๋กœ ์‹คํ–‰ (๋‹จ์ผ Actor ํ˜ธ์ถœ ์‹คํŒจ ์‹œ ์˜ํ–ฅ ์ตœ์†Œํ™”)
283
+ batch_size = 5
284
+ for i in range(0, len(usernames), batch_size):
285
+ batch = usernames[i:i + batch_size]
286
+ label = f"batch {i // batch_size + 1}/{(len(usernames) + batch_size - 1) // batch_size}"
287
+ posts = self._scrape_profiles(batch, label)
288
+ all_posts.extend(posts)
289
 
290
+ unique_posts = _dedup_posts_by_url(all_posts)
291
+ logger.info("๊ฒŒ์‹œ๋ฌผ ์ˆ˜์ง‘ ์™„๋ฃŒ: %d๊ฑด (%d๊ฐœ ๊ณ„์ •)", len(unique_posts), len(usernames))
292
+ return unique_posts
293
 
294
+ def _scrape_profiles(self, usernames: list[str], label: str) -> list[dict]:
295
+ """Apify instagram-profile-scraper Actor๋กœ ํ”„๋กœํ•„ ๊ฒŒ์‹œ๋ฌผ ์ˆ˜์ง‘."""
296
+ run_input = {
297
+ "usernames": usernames,
298
+ "resultsLimit": RESULTS_LIMIT_PER_ACCOUNT,
299
+ "proxy": {
300
+ "useApifyProxy": True,
301
+ "apifyProxyGroups": ["RESIDENTIAL"],
302
+ },
303
  }
 
 
 
 
 
 
 
 
 
 
 
304
 
305
  try:
306
+ run = self.apify.actor(self.ACTOR_ID).call(
307
+ run_input=run_input,
308
+ timeout_secs=self.ACTOR_TIMEOUT_SECS,
309
+ memory_mbytes=self.ACTOR_MEMORY_MB,
310
  )
311
+ except Exception as e:
312
+ logger.warning("Apify Actor ์‹คํ–‰ ์‹คํŒจ [%s]: %s", label, e)
 
 
 
 
 
313
  return []
314
 
315
+ if run.get("status") not in ("SUCCEEDED", None):
316
+ logger.warning(
317
+ "Apify Actor ๋น„์ •์ƒ ์ข…๋ฃŒ [%s]: status=%s",
318
+ label, run.get("status"),
319
+ )
320
  return []
321
 
 
 
322
  posts: list[dict] = []
323
+ dataset_id = run["defaultDatasetId"]
 
324
 
325
+ for profile in self.apify.dataset(dataset_id).iterate_items():
326
+ # profile-scraper๋Š” ํ”„๋กœํ•„ 1๊ฐœ = ์•„์ดํ…œ 1๊ฐœ, ๊ฒŒ์‹œ๋ฌผ์€ latestPosts ์•ˆ
327
+ profile_username = profile.get("username", "")
328
+ latest_posts = profile.get("latestPosts", [])
329
 
330
+ if not latest_posts:
331
+ logger.debug("๊ฒŒ์‹œ๋ฌผ ์—†์Œ: @%s", profile_username)
332
+ continue
333
 
334
+ for item in latest_posts:
335
+ post = self._normalize_post(item)
336
+ if post:
337
+ posts.append(post)
338
 
339
+ logger.info("[%s] %s โ†’ %d๊ฑด", label, usernames, len(posts))
 
 
 
 
340
  return posts
341
 
342
+ @staticmethod
343
+ def _normalize_post(item: dict) -> dict | None:
344
+ """Apify profile-scraper ์‘๋‹ต โ†’ ํ‘œ์ค€ post dict๋กœ ๋ณ€ํ™˜."""
345
+ owner = item.get("ownerUsername", "") or ""
346
+ if not owner:
347
+ return None
348
+
349
+ likes = item.get("likesCount", 0)
350
+ if likes == -1:
351
+ likes = 0
352
+
353
+ caption = item.get("caption", "") or ""
354
+ hashtags = item.get("hashtags") or []
355
+ if not hashtags and caption:
356
+ hashtags = HASHTAG_RE.findall(caption)
357
+
358
+ # ์ฝ˜ํ…์ธ  ์œ ํ˜• ๋ฐ ๋ฏธ๋””์–ด URL
359
+ post_type = item.get("type", "Image") # Image, Video, Sidecar
360
+ if post_type == "Video":
361
+ media_url = item.get("videoUrl", "") or item.get("displayUrl", "") or ""
362
+ else:
363
+ media_url = item.get("displayUrl", "") or ""
364
+
365
+ return {
366
+ "search_term": f"@{owner}",
367
+ "search_type": "profile",
368
+ "location_name": item.get("locationName", "") or "",
369
+ "likes_count": likes,
370
+ "comments_count": item.get("commentsCount", 0) or 0,
371
+ "caption": caption,
372
+ "timestamp": item.get("timestamp", ""),
373
+ "url": item.get("url", ""),
374
+ "hashtags": hashtags,
375
+ "media_url": media_url,
376
+ "media_type": post_type,
377
+ "_source_account": owner,
378
+ }
379
 
380
  # ==================================================================
381
+ # 3๋‹จ๊ณ„ ์žฅ์†Œ ๋งค์นญ (v5.0 โ€” ํ”„๋กœํ•„ ๊ธฐ๋ฐ˜)
382
  # ==================================================================
383
 
384
  def _match_post_to_spot(self, post: dict) -> tuple[str | None, str]:
385
+ """๊ฒŒ์‹œ๋ฌผ 1๊ฑด์— ๋Œ€ํ•ด spot_id๋ฅผ ๋งค์นญํ•œ๋‹ค (๊ณ ์‹ ๋ขฐ ์‹ ํ˜ธ๋งŒ).
386
 
387
  ๋งค์นญ ์šฐ์„ ์ˆœ์œ„:
 
388
  1. locationName ํƒœ๊ทธ โ†’ ์ ‘๋‘์‚ฌ ์ œ๊ฑฐ โ†’ SpotMatcher
389
  2. hashtags ๋ฐฐ์—ด โ†’ SpotMatcher.match_hashtag() (๋ฐฉํ–ฅ ์ œํ•œ)
 
 
 
 
 
 
390
 
391
+ ์บก์…˜/์ด๋ฏธ์ง€/์˜์ƒ ๊ธฐ๋ฐ˜ ๋ถ„์„์€ Pass 2 (AI ๋ฉ€ํ‹ฐ๋ชจ๋‹ฌ)์—์„œ ์ฒ˜๋ฆฌ.
392
+ """
393
  # Stage 1: locationName โ†’ ์ ‘๋‘์‚ฌ ์ œ๊ฑฐ โ†’ SpotMatcher
394
  loc = post.get("location_name", "")
395
  if loc and loc not in LOCATION_BLACKLIST and self.spot_matcher:
 
409
  if sid:
410
  return sid, "hashtag"
411
 
 
 
 
 
 
 
 
 
 
 
 
 
412
  return None, "unmatched"
413
 
414
  # ==================================================================
415
+ # Pass 1: ๊ทœ์น™ ๊ธฐ๋ฐ˜ ๋งค์นญ + ์ง‘๊ณ„ (๋ฏธ๋งค์นญ ๊ฒŒ์‹œ๋ฌผ ์ˆ˜์ง‘)
416
  # ==================================================================
417
 
418
+ def _aggregate_with_unmatched(
419
+ self, posts: list[dict],
420
+ ) -> tuple[dict[str, dict], list[dict], dict[str, int]]:
421
+ """์ˆ˜์ง‘๋œ ๊ฒŒ์‹œ๋ฌผ์„ spot_id ๊ธฐ์ค€์œผ๋กœ ์ง‘๊ณ„ํ•˜๊ณ , ๋ฏธ๋งค์นญ ๊ฒŒ์‹œ๋ฌผ์„ ๋ณ„๋„ ๋ฐ˜ํ™˜ํ•œ๋‹ค.
 
 
 
422
 
423
  Returns:
424
+ (spot_metrics, unmatched_posts, match_stats)
 
425
  """
426
  cutoff = datetime.now(timezone.utc) - timedelta(days=TREND_WINDOW_DAYS)
427
 
428
  spot_metrics: dict[str, dict] = {}
429
+ unmatched_posts: list[dict] = []
430
+ match_stats: dict[str, int] = {
431
+ "location_tag": 0,
432
+ "hashtag": 0, "unmatched": 0,
433
+ "ai_matched": 0,
434
  "filtered_old": 0, "filtered_low_engagement": 0,
435
  }
436
 
 
446
  except (ValueError, TypeError):
447
  pass # ํŒŒ์‹ฑ ์‹คํŒจ ์‹œ ํฌํ•จ
448
 
449
+ # ์ตœ์†Œ engagement ์ž„๊ณ„๊ฐ’ (๋งค์นญ ์ „ ํ•„ํ„ฐ โ€” ์ €engagement๋Š” AI๋„ ์ฒ˜๋ฆฌ ๋ถˆ์š”)
 
 
 
 
 
 
 
450
  likes = post.get("likes_count", 0) or 0
451
  comments = post.get("comments_count", 0) or 0
452
  engagement = likes + comments
 
455
  match_stats["filtered_low_engagement"] += 1
456
  continue
457
 
458
+ # 3๋‹จ๊ณ„ ๋งค์นญ
459
+ spot_id, method = self._match_post_to_spot(post)
460
+ match_stats[method] += 1
461
+
462
+ if not spot_id:
463
+ # ๋ฏธ๋งค์นญ โ†’ Pass 2 ๋Œ€์ƒ
464
+ unmatched_posts.append(post)
465
+ continue
466
+
467
  # engagement cap
468
  capped_engagement = min(engagement, ENGAGEMENT_CAP)
469
 
470
  # ์ง‘๊ณ„
471
+ _add_to_metrics(spot_metrics, spot_id, post, capped_engagement, method)
472
+
473
+ logger.info(
474
+ "Pass 1 ์™„๋ฃŒ: %d๊ฐœ ์ŠคํŒŸ ๋งค์นญ, %d๊ฑด ๋ฏธ๋งค์นญ โ†’ AI ๋Œ€์ƒ",
475
+ len(spot_metrics), len(unmatched_posts),
476
+ )
477
+ return spot_metrics, unmatched_posts, match_stats
478
+
479
+ # ==================================================================
480
+ # Pass 2: AI ๋ฉ€ํ‹ฐ๋ชจ๋‹ฌ ๋ถ„์„ (Gemini 2.0 Flash)
481
+ # ==================================================================
482
+
483
+ def _ai_analyze_content(
484
+ self,
485
+ unmatched_posts: list[dict],
486
+ spot_metrics: dict[str, dict],
487
+ match_stats: dict[str, int],
488
+ ) -> int:
489
+ """๋ฏธ๋งค์นญ ๊ฒŒ์‹œ๋ฌผ์„ Gemini ๋ฉ€ํ‹ฐ๋ชจ๋‹ฌ๋กœ ๋ถ„์„ํ•˜์—ฌ ์žฅ์†Œ๋ฅผ ์ถ”์ถœํ•œ๋‹ค.
490
+
491
+ ์ด๋ฏธ์ง€ ๊ฒŒ์‹œ๊ธ€: ์ด๋ฏธ์ง€ ๋‹ค์šด๋กœ๋“œ โ†’ Gemini Vision + ์บก์…˜ ๋ถ„์„
492
+ ๋ฆด์Šค(์˜์ƒ): ์˜์ƒ ๋‹ค์šด๋กœ๋“œ โ†’ Gemini File API + ์บก์…˜ ๋ถ„์„
493
+ ๋ฏธ๋””์–ด ์—†์Œ: ์บก์…˜ ํ…์ŠคํŠธ๋งŒ ๋ถ„์„ (ํด๋ฐฑ)
494
+
495
+ Returns:
496
+ AI๋กœ ์ถ”๊ฐ€ ๋งค์นญ๋œ ๊ฒŒ์‹œ๋ฌผ ์ˆ˜
497
+ """
498
+ try:
499
+ from google import genai
500
+ from google.genai import types
501
+ except ImportError:
502
+ logger.warning("google-genai ๋ฏธ์„ค์น˜ โ€” AI ๋ฉ€ํ‹ฐ๋ชจ๋‹ฌ ๋ถ„์„ ์Šคํ‚ต")
503
+ return 0
504
+
505
+ api_key = os.environ.get("GEMINI_API_KEY")
506
+ if not api_key:
507
+ return 0
508
+
509
+ client = genai.Client(api_key=api_key)
510
+
511
+ image_count = sum(1 for p in unmatched_posts if p.get("media_type") != "Video")
512
+ video_count = sum(1 for p in unmatched_posts if p.get("media_type") == "Video")
513
+ logger.info(
514
+ "AI ๋ฉ€ํ‹ฐ๋ชจ๋‹ฌ ๋ถ„์„ ์‹œ์ž‘: %d๊ฑด (์ด๋ฏธ์ง€ %d, ์˜์ƒ %d)",
515
+ len(unmatched_posts), image_count, video_count,
516
+ )
517
+
518
+ matched_count = 0
519
+ analyzed = 0
520
+
521
+ for post in unmatched_posts:
522
+ media_url = post.get("media_url", "")
523
+ media_type = post.get("media_type", "Image")
524
+ caption = post.get("caption", "")[:500]
525
+ hashtags = ", ".join(post.get("hashtags", [])[:10])
526
+
527
+ places: list[str] = []
528
+ try:
529
+ if media_type == "Video" and media_url:
530
+ places = self._ai_extract_from_video(
531
+ client, types, media_url, caption, hashtags,
532
+ )
533
+ elif media_url:
534
+ places = self._ai_extract_from_image(
535
+ client, types, media_url, caption, hashtags,
536
+ )
537
+ elif caption and len(caption) >= 10:
538
+ places = self._ai_extract_from_text(
539
+ client, types, caption, hashtags,
540
+ )
541
+ analyzed += 1
542
+ except Exception as e:
543
+ logger.debug("AI ๋ถ„์„ ์‹คํŒจ (%s): %s", post.get("url", "")[:60], e)
544
+ continue
545
+
546
+ # ์ถ”์ถœ๋œ ์žฅ์†Œ๋ช… โ†’ SpotMatcher
547
+ if places:
548
+ logger.info(
549
+ "AI ์ถ”์ถœ ์žฅ์†Œ: %s โ† @%s (%s)",
550
+ [p[:30] for p in places], post.get("_source_account", "?"), media_type,
551
+ )
552
+ else:
553
+ logger.debug(
554
+ "AI ์ถ”์ถœ ์žฅ์†Œ ์—†์Œ โ† @%s (%s)",
555
+ post.get("_source_account", "?"), media_type,
556
+ )
557
+ for name in places:
558
+ if not name or len(name) < 2:
559
+ continue
560
+ sid = self.spot_matcher.match(name) if self.spot_matcher else None
561
+ if sid:
562
+ capped = min(
563
+ (post.get("likes_count", 0) or 0)
564
+ + (post.get("comments_count", 0) or 0),
565
+ ENGAGEMENT_CAP,
566
+ )
567
+ _add_to_metrics(spot_metrics, sid, post, capped, "ai")
568
+ match_stats["ai_matched"] += 1
569
+ match_stats["unmatched"] = max(0, match_stats["unmatched"] - 1)
570
+ matched_count += 1
571
+ break # ํ•œ ๊ฒŒ์‹œ๋ฌผ์—์„œ ์ฒซ ๋งค์นญ๋งŒ
572
+
573
+ logger.info("AI ๋ฉ€ํ‹ฐ๋ชจ๋‹ฌ ๋ถ„์„ ์™„๋ฃŒ: %d๊ฑด ๋ถ„์„, %d๊ฑด ๋งค์นญ", analyzed, matched_count)
574
+ return matched_count
575
+
576
+ def _ai_extract_from_image(
577
+ self, client, types, media_url: str, caption: str, hashtags: str,
578
+ ) -> list[str]:
579
+ """์ด๋ฏธ์ง€ ๊ฒŒ์‹œ๋ฌผ์—์„œ Gemini Vision์œผ๋กœ ์žฅ์†Œ๋ฅผ ์ถ”์ถœํ•œ๋‹ค."""
580
+ image_bytes = _download_media(media_url, MAX_IMAGE_BYTES)
581
+
582
+ prompt = _build_spot_prompt(
583
+ content_type="๊ฒŒ์‹œ๊ธ€ (์ด๋ฏธ์ง€)",
584
+ caption=caption,
585
+ hashtags=hashtags,
586
+ media_instruction=(
587
+ "์ด๋ฏธ์ง€์—์„œ ๊ฐ„ํŒ, ๋ฉ”๋‰ดํŒ, ํŠน์ง•์  ๊ฒฝ๊ด€์„ ํ™•์ธํ•˜๊ณ  "
588
+ "์บก์…˜ ๋‚ด์šฉ๋„ ํ•จ๊ป˜ ๋ถ„์„ํ•˜์—ฌ ์žฅ์†Œ๋ฅผ ์‹๋ณ„ํ•˜์„ธ์š”."
589
+ ),
590
+ )
591
+
592
+ contents: list = []
593
+ if image_bytes:
594
+ contents.append(
595
+ types.Part.from_bytes(data=image_bytes, mime_type="image/jpeg"),
596
+ )
597
+ contents.append(prompt)
598
+
599
+ response = client.models.generate_content(
600
+ model="gemini-2.5-flash",
601
+ contents=contents,
602
+ config=types.GenerateContentConfig(
603
+ temperature=0.1, max_output_tokens=200,
604
+ thinking_config=types.ThinkingConfig(thinking_budget=0),
605
+ ),
606
+ )
607
+ return _parse_ai_places(response.text)
608
+
609
+ def _ai_extract_from_video(
610
+ self, client, types, media_url: str, caption: str, hashtags: str,
611
+ ) -> list[str]:
612
+ """๋ฆด์Šค(์˜์ƒ)์—์„œ Gemini๋กœ ์žฅ์†Œ๋ฅผ ์ถ”์ถœํ•œ๋‹ค."""
613
+ video_bytes = _download_media(media_url, MAX_VIDEO_BYTES)
614
+ if not video_bytes:
615
+ # ๋‹ค์šด๋กœ๋“œ ์‹คํŒจ โ†’ ์บก์…˜๋งŒ ๋ถ„์„
616
+ if caption and len(caption) >= 10:
617
+ return self._ai_extract_from_text(client, types, caption, hashtags)
618
+ return []
619
+
620
+ temp_path = None
621
+ try:
622
+ with tempfile.NamedTemporaryFile(suffix=".mp4", delete=False) as f:
623
+ f.write(video_bytes)
624
+ temp_path = f.name
625
+
626
+ # Gemini File API์— ์—…๋กœ๋“œ
627
+ video_file = client.files.upload(file=temp_path)
628
+
629
+ # ์ฒ˜๋ฆฌ ๋Œ€๊ธฐ (์ตœ๋Œ€ 60์ดˆ)
630
+ wait_count = 0
631
+ while wait_count < 30:
632
+ state_name = getattr(video_file.state, "name", str(video_file.state))
633
+ if "PROCESSING" not in state_name:
634
+ break
635
+ time.sleep(2)
636
+ video_file = client.files.get(name=video_file.name)
637
+ wait_count += 1
638
+
639
+ prompt = _build_spot_prompt(
640
+ content_type="๋ฆด์Šค (์˜์ƒ)",
641
+ caption=caption,
642
+ hashtags=hashtags,
643
+ media_instruction=(
644
+ "์˜์ƒ ์† ๊ฐ„ํŒ, ์ž๋ง‰, ๋‚˜๋ ˆ์ด์…˜, ํŠน์ง•์  ๊ฒฝ๊ด€์„ ํ™•์ธํ•˜๊ณ  "
645
+ "์บก์…˜๋„ ํ•จ๊ป˜ ๋ถ„์„ํ•˜์—ฌ ์žฅ์†Œ๋ฅผ ์‹๋ณ„ํ•˜์„ธ์š”."
646
+ ),
647
+ )
648
+
649
+ response = client.models.generate_content(
650
+ model="gemini-2.5-flash",
651
+ contents=[video_file, prompt],
652
+ config=types.GenerateContentConfig(
653
+ temperature=0.1, max_output_tokens=200,
654
+ thinking_config=types.ThinkingConfig(thinking_budget=0),
655
+ ),
656
  )
 
 
657
 
658
+ # ์—…๋กœ๋“œ ํŒŒ์ผ ์ •๋ฆฌ
659
+ try:
660
+ client.files.delete(name=video_file.name)
661
+ except Exception:
662
+ pass
663
+
664
+ return _parse_ai_places(response.text)
665
+ finally:
666
+ if temp_path:
667
+ try:
668
+ os.unlink(temp_path)
669
+ except OSError:
670
+ pass
671
+
672
+ @staticmethod
673
+ def _ai_extract_from_text(client, types, caption: str, hashtags: str) -> list[str]:
674
+ """์บก์…˜ ํ…์ŠคํŠธ๋งŒ์œผ๋กœ ์žฅ์†Œ๋ฅผ ์ถ”์ถœํ•œ๋‹ค (๋ฏธ๋””์–ด ๋‹ค์šด๋กœ๋“œ ์‹คํŒจ ์‹œ ํด๋ฐฑ)."""
675
+ prompt = _build_spot_prompt(
676
+ content_type="๊ฒŒ์‹œ๊ธ€",
677
+ caption=caption,
678
+ hashtags=hashtags,
679
+ media_instruction="์บก์…˜ ํ…์ŠคํŠธ์—์„œ ์žฅ์†Œ๋ฅผ ์ถ”์ถœํ•˜์„ธ์š”.",
680
+ )
681
+
682
+ response = client.models.generate_content(
683
+ model="gemini-2.5-flash",
684
+ contents=prompt,
685
+ config=types.GenerateContentConfig(
686
+ temperature=0.1, max_output_tokens=200,
687
+ thinking_config=types.ThinkingConfig(thinking_budget=0),
688
+ ),
689
+ )
690
+ return _parse_ai_places(response.text)
691
+
692
+ # ==================================================================
693
+ # ๋งค์นญ ํ†ต๊ณ„ ๋กœ๊น…
694
+ # ==================================================================
695
+
696
+ @staticmethod
697
+ def _log_match_stats(
698
+ match_stats: dict[str, int], total_posts: int, total_spots: int,
699
+ ) -> None:
700
+ """Pass 1 + Pass 2 ํ†ตํ•ฉ ๋งค์นญ ํ†ต๊ณ„๋ฅผ ๋กœ๊น…ํ•œ๋‹ค."""
701
  if match_stats["filtered_old"] > 0:
702
  logger.info(
703
+ "๊ธฐ๊ฐ„ ํ•„ํ„ฐ: %d๊ฑด ์ œ์™ธ (์ตœ๊ทผ %d์ผ ์™ธ)",
704
  match_stats["filtered_old"], TREND_WINDOW_DAYS,
 
705
  )
706
  if match_stats["filtered_low_engagement"] > 0:
707
  logger.info(
 
709
  match_stats["filtered_low_engagement"], MIN_ENGAGEMENT,
710
  )
711
  logger.info(
712
+ "์ตœ์ข… ๋งค์นญ ํ†ต๊ณ„: %d๊ฐœ ์ŠคํŒŸ โ€” "
713
+ "์œ„์น˜ํƒœ๊ทธ %d, ํ•ด์‹œํƒœ๊ทธ %d, AI๋ฉ€ํ‹ฐ๋ชจ๋‹ฌ %d, ๋ฏธ์‹๋ณ„ %d",
714
+ total_spots,
715
  match_stats["location_tag"],
716
  match_stats["hashtag"],
717
+ match_stats.get("ai_matched", 0),
 
718
  match_stats["unmatched"],
719
  )
 
720
 
721
  # ==================================================================
722
+ # DB ์ €์žฅ
723
  # ==================================================================
724
 
725
  def _save_to_db(self, spot_metrics: dict[str, dict]) -> int:
726
  """์ง‘๊ณ„๋œ ๋ฉ”ํŠธ๋ฆญ์„ spot_trends ํ…Œ์ด๋ธ”์— ์ €์žฅํ•œ๋‹ค.
727
 
728
+ source = instagram_influencer
729
  ์ €์žฅ ๋ฉ”ํŠธ๋ฆญ: post_count, avg_engagement, weighted_score
730
  """
731
+ # set โ†’ sorted list ๋ณ€ํ™˜ + avg_engagement ๊ณ„์‚ฐ
732
+ _finalize_metrics(spot_metrics)
733
+
734
  period_start, period_end = get_week_period()
735
+ source = "instagram_influencer"
736
  saved = 0
737
 
738
  for spot_id, metrics in spot_metrics.items():
 
741
  "total_comments": metrics["total_comments"],
742
  "match_methods": metrics["match_methods"],
743
  "hashtags": metrics["hashtags"],
744
+ "source_accounts": metrics["source_accounts"],
745
  }
746
 
747
  # post_count
 
761
  })
762
  saved += 1
763
  except Exception as e:
764
+ logger.warning("spot_trends ์ €์žฅ ์‹คํŒจ (post_count, %s): %s", spot_id, e)
765
 
766
  # avg_engagement
767
  if metrics["avg_engagement"] > 0:
 
776
  "raw_data": {"match_methods": metrics["match_methods"]},
777
  })
778
  except Exception as e:
779
+ logger.warning("spot_trends ์ €์žฅ ์‹คํŒจ (avg_engagement, %s): %s", spot_id, e)
780
 
781
  # weighted_score
782
  if metrics["weighted_score"] > 0:
 
791
  "raw_data": {},
792
  })
793
  except Exception as e:
794
+ logger.warning("spot_trends ์ €์žฅ ์‹คํŒจ (weighted_score, %s): %s", spot_id, e)
795
 
796
  logger.info("Instagram DB ์ €์žฅ: %d๊ฑด (%d ์ŠคํŒŸ, source=%s)", saved, len(spot_metrics), source)
797
  return saved
798
 
799
 
800
+ # โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•
801
+ # AI ๋ฉ€ํ‹ฐ๋ชจ๋‹ฌ ์œ ํ‹ธ๋ฆฌํ‹ฐ
802
+ # โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•
803
+
804
+
805
+ def _build_spot_prompt(
806
+ content_type: str, caption: str, hashtags: str, media_instruction: str,
807
+ ) -> str:
808
+ """Gemini์šฉ ์žฅ์†Œ ์ถ”์ถœ ํ”„๋กฌํ”„ํŠธ๋ฅผ ์ƒ์„ฑํ•œ๋‹ค."""
809
+ return (
810
+ f"์ œ์ฃผ๋„์˜ ์ธ์Šคํƒ€๊ทธ๋žจ {content_type}์„ ๋ถ„์„ํ•˜์—ฌ "
811
+ "๊ตฌ์ฒด์ ์ธ ์žฅ์†Œ๋ช…(์ƒํ˜ธ๋ช…)์„ ์ถ”์ถœํ•˜์„ธ์š”.\n\n"
812
+ f"์บก์…˜: {caption or '(์—†์Œ)'}\n"
813
+ f"ํ•ด์‹œํƒœ๊ทธ: {hashtags or '(์—†์Œ)'}\n\n"
814
+ f"{media_instruction}\n\n"
815
+ "์ถ”์ถœ ๋Œ€์ƒ:\n"
816
+ "- ์นดํŽ˜, ์‹๋‹น, ๋ฒ ์ด์ปค๋ฆฌ ๋“ฑ ์ƒํ˜ธ๋ช… (์˜ˆ: ์นดํŽ˜ ๋ ˆ์ด์–ด๋“œ, ๋ด„๋‚ ์˜ ํ…Œ์ด๋ธ”)\n"
817
+ "- ๊ด€๊ด‘์ง€, ํ•ด๋ณ€, ์˜ค๋ฆ„ ๊ณ ์œ ๋ช…์‚ฌ (์˜ˆ: ์ƒˆ๋ณ„์˜ค๋ฆ„, ํ˜‘์žฌํ•ด์ˆ˜์š•์žฅ, ๊ตฐ์‚ฐ์˜ค๋ฆ„)\n"
818
+ "- ๊ณต์›, ๋งˆ์„, ๊ฑฐ๋ฆฌ ๊ณ ์œ ๋ช…์‚ฌ (์˜ˆ: ํ•œ๋‹ดํ•ด์•ˆ์‚ฐ์ฑ…๋กœ, ๊ณฝ์ง€๊ณผ๋ฌผํ•ด๋ณ€)\n\n"
819
+ "์ œ์™ธ ๋Œ€์ƒ:\n"
820
+ "- '์• ์›”', '์ œ์ฃผ', '์ œ์ฃผ๋„', 'ํ•œ๋ฆผ', '์„œ๊ท€ํฌ' ๊ฐ™์€ ๊ด‘์—ญ ์ง€๋ช…\n"
821
+ "- '์นดํŽ˜', '๋ง›์ง‘', 'ํ•ด๋ณ€' ๊ฐ™์€ ์ผ๋ฐ˜ ์นดํ…Œ๊ณ ๋ฆฌ๋ช…\n\n"
822
+ "์ค‘์š”: thinking ์—†์ด JSON ๋ฐฐ์—ด๋งŒ ์ถœ๋ ฅํ•˜์„ธ์š”.\n"
823
+ '์‘๋‹ต ํ˜•์‹: ["์žฅ์†Œ๋ช…1", "์žฅ์†Œ๋ช…2"]\n'
824
+ "์žฅ์†Œ๊ฐ€ ์—†์œผ๋ฉด: []"
825
+ )
826
+
827
+
828
+ def _download_media(url: str, max_bytes: int) -> bytes | None:
829
+ """๋ฏธ๋””์–ด URL์—์„œ ๋ฐ”์ดํŠธ๋ฅผ ๋‹ค์šด๋กœ๋“œํ•œ๋‹ค."""
830
+ if not url:
831
+ return None
832
+ try:
833
+ import httpx
834
+
835
+ with httpx.stream(
836
+ "GET", url, timeout=MEDIA_DOWNLOAD_TIMEOUT, follow_redirects=True,
837
+ ) as resp:
838
+ if resp.status_code != 200:
839
+ return None
840
+ chunks: list[bytes] = []
841
+ total = 0
842
+ for chunk in resp.iter_bytes(chunk_size=8192):
843
+ total += len(chunk)
844
+ if total > max_bytes:
845
+ logger.debug("๋ฏธ๋””์–ด ํฌ๊ธฐ ์ดˆ๊ณผ (%d > %d): %s", total, max_bytes, url[:60])
846
+ return None
847
+ chunks.append(chunk)
848
+ return b"".join(chunks)
849
+ except Exception as e:
850
+ logger.debug("๋ฏธ๋””์–ด ๋‹ค์šด๋กœ๋“œ ์‹คํŒจ: %s โ€” %s", url[:60], e)
851
+ return None
852
+
853
+
854
+ def _parse_ai_places(text: str) -> list[str]:
855
+ """Gemini ์‘๋‹ต์—์„œ ์žฅ์†Œ๋ช… ๋ฐฐ์—ด์„ ํŒŒ์‹ฑํ•œ๋‹ค."""
856
+ if not text:
857
+ logger.debug("AI ์‘๋‹ต ๋น„์–ด์žˆ์Œ")
858
+ return []
859
+
860
+ raw_text = text # ๋””๋ฒ„๊น…์šฉ ์›๋ณธ ๋ณด์กด
861
+ text = text.strip()
862
+
863
+ # Gemini 2.5 Flash thinking ๋ธ”๋ก ์ œ๊ฑฐ
864
+ if "<think>" in text:
865
+ # thinking ๋ธ”๋ก ์ดํ›„์˜ ์‹ค์ œ ์‘๋‹ต๋งŒ ์ถ”์ถœ
866
+ parts = text.split("</think>")
867
+ text = parts[-1].strip() if len(parts) > 1 else text
868
+
869
+ # ๋งˆํฌ๋‹ค์šด ์ฝ”๋“œ ๋ธ”๋ก ์ œ๊ฑฐ
870
+ if text.startswith("```"):
871
+ text = text.split("\n", 1)[-1]
872
+ if text.endswith("```"):
873
+ text = text.rsplit("```", 1)[0]
874
+ text = text.strip()
875
+
876
+ # JSON ๋ฐฐ์—ด ์ง์ ‘ ํŒŒ์‹ฑ ์‹œ๋„
877
+ try:
878
+ result = json.loads(text)
879
+ if isinstance(result, list):
880
+ return [p.strip() for p in result if isinstance(p, str) and p.strip()]
881
+ except (json.JSONDecodeError, ValueError):
882
+ pass
883
+
884
+ # ํ…์ŠคํŠธ ์•ˆ์— JSON ๋ฐฐ์—ด์ด ํฌํ•จ๋œ ๊ฒฝ์šฐ ์ถ”์ถœ
885
+ match = re.search(r'\[.*?\]', text, re.DOTALL)
886
+ if match:
887
+ try:
888
+ result = json.loads(match.group())
889
+ if isinstance(result, list):
890
+ return [p.strip() for p in result if isinstance(p, str) and p.strip()]
891
+ except (json.JSONDecodeError, ValueError):
892
+ pass
893
+
894
+ logger.debug("AI ์‘๋‹ต ํŒŒ์‹ฑ ์‹คํŒจ: %.200s", raw_text)
895
+ return []
896
+
897
+
898
  # โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•
899
  # Shared Utility
900
  # โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•
901
 
902
 
903
+ def _add_to_metrics(
904
+ spot_metrics: dict[str, dict],
905
+ spot_id: str,
906
+ post: dict,
907
+ capped_engagement: int,
908
+ method: str,
909
+ ) -> None:
910
+ """๊ฒŒ์‹œ๋ฌผ 1๊ฑด์„ spot_metrics์— ์ง‘๊ณ„ํ•œ๋‹ค."""
911
+ if spot_id not in spot_metrics:
912
+ spot_metrics[spot_id] = {
913
+ "post_count": 0,
914
+ "total_likes": 0,
915
+ "total_comments": 0,
916
+ "weighted_score": 0,
917
+ "match_methods": set(),
918
+ "hashtags": set(),
919
+ "source_accounts": set(),
920
+ }
921
+
922
+ likes = post.get("likes_count", 0) or 0
923
+ comments = post.get("comments_count", 0) or 0
924
+
925
+ m = spot_metrics[spot_id]
926
+ m["post_count"] += 1
927
+ m["total_likes"] += likes
928
+ m["total_comments"] += comments
929
+ m["weighted_score"] += capped_engagement
930
+ m["match_methods"].add(method)
931
+ for tag in post.get("hashtags", []):
932
+ tag = tag.strip().lstrip("#")
933
+ if tag and len(tag) >= 2:
934
+ m["hashtags"].add(tag)
935
+ account = post.get("_source_account", "")
936
+ if account:
937
+ m["source_accounts"].add(account)
938
+
939
+
940
+ def _finalize_metrics(spot_metrics: dict[str, dict]) -> None:
941
+ """set โ†’ sorted list ๋ณ€ํ™˜ + avg_engagement ๊ณ„์‚ฐ."""
942
+ for metrics in spot_metrics.values():
943
+ count = max(metrics["post_count"], 1)
944
+ metrics["avg_engagement"] = int(
945
+ round((metrics["total_likes"] + metrics["total_comments"]) / count)
946
+ )
947
+ metrics["match_methods"] = sorted(metrics["match_methods"])
948
+ metrics["hashtags"] = sorted(metrics["hashtags"])
949
+ metrics["source_accounts"] = sorted(metrics["source_accounts"])
950
+
951
+
952
  def _dedup_posts_by_url(posts: list[dict]) -> list[dict]:
953
  """URL ๊ธฐ๋ฐ˜ ์ค‘๋ณต ์ œ๊ฑฐ."""
954
  seen_urls: set[str] = set()
trend_engine/trend_scorer.py CHANGED
@@ -236,12 +236,12 @@ def generate_weekly_ranking(supabase: Client | None = None) -> dict:
236
  mt = row["metric_type"]
237
  spots_last.setdefault(sid, {}).setdefault(src, {})[mt] = row["metric_value"]
238
 
239
- # -- instagram_apify/instagram_ed โ†’ "instagram" ํ‚ค๋กœ ํ†ตํ•ฉ --
240
- # DB ๋งˆ์ด๊ทธ๋ ˆ์ด์…˜ ํ›„ source๊ฐ€ instagram_apify/instagram_ed๋กœ ๋ถ„๋ฆฌ๋˜์—ˆ์œผ๋‚˜
241
  # ์Šค์ฝ”์–ด๋Ÿฌ๋Š” "instagram" ํ‚ค๋กœ ์ฐธ์กฐํ•˜๋ฏ€๋กœ ๊ฐ€์žฅ ์ตœ๊ทผ ๋ฐฑ์—”๋“œ ๋ฐ์ดํ„ฐ๋ฅผ ์‚ฌ์šฉ
242
  for spots_dict in (spots_this, spots_last):
243
  for sid in list(spots_dict.keys()):
244
- for ig_src in ("instagram_apify", "instagram_ed"):
245
  if ig_src in spots_dict[sid]:
246
  spots_dict[sid]["instagram"] = spots_dict[sid].pop(ig_src)
247
  break # ์ฒซ ๋ฒˆ์งธ ๋ฐœ๊ฒฌ๋œ ๋ฐฑ์—”๋“œ ์‚ฌ์šฉ (ํ•˜๋‚˜๋งŒ ํ™œ์„ฑ)
 
236
  mt = row["metric_type"]
237
  spots_last.setdefault(sid, {}).setdefault(src, {})[mt] = row["metric_value"]
238
 
239
+ # -- instagram_* โ†’ "instagram" ํ‚ค๋กœ ํ†ตํ•ฉ --
240
+ # DB source๊ฐ€ instagram_apify/instagram_ed/instagram_influencer๋กœ ๋ถ„๋ฆฌ๋˜์—ˆ์œผ๋‚˜
241
  # ์Šค์ฝ”์–ด๋Ÿฌ๋Š” "instagram" ํ‚ค๋กœ ์ฐธ์กฐํ•˜๋ฏ€๋กœ ๊ฐ€์žฅ ์ตœ๊ทผ ๋ฐฑ์—”๋“œ ๋ฐ์ดํ„ฐ๋ฅผ ์‚ฌ์šฉ
242
  for spots_dict in (spots_this, spots_last):
243
  for sid in list(spots_dict.keys()):
244
+ for ig_src in ("instagram_influencer", "instagram_apify", "instagram_ed"):
245
  if ig_src in spots_dict[sid]:
246
  spots_dict[sid]["instagram"] = spots_dict[sid].pop(ig_src)
247
  break # ์ฒซ ๋ฒˆ์งธ ๋ฐœ๊ฒฌ๋œ ๋ฐฑ์—”๋“œ ์‚ฌ์šฉ (ํ•˜๋‚˜๋งŒ ํ™œ์„ฑ)
utils/trending_builder.py CHANGED
@@ -65,8 +65,8 @@ CHANNEL_THEMES: dict[str, dict] = {
65
  "metric_type": "post_count",
66
  "sort_by": "metric_value",
67
  "min_spots": 3,
68
- # DB ์†Œ์Šค๋ช…์ด instagram_apify/instagram_ed๋กœ ๋ถ„๋ฆฌ๋˜์–ด ์žˆ์Œ
69
- "db_sources": ["instagram_apify", "instagram_ed"],
70
  },
71
  }
72
 
@@ -103,7 +103,7 @@ class TrendingBuilder:
103
  period_start, _ = get_week_period()
104
 
105
  # ์ด๋ฒˆ ์ฃผ spot_trends์—์„œ ํ•ด๋‹น ์ฑ„๋„+metric_type ์กฐํšŒ
106
- # instagram์€ DB ์†Œ์Šค๋ช…์ด instagram_apify/instagram_ed๋กœ ๋ถ„๋ฆฌ๋จ โ†’ in_ ์ฟผ๋ฆฌ
107
  sort_ascending = theme.get("sort_ascending", False)
108
  db_sources = theme.get("db_sources", [channel])
109
  query = (
@@ -127,7 +127,7 @@ class TrendingBuilder:
127
  if not trend_rows:
128
  return []
129
 
130
- # ๊ฐ™์€ spot_id๊ฐ€ ์—ฌ๋Ÿฌ ์†Œ์Šค(์˜ˆ: instagram_apify + instagram_ed)์—์„œ
131
  # ์˜ฌ ์ˆ˜ ์žˆ์œผ๋ฏ€๋กœ spot_id ๊ธฐ์ค€ ์ค‘๋ณต ์ œ๊ฑฐ (metric_value๊ฐ€ ํฐ ํ–‰ ์šฐ์„ )
132
  if len(db_sources) > 1:
133
  seen: dict[str, dict] = {}
 
65
  "metric_type": "post_count",
66
  "sort_by": "metric_value",
67
  "min_spots": 3,
68
+ # DB ์†Œ์Šค๋ช…์ด instagram_influencer (v5.0) + ๋ ˆ๊ฑฐ์‹œ ๋ถ„๋ฆฌ ์†Œ์Šค
69
+ "db_sources": ["instagram_influencer", "instagram_apify", "instagram_ed"],
70
  },
71
  }
72
 
 
103
  period_start, _ = get_week_period()
104
 
105
  # ์ด๋ฒˆ ์ฃผ spot_trends์—์„œ ํ•ด๋‹น ์ฑ„๋„+metric_type ์กฐํšŒ
106
+ # instagram์€ DB ์†Œ์Šค๋ช…์ด instagram_influencer/instagram_apify/instagram_ed๋กœ ๋ถ„๋ฆฌ๋จ โ†’ in_ ์ฟผ๋ฆฌ
107
  sort_ascending = theme.get("sort_ascending", False)
108
  db_sources = theme.get("db_sources", [channel])
109
  query = (
 
127
  if not trend_rows:
128
  return []
129
 
130
+ # ๊ฐ™์€ spot_id๊ฐ€ ์—ฌ๋Ÿฌ ์†Œ์Šค(์˜ˆ: instagram_influencer + ๋ ˆ๊ฑฐ์‹œ)์—์„œ
131
  # ์˜ฌ ์ˆ˜ ์žˆ์œผ๋ฏ€๋กœ spot_id ๊ธฐ์ค€ ์ค‘๋ณต ์ œ๊ฑฐ (metric_value๊ฐ€ ํฐ ํ–‰ ์šฐ์„ )
132
  if len(db_sources) > 1:
133
  seen: dict[str, dict] = {}