Spaces:

JHyeok5
/

samchun-gemini

Sleeping

App Files Files Community

JHyeok5 commited on Feb 24

Commit

206c732

verified ·

1 Parent(s): 19aa2f9

Upload folder using huggingface_hub

Browse files

Files changed (7) hide show

requirements-trend.txt +1 -2
scripts/run_trend_engine.py +8 -9
scripts/test_instagram_collector.py +272 -0
scripts/test_instagram_full.py +162 -0
trend_engine/collectors/instagram.py +647 -802
trend_engine/trend_scorer.py +3 -3
utils/trending_builder.py +4 -4

requirements-trend.txt CHANGED Viewed

@@ -4,7 +4,6 @@ beautifulsoup4>=4.12.0
 lxml>=5.0.0
 requests>=2.31.0
 google-api-python-client>=2.100.0
-# ensembledata SDK optional (httpx로 직접 호출)
-# ensembledata>=0.2.0
 supabase>=2.0.0
 python-dotenv>=1.0.0

 lxml>=5.0.0
 requests>=2.31.0
 google-api-python-client>=2.100.0
+google-genai>=1.0.0,<1.64.0
 supabase>=2.0.0
 python-dotenv>=1.0.0

scripts/run_trend_engine.py CHANGED Viewed

@@ -4,10 +4,10 @@ RE:Play Trend Engine v3 — 주간 배치 오케스트레이터
 순차 실행 파이프라인:
   1. 카카오맵 그리드 스캔 + 리뷰 파싱 (trend_spots 마스터 생성)
   2. SpotMatcher 초기화 (trend_spots + story_spots 사전 로드)
-  3. 네이버 블로그 수집 (URL 확보 + 크롤링 + DB 저장)
-  4. 블로그 본문 → 장소명 추출 + mention_count 집계
-  5. 유튜브 API (SpotMatcher 연동)
-  6. 인스타그램 Apify (SpotMatcher 연동)
   7. 종합 스코어 계산 + 랭킹 생성
 Usage:
@@ -122,11 +122,10 @@ def main() -> None:
     youtube = YouTubeCollector(sb, spot_matcher=matcher)
     run_step("3_youtube", youtube.run, results)
-    # ── 4. 인스타그램 (Apify/EnsembleData 듀얼 백엔드, SpotMatcher 연동) ──
-    ig_backend = os.environ.get("INSTAGRAM_BACKEND", "apify")
-    logger.info("인스타그램 백엔드: %s", ig_backend)
     instagram = InstagramCollector(sb, spot_matcher=matcher)
-    run_step("4_instagram", instagram.run, results)
     # ── 5. 네이버 플레이스 — 비활성 (Place ID 매칭 불가) ──
     logger.info("네이버 플레이스: 비활성 (Place ID 매칭 불가, 2026-02)")
@@ -273,7 +272,7 @@ def main() -> None:
     # ── 8. 종합 스코어 계산 + 랭킹 생성 (최소 2채널 성공 시) ──
     # 수집 채널 단계만 카운트 (1, 3, 4, 6)
-    collection_steps = ["1_kakaomap", "3_youtube", "4_instagram", "6_naver_blog"]
     successful_channels = [s for s in collection_steps if results.get(s, {}).get("status") == "ok"]
     def calc_scores():

 순차 실행 파이프라인:
   1. 카카오맵 그리드 스캔 + 리뷰 파싱 (trend_spots 마스터 생성)
   2. SpotMatcher 초기화 (trend_spots + story_spots 사전 로드)
+  3. 유튜브 API (SpotMatcher 연동)
+  4. 인스타그램 인플루언서 모니터링 v5.0 (SpotMatcher 연동)
+  5. 네이버 블로그 수집 (URL 확보 + 크롤링 + DB 저장)
+  6. 블로그 본문 → 장소명 추출 + mention_count 집계
   7. 종합 스코어 계산 + 랭킹 생성
 Usage:
     youtube = YouTubeCollector(sb, spot_matcher=matcher)
     run_step("3_youtube", youtube.run, results)
+    # ── 4. 인스타그램 인플루언서 모니터링 v5.1 Multimodal (SpotMatcher 연동) ──
+    logger.info("인스타그램: 인플루언서 모니터링 v5.1 (Multimodal AI)")
     instagram = InstagramCollector(sb, spot_matcher=matcher)
+    run_step("4_instagram_influencer", instagram.run, results)
     # ── 5. 네이버 플레이스 — 비활성 (Place ID 매칭 불가) ──
     logger.info("네이버 플레이스: 비활성 (Place ID 매칭 불가, 2026-02)")
     # ── 8. 종합 스코어 계산 + 랭킹 생성 (최소 2채널 성공 시) ──
     # 수집 채널 단계만 카운트 (1, 3, 4, 6)
+    collection_steps = ["1_kakaomap", "3_youtube", "4_instagram_influencer", "6_naver_blog"]
     successful_channels = [s for s in collection_steps if results.get(s, {}).get("status") == "ok"]
     def calc_scores():

scripts/test_instagram_collector.py ADDED Viewed

	@@ -0,0 +1,272 @@

+"""
+Instagram Collector v5.1 — 통합 테스트 스크립트
+인스타그램 인플루언서 파이프라인의 핵심 단계를 검증합니다:
+  1. DB에서 인플루언서 계정 로드
+  2. Apify로 1개 계정 게시물 수집 (비용 최소화)
+  3. 게시물 정규화 + 기간/engagement 필터
+  4. Pass 1: 규칙 기반 매칭 (위치태그 + 해시태그)
+  5. Pass 2: Gemini 멀티모달 AI 분석 (이미지 1건)
+  6. 매칭 결과 요약 (DB 저장 안 함)
+Usage:
+  python backend/scripts/test_instagram_collector.py
+"""
+import json
+import logging
+import os
+import sys
+import time
+from datetime import datetime, timedelta, timezone
+# backend/ 디렉토리를 import path에 추가
+sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
+# .env 로드
+try:
+    from dotenv import load_dotenv
+    env_path = os.path.join(os.path.dirname(__file__), "..", "..", ".env")
+    load_dotenv(env_path)
+except ImportError:
+    pass
+logging.basicConfig(
+    level=logging.INFO,
+    format="%(asctime)s [%(levelname)s] %(name)s: %(message)s",
+    datefmt="%H:%M:%S",
+)
+logger = logging.getLogger("test_instagram")
+def test_step(name: str, func):
+    """테스트 단계 실행 래퍼."""
+    logger.info("━━━ [TEST] %s ━━━", name)
+    start = time.time()
+    try:
+        result = func()
+        elapsed = time.time() - start
+        logger.info("  ✓ PASS — %.1f초", elapsed)
+        return result
+    except Exception as e:
+        elapsed = time.time() - start
+        logger.error("  ✗ FAIL — %s (%.1f초)", e, elapsed)
+        import traceback
+        traceback.print_exc()
+        return None
+def main():
+    from supabase import create_client
+    url = os.environ.get("SUPABASE_URL") or os.environ.get("VITE_SUPABASE_URL")
+    key = os.environ.get("SUPABASE_SERVICE_ROLE_KEY")
+    if not url or not key:
+        logger.error("SUPABASE_URL / SUPABASE_SERVICE_ROLE_KEY 환경변수 누락")
+        sys.exit(1)
+    sb = create_client(url, key)
+    # ── 1. 인플루언서 계정 로드 ──
+    def step1_load_accounts():
+        from trend_engine.collectors.instagram import InstagramCollector
+        collector = InstagramCollector(sb)
+        accounts = collector._load_influencer_accounts()
+        assert len(accounts) > 0, f"계정 0개 로드됨"
+        logger.info("  계정 %d개 로드:", len(accounts))
+        for a in accounts:
+            logger.info("    [%d] @%s (%s)", a.get("priority", 0), a["username"], a.get("category", ""))
+        return accounts
+    accounts = test_step("1. 인플루언서 계정 로드 (DB)", step1_load_accounts)
+    if not accounts:
+        logger.error("계정 로드 실패 — 테스트 중단")
+        sys.exit(1)
+    # ── 2. Apify 수집 (1개 계정만) ──
+    apify_token = os.environ.get("APIFY_API_TOKEN")
+    if not apify_token:
+        logger.error("APIFY_API_TOKEN 환경변수 누락 — 수집 테스트 스킵")
+        sys.exit(1)
+    # 제주 거주 인플루언서 우선, 없으면 priority 1
+    jeju_accounts = [a for a in accounts if a.get("category") == "lifestyle"]
+    test_account = jeju_accounts[0] if jeju_accounts else accounts[0]
+    test_username = test_account["username"]
+    def step2_apify_scrape():
+        from trend_engine.collectors.instagram import InstagramCollector
+        collector = InstagramCollector(sb)
+        # 1개 계정만 5건 제한으로 수집
+        from trend_engine.collectors import instagram as ig_mod
+        orig_limit = ig_mod.RESULTS_LIMIT_PER_ACCOUNT
+        orig_window = ig_mod.TREND_WINDOW_DAYS
+        ig_mod.RESULTS_LIMIT_PER_ACCOUNT = 5  # 테스트용 제한
+        ig_mod.TREND_WINDOW_DAYS = 90  # 테스트용: 90일로 확장 (충분한 매칭 데이터)
+        try:
+            posts = collector._scrape_profiles([test_username], "test_batch")
+        finally:
+            ig_mod.RESULTS_LIMIT_PER_ACCOUNT = orig_limit
+            ig_mod.TREND_WINDOW_DAYS = orig_window
+        logger.info("  @%s → %d건 수집", test_username, len(posts))
+        for i, p in enumerate(posts[:3]):
+            logger.info(
+                "    [%d] type=%s likes=%d loc=%s caption=%.50s...",
+                i + 1,
+                p.get("media_type", "?"),
+                p.get("likes_count", 0),
+                p.get("location_name", "")[:30] or "(없음)",
+                (p.get("caption", "") or "")[:50],
+            )
+            if p.get("media_url"):
+                logger.info("        media_url=%s", p["media_url"][:80])
+        return posts
+    posts = test_step(f"2. Apify 수집 (@{test_username}, 5건 제한)", step2_apify_scrape)
+    if not posts:
+        logger.warning("게시물 0건 — 이후 매칭 테스트는 합성 데이터로 진행")
+        posts = []
+    # ── 3. SpotMatcher 초기화 ──
+    def step3_spot_matcher():
+        from trend_engine.spot_matcher import SpotMatcher
+        matcher = SpotMatcher(sb)
+        logger.info(
+            "  trend_spots: %d건, story_spots: %d건",
+            len(matcher.trend_spots), len(matcher.story_spots),
+        )
+        # 샘플 매칭 테스트
+        test_names = ["카페 레이어드 애월", "협재해수욕장", "새별오름", "애월 카페거리"]
+        for name in test_names:
+            sid = matcher.match(name)
+            logger.info("    match('%s') → %s", name, sid or "(미매칭)")
+        return matcher
+    matcher = test_step("3. SpotMatcher 초기화", step3_spot_matcher)
+    if not matcher:
+        logger.error("SpotMatcher 초기화 실패 — 테스트 중단")
+        sys.exit(1)
+    # ── 4. Pass 1: 규칙 기반 매칭 ──
+    def step4_pass1_matching():
+        from trend_engine.collectors.instagram import InstagramCollector
+        from trend_engine.collectors import instagram as ig_mod
+        collector = InstagramCollector(sb, spot_matcher=matcher)
+        # 테스트용: 기간 필터 확장 (90일)
+        orig_window = ig_mod.TREND_WINDOW_DAYS
+        ig_mod.TREND_WINDOW_DAYS = 90
+        if not posts:
+            logger.info("  실제 게시물 없음 — 합성 데이터로 테스트")
+            test_posts = [
+                {
+                    "search_term": "@test",
+                    "search_type": "profile",
+                    "location_name": "카페 레이어드 애월",
+                    "likes_count": 500,
+                    "comments_count": 30,
+                    "caption": "애월 카페 너무 좋다 #애월카페 #제주도",
+                    "timestamp": datetime.now(timezone.utc).isoformat(),
+                    "url": "https://instagram.com/p/test1",
+                    "hashtags": ["애월카페", "제주도"],
+                    "media_url": "",
+                    "media_type": "Image",
+                    "_source_account": "test",
+                },
+                {
+                    "search_term": "@test",
+                    "search_type": "profile",
+                    "location_name": "",
+                    "likes_count": 200,
+                    "comments_count": 10,
+                    "caption": "새별오름 일출 보고왔어요 #새별오름 #제주여행",
+                    "timestamp": datetime.now(timezone.utc).isoformat(),
+                    "url": "https://instagram.com/p/test2",
+                    "hashtags": ["새별오름", "제주여행"],
+                    "media_url": "https://example.com/image.jpg",
+                    "media_type": "Image",
+                    "_source_account": "test",
+                },
+            ]
+        else:
+            test_posts = posts
+        spot_metrics, unmatched, match_stats = collector._aggregate_with_unmatched(test_posts)
+        logger.info("  총 게시물: %d건", len(test_posts))
+        logger.info("  Pass 1 매칭: %d개 스팟", len(spot_metrics))
+        logger.info("  미매칭 (→ Pass 2 대상): %d건", len(unmatched))
+        logger.info("  매칭 통계: %s", json.dumps(match_stats, ensure_ascii=False))
+        if spot_metrics:
+            logger.info("  매칭된 스팟:")
+            for sid, m in spot_metrics.items():
+                logger.info(
+                    "    %s: posts=%d, engagement=%d, methods=%s",
+                    sid, m["post_count"], m["weighted_score"],
+                    m["match_methods"],
+                )
+        ig_mod.TREND_WINDOW_DAYS = orig_window
+        return spot_metrics, unmatched, match_stats
+    result4 = test_step("4. Pass 1 — 규칙 기반 매칭", step4_pass1_matching)
+    if not result4:
+        logger.error("Pass 1 테스트 실패")
+        sys.exit(1)
+    spot_metrics, unmatched, match_stats = result4
+    # ── 5. Pass 2: AI 멀티모달 분석 (선택적) ──
+    gemini_key = os.environ.get("GEMINI_API_KEY")
+    if gemini_key and unmatched:
+        def step5_ai_multimodal():
+            from trend_engine.collectors.instagram import InstagramCollector
+            collector = InstagramCollector(sb, spot_matcher=matcher)
+            # 1건만 테스트
+            test_unmatched = unmatched[:1]
+            p = test_unmatched[0]
+            logger.info(
+                "  테스트 게시물: type=%s, caption=%.60s...",
+                p.get("media_type", "?"),
+                (p.get("caption", "") or "")[:60],
+            )
+            if p.get("media_url"):
+                logger.info("  media_url=%s", p["media_url"][:80])
+            ai_count = collector._ai_analyze_content(
+                test_unmatched, spot_metrics, match_stats,
+            )
+            logger.info("  AI 매칭 결과: %d건", ai_count)
+            return ai_count
+        test_step("5. Pass 2 — Gemini 멀티모달 AI (1건)", step5_ai_multimodal)
+    elif not gemini_key:
+        logger.info("━━━ [SKIP] 5. AI 멀티모달 — GEMINI_API_KEY 미설정 ━━━")
+    else:
+        logger.info("━━━ [SKIP] 5. AI 멀티모달 — 미매칭 게시물 없음 ━━━")
+    # ── 6. 최종 요약 ──
+    logger.info("")
+    logger.info("═══════════════════════════════════════════")
+    logger.info("  Instagram v5.1 테스트 결과 요약")
+    logger.info("═══════════════════════════════════════════")
+    logger.info("  인플루언서 계정: %d개 (DB)", len(accounts))
+    logger.info("  Apify 수집: @%s → %d건", test_username, len(posts))
+    logger.info("  SpotMatcher: trend=%d, story=%d",
+                len(matcher.trend_spots), len(matcher.story_spots))
+    logger.info("  Pass 1 매칭: %d개 스팟", len(spot_metrics))
+    logger.info("  매칭 통계: %s", json.dumps(match_stats, ensure_ascii=False))
+    logger.info("  AI 멀티모달: %s", "활성" if gemini_key else "비활성")
+    logger.info("═══════════════════════════════════════════")
+    logger.info("  ⚠️  DB 저장 안 함 (테스트 모드)")
+    logger.info("═══════════════════════════════════════════")
+if __name__ == "__main__":
+    main()

scripts/test_instagram_full.py ADDED Viewed

	@@ -0,0 +1,162 @@

+"""
+Instagram Collector v5.1 — 전체 인플루언서 실전 테스트
+15개 인플루언서 전체에 대해 실제 파이프라인을 실행합니다.
+DB 저장은 하지 않고 결과만 확인합니다.
+Usage:
+  python3 backend/scripts/test_instagram_full.py
+"""
+import json
+import logging
+import os
+import sys
+import time
+from datetime import datetime, timedelta, timezone
+sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
+try:
+    from dotenv import load_dotenv
+    load_dotenv(os.path.join(os.path.dirname(__file__), "..", "..", ".env"))
+except ImportError:
+    pass
+logging.basicConfig(
+    level=logging.INFO,
+    format="%(asctime)s [%(levelname)s] %(name)s: %(message)s",
+    datefmt="%H:%M:%S",
+)
+logger = logging.getLogger("test_instagram_full")
+def main():
+    from supabase import create_client
+    from trend_engine.collectors.instagram import InstagramCollector
+    from trend_engine.spot_matcher import SpotMatcher
+    url = os.environ.get("SUPABASE_URL") or os.environ.get("VITE_SUPABASE_URL")
+    key = os.environ.get("SUPABASE_SERVICE_ROLE_KEY")
+    if not url or not key:
+        logger.error("SUPABASE_URL / SUPABASE_SERVICE_ROLE_KEY 환경변수 누락")
+        sys.exit(1)
+    sb = create_client(url, key)
+    # SpotMatcher 초기화
+    matcher = SpotMatcher(sb)
+    logger.info("SpotMatcher: trend=%d, story=%d", len(matcher.trend_spots), len(matcher.story_spots))
+    # InstagramCollector 초기화
+    collector = InstagramCollector(sb, spot_matcher=matcher)
+    # ── 1. 계정 로드 ──
+    accounts = collector._load_influencer_accounts()
+    logger.info("인플루언서 계정 %d개 로드", len(accounts))
+    # ── 2. 전체 게시물 수집 ──
+    logger.info("━━━ 전체 인플루언서 게시물 수집 시작 ━━━")
+    start = time.time()
+    all_posts = collector._collect_from_accounts(accounts)
+    collect_elapsed = time.time() - start
+    logger.info("수집 완료: %d건 (%.1f초)", len(all_posts), collect_elapsed)
+    # 계정별 수집 통계
+    account_stats: dict[str, int] = {}
+    for p in all_posts:
+        acct = p.get("_source_account", "unknown")
+        account_stats[acct] = account_stats.get(acct, 0) + 1
+    logger.info("━━━ 계정별 수집 현황 ━━━")
+    for acct, count in sorted(account_stats.items(), key=lambda x: -x[1]):
+        logger.info("  @%-25s → %d건", acct, count)
+    if not all_posts:
+        logger.error("수집된 게시물 없음 — 종료")
+        sys.exit(1)
+    # ── 3. Pass 1: 규칙 기반 매칭 ──
+    logger.info("━━━ Pass 1: 규칙 기반 매칭 ━━━")
+    spot_metrics, unmatched_posts, match_stats = collector._aggregate_with_unmatched(all_posts)
+    logger.info("Pass 1 결과:")
+    logger.info("  전체 게시물: %d건", len(all_posts))
+    logger.info("  기간 필터 제외: %d건 (30일 초과)", match_stats["filtered_old"])
+    logger.info("  저engagement 제외: %d건", match_stats["filtered_low_engagement"])
+    logger.info("  위치태그 매칭: %d건", match_stats["location_tag"])
+    logger.info("  해시태그 매칭: %d건", match_stats["hashtag"])
+    logger.info("  미매칭 → AI 대상: %d건", match_stats["unmatched"])
+    logger.info("  Pass 1 스팟: %d개", len(spot_metrics))
+    if spot_metrics:
+        logger.info("  매칭된 스팟:")
+        for sid, m in sorted(spot_metrics.items(), key=lambda x: -x[1]["weighted_score"]):
+            logger.info(
+                "    %s: posts=%d, score=%d, methods=%s, accounts=%s",
+                sid, m["post_count"], m["weighted_score"],
+                m["match_methods"], m["source_accounts"],
+            )
+    # ── 4. Pass 2: AI 멀티모달 분석 (전체) ──
+    ai_matched = 0
+    if unmatched_posts and os.environ.get("GEMINI_API_KEY"):
+        logger.info("━━━ Pass 2: Gemini 2.5 Flash 멀티모달 분석 (%d건) ━━━", len(unmatched_posts))
+        # 미매칭 게시물 미디어 타입 통계
+        image_count = sum(1 for p in unmatched_posts if p.get("media_type") != "Video")
+        video_count = sum(1 for p in unmatched_posts if p.get("media_type") == "Video")
+        logger.info("  이미지: %d건, 영상: %d건", image_count, video_count)
+        # 미매칭 게시물 상세 (캡션 미리보기)
+        for i, p in enumerate(unmatched_posts[:10]):
+            logger.info(
+                "  [%d] @%s type=%s caption=%.60s...",
+                i + 1, p.get("_source_account", "?"),
+                p.get("media_type", "?"),
+                (p.get("caption", "") or "")[:60],
+            )
+        start_ai = time.time()
+        ai_matched = collector._ai_analyze_content(unmatched_posts, spot_metrics, match_stats)
+        ai_elapsed = time.time() - start_ai
+        logger.info("AI 분석 완료: %d건 매칭 (%.1f초)", ai_matched, ai_elapsed)
+    elif not os.environ.get("GEMINI_API_KEY"):
+        logger.warning("GEMINI_API_KEY 미설정 — AI 분석 스킵")
+    else:
+        logger.info("미매칭 게시물 없음 — AI 분석 불필요")
+    # ── 5. 최종 결과 ──
+    logger.info("")
+    logger.info("═══════════════════════════════════════════════════")
+    logger.info("  Instagram v5.1 전체 테스트 결과")
+    logger.info("═══════════════════════════════════════════════════")
+    logger.info("  인플루언서: %d개 계정", len(accounts))
+    logger.info("  수집 성공: %d개 계정 (%d건)", len(account_stats), len(all_posts))
+    logger.info("  수집 소요: %.1f초", collect_elapsed)
+    logger.info("  ─────────────────────────────────────────────")
+    logger.info("  30일 이내 게시물: %d건", len(all_posts) - match_stats["filtered_old"])
+    logger.info("  engagement ≥ 50: %d건",
+                len(all_posts) - match_stats["filtered_old"] - match_stats["filtered_low_engagement"])
+    logger.info("  ─────────────────────────────────────────────")
+    logger.info("  Pass 1 (위치태그): %d건", match_stats["location_tag"])
+    logger.info("  Pass 1 (해시태그): %d건", match_stats["hashtag"])
+    logger.info("  Pass 2 (AI 멀티모달): %d건", match_stats.get("ai_matched", 0))
+    logger.info("  총 매칭 스팟: %d개", len(spot_metrics))
+    logger.info("  ─────────────────────────────────────────────")
+    if spot_metrics:
+        logger.info("  최종 매칭 스팟 목록:")
+        for sid, m in sorted(spot_metrics.items(), key=lambda x: -x[1]["weighted_score"]):
+            logger.info(
+                "    %s: posts=%d, score=%d",
+                sid, m["post_count"], m["weighted_score"],
+            )
+    logger.info("  ─────────────────────────────────────────────")
+    logger.info("  ⚠️  DB 저장 안 함 (테스트 모드)")
+    logger.info("═══════════════════════════════════════════════════")
+if __name__ == "__main__":
+    main()

trend_engine/collectors/instagram.py CHANGED Viewed

@@ -1,165 +1,64 @@
 """
-Instagram Collector — Dual Backend (v4.1)
-환경변수 INSTAGRAM_BACKEND로 수집 백엔드를 선택합니다:
-  - "apify" (기본값/Phase 1): Apify instagram-hashtag-scraper + directUrls
-  - "ed" (Phase 2): EnsembleData REST API + 일별 예산 관리
-공통 파이프라인:
-  1. 템플릿 기반 해시태그 동적 생성 (일반 + 하위지명 + 고유명사)
-  2. 백엔드별 게시물 수집 (Apify Actor / EnsembleData REST)
-  3. 기간 필터(10일) + 최소 engagement 임계값 + cap 적용
-  4. 3단계 장소 매칭: 위치태그(접두사 제거) → 해시태그(방향 제한) → 캡션 추출
-     - Apify 추가: Stage 0 directUrls (_direct_spot_id 확정)
   5. 가중 집계: weighted_score = sum(min(engagement, cap))
-  6. spot_trends 테이블에 저장 (source = instagram_apify / instagram_ed)
-Apify (v3.6):
-  - Apify instagram-hashtag-scraper Actor (~30건/해시태그)
-  - directUrls: Location ID 반자동 탐색 + JSON 캐싱 + SpotMatcher 연동
-EnsembleData (v4.0):
-  - REST API /instagram/hashtag/posts (~63건/해시태그)
-  - 일별 유닛 예산 관리 (BudgetTracker)
-  - 해시태그 로테이션 스케줄 (Free Trial: 1개/일, Wood: 전체)
 """
 from __future__ import annotations
 import json
 import os
 import re
-import logging
-from datetime import datetime, timedelta, timezone, date
-# ──────────────────────────────────────────────
-# Backend Switch
-# ──────────────────────────────────────────────
-INSTAGRAM_BACKEND = os.getenv("INSTAGRAM_BACKEND", "apify")
-if INSTAGRAM_BACKEND == "apify":
-    from apify_client import ApifyClient
-elif INSTAGRAM_BACKEND == "ed":
-    import httpx
-from trend_engine.place_extractor import PlaceNameExtractor
 from trend_engine.utils import get_week_period, safe_upsert_spot_trend
 logger = logging.getLogger(__name__)
 # ══════════════════════════════════════════════
-# Shared: 지역 설정 — 서비스 지역 확장 시 이 섹션만 수정
 # ══════════════════════════════════════════════
-AREA_NAME = "애월"
-AREA_ALIASES = ["제주애월", "제주도애월"]
-AREA_SUB_NAMES = ["한담", "곽지"]
-# ──────────────────────────────────────────────
-# 카테고리 1: 지역 일반 템플릿
-# ──────────────────────────────────────────────
-GENERAL_SUFFIXES = [
-    "카페", "맛집", "여행", "가볼만한곳", "핫플",
-    "감성", "디저트", "해안", "산책", "일몰", "오션뷰",
-]
-def build_general_hashtags(area: str, aliases: list[str]) -> list[str]:
-    """지역명 + 일반 접미사 조합으로 해시태그 생성."""
-    tags = []
-    for suffix in GENERAL_SUFFIXES:
-        tags.append(f"{area}{suffix}")
-    for alias in aliases:
-        tags.append(alias)
-    return tags
-# ──────────────────────────────────────────────
-# 카테고리 2: 하위 지명 해시태그
-# ──────────────────────────────────────────────
-SUB_AREA_SUFFIXES = ["해변", "해수욕장"]
-def build_sub_area_hashtags(sub_names: list[str]) -> list[str]:
-    """하위 지명 + 지형 접미사 조합."""
-    tags = []
-    for name in sub_names:
-        for suffix in SUB_AREA_SUFFIXES:
-            tags.append(f"{name}{suffix}")
-    return tags
-# ──────────────────────────────────────────────
-# 카테고리 3: 장소 고유명사 (동적 생성)
-# ──────────────────────────────────────────────
-def build_spot_hashtags(supabase_client, limit: int = 15) -> list[str]:
-    """trend_spots 테이블에서 인기 장소명을 해시태그 후보로 추출."""
-    try:
-        result = (
-            supabase_client.table("trend_spots")
-            .select("name, category")
-            .in_("category", ["관광명소", "카페", "문화시설"])
-            .execute()
-        )
-    except Exception as e:
-        logger.warning("trend_spots 조회 실패 (고유명사 해시태그): %s", e)
-        return []
-    spot_names = []
-    for row in result.data or []:
-        name = row.get("name", "")
-        if not name or len(name) < 2 or len(name) > 15:
-            continue
-        if " " in name:
-            continue
-        spot_names.append(name)
-    spot_names.sort(key=len, reverse=True)
-    return spot_names[:limit]
 # ──────────────────────────────────────────────
-# 최종 해시태그 리스트 조합
 # ──────────────────────────────────────────────
-def build_all_hashtags(supabase_client) -> list[str]:
-    """3개 카테고리를 합산하고 중복 제거한 최종 해시태그 리스트."""
-    general = build_general_hashtags(AREA_NAME, AREA_ALIASES)
-    sub_area = build_sub_area_hashtags(AREA_SUB_NAMES)
-    spots = build_spot_hashtags(supabase_client, limit=15)
-    seen: set[str] = set()
-    all_tags: list[str] = []
-    for tag in general + sub_area + spots:
-        if tag not in seen:
-            seen.add(tag)
-            all_tags.append(tag)
-    logger.info(
-        "해시태그 구성: 일반 %d + 하위지명 %d + 고유명사 %d = 총 %d개",
-        len(general), len(sub_area), len(spots), len(all_tags),
-    )
-    return all_tags
-# ══════════════════════════════════════════════
-# Shared: 공통 설정
-# ══════════════════════════════════════════════
-# 기간 필터 — 직전 1주 + 2~3일 버퍼
-TREND_WINDOW_DAYS = 10
-# 가중 집계 상수
-MIN_ENGAGEMENT = 10       # 최소 engagement 임계값 (봇/스팸 필터)
-ENGAGEMENT_CAP = 1000     # 단일 게시물 engagement 상한 (인플루언서 지배 방지)
 # 국가/공항 단위만 BLACKLIST
 LOCATION_BLACKLIST = frozenset({
     "South Korea", "Korea", "대한민국", "한국",
@@ -204,683 +103,293 @@ def clean_location_name(name: str) -> str:
 HASHTAG_RE = re.compile(r"#([\w가-힣]{2,30})")
-# ══════════════════════════════════════════════
-# Apify-Only: Actor 설정 & Location Cache
-# ══════════════════════════════════════════════
-# 해시태그당 최대 게시물 수 (인스타그램 공개 API 1페이지 = ~30건)
-RESULTS_LIMIT_PER_HASHTAG = 30
-# Actor 설정
-ACTOR_ID = "apify/instagram-hashtag-scraper"
-SEARCH_ACTOR_ID = "apify/instagram-search-scraper"
-# Location ID 캐시 파일 경로
-LOCATION_CACHE_FILE = os.path.join(
-    os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))),
-    "data", "instagram_location_ids.json",
-)
-def discover_location_ids(
-    apify_client: ApifyClient,
-    spot_names: list[str],
-    area: str = "애월",
-) -> dict[str, dict]:
-    """주요 장소의 Instagram Location ID를 탐색한다.
-    Instagram Search Scraper로 장소명 검색 → locationId 추출.
-    초기 1회 실행 또는 월 1회 갱신용. 결과는 파일에 캐싱.
-    """
-    location_map: dict[str, dict] = {}
-    for name in spot_names:
-        try:
-            run_input = {
-                "search": f"{name} {area}",
-                "searchType": "place",
-                "resultsLimit": 3,
-            }
-            run = apify_client.actor(SEARCH_ACTOR_ID).call(
-                run_input=run_input,
-                timeout_secs=60,
-            )
-            items = list(
-                apify_client.dataset(run["defaultDatasetId"]).iterate_items()
-            )
-            for item in items:
-                loc_id = item.get("locationId") or item.get("id")
-                loc_name = item.get("name", "")
-                if loc_id and loc_name:
-                    url = f"https://www.instagram.com/explore/locations/{loc_id}/"
-                    location_map[url] = {
-                        "instagram_name": loc_name,
-                        "search_query": name,
-                    }
-                    logger.info("Location ID 확보: %s → %s (%s)", name, loc_id, loc_name)
-                    break  # 첫 번째 매칭만
-        except Exception as e:
-            logger.warning("Location ID 탐색 실패: %s — %s", name, e)
-            continue
-    logger.info("Location ID 탐색 완료: %d/%d개 성공", len(location_map), len(spot_names))
-    return location_map
-def load_or_discover_locations(
-    apify_client: ApifyClient, supabase_client, max_age_days: int = 30,
-) -> dict[str, dict]:
-    """캐시된 Location ID를 불러오거나, 없으면 탐색한다."""
-    if os.path.exists(LOCATION_CACHE_FILE):
-        try:
-            with open(LOCATION_CACHE_FILE) as f:
-                cached = json.load(f)
-            updated_at = cached.get("updated_at", "")
-            if updated_at:
-                updated = datetime.fromisoformat(updated_at)
-                if (datetime.now(timezone.utc) - updated).days < max_age_days:
-                    locations = cached.get("locations", {})
-                    logger.info("Location ID 캐시 사용 (%d개, %s)", len(locations), updated_at[:10])
-                    return locations
-        except (json.JSONDecodeError, ValueError, KeyError) as e:
-            logger.warning("Location ID 캐시 파싱 실패: %s", e)
-    # 캐시 없거나 만료 → 재탐색
-    spot_names = build_spot_hashtags(supabase_client, limit=15)
-    if not spot_names:
-        logger.warning("고유명사 해시태그 0개 — Location ID 탐색 스킵")
-        return {}
-    locations = discover_location_ids(apify_client, spot_names)
-    # 캐시 저장
-    try:
-        os.makedirs(os.path.dirname(LOCATION_CACHE_FILE), exist_ok=True)
-        with open(LOCATION_CACHE_FILE, "w") as f:
-            json.dump({
-                "updated_at": datetime.now(timezone.utc).isoformat(),
-                "locations": locations,
-            }, f, ensure_ascii=False, indent=2)
-        logger.info("Location ID 캐시 저장: %s (%d개)", LOCATION_CACHE_FILE, len(locations))
-    except OSError as e:
-        logger.warning("Location ID 캐시 저장 실패: %s", e)
-    return locations
-def build_direct_urls_with_spot_ids(
-    location_map: dict[str, dict], spot_matcher,
-) -> dict[str, str]:
-    """Location URL → spot_id 매핑을 구축한다."""
-    url_to_spot: dict[str, str] = {}
-    for url, info in location_map.items():
-        search_query = info.get("search_query", "")
-        if not search_query or not spot_matcher:
-            continue
-        spot_id = spot_matcher.match(search_query)
-        if spot_id:
-            url_to_spot[url] = spot_id
-            logger.info("directUrl 매핑: %s → %s", search_query, spot_id)
-        else:
-            logger.debug("directUrl 매핑 실패: %s", search_query)
-    logger.info("directUrl 매핑 완료: %d/%d개 성공", len(url_to_spot), len(location_map))
-    return url_to_spot
-# ══════════════════════════════════════════════
-# EnsembleData-Only: API, Budget, Post Conversion
-# ══════════════════════════════════════════════
-ENSEMBLEDATA_BASE_URL = "https://ensembledata.com/apis"
-API_TIMEOUT_SECS = 30
-# 우선순위 해시태그 (Free Trial 로테이션용)
-PRIORITY_HASHTAGS = [
-    "애월카페",       # 월 (Day 0)
-    "애월맛집",       # 화 (Day 1)
-    "애월여행",       # 수 (Day 2)
-    "제주애월",       # 목 (Day 3)
-    "애월핫플",       # 금 (Day 4)
-    "애월감성",       # 토 (Day 5)
-    # 일요일 (Day 6) = 집계 전���, 수집 없음
 ]
-def get_daily_hashtags(all_hashtags: list[str], daily_budget: int) -> list[str]:
-    """일별 예산에 맞춰 오늘 수집할 해시태그 목록을 반환한다.
-    - daily_budget >= 1500 (Wood 플랜): 전체 해시태그 반환
-    - daily_budget < 1500 (Free Trial): 요일 기반 로테이션
-    """
-    if daily_budget >= 1500:
-        return all_hashtags
-    day_of_week = date.today().weekday()  # 0=월, 6=일
-    if day_of_week == 6:
-        logger.info("일요일 — 수집 스킵 (집계 전용)")
-        return []
-    if day_of_week < len(PRIORITY_HASHTAGS):
-        tag = PRIORITY_HASHTAGS[day_of_week]
-        logger.info("Free Trial 로테이션: %s요일 → #%s", "월화수목금토"[day_of_week], tag)
-        return [tag]
-    return []
-class BudgetTracker:
-    """일별 EnsembleData 유닛 사용량 추적.
-    상태 파일에 오늘 사용량을 기록하여 재실행 시에도 예산 초과를 방지한다.
-    """
-    def __init__(self, daily_limit: int, state_file: str | None = None):
-        self.daily_limit = daily_limit
-        self.state_file = state_file or os.path.join(
-            os.environ.get("BUDGET_STATE_DIR", "/tmp"),
-            "ed_budget_state.json",
-        )
-        self.used_today = self._load_today_usage()
-    @property
-    def remaining(self) -> int:
-        return max(0, self.daily_limit - self.used_today)
-    def can_afford(self, estimated_posts: int = 70) -> bool:
-        """예상 게시물 수 기준으로 예산 내인지 확인."""
-        return self.remaining >= estimated_posts
-    def record(self, units: int) -> None:
-        """유닛 사용 기록."""
-        self.used_today += units
-        self._save_state()
-        logger.info(
-            "유닛 사용: +%d (오늘 합계: %d/%d, 잔여: %d)",
-            units, self.used_today, self.daily_limit, self.remaining,
-        )
-    def _load_today_usage(self) -> int:
-        try:
-            with open(self.state_file) as f:
-                state = json.load(f)
-            if state.get("date") == date.today().isoformat():
-                return state.get("used", 0)
-        except (FileNotFoundError, json.JSONDecodeError, KeyError):
-            pass
-        return 0
-    def _save_state(self) -> None:
-        try:
-            os.makedirs(os.path.dirname(self.state_file), exist_ok=True)
-            with open(self.state_file, "w") as f:
-                json.dump({
-                    "date": date.today().isoformat(),
-                    "used": self.used_today,
-                    "limit": self.daily_limit,
-                }, f)
-        except OSError as e:
-            logger.warning("예산 상태 저장 실패: %s", e)
-def _extract_caption(node: dict) -> str:
-    """EnsembleData node에서 캡션 텍스트 추출."""
-    edges = node.get("edge_media_to_caption", {}).get("edges", [])
-    if edges:
-        return edges[0].get("node", {}).get("text", "")
-    return ""
-def _extract_likes(node: dict) -> int:
-    """좋아요 수 추출 (null/hidden likes 처리)."""
-    likes = node.get("edge_liked_by", {}).get("count")
-    if likes is not None:
-        return max(likes, 0)
-    likes = node.get("edge_media_preview_like", {}).get("count")
-    if likes is not None:
-        return max(likes, 0)
-    return 0
-def _convert_node_to_post(node: dict, hashtag: str) -> dict:
-    """EnsembleData node → 표준 post dict로 변환."""
-    caption = _extract_caption(node)
-    hashtags = HASHTAG_RE.findall(caption)
-    location = node.get("location") or {}
-    location_name = location.get("name", "")
-    ts = node.get("taken_at_timestamp")
-    timestamp_iso = ""
-    if ts:
-        try:
-            timestamp_iso = datetime.fromtimestamp(ts, tz=timezone.utc).isoformat()
-        except (ValueError, OSError):
-            pass
-    shortcode = node.get("shortcode", "")
-    return {
-        "search_term": hashtag,
-        "search_type": "hashtag",
-        "location_name": location_name,
-        "likes_count": _extract_likes(node),
-        "comments_count": node.get("edge_media_to_comment", {}).get("count", 0) or 0,
-        "caption": caption,
-        "timestamp": timestamp_iso,
-        "url": f"https://www.instagram.com/p/{shortcode}/" if shortcode else "",
-        "hashtags": hashtags,
-        "_location_lat": location.get("lat"),
-        "_location_lng": location.get("lng"),
-        "_location_address": location.get("address", ""),
-        "_location_pk": location.get("pk"),
-    }
 # ══════════════════════════════════════════════
-# InstagramCollector — Dual Backend
 # ══════════════════════════════════════════════
 class InstagramCollector:
-    """Instagram 수집기 (v4.1 — Dual Backend).
-    INSTAGRAM_BACKEND 환경변수에 따라 Apify 또는 EnsembleData를 사용합니다.
     """
     def __init__(self, supabase_client, spot_matcher=None):
         self.supabase = supabase_client
         self.spot_matcher = spot_matcher
-        self.extractor = PlaceNameExtractor(supabase_client)
-        if INSTAGRAM_BACKEND == "apify":
-            self.apify = ApifyClient(os.environ["APIFY_API_TOKEN"])
-        elif INSTAGRAM_BACKEND == "ed":
-            self.token = os.environ.get("ENSEMBLEDATA_TOKEN", "")
-            if not self.token:
-                raise ValueError("ENSEMBLEDATA_TOKEN 환경변수가 설정되지 않았습니다")
-            daily_limit = int(os.environ.get("ED_DAILY_UNIT_BUDGET", "50"))
-            self.budget = BudgetTracker(daily_limit=daily_limit)
-            self.http = httpx.Client(timeout=API_TIMEOUT_SECS)
-        else:
-            raise ValueError(
-                f"Unknown INSTAGRAM_BACKEND: {INSTAGRAM_BACKEND!r} "
-                "(expected 'apify' or 'ed')"
-            )
-    def _get_source_name(self) -> str:
-        """DB source 컬럼 값: instagram_apify 또는 instagram_ed."""
-        return f"instagram_{INSTAGRAM_BACKEND}"
     # ==================================================================
     # Main Entry Point
     # ==================================================================
     def run(self) -> dict:
-        """Instagram 수집 파이프라인 — 백엔드별 분기."""
-        if INSTAGRAM_BACKEND == "apify":
-            return self._run_apify()
-        elif INSTAGRAM_BACKEND == "ed":
-            return self._run_ensembledata()
-        else:
-            raise ValueError(f"Unknown INSTAGRAM_BACKEND: {INSTAGRAM_BACKEND!r}")
-    # ==================================================================
-    # Apify Backend (v3.6)
-    # ==================================================================
-    ACTOR_MEMORY_MB = 1024
-    ACTOR_TIMEOUT_SECS = 120
-    def _run_apify(self) -> dict:
-        """Apify 백엔드 수집 파이프라인.
-        [1] 해시태그 리스트 동적 생성 (일반 + 하위지명 + 고유명사)
-        [2] 해시태그 검색으로 게시물 수집
-        [3] directUrls로 주요 장소 게시물 추가 수집
-        [4] 3단계 매칭 + 기간 필터 + 임계값 필터 + 가중 집계
-        [5] DB 저장
         """
-        logger.info("=== Instagram 수집 시작 (v3.6 — Apify) ===")
-        # [1] 해시태그 동적 생성
-        hashtags = build_all_hashtags(self.supabase)
-        # [2] 해시태그 검색
-        search_posts = self._collect_posts_apify(hashtags)
-        # [3] directUrls (Location ID 캐시 → spot_id 매핑 → 수집)
-        location_map = load_or_discover_locations(self.apify, self.supabase)
-        direct_urls = build_direct_urls_with_spot_ids(location_map, self.spot_matcher)
-        direct_posts = self._collect_direct_location_posts(direct_urls)
-        # [4] 통합 집계
-        all_posts = search_posts + direct_posts
         if not all_posts:
             logger.warning("수집된 게시물 없음 — 종료")
-            return {"total_posts": 0, "spots_matched": 0, "saved": 0, "backend": "apify"}
-        spot_metrics = self.aggregate_spot_metrics(all_posts)
-        # [5] DB 저장
         saved = self._save_to_db(spot_metrics)
         result = {
-            "backend": "apify",
-            "hashtag_count": len(hashtags),
             "total_posts": len(all_posts),
-            "search_posts": len(search_posts),
-            "direct_posts": len(direct_posts),
-            "spots_matched": len(spot_metrics),
             "saved": saved,
         }
-        logger.info("=== Instagram 수집 완료 (v3.6 — Apify): %s ===", result)
         return result
-    def _scrape_hashtag(self, hashtag: str, label: str) -> list[dict]:
-        """단일 해시태그 페이지에서 게시물을 스크랩한다. 실패 시 1회 재시도."""
-        posts = self._execute_hashtag_actor(hashtag, label)
-        if len(posts) == 0:
-            logger.info("[%s] 결과 0건 — 1회 재시도", label)
-            posts = self._execute_hashtag_actor(hashtag, f"{label} retry")
-        return posts
-    def _execute_hashtag_actor(self, hashtag: str, label: str) -> list[dict]:
-        """instagram-hashtag-scraper Actor 1회 실행."""
-        run_input = {
-            "hashtags": [hashtag],
-            "resultsLimit": RESULTS_LIMIT_PER_HASHTAG,
-            "proxy": {
-                "useApifyProxy": True,
-                "apifyProxyGroups": ["RESIDENTIAL"],
-            },
-        }
         try:
-            run = self.apify.actor(ACTOR_ID).call(
-                run_input=run_input,
-                timeout_secs=self.ACTOR_TIMEOUT_SECS,
-                memory_mbytes=self.ACTOR_MEMORY_MB,
             )
         except Exception as e:
-            logger.warning("Apify Actor 실행 실패 [hashtag=%s]: %s", hashtag, e)
-            return []
-        if run.get("status") not in ("SUCCEEDED", None):
-            logger.warning(
-                "Apify Actor 비정상 종료 [hashtag=%s]: status=%s",
-                hashtag, run.get("status"),
-            )
-            return []
-        posts: list[dict] = []
-        dataset_id = run["defaultDatasetId"]
-        for item in self.apify.dataset(dataset_id).iterate_items():
-            likes = item.get("likesCount", 0)
-            if likes == -1:
-                likes = 0
-            hashtags = item.get("hashtags") or []
-            if not hashtags:
-                caption = item.get("caption", "")
-                if caption:
-                    hashtags = HASHTAG_RE.findall(caption)
-            posts.append({
-                "search_term": hashtag,
-                "search_type": "hashtag",
-                "location_name": item.get("locationName", ""),
-                "likes_count": likes,
-                "comments_count": item.get("commentsCount", 0),
-                "caption": item.get("caption", ""),
-                "timestamp": item.get("timestamp", ""),
-                "url": item.get("url", ""),
-                "hashtags": hashtags,
-            })
-        logger.info("[%s] #%s → %d건", label, hashtag, len(posts))
-        return posts
-    def _collect_posts_apify(self, hashtags: list[str]) -> list[dict]:
-        """해시태그 리스트에서 게시물을 수집한다 (Apify). URL 기반 중복 제거."""
-        logger.info(
-            "Apify Actor 실행 시작: %d개 해시태그 (resultsLimit=%d)",
-            len(hashtags), RESULTS_LIMIT_PER_HASHTAG,
-        )
-        all_posts: list[dict] = []
-        for i, tag in enumerate(hashtags, 1):
-            label = f"{i}/{len(hashtags)}"
-            posts = self._scrape_hashtag(tag, label)
-            all_posts.extend(posts)
-        unique_posts = _dedup_posts_by_url(all_posts)
-        logger.info("게시물 수집 완료: %d건 (%d개 해시태그)", len(unique_posts), len(hashtags))
-        return unique_posts
-    def _collect_direct_location_posts(self, direct_urls: dict[str, str]) -> list[dict]:
-        """인스타그램 위치 페이지에서 게시물을 수집한다.
-        각 위치 페이지의 spot_id가 이미 확정되어 있으므로
-        게시물에 _direct_spot_id를 첨부한다.
-        """
-        if not direct_urls:
-            logger.info("directUrls 없음 — 방향 B 스킵")
-            return []
-        logger.info("directUrls 수집 시작: %d개 위치 페이지", len(direct_urls))
-        all_posts: list[dict] = []
-        for url, spot_id in direct_urls.items():
-            run_input = {
-                "directUrls": [url],
-                "resultsLimit": RESULTS_LIMIT_PER_HASHTAG,
-                "proxy": {
-                    "useApifyProxy": True,
-                    "apifyProxyGroups": ["RESIDENTIAL"],
-                },
-            }
             try:
-                run = self.apify.actor(ACTOR_ID).call(
-                    run_input=run_input,
-                    timeout_secs=self.ACTOR_TIMEOUT_SECS,
-                    memory_mbytes=self.ACTOR_MEMORY_MB,
-                )
             except Exception as e:
-                logger.warning("directUrls Actor 실행 실패 [%s]: %s", url, e)
-                continue
-            if run.get("status") not in ("SUCCEEDED", None):
-                logger.warning("directUrls Actor 비정상 종료 [%s]: status=%s", url, run.get("status"))
-                continue
-            dataset_id = run["defaultDatasetId"]
-            count = 0
-            for item in self.apify.dataset(dataset_id).iterate_items():
-                likes = item.get("likesCount", 0)
-                if likes == -1:
-                    likes = 0
-                all_posts.append({
-                    "search_term": "__direct__",
-                    "search_type": "direct",
-                    "location_name": item.get("locationName", ""),
-                    "likes_count": likes,
-                    "comments_count": item.get("commentsCount", 0),
-                    "caption": item.get("caption", ""),
-                    "timestamp": item.get("timestamp", ""),
-                    "url": item.get("url", ""),
-                    "hashtags": item.get("hashtags") or [],
-                    "_direct_spot_id": spot_id,
-                })
-                count += 1
-            logger.info("directUrls [%s] → %d건 (spot_id=%s)", url, count, spot_id)
-        logger.info("directUrls 수집 완료: %d건", len(all_posts))
-        return all_posts
     # ==================================================================
-    # EnsembleData Backend (v4.0)
     # ==================================================================
-    def _run_ensembledata(self) -> dict:
-        """EnsembleData 백엔드 수집 파이프라인.
-        [1] 해시태그 리스트 동적 생성
-        [2] 일별 예산에 맞춰 해시태그 선정
-        [3] EnsembleData API로 게시물 수집
-        [4] 3단계 매칭 + 가중 집계
-        [5] DB 저장
         """
-        logger.info("=== Instagram 수집 시작 (v4.0 — EnsembleData) ===")
         logger.info(
-            "일별 예산: %d/%d유닛 (잔여: %d)",
-            self.budget.used_today, self.budget.daily_limit, self.budget.remaining,
         )
-        # [1] 해시태그 동적 생성
-        all_hashtags = build_all_hashtags(self.supabase)
-        # [2] 예산에 맞춰 오늘의 해시태그 선정
-        hashtags = get_daily_hashtags(all_hashtags, self.budget.daily_limit)
-        if not hashtags:
-            logger.info("오늘 수집할 해시태그 없음 — 종료")
-            return {
-                "total_posts": 0, "spots_matched": 0, "saved": 0,
-                "backend": "ed", "reason": "no_hashtags_today",
-            }
-        # [3] 게시물 수집
-        posts = self._collect_posts_ed(hashtags)
-        if not posts:
-            logger.warning("수집된 게시물 없음 — 종료")
-            return {"total_posts": 0, "spots_matched": 0, "saved": 0, "backend": "ed"}
-        # [4] 집계
-        spot_metrics = self.aggregate_spot_metrics(posts)
-        # [5] DB 저장
-        saved = self._save_to_db(spot_metrics)
-        result = {
-            "backend": "ed",
-            "hashtag_count": len(hashtags),
-            "hashtags_collected": hashtags,
-            "total_posts": len(posts),
-            "spots_matched": len(spot_metrics),
-            "saved": saved,
-            "budget_used": self.budget.used_today,
-            "budget_remaining": self.budget.remaining,
         }
-        logger.info("=== Instagram 수집 완료 (v4.0 — EnsembleData): %s ===", result)
-        return result
-    def _fetch_hashtag_posts(self, hashtag: str, label: str) -> list[dict]:
-        """EnsembleData API로 해시태그 게시물을 가져온다.
-        top_posts + recent_posts를 통합하여 반환한다.
-        """
-        if not self.budget.can_afford(estimated_posts=30):
-            logger.warning("[%s] 유닛 예산 부족 (잔여: %d) — 스킵", label, self.budget.remaining)
-            return []
         try:
-            resp = self.http.get(
-                f"{ENSEMBLEDATA_BASE_URL}/instagram/hashtag/posts",
-                params={"name": hashtag, "token": self.token},
             )
-        except httpx.HTTPError as e:
-            logger.warning("[%s] API 호출 실패 (#%s): %s", label, hashtag, e)
-            return []
-        if resp.status_code == 495:
-            logger.error("[%s] 일일 유닛 한도 초과 — 수집 중단", label)
-            self.budget.record(self.budget.remaining)
             return []
-        if resp.status_code != 200:
-            logger.warning("[%s] API 응답 오류 (#%s): status=%d", label, hashtag, resp.status_code)
             return []
-        data = resp.json().get("data", {})
         posts: list[dict] = []
-        top_nodes = data.get("top_posts", [])
-        recent_nodes = data.get("recent_posts", [])
-        for item in top_nodes:
-            node = item.get("node", item)
-            posts.append(_convert_node_to_post(node, hashtag))
-        for item in recent_nodes:
-            node = item.get("node", item)
-            posts.append(_convert_node_to_post(node, hashtag))
-        self.budget.record(len(posts))
-        logger.info(
-            "[%s] #%s → %d건 (top %d + recent %d, 해시태그 전체 %s건)",
-            label, hashtag, len(posts), len(top_nodes), len(recent_nodes),
-            f"{data.get('count', 0):,}" if isinstance(data.get("count"), int) else "?",
-        )
         return posts
-    def _collect_posts_ed(self, hashtags: list[str]) -> list[dict]:
-        """해시태그 리스트에서 게시물을 수집한다 (EnsembleData). URL 기반 중복 제거."""
-        if not hashtags:
-            logger.info("수집할 해시태그 없음 — 스킵")
-            return []
-        logger.info(
-            "EnsembleData 수집 시작: %d개 해시태그 (일일 예산: %d/%d)",
-            len(hashtags), self.budget.remaining, self.budget.daily_limit,
-        )
-        all_posts: list[dict] = []
-        for i, tag in enumerate(hashtags, 1):
-            if not self.budget.can_afford(estimated_posts=30):
-                logger.warning("유닛 예산 소진 — 나머지 %d개 해시태그 스킵", len(hashtags) - i + 1)
-                break
-            label = f"{i}/{len(hashtags)}"
-            posts = self._fetch_hashtag_posts(tag, label)
-            all_posts.extend(posts)
-        unique_posts = _dedup_posts_by_url(all_posts)
-        logger.info("게시물 수집 완료: %d건 (%d개 해시태그)", len(unique_posts), len(hashtags))
-        return unique_posts
     # ==================================================================
-    # Shared: 3단계 장소 매칭 (v3.6 — Stage 0 directUrls 포함)
     # ==================================================================
     def _match_post_to_spot(self, post: dict) -> tuple[str | None, str]:
-        """게시물 1건에 대해 spot_id를 매칭한다.
         매칭 우선순위:
-          0. directUrls 게시물 (_direct_spot_id 이미 확정) — Apify only
           1. locationName 태그 → 접두사 제거 → SpotMatcher
           2. hashtags 배열 → SpotMatcher.match_hashtag() (방향 제한)
-          3. caption → PlaceNameExtractor → SpotMatcher
-        """
-        # Stage 0: directUrls (Apify only — ED posts don't have this field)
-        direct_sid = post.get("_direct_spot_id")
-        if direct_sid:
-            return direct_sid, "direct"
         # Stage 1: locationName → 접두사 제거 → SpotMatcher
         loc = post.get("location_name", "")
         if loc and loc not in LOCATION_BLACKLIST and self.spot_matcher:
@@ -900,42 +409,28 @@ class InstagramCollector:
                 if sid:
                     return sid, "hashtag"
-        # Stage 3: caption → PlaceNameExtractor → SpotMatcher
-        caption = post.get("caption", "")
-        if caption and len(caption) >= 5:
-            places = self.extractor.extract(caption)
-            for place in places:
-                if self.spot_matcher:
-                    sid = self.spot_matcher.match(place["name"])
-                    if sid:
-                        return sid, "caption"
-                elif place.get("spot_id"):
-                    return place["spot_id"], "caption"
         return None, "unmatched"
     # ==================================================================
-    # Shared: 가중 집계 (v3.6)
     # ==================================================================
-    def aggregate_spot_metrics(self, posts: list[dict]) -> dict[str, dict]:
-        """수집된 게시물을 spot_id 기준으로 집계한다.
-        - 기간 필터: 최근 TREND_WINDOW_DAYS(10일) 이내만 처리
-        - 최소 engagement 임계값: MIN_ENGAGEMENT(10) 미만 제외
-        - engagement cap: ENGAGEMENT_CAP(1000) 초과 시 절삭
-        - weighted_score: sum(min(engagement, cap))
         Returns:
-            {spot_id: {post_count, total_likes, total_comments,
-                       avg_engagement, weighted_score, match_methods, hashtags}}
         """
         cutoff = datetime.now(timezone.utc) - timedelta(days=TREND_WINDOW_DAYS)
         spot_metrics: dict[str, dict] = {}
-        match_stats = {
-            "direct": 0, "location_tag": 0,
-            "hashtag": 0, "caption": 0, "unmatched": 0,
             "filtered_old": 0, "filtered_low_engagement": 0,
         }
@@ -951,14 +446,7 @@ class InstagramCollector:
                 except (ValueError, TypeError):
                     pass  # 파싱 실패 시 포함
-            # 3단계 매칭
-            spot_id, method = self._match_post_to_spot(post)
-            match_stats[method] += 1
-            if not spot_id:
-                continue
-            # 최소 engagement 임계값
             likes = post.get("likes_count", 0) or 0
             comments = post.get("comments_count", 0) or 0
             engagement = likes + comments
@@ -967,44 +455,253 @@ class InstagramCollector:
                 match_stats["filtered_low_engagement"] += 1
                 continue
             # engagement cap
             capped_engagement = min(engagement, ENGAGEMENT_CAP)
             # 집계
-            if spot_id not in spot_metrics:
-                spot_metrics[spot_id] = {
-                    "post_count": 0,
-                    "total_likes": 0,
-                    "total_comments": 0,
-                    "weighted_score": 0,
-                    "match_methods": set(),
-                    "hashtags": set(),
-                }
-            m = spot_metrics[spot_id]
-            m["post_count"] += 1
-            m["total_likes"] += likes
-            m["total_comments"] += comments
-            m["weighted_score"] += capped_engagement
-            m["match_methods"].add(method)
-            term = post.get("search_term", "")
-            if term and term != "__direct__":
-                m["hashtags"].add(term)
-        # avg_engagement 계산 + set → list 변환
-        for metrics in spot_metrics.values():
-            count = max(metrics["post_count"], 1)
-            metrics["avg_engagement"] = int(
-                round((metrics["total_likes"] + metrics["total_comments"]) / count)
             )
-            metrics["match_methods"] = sorted(metrics["match_methods"])
-            metrics["hashtags"] = sorted(metrics["hashtags"])
         if match_stats["filtered_old"] > 0:
             logger.info(
-                "기간 필터: %d건 제외 (최근 %d일 외), %d건 처리",
                 match_stats["filtered_old"], TREND_WINDOW_DAYS,
-                len(posts) - match_stats["filtered_old"],
             )
         if match_stats["filtered_low_engagement"] > 0:
             logger.info(
@@ -1012,29 +709,30 @@ class InstagramCollector:
                 match_stats["filtered_low_engagement"], MIN_ENGAGEMENT,
             )
         logger.info(
-            "스팟 매칭 완료: %d개 스팟 "
-            "(위치태그 %d, 해시태그 %d, 캡션 %d, direct %d, 미식별 %d)",
-            len(spot_metrics),
             match_stats["location_tag"],
             match_stats["hashtag"],
-            match_stats["caption"],
-            match_stats["direct"],
             match_stats["unmatched"],
         )
-        return spot_metrics
     # ==================================================================
-    # Shared: DB 저장
     # ==================================================================
     def _save_to_db(self, spot_metrics: dict[str, dict]) -> int:
         """집계된 메트릭을 spot_trends 테이블에 저장한다.
-        source = instagram_apify 또는 instagram_ed (INSTAGRAM_BACKEND 기반).
         저장 메트릭: post_count, avg_engagement, weighted_score
         """
         period_start, period_end = get_week_period()
-        source = self._get_source_name()
         saved = 0
         for spot_id, metrics in spot_metrics.items():
@@ -1043,7 +741,7 @@ class InstagramCollector:
                 "total_comments": metrics["total_comments"],
                 "match_methods": metrics["match_methods"],
                 "hashtags": metrics["hashtags"],
-                "backend": INSTAGRAM_BACKEND,
             }
             # post_count
@@ -1063,7 +761,7 @@ class InstagramCollector:
                 })
                 saved += 1
             except Exception as e:
-                logger.warning("spot_trends 저장 실패 (%s post_count, %s): %s", source, spot_id, e)
             # avg_engagement
             if metrics["avg_engagement"] > 0:
@@ -1078,7 +776,7 @@ class InstagramCollector:
                         "raw_data": {"match_methods": metrics["match_methods"]},
                     })
                 except Exception as e:
-                    logger.warning("spot_trends 저장 실패 (%s avg_engagement, %s): %s", source, spot_id, e)
             # weighted_score
             if metrics["weighted_score"] > 0:
@@ -1093,17 +791,164 @@ class InstagramCollector:
                         "raw_data": {},
                     })
                 except Exception as e:
-                    logger.warning("spot_trends 저장 실패 (%s weighted_score, %s): %s", source, spot_id, e)
         logger.info("Instagram DB 저장: %d건 (%d 스팟, source=%s)", saved, len(spot_metrics), source)
         return saved
 # ══════════════════════════════════════════════
 # Shared Utility
 # ══════════════════════════════════════════════
 def _dedup_posts_by_url(posts: list[dict]) -> list[dict]:
     """URL 기반 중복 제거."""
     seen_urls: set[str] = set()

 """
+Instagram Collector — Influencer Monitoring (v5.1 Multimodal)
+큐레이팅된 제주 여행 인플루언서 계정의 최근 게시물을 수집합니다.
+파이프라인:
+  1. influencer_accounts 테이블에서 활성 계정 목록 로드 (DB 실패 시 기본값 폴백)
+  2. Apify instagram-profile-scraper로 계정별 최근 게시물 수집
+  3. 게시물 정규화 + 중복 제거
+  4. 2-pass 하이브리드 장소 매칭:
+     Pass 1 (규칙 기반): 위치태그 → 해시태그 매칭 (고신뢰 신호만)
+     Pass 2 (AI 멀티모달): Gemini 2.0 Flash로 콘텐츠 분석
+       - 이미지 게시글: 이미지 + 캡션 → 간판/경관/텍스트 인식
+       - 릴스(영상): 영상 + 캡션 → 나레이션/자막/간판 인식
+       - 미디어 없음: 캡션 텍스트 분석 (폴백)
   5. 가중 집계: weighted_score = sum(min(engagement, cap))
+  6. spot_trends 테이블에 저장 (source = instagram_influencer)
+  7. 계정별 last_scraped_at 업데이트
 """
 from __future__ import annotations
 import json
+import logging
 import os
 import re
+import tempfile
+import time
+from datetime import datetime, timedelta, timezone
+from apify_client import ApifyClient
 from trend_engine.utils import get_week_period, safe_upsert_spot_trend
 logger = logging.getLogger(__name__)
 # ══════════════════════════════════════════════
+# 수집 설정
 # ══════════════════════════════════════════════
+# 기간 필터 — 직전 30일 (7일은 스팟 매칭률 과소)
+TREND_WINDOW_DAYS = 30
+# 가중 집계 상수 (인플루언서 콘텐츠 기준)
+MIN_ENGAGEMENT = 50       # 최소 engagement 임계값
+ENGAGEMENT_CAP = 5000     # 단일 게시물 engagement 상한
+# 계정당 최대 게시물 수
+RESULTS_LIMIT_PER_ACCOUNT = 20
+# AI 멀티모달 분석 설정
+MAX_IMAGE_BYTES = 5 * 1024 * 1024     # 이미지 최대 5MB
+MAX_VIDEO_BYTES = 50 * 1024 * 1024    # 영상 최대 50MB
+MEDIA_DOWNLOAD_TIMEOUT = 15           # 미디어 다운로드 타임아웃(초)
 # ──────────────────────────────────────────────
+# 위치 태그 정리
 # ──────────────────────────────────────────────
 # 국가/공항 단위만 BLACKLIST
 LOCATION_BLACKLIST = frozenset({
     "South Korea", "Korea", "대한민국", "한국",
 HASHTAG_RE = re.compile(r"#([\w가-힣]{2,30})")
+# ──────────────────────────────────────────────
+# 기본 인플루언서 계정 (DB 조회 실패 시 폴백)
+# ──────────────────────────────────────────────
+DEFAULT_INFLUENCER_ACCOUNTS = [
+    {"username": "_sohee.e", "category": "travel", "priority": 1},
+    {"username": "foto_ycy", "category": "photo", "priority": 2},
+    {"username": "bbo_muksta", "category": "food", "priority": 3},
+    {"username": "yoontheroad", "category": "photo", "priority": 4},
+    {"username": "siniple", "category": "photo", "priority": 5},
+    {"username": "bigg_jun", "category": "photo", "priority": 6},
+    {"username": "aria.leeee", "category": "travel", "priority": 7},
+    {"username": "gamttanam", "category": "lifestyle", "priority": 8},
+    {"username": "by_malgm", "category": "photo", "priority": 9},
+    {"username": "colorny", "category": "travel", "priority": 10},
+    {"username": "mongle_jyh", "category": "photo", "priority": 11},
+    {"username": "ryuppeum", "category": "travel", "priority": 12},
+    {"username": "thesoulofseoulblog", "category": "travel", "priority": 13},
+    {"username": "hey_jejuisland", "category": "lifestyle", "priority": 14},
+    {"username": "yooonjeju", "category": "travel", "priority": 15},
 ]
 # ══════════════════════════════════════════════
+# InstagramCollector — Influencer Monitoring v5.1
 # ══════════════════════════════════════════════
 class InstagramCollector:
+    """Instagram 수집기 (v5.1 — Influencer Monitoring + Multimodal AI).
+    큐레이팅된 인플루언서 계정의 최근 게시물을 수집하고
+    2-pass 하이브리드 매칭(위치태그/해시태그 + Gemini 멀티모달) 후
+    spot_trends에 저장합니다.
     """
+    ACTOR_ID = "apify/instagram-profile-scraper"
+    ACTOR_MEMORY_MB = 1024
+    ACTOR_TIMEOUT_SECS = 180
     def __init__(self, supabase_client, spot_matcher=None):
         self.supabase = supabase_client
         self.spot_matcher = spot_matcher
+        self.apify = ApifyClient(os.environ["APIFY_API_TOKEN"])
     # ==================================================================
     # Main Entry Point
     # ==================================================================
     def run(self) -> dict:
+        """Instagram 인플루언서 수집 파이프라인 v5.1.
+        [1] 인플루언서 계정 목록 로드 (DB → 폴백)
+        [2] Apify profile-scraper로 계정별 최근 게시물 수집
+        [3] Pass 1: 위치태그 + 해시태그 매칭
+        [3b] Pass 2: 미매칭 → Gemini 멀티모달 (이미지/영상 + 캡션) → SpotMatcher
+        [4] DB 저장
+        [5] last_scraped_at 업데이트
         """
+        logger.info("=== Instagram 수집 시작 (v5.1 — Multimodal AI) ===")
+        # [1] 인플루언서 계정 목록
+        accounts = self._load_influencer_accounts()
+        if not accounts:
+            logger.warning("활성 인플루언서 계정 없음 — 종료")
+            return {"total_posts": 0, "spots_matched": 0, "saved": 0, "accounts": 0}
+        logger.info("인플루언서 계정 %d개 로드", len(accounts))
+        # [2] 게시물 수집
+        all_posts = self._collect_from_accounts(accounts)
         if not all_posts:
             logger.warning("수집된 게시물 없음 — 종료")
+            return {
+                "total_posts": 0, "spots_matched": 0, "saved": 0,
+                "accounts": len(accounts),
+            }
+        # [3] Pass 1: 규칙 기반 매칭 + 집계
+        spot_metrics, unmatched_posts, match_stats = self._aggregate_with_unmatched(all_posts)
+        pass1_matched = len(spot_metrics)
+        # [3b] Pass 2: AI 멀티모달 분석 (미매칭 게시물 → 이미지/영상 + 캡션 분석)
+        ai_matched_count = 0
+        if unmatched_posts and os.environ.get("GEMINI_API_KEY"):
+            ai_matched_count = self._ai_analyze_content(unmatched_posts, spot_metrics, match_stats)
+        elif unmatched_posts:
+            logger.info("GEMINI_API_KEY 미설정 — AI 멀티모달 분석 스킵 (%d건 미매칭)", len(unmatched_posts))
+        # 최종 통계 로깅
+        self._log_match_stats(match_stats, len(all_posts), len(spot_metrics))
+        # [4] DB 저장
         saved = self._save_to_db(spot_metrics)
+        # [5] last_scraped_at 업데이트
+        scraped_usernames = list({
+            p.get("_source_account", "")
+            for p in all_posts if p.get("_source_account")
+        })
+        self._update_last_scraped(scraped_usernames)
         result = {
+            "accounts": len(accounts),
+            "accounts_scraped": len(scraped_usernames),
             "total_posts": len(all_posts),
+            "pass1_spots_matched": pass1_matched,
+            "ai_matched": ai_matched_count,
+            "total_spots_matched": len(spot_metrics),
             "saved": saved,
         }
+        logger.info("=== Instagram 수집 완료 (v5.1 — Multimodal AI): %s ===", result)
         return result
+    # ==================================================================
+    # 인플루언서 계정 관리
+    # ==================================================================
+    def _load_influencer_accounts(self) -> list[dict]:
+        """influencer_accounts 테이블에서 활성 계정 목록을 로드한다.
+        DB 조회 실패 시 DEFAULT_INFLUENCER_ACCOUNTS로 폴백.
+        """
         try:
+            resp = (
+                self.supabase.table("influencer_accounts")
+                .select("username, category, priority")
+                .eq("platform", "instagram")
+                .eq("is_active", True)
+                .order("priority")
+                .execute()
             )
+            accounts = resp.data or []
+            if accounts:
+                logger.info("DB에서 인플루언서 계정 %d개 로드", len(accounts))
+                return accounts
         except Exception as e:
+            logger.warning("influencer_accounts 조회 실패 (폴백 사용): %s", e)
+        logger.info("기본 인플루언서 계정 %d개 사용 (폴백)", len(DEFAULT_INFLUENCER_ACCOUNTS))
+        return list(DEFAULT_INFLUENCER_ACCOUNTS)
+    def _update_last_scraped(self, usernames: list[str]) -> None:
+        """수집 완료된 계정의 last_scraped_at을 업데이트한다."""
+        if not usernames:
+            return
+        now = datetime.now(timezone.utc).isoformat()
+        for username in usernames:
             try:
+                self.supabase.table("influencer_accounts").update({
+                    "last_scraped_at": now,
+                }).eq("platform", "instagram").eq("username", username).execute()
             except Exception as e:
+                logger.debug("last_scraped_at 업데이트 실패 (%s): %s", username, e)
+        logger.info("last_scraped_at 업데이트: %d개 계정", len(usernames))
     # ==================================================================
+    # 게시물 수집 (Apify Profile Scraper)
     # ==================================================================
+    def _collect_from_accounts(self, accounts: list[dict]) -> list[dict]:
+        """인플루언서 계정에서 게시물을 수집한다.
+        Apify instagram-profile-scraper Actor를 사용하여
+        계정별 최근 게시물을 가져온다. 5개씩 배치 실행.
         """
+        usernames = [a["username"] for a in accounts]
         logger.info(
+            "Apify Actor 실행: %d개 계정, 계정당 최대 %d건",
+            len(usernames), RESULTS_LIMIT_PER_ACCOUNT,
         )
+        all_posts: list[dict] = []
+        # 계정을 5개씩 배치로 실행 (단일 Actor 호출 실패 시 영향 최소화)
+        batch_size = 5
+        for i in range(0, len(usernames), batch_size):
+            batch = usernames[i:i + batch_size]
+            label = f"batch {i // batch_size + 1}/{(len(usernames) + batch_size - 1) // batch_size}"
+            posts = self._scrape_profiles(batch, label)
+            all_posts.extend(posts)
+        unique_posts = _dedup_posts_by_url(all_posts)
+        logger.info("게시물 수집 완료: %d건 (%d개 계정)", len(unique_posts), len(usernames))
+        return unique_posts
+    def _scrape_profiles(self, usernames: list[str], label: str) -> list[dict]:
+        """Apify instagram-profile-scraper Actor로 프로필 게시물 수집."""
+        run_input = {
+            "usernames": usernames,
+            "resultsLimit": RESULTS_LIMIT_PER_ACCOUNT,
+            "proxy": {
+                "useApifyProxy": True,
+                "apifyProxyGroups": ["RESIDENTIAL"],
+            },
         }
         try:
+            run = self.apify.actor(self.ACTOR_ID).call(
+                run_input=run_input,
+                timeout_secs=self.ACTOR_TIMEOUT_SECS,
+                memory_mbytes=self.ACTOR_MEMORY_MB,
             )
+        except Exception as e:
+            logger.warning("Apify Actor 실행 실패 [%s]: %s", label, e)
             return []
+        if run.get("status") not in ("SUCCEEDED", None):
+            logger.warning(
+                "Apify Actor 비정상 종료 [%s]: status=%s",
+                label, run.get("status"),
+            )
             return []
         posts: list[dict] = []
+        dataset_id = run["defaultDatasetId"]
+        for profile in self.apify.dataset(dataset_id).iterate_items():
+            # profile-scraper는 프로필 1개 = 아이템 1개, 게시물은 latestPosts 안
+            profile_username = profile.get("username", "")
+            latest_posts = profile.get("latestPosts", [])
+            if not latest_posts:
+                logger.debug("게시물 없음: @%s", profile_username)
+                continue
+            for item in latest_posts:
+                post = self._normalize_post(item)
+                if post:
+                    posts.append(post)
+        logger.info("[%s] %s → %d건", label, usernames, len(posts))
         return posts
+    @staticmethod
+    def _normalize_post(item: dict) -> dict | None:
+        """Apify profile-scraper 응답 → 표준 post dict로 변환."""
+        owner = item.get("ownerUsername", "") or ""
+        if not owner:
+            return None
+        likes = item.get("likesCount", 0)
+        if likes == -1:
+            likes = 0
+        caption = item.get("caption", "") or ""
+        hashtags = item.get("hashtags") or []
+        if not hashtags and caption:
+            hashtags = HASHTAG_RE.findall(caption)
+        # 콘텐츠 유형 및 미디어 URL
+        post_type = item.get("type", "Image")  # Image, Video, Sidecar
+        if post_type == "Video":
+            media_url = item.get("videoUrl", "") or item.get("displayUrl", "") or ""
+        else:
+            media_url = item.get("displayUrl", "") or ""
+        return {
+            "search_term": f"@{owner}",
+            "search_type": "profile",
+            "location_name": item.get("locationName", "") or "",
+            "likes_count": likes,
+            "comments_count": item.get("commentsCount", 0) or 0,
+            "caption": caption,
+            "timestamp": item.get("timestamp", ""),
+            "url": item.get("url", ""),
+            "hashtags": hashtags,
+            "media_url": media_url,
+            "media_type": post_type,
+            "_source_account": owner,
+        }
     # ==================================================================
+    # 3단계 장소 매칭 (v5.0 — 프로필 기반)
     # ==================================================================
     def _match_post_to_spot(self, post: dict) -> tuple[str | None, str]:
+        """게시물 1건에 대해 spot_id를 매칭한다 (고신뢰 신호만).
         매칭 우선순위:
           1. locationName 태그 → 접두사 제거 → SpotMatcher
           2. hashtags 배열 → SpotMatcher.match_hashtag() (방향 제한)
+        캡션/이미지/영상 기반 분석은 Pass 2 (AI 멀티모달)에서 처리.
+        """
         # Stage 1: locationName → 접두사 제거 → SpotMatcher
         loc = post.get("location_name", "")
         if loc and loc not in LOCATION_BLACKLIST and self.spot_matcher:
                 if sid:
                     return sid, "hashtag"
         return None, "unmatched"
     # ==================================================================
+    # Pass 1: 규칙 기반 매칭 + 집계 (미매칭 게시물 수집)
     # ==================================================================
+    def _aggregate_with_unmatched(
+        self, posts: list[dict],
+    ) -> tuple[dict[str, dict], list[dict], dict[str, int]]:
+        """수집된 게시물을 spot_id 기준으로 집계하고, 미매칭 게시물을 별도 반환한다.
         Returns:
+            (spot_metrics, unmatched_posts, match_stats)
         """
         cutoff = datetime.now(timezone.utc) - timedelta(days=TREND_WINDOW_DAYS)
         spot_metrics: dict[str, dict] = {}
+        unmatched_posts: list[dict] = []
+        match_stats: dict[str, int] = {
+            "location_tag": 0,
+            "hashtag": 0, "unmatched": 0,
+            "ai_matched": 0,
             "filtered_old": 0, "filtered_low_engagement": 0,
         }
                 except (ValueError, TypeError):
                     pass  # 파싱 실패 시 포함
+            # 최소 engagement 임계값 (매칭 전 필터 — 저engagement는 AI도 처리 불요)
             likes = post.get("likes_count", 0) or 0
             comments = post.get("comments_count", 0) or 0
             engagement = likes + comments
                 match_stats["filtered_low_engagement"] += 1
                 continue
+            # 3단계 매칭
+            spot_id, method = self._match_post_to_spot(post)
+            match_stats[method] += 1
+            if not spot_id:
+                # 미매칭 → Pass 2 대상
+                unmatched_posts.append(post)
+                continue
             # engagement cap
             capped_engagement = min(engagement, ENGAGEMENT_CAP)
             # 집계
+            _add_to_metrics(spot_metrics, spot_id, post, capped_engagement, method)
+        logger.info(
+            "Pass 1 완료: %d개 스팟 매칭, %d건 미매칭 → AI 대상",
+            len(spot_metrics), len(unmatched_posts),
+        )
+        return spot_metrics, unmatched_posts, match_stats
+    # ==================================================================
+    # Pass 2: AI 멀티모달 분석 (Gemini 2.0 Flash)
+    # ==================================================================
+    def _ai_analyze_content(
+        self,
+        unmatched_posts: list[dict],
+        spot_metrics: dict[str, dict],
+        match_stats: dict[str, int],
+    ) -> int:
+        """미매칭 게시물을 Gemini 멀티모달로 분석하여 장소를 추출한다.
+        이미지 게시글: 이미지 다운로드 → Gemini Vision + 캡션 분석
+        릴스(영상): 영상 다운로드 → Gemini File API + 캡션 분석
+        미디어 없음: 캡션 텍스트만 분석 (폴백)
+        Returns:
+            AI로 추가 매칭된 게시물 수
+        """
+        try:
+            from google import genai
+            from google.genai import types
+        except ImportError:
+            logger.warning("google-genai 미설치 — AI 멀티모달 분석 스킵")
+            return 0
+        api_key = os.environ.get("GEMINI_API_KEY")
+        if not api_key:
+            return 0
+        client = genai.Client(api_key=api_key)
+        image_count = sum(1 for p in unmatched_posts if p.get("media_type") != "Video")
+        video_count = sum(1 for p in unmatched_posts if p.get("media_type") == "Video")
+        logger.info(
+            "AI 멀티모달 분석 시작: %d건 (이미지 %d, 영상 %d)",
+            len(unmatched_posts), image_count, video_count,
+        )
+        matched_count = 0
+        analyzed = 0
+        for post in unmatched_posts:
+            media_url = post.get("media_url", "")
+            media_type = post.get("media_type", "Image")
+            caption = post.get("caption", "")[:500]
+            hashtags = ", ".join(post.get("hashtags", [])[:10])
+            places: list[str] = []
+            try:
+                if media_type == "Video" and media_url:
+                    places = self._ai_extract_from_video(
+                        client, types, media_url, caption, hashtags,
+                    )
+                elif media_url:
+                    places = self._ai_extract_from_image(
+                        client, types, media_url, caption, hashtags,
+                    )
+                elif caption and len(caption) >= 10:
+                    places = self._ai_extract_from_text(
+                        client, types, caption, hashtags,
+                    )
+                analyzed += 1
+            except Exception as e:
+                logger.debug("AI 분석 실패 (%s): %s", post.get("url", "")[:60], e)
+                continue
+            # 추출된 장소명 → SpotMatcher
+            if places:
+                logger.info(
+                    "AI 추출 장소: %s ← @%s (%s)",
+                    [p[:30] for p in places], post.get("_source_account", "?"), media_type,
+                )
+            else:
+                logger.debug(
+                    "AI 추출 장소 없음 ← @%s (%s)",
+                    post.get("_source_account", "?"), media_type,
+                )
+            for name in places:
+                if not name or len(name) < 2:
+                    continue
+                sid = self.spot_matcher.match(name) if self.spot_matcher else None
+                if sid:
+                    capped = min(
+                        (post.get("likes_count", 0) or 0)
+                        + (post.get("comments_count", 0) or 0),
+                        ENGAGEMENT_CAP,
+                    )
+                    _add_to_metrics(spot_metrics, sid, post, capped, "ai")
+                    match_stats["ai_matched"] += 1
+                    match_stats["unmatched"] = max(0, match_stats["unmatched"] - 1)
+                    matched_count += 1
+                    break  # 한 게시물에서 첫 매칭만
+        logger.info("AI 멀티모달 분석 완료: %d건 분석, %d건 매칭", analyzed, matched_count)
+        return matched_count
+    def _ai_extract_from_image(
+        self, client, types, media_url: str, caption: str, hashtags: str,
+    ) -> list[str]:
+        """이미지 게시물에서 Gemini Vision으로 장소를 추출한다."""
+        image_bytes = _download_media(media_url, MAX_IMAGE_BYTES)
+        prompt = _build_spot_prompt(
+            content_type="게시글 (이미지)",
+            caption=caption,
+            hashtags=hashtags,
+            media_instruction=(
+                "이미지에서 간판, 메뉴판, 특징적 경관을 확인하고 "
+                "캡션 내용도 함께 분석하여 장소를 식별하세요."
+            ),
+        )
+        contents: list = []
+        if image_bytes:
+            contents.append(
+                types.Part.from_bytes(data=image_bytes, mime_type="image/jpeg"),
+            )
+        contents.append(prompt)
+        response = client.models.generate_content(
+            model="gemini-2.5-flash",
+            contents=contents,
+            config=types.GenerateContentConfig(
+                temperature=0.1, max_output_tokens=200,
+                thinking_config=types.ThinkingConfig(thinking_budget=0),
+            ),
+        )
+        return _parse_ai_places(response.text)
+    def _ai_extract_from_video(
+        self, client, types, media_url: str, caption: str, hashtags: str,
+    ) -> list[str]:
+        """릴스(영상)에서 Gemini로 장소를 추출한다."""
+        video_bytes = _download_media(media_url, MAX_VIDEO_BYTES)
+        if not video_bytes:
+            # 다운로드 실패 → 캡션만 분석
+            if caption and len(caption) >= 10:
+                return self._ai_extract_from_text(client, types, caption, hashtags)
+            return []
+        temp_path = None
+        try:
+            with tempfile.NamedTemporaryFile(suffix=".mp4", delete=False) as f:
+                f.write(video_bytes)
+                temp_path = f.name
+            # Gemini File API에 업로드
+            video_file = client.files.upload(file=temp_path)
+            # 처리 대기 (최대 60초)
+            wait_count = 0
+            while wait_count < 30:
+                state_name = getattr(video_file.state, "name", str(video_file.state))
+                if "PROCESSING" not in state_name:
+                    break
+                time.sleep(2)
+                video_file = client.files.get(name=video_file.name)
+                wait_count += 1
+            prompt = _build_spot_prompt(
+                content_type="릴스 (영상)",
+                caption=caption,
+                hashtags=hashtags,
+                media_instruction=(
+                    "영상 속 간판, 자막, 나레이션, 특징적 경관을 확인하고 "
+                    "캡션도 함께 분석하여 장소를 식별하세요."
+                ),
+            )
+            response = client.models.generate_content(
+                model="gemini-2.5-flash",
+                contents=[video_file, prompt],
+                config=types.GenerateContentConfig(
+                    temperature=0.1, max_output_tokens=200,
+                    thinking_config=types.ThinkingConfig(thinking_budget=0),
+                ),
             )
+            # 업로드 파일 정리
+            try:
+                client.files.delete(name=video_file.name)
+            except Exception:
+                pass
+            return _parse_ai_places(response.text)
+        finally:
+            if temp_path:
+                try:
+                    os.unlink(temp_path)
+                except OSError:
+                    pass
+    @staticmethod
+    def _ai_extract_from_text(client, types, caption: str, hashtags: str) -> list[str]:
+        """캡션 텍스트만으로 장소를 추출한다 (미디어 다운로드 실패 시 폴백)."""
+        prompt = _build_spot_prompt(
+            content_type="게시글",
+            caption=caption,
+            hashtags=hashtags,
+            media_instruction="캡션 텍스트에서 장소를 추출하세요.",
+        )
+        response = client.models.generate_content(
+            model="gemini-2.5-flash",
+            contents=prompt,
+            config=types.GenerateContentConfig(
+                temperature=0.1, max_output_tokens=200,
+                thinking_config=types.ThinkingConfig(thinking_budget=0),
+            ),
+        )
+        return _parse_ai_places(response.text)
+    # ==================================================================
+    # 매칭 통계 로깅
+    # ==================================================================
+    @staticmethod
+    def _log_match_stats(
+        match_stats: dict[str, int], total_posts: int, total_spots: int,
+    ) -> None:
+        """Pass 1 + Pass 2 통합 매칭 통계를 로깅한다."""
         if match_stats["filtered_old"] > 0:
             logger.info(
+                "기간 필터: %d건 제외 (최근 %d일 외)",
                 match_stats["filtered_old"], TREND_WINDOW_DAYS,
             )
         if match_stats["filtered_low_engagement"] > 0:
             logger.info(
                 match_stats["filtered_low_engagement"], MIN_ENGAGEMENT,
             )
         logger.info(
+            "최종 매칭 통계: %d개 스팟 — "
+            "위치태그 %d, 해시태그 %d, AI멀티모달 %d, 미식별 %d",
+            total_spots,
             match_stats["location_tag"],
             match_stats["hashtag"],
+            match_stats.get("ai_matched", 0),
             match_stats["unmatched"],
         )
     # ==================================================================
+    # DB 저장
     # ==================================================================
     def _save_to_db(self, spot_metrics: dict[str, dict]) -> int:
         """집계된 메트릭을 spot_trends 테이블에 저장한다.
+        source = instagram_influencer
         저장 메트릭: post_count, avg_engagement, weighted_score
         """
+        # set → sorted list 변환 + avg_engagement 계산
+        _finalize_metrics(spot_metrics)
         period_start, period_end = get_week_period()
+        source = "instagram_influencer"
         saved = 0
         for spot_id, metrics in spot_metrics.items():
                 "total_comments": metrics["total_comments"],
                 "match_methods": metrics["match_methods"],
                 "hashtags": metrics["hashtags"],
+                "source_accounts": metrics["source_accounts"],
             }
             # post_count
                 })
                 saved += 1
             except Exception as e:
+                logger.warning("spot_trends 저장 실패 (post_count, %s): %s", spot_id, e)
             # avg_engagement
             if metrics["avg_engagement"] > 0:
                         "raw_data": {"match_methods": metrics["match_methods"]},
                     })
                 except Exception as e:
+                    logger.warning("spot_trends 저장 실패 (avg_engagement, %s): %s", spot_id, e)
             # weighted_score
             if metrics["weighted_score"] > 0:
                         "raw_data": {},
                     })
                 except Exception as e:
+                    logger.warning("spot_trends 저장 실패 (weighted_score, %s): %s", spot_id, e)
         logger.info("Instagram DB 저장: %d건 (%d 스팟, source=%s)", saved, len(spot_metrics), source)
         return saved
+# ══════════════════════════════════════════════
+# AI 멀티모달 유틸리티
+# ══════════════════════════════════════════════
+def _build_spot_prompt(
+    content_type: str, caption: str, hashtags: str, media_instruction: str,
+) -> str:
+    """Gemini용 장소 추출 프롬프트를 생성한다."""
+    return (
+        f"제주도의 인스타그램 {content_type}을 분석하여 "
+        "구체적인 장소명(상호명)을 추출하세요.\n\n"
+        f"캡션: {caption or '(없음)'}\n"
+        f"해시태그: {hashtags or '(없음)'}\n\n"
+        f"{media_instruction}\n\n"
+        "추출 대상:\n"
+        "- 카페, 식당, 베이커리 등 상호명 (예: 카페 레이어드, 봄날의 테이블)\n"
+        "- 관광지, 해변, 오름 고유명사 (예: 새별오름, 협재해수욕장, 군산오름)\n"
+        "- 공원, 마을, 거리 고유명사 (예: 한담해안산책로, 곽지과물해변)\n\n"
+        "제외 대상:\n"
+        "- '애월', '제주', '제주도', '한림', '서귀포' 같은 광역 지명\n"
+        "- '카페', '맛집', '해변' 같은 일반 카테고리명\n\n"
+        "중요: thinking 없이 JSON 배열만 출력하세요.\n"
+        '응답 형식: ["장소명1", "장소명2"]\n'
+        "장소가 없으면: []"
+    )
+def _download_media(url: str, max_bytes: int) -> bytes | None:
+    """미디어 URL에서 바이트를 다운로드한다."""
+    if not url:
+        return None
+    try:
+        import httpx
+        with httpx.stream(
+            "GET", url, timeout=MEDIA_DOWNLOAD_TIMEOUT, follow_redirects=True,
+        ) as resp:
+            if resp.status_code != 200:
+                return None
+            chunks: list[bytes] = []
+            total = 0
+            for chunk in resp.iter_bytes(chunk_size=8192):
+                total += len(chunk)
+                if total > max_bytes:
+                    logger.debug("미디어 크기 초과 (%d > %d): %s", total, max_bytes, url[:60])
+                    return None
+                chunks.append(chunk)
+            return b"".join(chunks)
+    except Exception as e:
+        logger.debug("미디어 다운로드 실패: %s — %s", url[:60], e)
+        return None
+def _parse_ai_places(text: str) -> list[str]:
+    """Gemini 응답에서 장소명 배열을 파싱한다."""
+    if not text:
+        logger.debug("AI 응답 비어있음")
+        return []
+    raw_text = text  # 디버깅용 원본 보존
+    text = text.strip()
+    # Gemini 2.5 Flash thinking 블록 제거
+    if "<think>" in text:
+        # thinking 블록 이후의 실제 응답만 추출
+        parts = text.split("</think>")
+        text = parts[-1].strip() if len(parts) > 1 else text
+    # 마크다운 코드 블록 제거
+    if text.startswith("```"):
+        text = text.split("\n", 1)[-1]
+    if text.endswith("```"):
+        text = text.rsplit("```", 1)[0]
+    text = text.strip()
+    # JSON 배열 직접 파싱 시도
+    try:
+        result = json.loads(text)
+        if isinstance(result, list):
+            return [p.strip() for p in result if isinstance(p, str) and p.strip()]
+    except (json.JSONDecodeError, ValueError):
+        pass
+    # 텍스트 안에 JSON 배열이 포함된 경우 추출
+    match = re.search(r'\[.*?\]', text, re.DOTALL)
+    if match:
+        try:
+            result = json.loads(match.group())
+            if isinstance(result, list):
+                return [p.strip() for p in result if isinstance(p, str) and p.strip()]
+        except (json.JSONDecodeError, ValueError):
+            pass
+    logger.debug("AI 응답 파싱 실패: %.200s", raw_text)
+    return []
 # ══════════════════════════════════════════════
 # Shared Utility
 # ══════════════════════════════════════════════
+def _add_to_metrics(
+    spot_metrics: dict[str, dict],
+    spot_id: str,
+    post: dict,
+    capped_engagement: int,
+    method: str,
+) -> None:
+    """게시물 1건을 spot_metrics에 집계한다."""
+    if spot_id not in spot_metrics:
+        spot_metrics[spot_id] = {
+            "post_count": 0,
+            "total_likes": 0,
+            "total_comments": 0,
+            "weighted_score": 0,
+            "match_methods": set(),
+            "hashtags": set(),
+            "source_accounts": set(),
+        }
+    likes = post.get("likes_count", 0) or 0
+    comments = post.get("comments_count", 0) or 0
+    m = spot_metrics[spot_id]
+    m["post_count"] += 1
+    m["total_likes"] += likes
+    m["total_comments"] += comments
+    m["weighted_score"] += capped_engagement
+    m["match_methods"].add(method)
+    for tag in post.get("hashtags", []):
+        tag = tag.strip().lstrip("#")
+        if tag and len(tag) >= 2:
+            m["hashtags"].add(tag)
+    account = post.get("_source_account", "")
+    if account:
+        m["source_accounts"].add(account)
+def _finalize_metrics(spot_metrics: dict[str, dict]) -> None:
+    """set → sorted list 변환 + avg_engagement 계산."""
+    for metrics in spot_metrics.values():
+        count = max(metrics["post_count"], 1)
+        metrics["avg_engagement"] = int(
+            round((metrics["total_likes"] + metrics["total_comments"]) / count)
+        )
+        metrics["match_methods"] = sorted(metrics["match_methods"])
+        metrics["hashtags"] = sorted(metrics["hashtags"])
+        metrics["source_accounts"] = sorted(metrics["source_accounts"])
 def _dedup_posts_by_url(posts: list[dict]) -> list[dict]:
     """URL 기반 중복 제거."""
     seen_urls: set[str] = set()

trend_engine/trend_scorer.py CHANGED Viewed

@@ -236,12 +236,12 @@ def generate_weekly_ranking(supabase: Client | None = None) -> dict:
         mt = row["metric_type"]
         spots_last.setdefault(sid, {}).setdefault(src, {})[mt] = row["metric_value"]
-    # -- instagram_apify/instagram_ed → "instagram" 키로 통합 --
-    # DB 마이그레이션 후 source가 instagram_apify/instagram_ed로 분리되었으나
     # 스코어러는 "instagram" 키로 참조하므로 가장 최근 백엔드 데이터를 사용
     for spots_dict in (spots_this, spots_last):
         for sid in list(spots_dict.keys()):
-            for ig_src in ("instagram_apify", "instagram_ed"):
                 if ig_src in spots_dict[sid]:
                     spots_dict[sid]["instagram"] = spots_dict[sid].pop(ig_src)
                     break  # 첫 번째 발견된 백엔드 사용 (하나만 활성)

         mt = row["metric_type"]
         spots_last.setdefault(sid, {}).setdefault(src, {})[mt] = row["metric_value"]
+    # -- instagram_* → "instagram" 키로 통합 --
+    # DB source가 instagram_apify/instagram_ed/instagram_influencer로 분리되었으나
     # 스코어러는 "instagram" 키로 참조하므로 가장 최근 백엔드 데이터를 사용
     for spots_dict in (spots_this, spots_last):
         for sid in list(spots_dict.keys()):
+            for ig_src in ("instagram_influencer", "instagram_apify", "instagram_ed"):
                 if ig_src in spots_dict[sid]:
                     spots_dict[sid]["instagram"] = spots_dict[sid].pop(ig_src)
                     break  # 첫 번째 발견된 백엔드 사용 (하나만 활성)

utils/trending_builder.py CHANGED Viewed

@@ -65,8 +65,8 @@ CHANNEL_THEMES: dict[str, dict] = {
         "metric_type": "post_count",
         "sort_by": "metric_value",
         "min_spots": 3,
-        # DB 소스명이 instagram_apify/instagram_ed로 분리되어 있음
-        "db_sources": ["instagram_apify", "instagram_ed"],
     },
 }
@@ -103,7 +103,7 @@ class TrendingBuilder:
         period_start, _ = get_week_period()
         # 이번 주 spot_trends에서 해당 채널+metric_type 조회
-        # instagram은 DB 소스명이 instagram_apify/instagram_ed로 분리됨 → in_ 쿼리
         sort_ascending = theme.get("sort_ascending", False)
         db_sources = theme.get("db_sources", [channel])
         query = (
@@ -127,7 +127,7 @@ class TrendingBuilder:
         if not trend_rows:
             return []
-        # 같은 spot_id가 여러 소스(예: instagram_apify + instagram_ed)에서
         # 올 수 있으므로 spot_id 기준 중복 제거 (metric_value가 큰 행 우선)
         if len(db_sources) > 1:
             seen: dict[str, dict] = {}

         "metric_type": "post_count",
         "sort_by": "metric_value",
         "min_spots": 3,
+        # DB 소스명이 instagram_influencer (v5.0) + 레거시 분리 소스
+        "db_sources": ["instagram_influencer", "instagram_apify", "instagram_ed"],
     },
 }
         period_start, _ = get_week_period()
         # 이번 주 spot_trends에서 해당 채널+metric_type 조회
+        # instagram은 DB 소스명이 instagram_influencer/instagram_apify/instagram_ed로 분리됨 → in_ 쿼리
         sort_ascending = theme.get("sort_ascending", False)
         db_sources = theme.get("db_sources", [channel])
         query = (
         if not trend_rows:
             return []
+        # 같은 spot_id가 여러 소스(예: instagram_influencer + 레거시)에서
         # 올 수 있으므로 spot_id 기준 중복 제거 (metric_value가 큰 행 우선)
         if len(db_sources) > 1:
             seen: dict[str, dict] = {}