eodi-mcp / scripts /sync_to_supabase.py
lovelymango's picture
Upload 2 files
01240a0 verified
"""
Supabase ๋™๊ธฐํ™” ์Šคํฌ๋ฆฝํŠธ (๋ฆฌํŒฉํ† ๋ง ๋ฒ„์ „)
========================================
YAML ๋ฐ์ดํ„ฐ๋ฅผ ์ฒญํฌ๋กœ ๋ณ€ํ™˜ํ•˜์—ฌ Supabase์— ์ €์žฅ.
ํ•ธ๋“ค๋Ÿฌ ๊ธฐ๋ฐ˜ ์•„ํ‚คํ…์ฒ˜๋กœ ์ƒˆ YAML ๊ตฌ์กฐ ์ง€์›์ด ์‰ฝ์Šต๋‹ˆ๋‹ค.
์‚ฌ์šฉ๋ฒ•:
python scripts/sync_to_supabase.py
python scripts/sync_to_supabase.py --chain MARRIOTT
python scripts/sync_to_supabase.py --dry-run
"""
import os
import sys
import yaml
import hashlib
import click
from pathlib import Path
from typing import List, Dict, Any, Optional
# ํ”„๋กœ์ ํŠธ ๋ฃจํŠธ๋ฅผ Python ๊ฒฝ๋กœ์— ์ถ”๊ฐ€
sys.path.insert(0, str(Path(__file__).parent.parent))
from dotenv import load_dotenv
load_dotenv()
# ์ฒญํฌ ํ•ธ๋“ค๋Ÿฌ ๋ชจ๋“ˆ ์ž„ํฌํŠธ
from chunk_handlers import (
CHUNK_HANDLERS,
NESTED_HANDLERS,
IGNORED_KEYS,
get_handler,
get_nested_handler,
is_ignored,
get_all_handler_keys,
)
# ===========================================================================
# ์ฒญํฌ ID ์ƒ์„ฑ
# ===========================================================================
def generate_chunk_id(doc_id: str, chunk_index: int) -> str:
"""์ฒญํฌ ID ์ƒ์„ฑ"""
return f"{doc_id}_chunk_{chunk_index:04d}"
# ===========================================================================
# ์ฒญํฌ ์ƒ์„ฑ (ํ•ธ๋“ค๋Ÿฌ ๊ธฐ๋ฐ˜)
# ===========================================================================
def create_chunks_from_knowledge(
doc_id: str,
chain: str,
extracted_knowledge: Dict[str, Any],
verbose: bool = False
) -> List[Dict[str, Any]]:
"""
extracted_knowledge์—์„œ ๊ฒ€์ƒ‰ ๊ฐ€๋Šฅํ•œ ์ฒญํฌ ์ƒ์„ฑ.
ํ•ธ๋“ค๋Ÿฌ ๊ธฐ๋ฐ˜์œผ๋กœ ๊ฐ ํ‚ค๋ฅผ ์ฒ˜๋ฆฌํ•ฉ๋‹ˆ๋‹ค.
"""
chunks = []
chunk_index = 0
unhandled_keys = []
# ํ˜ธํ…” ์ •๋ณด ๋ฏธ๋ฆฌ ์ถ”์ถœ (context์šฉ)
hotel_name = "Unknown Hotel"
hotel_name_ko = None
hotel_id_map = {} # hotel_id โ†’ ํ˜ธํ…”๋ช… ๋งคํ•‘
hotel_properties = extracted_knowledge.get("hotel_properties", [])
if hotel_properties and isinstance(hotel_properties, list):
for hotel in hotel_properties:
if isinstance(hotel, dict):
h_id = hotel.get("hotel_id")
h_name = hotel.get("name", "Unknown")
h_name_localized = hotel.get("name_localized", {})
h_name_ko = h_name_localized.get("ko") if isinstance(h_name_localized, dict) else None
# hotel_id ๋งคํ•‘ ์ถ”๊ฐ€
if h_id:
hotel_id_map[h_id] = {
"name": h_name,
"name_ko": h_name_ko,
"country": hotel.get("location", {}).get("country", "") if isinstance(hotel.get("location"), dict) else "",
"city": hotel.get("location", {}).get("city", "") if isinstance(hotel.get("location"), dict) else ""
}
# ์ฒซ ๋ฒˆ์งธ ํ˜ธํ…”์„ ๊ธฐ๋ณธ ํ˜ธํ…”๋กœ ์‚ฌ์šฉ
first_hotel = hotel_properties[0]
if isinstance(first_hotel, dict):
hotel_name = first_hotel.get("name", "Unknown Hotel")
name_localized = first_hotel.get("name_localized", {})
hotel_name_ko = name_localized.get("ko") if isinstance(name_localized, dict) else None
# identity ์„น์…˜์—์„œ ์ถ”๊ฐ€ ์ •๋ณด ์ถ”์ถœ (ํ”„๋กฌํ”„ํŠธ ์ถœ๋ ฅ ํ˜ธํ™˜์„ฑ)
identity = extracted_knowledge.get("identity", {})
if identity and isinstance(identity, dict):
# identity์—์„œ ์ •๋ณด ๋ณด๊ฐ•
if identity.get("title") and hotel_name == "Unknown Hotel":
hotel_name = identity.get("title")
if identity.get("chain"):
chain = identity.get("chain", chain)
# source ์„น์…˜์—์„œ ์ถœ์ฒ˜ ์ •๋ณด ์ถ”์ถœ
source = extracted_knowledge.get("source", {})
source_type = source.get("source_type") if isinstance(source, dict) else None
source_url = source.get("canonical_url") if isinstance(source, dict) else None
retrieved_at = source.get("retrieved_at") if isinstance(source, dict) else None
# version ์„น์…˜์—์„œ ์œ ํšจ ๊ธฐ๊ฐ„ ์ •๋ณด ์ถ”์ถœ
version = extracted_knowledge.get("version", {})
effective_date = version.get("effective_date") if isinstance(version, dict) else None
last_updated = version.get("last_updated") if isinstance(version, dict) else None
# ์ปจํ…์ŠคํŠธ ์ƒ์„ฑ (ํ™•์žฅ: source/version ์ •๋ณด ํฌํ•จ)
context = {
"chain": chain,
"hotel_name": hotel_name,
"hotel_name_ko": hotel_name_ko,
"hotel_id_map": hotel_id_map, # hotel_id โ†’ ํ˜ธํ…” ์ •๋ณด ๋งคํ•‘ ์ถ”๊ฐ€
"doc_id": doc_id,
# identity ์ถ”๊ฐ€ ์ •๋ณด
"document_category": identity.get("category") if identity else None,
"document_type": identity.get("doc_type", identity.get("document_type")) if identity else None,
# source ์ •๋ณด (๊ฒ€์ƒ‰ ์ •๋ ฌ/ํ•„ํ„ฐ์— ํ™œ์šฉ)
"source_type": source_type, # OFFICIAL, USER_GENERATED, NEWS ๋“ฑ
"source_url": source_url,
"retrieved_at": retrieved_at,
# version ์ •๋ณด (์‹œ๊ฐ„ ๊ธฐ๋ฐ˜ ์ •๋ ฌ์— ํ™œ์šฉ)
"effective_date": effective_date,
"last_updated": last_updated,
}
# ๊ณตํ†ต ๋ฉ”ํƒ€๋ฐ์ดํ„ฐ (๋ชจ๋“  ์ฒญํฌ์— ์ž๋™ ์ถ”๊ฐ€)
common_metadata = {}
if source_type:
common_metadata["source_type"] = source_type
if effective_date:
common_metadata["effective_date"] = effective_date
if last_updated:
common_metadata["last_updated"] = last_updated
def add_chunk(content: str, metadata: Dict[str, Any]):
"""์ฒญํฌ ์ถ”๊ฐ€ ํ—ฌํผ - ๊ณตํ†ต ๋ฉ”ํƒ€๋ฐ์ดํ„ฐ ์ž๋™ ๋ณ‘ํ•ฉ"""
nonlocal chunk_index
if content and content.strip() and len(content) > 50:
# ํ•ธ๋“ค๋Ÿฌ ๋ฉ”ํƒ€๋ฐ์ดํ„ฐ + ๊ณตํ†ต ๋ฉ”ํƒ€๋ฐ์ดํ„ฐ ๋ณ‘ํ•ฉ
merged_metadata = {**common_metadata, **metadata}
chunks.append({
"chunk_id": generate_chunk_id(doc_id, chunk_index),
"doc_id": doc_id,
"chain": chain,
"content": content.strip()[:5000],
"metadata": merged_metadata
})
chunk_index += 1
# 1. ์ตœ์ƒ์œ„ ํ‚ค ์ฒ˜๋ฆฌ
for key, value in extracted_knowledge.items():
if is_ignored(key):
continue
handler = get_handler(key)
if handler:
try:
result_chunks = handler(value, context)
for rc in result_chunks:
add_chunk(rc["content"], rc["metadata"])
except Exception as e:
if verbose:
print(f" โš ๏ธ ํ•ธ๋“ค๋Ÿฌ ์˜ค๋ฅ˜ ({key}): {e}")
else:
unhandled_keys.append(key)
# 2. ์ค‘์ฒฉ ํ‚ค ์ฒ˜๋ฆฌ (์˜ˆ: facts.pricing_analysis)
for nested_key, handler in NESTED_HANDLERS.items():
if handler is None:
continue
parts = nested_key.split(".")
if len(parts) == 2:
parent_key, child_key = parts
parent_data = extracted_knowledge.get(parent_key)
if isinstance(parent_data, dict) and child_key in parent_data:
try:
result_chunks = handler(parent_data[child_key], context)
for rc in result_chunks:
add_chunk(rc["content"], rc["metadata"])
except Exception as e:
if verbose:
print(f" โš ๏ธ ์ค‘์ฒฉ ํ•ธ๋“ค๋Ÿฌ ์˜ค๋ฅ˜ ({nested_key}): {e}")
# 3. ๋ฏธ์ฒ˜๋ฆฌ ํ‚ค ๊ฒฝ๊ณ 
if unhandled_keys and verbose:
print(f" โš ๏ธ ๋ฏธ์ฒ˜๋ฆฌ ํ‚ค: {', '.join(unhandled_keys)}")
return chunks
# ===========================================================================
# ๋ฌธ์„œ ID ์ƒ์„ฑ
# ===========================================================================
def generate_doc_id(file_path: str) -> str:
"""ํŒŒ์ผ ๊ฒฝ๋กœ์—์„œ ๊ณ ์œ  ๋ฌธ์„œ ID ์ƒ์„ฑ"""
return hashlib.md5(file_path.encode()).hexdigest()[:12]
# ===========================================================================
# YAML ํŒŒ์ผ ๋กœ๋“œ
# ===========================================================================
def load_yaml_from_md(file_path: Path) -> Optional[Dict[str, Any]]:
"""๋งˆํฌ๋‹ค์šด ํŒŒ์ผ์—์„œ YAML ํ”„๋ก ํŠธ๋งคํ„ฐ ์ถ”์ถœ"""
try:
content = file_path.read_text(encoding='utf-8')
lines = content.split('\n')
# YAML ๋ธ”๋ก์˜ ์‹œ์ž‘๊ณผ ๋์„ ์ค„ ๋‹จ์œ„๋กœ ์ฐพ๊ธฐ
yaml_start = None
yaml_end = None
for i, line in enumerate(lines):
stripped = line.strip()
if stripped == '---':
if yaml_start is None:
yaml_start = i + 1 # --- ๋‹ค์Œ ์ค„๋ถ€ํ„ฐ
else:
yaml_end = i # --- ์ด์ „ ์ค„๊นŒ์ง€
break
if yaml_start is None or yaml_end is None:
return None
# YAML ๋ธ”๋ก ์ถ”์ถœ
yaml_lines = lines[yaml_start:yaml_end]
yaml_part = '\n'.join(yaml_lines)
data = yaml.safe_load(yaml_part)
return data if data else None
except Exception as e:
return None
def detect_chain(file_path: Path) -> str:
"""ํŒŒ์ผ ๊ฒฝ๋กœ์—์„œ ์ฒด์ธ/๋„๋ฉ”์ธ ๊ฐ์ง€ (ํ™•์žฅ: ํ˜ธํ…” + ํ•ญ๊ณต + ์นด๋“œ + ๋‰ด์Šค)"""
path_str = str(file_path).upper()
# --- ํ˜ธํ…” ์ฒด์ธ ---
if "MARRIOTT" in path_str:
return "MARRIOTT"
elif "HILTON" in path_str:
return "HILTON"
elif "IHG" in path_str:
return "IHG"
elif "ACCOR" in path_str or "ACCO" in path_str:
return "ACCOR"
elif "HYATT" in path_str:
return "HYATT"
# ๋กฏ๋ฐํ˜ธํ…” (ํ˜ธํ…” ๊ฒฝ๋กœ์—์„œ๋งŒ ๊ฐ์ง€, ์นด๋“œ์‚ฌ ๋กฏ๋ฐ์™€ ๊ตฌ๋ถ„)
elif "LOTTE" in path_str and "/HOTEL/" in path_str.upper():
return "LOTTE"
# Jumeirah Hotels & Resorts (๋‘๋ฐ”์ด ๋Ÿญ์…”๋ฆฌ ์ฒด์ธ)
elif "JUMEIRAH" in path_str and "/HOTEL/" in path_str.upper():
return "JUMEIRAH"
# --- ํ•ญ๊ณต์‚ฌ (Phase 1 ํ™•์žฅ) ---
elif "KOREAN_AIR" in path_str or "KOREANAIR" in path_str:
return "KOREAN_AIR"
elif "ASIANA" in path_str:
return "ASIANA"
elif "DELTA" in path_str:
return "DELTA"
elif "UNITED" in path_str:
return "UNITED"
elif "ALLIANCE" in path_str or "ONEWORLD" in path_str or "STAR_ALLIANCE" in path_str or "SKYTEAM" in path_str:
return "ALLIANCE"
elif "/AIRLINE/" in path_str.upper():
return "AIRLINE"
# --- ์นด๋“œ์‚ฌ (Phase 1 ํ™•์žฅ) ---
elif "AMEX" in path_str or "FHR" in path_str or "THC" in path_str:
return "AMEX"
elif "SHINHAN" in path_str:
return "SHINHAN"
elif "HYUNDAI" in path_str and "CARD" in path_str:
return "HYUNDAI"
elif "HANA" in path_str and ("CARD" in path_str or "/CREDITCARD/" in path_str):
return "HANA"
elif "LOTTE" in path_str and ("CARD" in path_str or "/CREDITCARD/" in path_str):
return "LOTTE"
elif "WOORI" in path_str and ("CARD" in path_str or "/CREDITCARD/" in path_str):
return "WOORI"
elif "KB" in path_str and "CARD" in path_str:
return "KB"
elif "SAMSUNG" in path_str and "CARD" in path_str:
return "SAMSUNG"
elif "/CREDITCARD/" in path_str.upper():
return "CARD"
# --- ๋‰ด์Šค/๋”œ (Phase 1 ํ™•์žฅ) ---
elif "/NEWS/" in path_str.upper() or "/DEAL" in path_str.upper():
return "NEWS"
# --- ๊ธฐํƒ€ ---
elif "BENEFIT_RATE" in path_str:
return "BENEFIT"
else:
return "OTHER"
# ===========================================================================
# ๋ฉ”์ธ ๋™๊ธฐํ™” ํ•จ์ˆ˜
# ===========================================================================
@click.command()
@click.option('--chain', '-c', type=str, default=None, help='ํŠน์ • ์ฒด์ธ๋งŒ ๋™๊ธฐํ™”')
@click.option('--domain', '-d', type=click.Choice(['hotel', 'airline', 'card', 'news', 'all']),
default='all', help='๋™๊ธฐํ™”ํ•  ๋„๋ฉ”์ธ (hotel/airline/card/news/all)')
@click.option('--dry-run', is_flag=True, help='์‹ค์ œ ์ €์žฅํ•˜์ง€ ์•Š๊ณ  ํ™•์ธ๋งŒ')
@click.option('--skip-embeddings', is_flag=True, help='์ž„๋ฒ ๋”ฉ ์ƒ์„ฑ ๊ฑด๋„ˆ๋›ฐ๊ธฐ')
@click.option('--verbose', '-v', is_flag=True, help='์ž์„ธํ•œ ์ถœ๋ ฅ')
@click.option('--file', '-f', type=str, default=None, help='ํŠน์ • ํŒŒ์ผ๋งŒ ์ฒ˜๋ฆฌ')
def main(chain: Optional[str], domain: str, dry_run: bool, skip_embeddings: bool, verbose: bool, file: Optional[str]):
"""YAML ๋ฐ์ดํ„ฐ๋ฅผ Supabase๋กœ ๋™๊ธฐํ™” (ํ™•์žฅ: ํ˜ธํ…” + ํ•ญ๊ณต + ์นด๋“œ + ๋‰ด์Šค)"""
print("๐Ÿš€ Supabase ๋™๊ธฐํ™” (์—ฌํ–‰ ํ”Œ๋žซํผ ํ†ตํ•ฉ ๋ฒ„์ „)")
print("=" * 60)
# ๋„๋ฉ”์ธ๋ณ„ ๋””๋ ‰ํ† ๋ฆฌ ๋งคํ•‘
domain_dirs = {
'hotel': Path("data/raw/Hotel"),
'airline': Path("data/raw/Airline"),
'card': Path("data/raw/CreditCard"),
'news': Path("data/raw/News"),
}
# ์Šค์บ”ํ•  ๋””๋ ‰ํ† ๋ฆฌ ๊ฒฐ์ •
if domain == 'all':
data_dirs = [d for d in domain_dirs.values() if d.exists()]
else:
data_dirs = [domain_dirs[domain]] if domain_dirs[domain].exists() else []
if not data_dirs:
print(f"โŒ ๋ฐ์ดํ„ฐ ๋””๋ ‰ํ† ๋ฆฌ๋ฅผ ์ฐพ์„ ์ˆ˜ ์—†์Šต๋‹ˆ๋‹ค")
return
print(f"๐Ÿ“‚ ์Šค์บ” ๋””๋ ‰ํ† ๋ฆฌ: {', '.join(str(d) for d in data_dirs)}")
# ํŒŒ์ผ ๋ชฉ๋ก
if file:
md_files = [Path(file)]
else:
md_files = []
for data_dir in data_dirs:
md_files.extend(list(data_dir.rglob("*.md")))
print(f"๐Ÿ“ ์ด {len(md_files)}๊ฐœ ํŒŒ์ผ ๋ฐœ๊ฒฌ")
# ์ฒด์ธ ํ•„ํ„ฐ
if chain:
chain = chain.upper()
print(f"๐Ÿ” ํ•„ํ„ฐ: {chain}")
# ํ†ต๊ณ„
stats = {
"total": 0,
"success": 0,
"skipped": 0,
"error": 0,
"chunks": 0,
}
# Quarantine ๋ฆฌํฌํŠธ (๋ฌธ์ œ ํŒŒ์ผ ์ถ”์ )
quarantine = {
"no_yaml": [], # YAML ํ”„๋ก ํŠธ๋งคํ„ฐ ์—†์Œ
"no_knowledge": [], # extracted_knowledge ์—†์Œ
"no_chunks": [], # ์ฒญํฌ ์ƒ์„ฑ ์‹คํŒจ
"missing_fields": [], # ํ•„์ˆ˜ ํ•„๋“œ ๋ˆ„๋ฝ ๊ฒฝ๊ณ 
}
all_chunks = []
all_docs = []
for md_file in md_files:
stats["total"] += 1
# ์ฒด์ธ ๊ฐ์ง€
file_chain = detect_chain(md_file)
if chain and file_chain != chain:
stats["skipped"] += 1
continue
# YAML ๋กœ๋“œ
data = load_yaml_from_md(md_file)
if not data or not isinstance(data, dict):
if verbose:
print(f" โš ๏ธ {md_file.name} (YAML ์—†์Œ)")
quarantine["no_yaml"].append(str(md_file.name))
stats["skipped"] += 1
continue
# extracted_knowledge ์ถ”์ถœ (์—†์œผ๋ฉด ์ตœ์ƒ์œ„ data ์‚ฌ์šฉ)
extracted_knowledge = data.get("extracted_knowledge")
if not extracted_knowledge or not isinstance(extracted_knowledge, dict):
# extracted_knowledge๊ฐ€ ์—†์œผ๋ฉด data ์ž์ฒด๊ฐ€ knowledge์ผ ์ˆ˜ ์žˆ์Œ
# ๋‹ค์–‘ํ•œ ๋„๋ฉ”์ธ์˜ ํ•ต์‹ฌ ํ‚ค๋“ค์„ ์ฒดํฌ
core_keys = {
# ํ˜ธํ…”
"hotel_properties", "loyalty_programs", "loyalty_program",
"membership_tiers", "tier_implementations", "hotel_brands",
"best_rate_guarantee", "channel_benefit_packages",
# ํ•ญ๊ณต
"airline_programs", "airline_program", "airline_tiers",
"award_charts", "airline_earning_rules",
# ์นด๋“œ
"credit_cards",
# ํ”„๋กœ๋ชจ์…˜/๋‰ด์Šค
"deal_alerts", "news_updates", "promotions",
# ๊ธฐํƒ€
"points_systems", "member_rates", "dining_programs",
}
if any(key in data for key in core_keys):
extracted_knowledge = data
else:
if verbose:
print(f" โš ๏ธ {md_file.name} (extracted_knowledge ์—†์Œ)")
quarantine["no_knowledge"].append(str(md_file.name))
stats["skipped"] += 1
continue
# ๋ฌธ์„œ ID ์ƒ์„ฑ
# --file ์˜ต์…˜ ์‚ฌ์šฉ ์‹œ data_dir๊ฐ€ ์—†์„ ์ˆ˜ ์žˆ์œผ๋ฏ€๋กœ ํŒŒ์ผ ๊ฒฝ๋กœ์—์„œ ์ง์ ‘ ๊ณ„์‚ฐ
try:
# data/raw ๊ธฐ์ค€์œผ๋กœ ์ƒ๋Œ€ ๊ฒฝ๋กœ ๊ณ„์‚ฐ
data_raw = Path("data/raw")
if md_file.is_relative_to(data_raw):
rel_path = str(md_file.relative_to(data_raw.parent))
elif "data/raw" in str(md_file):
# ์ ˆ๋Œ€ ๊ฒฝ๋กœ์ธ ๊ฒฝ์šฐ data/raw ์ดํ›„ ๋ถ€๋ถ„ ์ถ”์ถœ
path_str = str(md_file)
idx = path_str.find("data/raw")
rel_path = path_str[idx:] if idx >= 0 else str(md_file.name)
else:
rel_path = str(md_file)
except Exception:
rel_path = str(md_file)
doc_id = generate_doc_id(rel_path)
# ์ฒญํฌ ์ƒ์„ฑ
chunks = create_chunks_from_knowledge(
doc_id=doc_id,
chain=file_chain,
extracted_knowledge=extracted_knowledge,
verbose=verbose
)
if chunks:
print(f" โœ… {md_file.name} ({len(chunks)}๊ฐœ ์ฒญํฌ)")
stats["success"] += 1
stats["chunks"] += len(chunks)
# ๋ฌธ์„œ ์ •๋ณด
all_docs.append({
"doc_id": doc_id,
"source_file": rel_path,
"chain": file_chain,
"chunk_count": len(chunks),
})
all_chunks.extend(chunks)
else:
if verbose:
print(f" โš ๏ธ {md_file.name} (์ฒญํฌ ์—†์Œ)")
quarantine["no_chunks"].append(str(md_file.name))
stats["skipped"] += 1
print()
print("=" * 60)
print(f"๐Ÿ“Š ๊ฒฐ๊ณผ: {stats['success']}๊ฐœ ์„ฑ๊ณต, {stats['skipped']}๊ฐœ ๊ฑด๋„ˆ๋œ€")
print(f" ์ด ์ฒญํฌ: {stats['chunks']}๊ฐœ")
# Quarantine ๋ฆฌํฌํŠธ ์ถœ๋ ฅ (๋ฌธ์ œ ํŒŒ์ผ์ด ์žˆ๋Š” ๊ฒฝ์šฐ)
total_quarantined = sum(len(v) for v in quarantine.values())
if total_quarantined > 0:
print(f"\nโš ๏ธ Quarantine ๋ฆฌํฌํŠธ ({total_quarantined}๊ฐœ ํŒŒ์ผ):")
if quarantine["no_yaml"]:
print(f" ๐Ÿ“„ YAML ์—†์Œ ({len(quarantine['no_yaml'])}๊ฐœ): {', '.join(quarantine['no_yaml'][:5])}")
if quarantine["no_knowledge"]:
print(f" ๐Ÿ“„ extracted_knowledge ์—†์Œ ({len(quarantine['no_knowledge'])}๊ฐœ): {', '.join(quarantine['no_knowledge'][:5])}")
if quarantine["no_chunks"]:
print(f" ๐Ÿ“„ ์ฒญํฌ ์ƒ์„ฑ ์‹คํŒจ ({len(quarantine['no_chunks'])}๊ฐœ): {', '.join(quarantine['no_chunks'][:5])}")
print(" โ†’ ์œ„ ํŒŒ์ผ๋“ค์€ ์ˆ˜๋™ ๊ฒ€ํ† ๊ฐ€ ํ•„์š”ํ•ฉ๋‹ˆ๋‹ค.")
if dry_run:
print("\n๐Ÿ” Dry-run ๋ชจ๋“œ - ์ €์žฅํ•˜์ง€ ์•Š์Œ")
return
# Supabase ์ €์žฅ
print("\n๐Ÿ’พ Supabase์— ์ €์žฅ ์ค‘...")
try:
from src.db import SupabaseAdapter
adapter = SupabaseAdapter()
# ๋ฌธ์„œ ์ €์žฅ (ํ•„์ˆ˜ ํ•„๋“œ๋งŒ)
for doc in all_docs:
try:
doc_data = {
"doc_id": doc["doc_id"],
"source_file": doc["source_file"],
"chain": doc["chain"],
"extracted_knowledge": {}, # ๋นˆ dict (NOT NULL ํ•„๋“œ)
}
adapter.client.table("kb_documents").upsert(doc_data, on_conflict='doc_id').execute()
except Exception as e:
if verbose:
print(f" โš ๏ธ ๋ฌธ์„œ ์ €์žฅ ์˜ค๋ฅ˜: {e}")
# ์ฒญํฌ ์ €์žฅ (์ž„๋ฒ ๋”ฉ ํฌํ•จ)
saved = adapter.upsert_chunks(
chunks=all_chunks,
generate_embeddings=not skip_embeddings
)
print(f"\nโœ… Supabase ๋™๊ธฐํ™” ์™„๋ฃŒ!")
print(f" ๋ฌธ์„œ: {len(all_docs)}๊ฐœ")
print(f" ์ฒญํฌ: {saved}๊ฐœ")
except Exception as e:
print(f"\nโŒ Supabase ์ €์žฅ ์˜ค๋ฅ˜: {e}")
import traceback
if verbose:
traceback.print_exc()
if __name__ == "__main__":
main()