eodi-mcp / scripts /reembed_with_bge.py
lovelymango's picture
Upload 19 files
4c3c97b verified
#!/usr/bin/env python
"""
bge-m3 ์žฌ์ž„๋ฒ ๋”ฉ ์Šคํฌ๋ฆฝํŠธ
======================
๊ธฐ์กด ์ฒญํฌ๋“ค์„ bge-m3 ๋ชจ๋ธ๋กœ ์žฌ์ž„๋ฒ ๋”ฉํ•ฉ๋‹ˆ๋‹ค.
๋กœ์ปฌ์—์„œ ์‹คํ–‰ํ•˜์—ฌ API ๋น„์šฉ ์—†์ด ์ž„๋ฒ ๋”ฉ์„ ์ƒ์„ฑํ•ฉ๋‹ˆ๋‹ค.
์‚ฌ์šฉ๋ฒ•:
python scripts/reembed_with_bge.py
python scripts/reembed_with_bge.py --dry-run # ํ…Œ์ŠคํŠธ ๋ชจ๋“œ
"""
import os
import sys
import json
from pathlib import Path
from typing import List, Dict, Any
# ํ”„๋กœ์ ํŠธ ๋ฃจํŠธ๋ฅผ Python ๊ฒฝ๋กœ์— ์ถ”๊ฐ€
sys.path.insert(0, str(Path(__file__).parent.parent))
from dotenv import load_dotenv
load_dotenv()
import click
@click.command()
@click.option('--dry-run', is_flag=True, help='์‹ค์ œ ์ €์žฅ ์—†์ด ํ…Œ์ŠคํŠธ')
@click.option('--limit', '-l', type=int, default=None, help='์ฒ˜๋ฆฌํ•  ์ฒญํฌ ์ˆ˜ ์ œํ•œ')
@click.option('--batch-size', '-b', type=int, default=32, help='๋ฐฐ์น˜ ํฌ๊ธฐ')
def main(dry_run: bool, limit: int, batch_size: int):
"""๊ธฐ์กด ์ฒญํฌ๋“ค์„ bge-m3๋กœ ์žฌ์ž„๋ฒ ๋”ฉ"""
print("๐Ÿ”„ bge-m3 ์žฌ์ž„๋ฒ ๋”ฉ ์‹œ์ž‘")
print("=" * 60)
if dry_run:
print("โš ๏ธ DRY-RUN ๋ชจ๋“œ: ์‹ค์ œ ์ €์žฅํ•˜์ง€ ์•Š์Šต๋‹ˆ๋‹ค.\n")
# 1. bge-m3 ๋ชจ๋ธ ๋กœ๋“œ
print("๐Ÿ“ฆ bge-m3 ๋ชจ๋ธ ๋กœ๋”ฉ ์ค‘...")
from src.db.bge_embeddings import BGEEmbeddings
embedder = BGEEmbeddings()
print(f"โœ… ๋ชจ๋ธ ๋กœ๋“œ ์™„๋ฃŒ (์ฐจ์›: {embedder.EMBEDDING_DIMENSION})")
# 2. Supabase ์—ฐ๊ฒฐ
print("\n๐Ÿ”Œ Supabase ์—ฐ๊ฒฐ ์ค‘...")
from supabase import create_client
url = os.getenv("SUPABASE_URL")
key = os.getenv("SUPABASE_KEY")
if not url or not key:
print("โŒ SUPABASE_URL ๋˜๋Š” SUPABASE_KEY๊ฐ€ ์„ค์ •๋˜์ง€ ์•Š์•˜์Šต๋‹ˆ๋‹ค.")
return
client = create_client(url, key)
print("โœ… Supabase ์—ฐ๊ฒฐ๋จ")
# 3. ๊ธฐ์กด ์ฒญํฌ ๊ฐ€์ ธ์˜ค๊ธฐ
print("\n๐Ÿ“ฅ ๊ธฐ์กด ์ฒญํฌ ๋กœ๋”ฉ ์ค‘...")
query = client.table("kb_chunks").select("chunk_id, content")
if limit:
query = query.limit(limit)
result = query.execute()
chunks = result.data or []
print(f" ์ด ์ฒญํฌ: {len(chunks)}๊ฐœ")
if not chunks:
print("โŒ ์ฒ˜๋ฆฌํ•  ์ฒญํฌ๊ฐ€ ์—†์Šต๋‹ˆ๋‹ค.")
return
# 4. ๋ฐฐ์น˜ ์ฒ˜๋ฆฌ๋กœ ์ž„๋ฒ ๋”ฉ ์ƒ์„ฑ
print(f"\n๐Ÿ“Š ์ž„๋ฒ ๋”ฉ ์ƒ์„ฑ ์ค‘... (๋ฐฐ์น˜ ํฌ๊ธฐ: {batch_size})")
total_processed = 0
total_errors = 0
for i in range(0, len(chunks), batch_size):
batch = chunks[i:i + batch_size]
batch_contents = [c["content"] for c in batch]
batch_ids = [c["chunk_id"] for c in batch]
try:
# ๋ฐฐ์น˜ ์ž„๋ฒ ๋”ฉ ์ƒ์„ฑ
embeddings = embedder.embed_texts(batch_contents)
# ๊ฐ ์ฒญํฌ ์—…๋ฐ์ดํŠธ
for j, (chunk_id, embedding) in enumerate(zip(batch_ids, embeddings)):
if not dry_run:
try:
client.table("kb_chunks").update({
"embedding": embedding
}).eq("chunk_id", chunk_id).execute()
except Exception as e:
print(f" โš ๏ธ {chunk_id} ์—…๋ฐ์ดํŠธ ์‹คํŒจ: {e}")
total_errors += 1
continue
total_processed += 1
# ์ง„ํ–‰ ์ƒํ™ฉ ์ถœ๋ ฅ
progress = (i + len(batch)) / len(chunks) * 100
print(f" {i + len(batch):4d}/{len(chunks)} ({progress:5.1f}%) ์™„๋ฃŒ")
except Exception as e:
print(f" โš ๏ธ ๋ฐฐ์น˜ {i//batch_size + 1} ์ฒ˜๋ฆฌ ์‹คํŒจ: {e}")
total_errors += len(batch)
# 5. ๊ฒฐ๊ณผ ์š”์•ฝ
print("\n" + "=" * 60)
print("๐Ÿ“Š ์žฌ์ž„๋ฒ ๋”ฉ ์™„๋ฃŒ")
print(f" ์ฒ˜๋ฆฌ๋จ: {total_processed}๊ฐœ")
print(f" ์˜ค๋ฅ˜: {total_errors}๊ฐœ")
if dry_run:
print("\nโš ๏ธ DRY-RUN ์™„๋ฃŒ. ์‹ค์ œ ์ €์žฅํ•˜๋ ค๋ฉด --dry-run ์˜ต์…˜์„ ์ œ๊ฑฐํ•˜์„ธ์š”.")
else:
print("\nโœ… bge-m3 ์žฌ์ž„๋ฒ ๋”ฉ ์™„๋ฃŒ!")
# 6. ๊ฒ€์ฆ ํ…Œ์ŠคํŠธ
if not dry_run and total_processed > 0:
print("\n๐Ÿงช ๊ฒ€์ฆ ํ…Œ์ŠคํŠธ...")
# ํ•œ๊ตญ์–ด ๊ฒ€์ƒ‰ ํ…Œ์ŠคํŠธ
test_query = "์›จ์Šคํ‹ด ํŒŒ๋ฅด๋‚˜์Šค"
query_embedding = embedder.embed_query(test_query)
try:
search_result = client.rpc(
"match_kb_chunks",
{
"query_embedding": query_embedding,
"match_threshold": 0.3,
"match_count": 3,
"filter_chain": None
}
).execute()
print(f" ๊ฒ€์ƒ‰์–ด: {test_query}")
if search_result.data:
for r in search_result.data[:3]:
sim = r.get("similarity", 0)
content = r.get("content", "")[:50].replace("\n", " ")
print(f" - {sim:.4f}: {content}...")
else:
print(" โš ๏ธ ๊ฒ€์ƒ‰ ๊ฒฐ๊ณผ ์—†์Œ (RPC ํ•จ์ˆ˜ ์—…๋ฐ์ดํŠธ ํ•„์š”ํ•  ์ˆ˜ ์žˆ์Œ)")
except Exception as e:
print(f" โš ๏ธ ๊ฒ€์ƒ‰ ํ…Œ์ŠคํŠธ ์‹คํŒจ: {e}")
print(" โ†’ Supabase์—์„œ match_kb_chunks ํ•จ์ˆ˜๋ฅผ 1024์ฐจ์›์œผ๋กœ ์—…๋ฐ์ดํŠธํ•˜์„ธ์š”.")
if __name__ == "__main__":
main()