Spaces:
Running
Running
| #!/usr/bin/env python | |
| """ | |
| bge-m3 ์ฌ์๋ฒ ๋ฉ ์คํฌ๋ฆฝํธ | |
| ====================== | |
| ๊ธฐ์กด ์ฒญํฌ๋ค์ bge-m3 ๋ชจ๋ธ๋ก ์ฌ์๋ฒ ๋ฉํฉ๋๋ค. | |
| ๋ก์ปฌ์์ ์คํํ์ฌ API ๋น์ฉ ์์ด ์๋ฒ ๋ฉ์ ์์ฑํฉ๋๋ค. | |
| ์ฌ์ฉ๋ฒ: | |
| python scripts/reembed_with_bge.py | |
| python scripts/reembed_with_bge.py --dry-run # ํ ์คํธ ๋ชจ๋ | |
| """ | |
| import os | |
| import sys | |
| import json | |
| from pathlib import Path | |
| from typing import List, Dict, Any | |
| # ํ๋ก์ ํธ ๋ฃจํธ๋ฅผ Python ๊ฒฝ๋ก์ ์ถ๊ฐ | |
| sys.path.insert(0, str(Path(__file__).parent.parent)) | |
| from dotenv import load_dotenv | |
| load_dotenv() | |
| import click | |
| def main(dry_run: bool, limit: int, batch_size: int): | |
| """๊ธฐ์กด ์ฒญํฌ๋ค์ bge-m3๋ก ์ฌ์๋ฒ ๋ฉ""" | |
| print("๐ bge-m3 ์ฌ์๋ฒ ๋ฉ ์์") | |
| print("=" * 60) | |
| if dry_run: | |
| print("โ ๏ธ DRY-RUN ๋ชจ๋: ์ค์ ์ ์ฅํ์ง ์์ต๋๋ค.\n") | |
| # 1. bge-m3 ๋ชจ๋ธ ๋ก๋ | |
| print("๐ฆ bge-m3 ๋ชจ๋ธ ๋ก๋ฉ ์ค...") | |
| from src.db.bge_embeddings import BGEEmbeddings | |
| embedder = BGEEmbeddings() | |
| print(f"โ ๋ชจ๋ธ ๋ก๋ ์๋ฃ (์ฐจ์: {embedder.EMBEDDING_DIMENSION})") | |
| # 2. Supabase ์ฐ๊ฒฐ | |
| print("\n๐ Supabase ์ฐ๊ฒฐ ์ค...") | |
| from supabase import create_client | |
| url = os.getenv("SUPABASE_URL") | |
| key = os.getenv("SUPABASE_KEY") | |
| if not url or not key: | |
| print("โ SUPABASE_URL ๋๋ SUPABASE_KEY๊ฐ ์ค์ ๋์ง ์์์ต๋๋ค.") | |
| return | |
| client = create_client(url, key) | |
| print("โ Supabase ์ฐ๊ฒฐ๋จ") | |
| # 3. ๊ธฐ์กด ์ฒญํฌ ๊ฐ์ ธ์ค๊ธฐ | |
| print("\n๐ฅ ๊ธฐ์กด ์ฒญํฌ ๋ก๋ฉ ์ค...") | |
| query = client.table("kb_chunks").select("chunk_id, content") | |
| if limit: | |
| query = query.limit(limit) | |
| result = query.execute() | |
| chunks = result.data or [] | |
| print(f" ์ด ์ฒญํฌ: {len(chunks)}๊ฐ") | |
| if not chunks: | |
| print("โ ์ฒ๋ฆฌํ ์ฒญํฌ๊ฐ ์์ต๋๋ค.") | |
| return | |
| # 4. ๋ฐฐ์น ์ฒ๋ฆฌ๋ก ์๋ฒ ๋ฉ ์์ฑ | |
| print(f"\n๐ ์๋ฒ ๋ฉ ์์ฑ ์ค... (๋ฐฐ์น ํฌ๊ธฐ: {batch_size})") | |
| total_processed = 0 | |
| total_errors = 0 | |
| for i in range(0, len(chunks), batch_size): | |
| batch = chunks[i:i + batch_size] | |
| batch_contents = [c["content"] for c in batch] | |
| batch_ids = [c["chunk_id"] for c in batch] | |
| try: | |
| # ๋ฐฐ์น ์๋ฒ ๋ฉ ์์ฑ | |
| embeddings = embedder.embed_texts(batch_contents) | |
| # ๊ฐ ์ฒญํฌ ์ ๋ฐ์ดํธ | |
| for j, (chunk_id, embedding) in enumerate(zip(batch_ids, embeddings)): | |
| if not dry_run: | |
| try: | |
| client.table("kb_chunks").update({ | |
| "embedding": embedding | |
| }).eq("chunk_id", chunk_id).execute() | |
| except Exception as e: | |
| print(f" โ ๏ธ {chunk_id} ์ ๋ฐ์ดํธ ์คํจ: {e}") | |
| total_errors += 1 | |
| continue | |
| total_processed += 1 | |
| # ์งํ ์ํฉ ์ถ๋ ฅ | |
| progress = (i + len(batch)) / len(chunks) * 100 | |
| print(f" {i + len(batch):4d}/{len(chunks)} ({progress:5.1f}%) ์๋ฃ") | |
| except Exception as e: | |
| print(f" โ ๏ธ ๋ฐฐ์น {i//batch_size + 1} ์ฒ๋ฆฌ ์คํจ: {e}") | |
| total_errors += len(batch) | |
| # 5. ๊ฒฐ๊ณผ ์์ฝ | |
| print("\n" + "=" * 60) | |
| print("๐ ์ฌ์๋ฒ ๋ฉ ์๋ฃ") | |
| print(f" ์ฒ๋ฆฌ๋จ: {total_processed}๊ฐ") | |
| print(f" ์ค๋ฅ: {total_errors}๊ฐ") | |
| if dry_run: | |
| print("\nโ ๏ธ DRY-RUN ์๋ฃ. ์ค์ ์ ์ฅํ๋ ค๋ฉด --dry-run ์ต์ ์ ์ ๊ฑฐํ์ธ์.") | |
| else: | |
| print("\nโ bge-m3 ์ฌ์๋ฒ ๋ฉ ์๋ฃ!") | |
| # 6. ๊ฒ์ฆ ํ ์คํธ | |
| if not dry_run and total_processed > 0: | |
| print("\n๐งช ๊ฒ์ฆ ํ ์คํธ...") | |
| # ํ๊ตญ์ด ๊ฒ์ ํ ์คํธ | |
| test_query = "์จ์คํด ํ๋ฅด๋์ค" | |
| query_embedding = embedder.embed_query(test_query) | |
| try: | |
| search_result = client.rpc( | |
| "match_kb_chunks", | |
| { | |
| "query_embedding": query_embedding, | |
| "match_threshold": 0.3, | |
| "match_count": 3, | |
| "filter_chain": None | |
| } | |
| ).execute() | |
| print(f" ๊ฒ์์ด: {test_query}") | |
| if search_result.data: | |
| for r in search_result.data[:3]: | |
| sim = r.get("similarity", 0) | |
| content = r.get("content", "")[:50].replace("\n", " ") | |
| print(f" - {sim:.4f}: {content}...") | |
| else: | |
| print(" โ ๏ธ ๊ฒ์ ๊ฒฐ๊ณผ ์์ (RPC ํจ์ ์ ๋ฐ์ดํธ ํ์ํ ์ ์์)") | |
| except Exception as e: | |
| print(f" โ ๏ธ ๊ฒ์ ํ ์คํธ ์คํจ: {e}") | |
| print(" โ Supabase์์ match_kb_chunks ํจ์๋ฅผ 1024์ฐจ์์ผ๋ก ์ ๋ฐ์ดํธํ์ธ์.") | |
| if __name__ == "__main__": | |
| main() | |