File size: 2,595 Bytes
f6b05db
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
"""
Vector DB ์ƒํƒœ ํ™•์ธ ์Šคํฌ๋ฆฝํŠธ

์ธ๋ฑ์‹ฑ์ด ์™„๋ฃŒ๋œ ํ›„ ๋ฒกํ„ฐ DB์˜ ๋‚ด์šฉ์„ ํ™•์ธํ•ฉ๋‹ˆ๋‹ค.
"""

import sys
from pathlib import Path

# ํ”„๋กœ์ ํŠธ ๋ฃจํŠธ๋ฅผ Python ๊ฒฝ๋กœ์— ์ถ”๊ฐ€
project_root = Path(__file__).parent.parent
sys.path.insert(0, str(project_root))

from dotenv import load_dotenv
from services.vector_store import VectorStore
from utils.config import settings


def main():
    """Vector DB ์ƒํƒœ ํ™•์ธ"""
    load_dotenv()

    print("=" * 80)
    print("Vector DB ์ƒํƒœ ํ™•์ธ")
    print("=" * 80)

    # Vector Store ์ดˆ๊ธฐํ™”
    vector_store = VectorStore(
        persist_directory=settings.chroma_persist_directory,
        collection_name=settings.collection_name
    )

    # ํ†ต๊ณ„ ์ •๋ณด
    stats = vector_store.get_collection_stats()
    print(f"\n๐Ÿ“Š ๊ธฐ๋ณธ ์ •๋ณด:")
    print(f"   ์ปฌ๋ ‰์…˜๋ช…: {stats['collection_name']}")
    print(f"   ์ €์žฅ ๊ฒฝ๋กœ: {stats['persist_directory']}")
    print(f"   ์ „์ฒด ๋ฌธ์„œ: {stats['total_documents']}๊ฐœ")
    print(f"   ๋ฐ์ดํ„ฐ ์กด์žฌ: {'โœ… ์˜ˆ' if stats['has_data'] else 'โŒ ์•„๋‹ˆ์˜ค'}")

    if not stats['has_data']:
        print("\nโš ๏ธ Vector DB๊ฐ€ ๋น„์–ด์žˆ์Šต๋‹ˆ๋‹ค!")
        print("   python scripts/index_pdfs.py ๋ฅผ ๋จผ์ € ์‹คํ–‰ํ•˜์„ธ์š”.")
        return

    # ์ƒ˜ํ”Œ ๋ฐ์ดํ„ฐ ํ™•์ธ
    print(f"\n๐Ÿ“š ์ƒ˜ํ”Œ ๋ฌธ์„œ ํ™•์ธ:")
    sample = vector_store.collection.peek(limit=3)

    for i, (doc_id, doc, metadata) in enumerate(zip(
        sample['ids'],
        sample['documents'],
        sample['metadatas']
    ), 1):
        print(f"\n[{i}] {doc_id}")
        print(f"   ์ถœ์ฒ˜: {metadata.get('source_filename', 'unknown')}")
        print(f"   ์ œ๋ชฉ: {metadata.get('title', 'N/A')}")
        print(f"   ์ €์ž: {metadata.get('author', 'N/A')}")
        print(f"   ๋‚ด์šฉ: {doc[:150]}...")

    # ๊ฐ„๋‹จํ•œ ๊ฒ€์ƒ‰ ํ…Œ์ŠคํŠธ
    print(f"\n๐Ÿ” ๊ฒ€์ƒ‰ ํ…Œ์ŠคํŠธ:")
    test_query = "financial crisis"
    print(f"   ์ฟผ๋ฆฌ: '{test_query}'")

    results = vector_store.search_by_text(test_query, top_k=3)

    print(f"   ๊ฒฐ๊ณผ: {len(results['documents'])}๊ฐœ ๋ฌธ์„œ ๋ฐœ๊ฒฌ")
    for i, (doc, metadata, distance) in enumerate(zip(
        results['documents'],
        results['metadatas'],
        results['distances']
    ), 1):
        similarity = 1 - distance
        print(f"\n   [{i}] {metadata.get('source_filename', 'unknown')}")
        print(f"       ์œ ์‚ฌ๋„: {similarity:.3f}")
        print(f"       ๋‚ด์šฉ: {doc[:100]}...")

    print("\n" + "=" * 80)
    print("โœ… Vector DB ํ™•์ธ ์™„๋ฃŒ!")
    print("=" * 80)


if __name__ == "__main__":
    main()