File size: 10,342 Bytes
64ad66f
 
 
 
 
 
 
 
 
 
 
 
 
 
622f700
64ad66f
 
 
6fecdf0
f2d1de7
 
 
 
6fecdf0
 
64ad66f
 
 
 
 
7f57ffc
64ad66f
 
 
 
 
7f57ffc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
64ad66f
 
 
 
 
c64138a
64ad66f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c64138a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
64ad66f
 
 
c64138a
64ad66f
 
c64138a
64ad66f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
47e7138
64ad66f
47e7138
 
 
ff0395c
 
47e7138
ff0395c
47e7138
 
 
64ad66f
 
47e7138
64ad66f
47e7138
 
 
64ad66f
 
47e7138
64ad66f
47e7138
 
 
64ad66f
 
 
 
 
 
47e7138
64ad66f
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
"""
smoke_test_rag.py β€” GraphRAG 3λŒ€ μ‹œλ‚˜λ¦¬μ˜€ ν˜„μž₯ 검증 슀크립트
=============================================================
지원동기 μž‘μ„± 지원 μ±—λ΄‡μœΌλ‘œμ„œμ˜ μ„œλΉ„μŠ€ λͺ©μ μ„ κ²€μ¦ν•©λ‹ˆλ‹€.

μ‹œλ‚˜λ¦¬μ˜€:
  1. νŠΉμ • κΈ°μ—…  - "카카였의 AI μ„œλΉ„μŠ€ νŠΈλ Œλ“œλŠ”?"
  2. νŠΉμ • 기술  - "LLM κΈ°μˆ μ„ κ°œλ°œν•˜λŠ” 기업듀은?"
  3. 전체 νŠΈλ Œλ“œ - "금육AI λΆ„μ•Όμ—μ„œ κ°€μž₯ 적극적인 κΈ°μ—… TOP 3와 λŒ€ν‘œ μ„œλΉ„μŠ€"

μ‹€ν–‰ 방법:
    python3 tests/smoke_test_rag.py
"""

import io
import os
import sys
import time

# ν”„λ‘œμ νŠΈ 루트 디렉토리λ₯Ό Python κ²½λ‘œμ— μΆ”κ°€ν•˜μ—¬ ModuleNotFoundError λ°©μ§€
sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..")))


# Windows ν™˜κ²½μ—μ„œ μœ λ‹ˆμ½”λ“œ 이λͺ¨μ§€ 좜λ ₯ μ‹œ UnicodeEncodeError(cp949) λ°©μ§€λ₯Ό μœ„ν•œ stdout 인코딩 μž¬μ„€μ •
sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8')

import dotenv

dotenv.load_dotenv()


# ── 0. κ·Έλž˜ν”„ ꡬ성 사전 점검 (Neo4j λ…Έλ“œ/관계 톡계) ─────────────────────────
def check_graph_structure():
    import neo4j

    uri = os.getenv("NEO4J_URI", "neo4j://localhost:7687")
    client_id = os.getenv("NEO4J_CLIENT_ID")
    client_secret = os.getenv("NEO4J_CLIENT_SECRET")
    
    driver = None
    if client_id and client_secret:
        try:
            driver = neo4j.GraphDatabase.driver(uri, auth=(client_id, client_secret))
            driver.verify_connectivity()
        except Exception:
            driver = None
            
    if not driver:
        username = os.getenv("NEO4J_USERNAME", "neo4j")
        password = os.getenv("NEO4J_PASSWORD", "password")
        driver = neo4j.GraphDatabase.driver(uri, auth=(username, password))
        driver.verify_connectivity()

    print("\n" + "=" * 60)
    print("πŸ“Š [사전 점검] Neo4j κ·Έλž˜ν”„ ꡬ성 ν˜„ν™©")
    print("=" * 60)

    # ── λ…Έλ“œ/κΈ°λ³Έ 관계 수 점검 ──────────────────────────────────────────────
    queries = {
        "Article (기사)":        "MATCH (n:Article) RETURN count(n) as cnt",
        "AICompany (κΈ°μ—…)":      "MATCH (n:AICompany) RETURN count(n) as cnt",
        "AITechnology (기술)":   "MATCH (n:AITechnology) RETURN count(n) as cnt",
        "AIService (μ„œλΉ„μŠ€)":    "MATCH (n:AIService) RETURN count(n) as cnt",
        "AIField (λΆ„μ•Ό)":        "MATCH (n:AIField) RETURN count(n) as cnt",
        "Content (청크+벑터)":   "MATCH (n:Content) RETURN count(n) as cnt",
        "MENTIONS 관계":         "MATCH ()-[r:MENTIONS]->() RETURN count(r) as cnt",
        "DEVELOPS 관계":         "MATCH ()-[r:DEVELOPS]->() RETURN count(r) as cnt",
    }

    all_ok = True
    for label, cypher in queries.items():
        with driver.session() as s:
            result = s.run(cypher).single()
            cnt = result["cnt"] if result else 0
            status = "βœ…" if cnt > 0 else "⚠️ λΉ„μ–΄μžˆμŒ"
            if cnt == 0:
                all_ok = False
            print(f"  {status}  {label}: {cnt}개")

    # ── μ—”ν‹°ν‹° κ°„ 직접 관계 μ—°κ²°μ„± 심측 점검 ───────────────────────────────
    print()
    print("  [μ—”ν‹°ν‹° κ°„ 직접 관계 μ—°κ²°μ„± 점검]")
    entity_rel_types = ["DEVELOPS", "INVESTS_IN", "PARTNERS_WITH", "APPLIES", "USED_IN", "RELATED_TO"]
    total_entity_rels = 0
    with driver.session() as s:
        for rel_type in entity_rel_types:
            cnt = s.run(
                f"MATCH ()-[r:{rel_type}]->() RETURN count(r) as cnt"
            ).single()["cnt"]
            total_entity_rels += cnt
            status = "βœ…" if cnt > 0 else "⚠️"
            print(f"    {status} {rel_type}: {cnt}개")

        # 고립 λ…Έλ“œ(관계가 μ „ν˜€ μ—†λŠ” Content μ œμ™Έ) λΉ„μœ¨ 점검
        isolated = s.run(
            "MATCH (n) WHERE NOT (n)--() AND NOT n:Content RETURN count(n) as cnt"
        ).single()["cnt"]
        total_nodes = s.run(
            "MATCH (n) WHERE NOT n:Content RETURN count(n) as cnt"
        ).single()["cnt"]

    isolation_rate = (isolated / total_nodes * 100) if total_nodes > 0 else 0
    iso_status = "βœ…" if isolation_rate < 20 else "⚠️ 고립 λ…Έλ“œ κ³Όλ‹€"
    print(f"\n    {iso_status} 고립 λ…Έλ“œ(Content μ œμ™Έ): {isolated}개 / 전체: {total_nodes}개 ({isolation_rate:.1f}%)")
    print(f"    μ—”ν‹°ν‹° κ°„ 직접 관계 합계: {total_entity_rels}개")

    # μ—”ν‹°ν‹° κ°„ 관계가 μ „ν˜€ μ—†μœΌλ©΄ μ‹€νŒ¨ 처리
    if total_entity_rels == 0:
        print("\n  β›” μ—”ν‹°ν‹° κ°„ 직접 관계(DEVELOPS/APPLIES λ“±)κ°€ 0κ°œμž…λ‹ˆλ‹€. finGraph.py μž¬μ‹€ν–‰ ν•„μš”.")
        all_ok = False

    # μ΅œμ†Œ μž„κ³„κ°’: 기사 10건당 직접 관계 5개 이상 ꢌ고
    with driver.session() as s:
        article_cnt = s.run("MATCH (n:Article) RETURN count(n) as cnt").single()["cnt"]
    if article_cnt > 0:
        rels_per_article = total_entity_rels / article_cnt
        threshold_ok = rels_per_article >= 3.0
        t_status = "βœ…" if threshold_ok else "⚠️ 관계 밀도 λΆ€μ‘±"
        print(f"    {t_status} 기사당 평균 μ—”ν‹°ν‹° 관계: {rels_per_article:.1f}개 (ꢌ고: 3.0개 이상)")
        if not threshold_ok:
            all_ok = False

    driver.close()
    print()
    if not all_ok:
        print("β›” 일뢀 λ…Έλ“œ/관계가 λΉ„μ–΄μžˆκ±°λ‚˜ 연결성이 λΆ€μ‘±ν•©λ‹ˆλ‹€. finGraph.py μ‹€ν–‰μœΌλ‘œ κ·Έλž˜ν”„λ₯Ό μ±„μ›Œμ£Όμ„Έμš”.\n")
        sys.exit(1)
    else:
        print("βœ… κ·Έλž˜ν”„ ꡬ성 및 μ—°κ²°μ„± 정상 β€” RAG ν…ŒμŠ€νŠΈλ₯Ό μ‹œμž‘ν•©λ‹ˆλ‹€.\n")


# ── 1. GraphRAG 응닡 ν’ˆμ§ˆ 검증 ───────────────────────────────────────────────
def run_scenario(label: str, query: str, expected_keywords: list[str]):
    from src.retrieval.finRetrieval import graphrag

    print("=" * 60)
    print(f"πŸ” μ‹œλ‚˜λ¦¬μ˜€: {label}")
    print(f"   질문: {query}")
    print("=" * 60)

    start = time.time()
    result = graphrag.search(query_text=query)
    elapsed = time.time() - start

    answer = result.answer if result and result.answer else ""

    print(f"\nπŸ“ GraphRAG 응닡 ({elapsed:.1f}초):\n")
    print(answer)

    # ν’ˆμ§ˆ 검증
    print("\nπŸ”Ž ν’ˆμ§ˆ 체크:")
    all_pass = True

    # 1) 응닡이 λΉ„μ–΄μžˆμ§€ μ•Šμ€κ°€
    if len(answer.strip()) > 50:
        print("  βœ… 응닡 길이 μΆ©λΆ„ (50자 이상)")
    else:
        print(f"  ❌ 응닡이 λ„ˆλ¬΄ 짧음 ({len(answer.strip())}자)")
        all_pass = False

    # 2) κΈ°λŒ€ ν‚€μ›Œλ“œ 포함 μ—¬λΆ€
    found = [kw for kw in expected_keywords if kw in answer]
    missing = [kw for kw in expected_keywords if kw not in answer]
    if found:
        print(f"  βœ… 핡심 ν‚€μ›Œλ“œ 포함: {found}")
    if missing:
        print(f"  ⚠️  미포함 ν‚€μ›Œλ“œ: {missing}")

    # 3) 좜처/κ·Όκ±° ν‘œκΈ° μ—¬λΆ€
    source_indicators = ["기사", "좜처", "λ‰΄μŠ€", "보도", "λ”°λ₯΄λ©΄", "λ°œν‘œ", "http"]
    has_source = any(ind in answer for ind in source_indicators)
    if has_source:
        print("  βœ… 좜처/κ·Όκ±° ν‘œκΈ° 있음")
    else:
        print("  ⚠️  좜처/κ·Όκ±° ν‘œκΈ° μ—†μŒ (RAG μ‘λ‹΅μ΄μ§€λ§Œ κ·Όκ±°κ°€ 뢈λͺ…ν™•)")
        all_pass = False

    overall = "βœ… PASS" if all_pass else "⚠️  PARTIAL (κ°œμ„  μ—¬μ§€ 있음)"
    print(f"\n  β†’ μ΅œμ’… νŒμ •: {overall}")
    print()
    return all_pass


# ── 메인 μ‹€ν–‰ ────────────────────────────────────────────────────────────────
if __name__ == "__main__":
    # 0. κ·Έλž˜ν”„ ꡬ성 사전 점검
    check_graph_structure()

    results = []

    # μ‹œλ‚˜λ¦¬μ˜€ 1: μ‹ ν•œμ€ν–‰ AI 쏠 포트폴리였
    results.append(run_scenario(
        label="β‘  μ‹ ν•œμ€ν–‰ β€” μ‹ ν•œμ€ν–‰μ˜ 'μ‹ ν•œ AI 쏠 포트폴리였' λ‘œλ³΄μ–΄λ“œλ°”μ΄μ € 기술과 개인 λ§žμΆ€ν˜• μ„œλΉ„μŠ€μ˜ νŠΉμ§•μ„ μ„€λͺ…ν•΄μ€˜",
        query="μ‹ ν•œμ€ν–‰μ˜ 'μ‹ ν•œ AI 쏠 포트폴리였' λ‘œλ³΄μ–΄λ“œλ°”μ΄μ € 기술과 개인 λ§žμΆ€ν˜• μ„œλΉ„μŠ€μ˜ νŠΉμ§•μ„ μ„€λͺ…ν•΄μ€˜",
        expected_keywords=["μ‹ ν•œ", "λ‘œλ³΄μ–΄λ“œλ°”μ΄μ €"],
    ))

    # μ‹œλ‚˜λ¦¬μ˜€ 2: 카카였페이 AI λŒ€μ•ˆμ‹ μš©ν‰κ°€
    results.append(run_scenario(
        label="β‘‘ 카카였페이 β€” μΉ΄μΉ΄μ˜€νŽ˜μ΄κ°€ μ”¬νŒŒμΌλŸ¬λ₯Ό μœ„ν•΄ κ°œλ°œν•œ 'AI λŒ€μ•ˆμ‹ μš©ν‰κ°€' λͺ¨λΈμ˜ μž₯점과 λŒ€μΆœ 승인 νš¨κ³ΌλŠ” λ¬΄μ—‡μΈκ°€μš”?",
        query="μΉ΄μΉ΄μ˜€νŽ˜μ΄κ°€ μ”¬νŒŒμΌλŸ¬λ₯Ό μœ„ν•΄ κ°œλ°œν•œ 'AI λŒ€μ•ˆμ‹ μš©ν‰κ°€' λͺ¨λΈμ˜ μž₯점과 λŒ€μΆœ 승인 νš¨κ³ΌλŠ” λ¬΄μ—‡μΈκ°€μš”?",
        expected_keywords=["카카였페이", "λŒ€μ•ˆμ‹ μš©ν‰κ°€"],
    ))

    # μ‹œλ‚˜λ¦¬μ˜€ 3: ν† μŠ€λ±…ν¬ AI FDS
    results.append(run_scenario(
        label="β‘’ ν† μŠ€λ±…ν¬ β€” ν† μŠ€λ±…ν¬μ˜ μ‹€μ‹œκ°„ λ³΄μ΄μŠ€ν”Όμ‹± 탐지 기술인 'ν† μŠ€ AI FDS'의 μž‘λ™ 원리와 μ°¨λ‹¨μœ¨μ„ μ•Œλ €μ€˜",
        query="ν† μŠ€λ±…ν¬μ˜ μ‹€μ‹œκ°„ λ³΄μ΄μŠ€ν”Όμ‹± 탐지 기술인 'ν† μŠ€ AI FDS'의 μž‘λ™ 원리와 μ°¨λ‹¨μœ¨μ„ μ•Œλ €μ€˜",
        expected_keywords=["ν† μŠ€", "FDS"],
    ))

    # μ‹œλ‚˜λ¦¬μ˜€ 4: λ„€μ΄λ²„νŽ˜μ΄ AI 금육 λΉ„μ„œ
    results.append(run_scenario(
        label="β‘£ λ„€μ΄λ²„νŽ˜μ΄ β€” λ„€μ΄λ²„νŽ˜μ΄κ°€ μΆœμ‹œν•œ 'AI 금육 λΉ„μ„œ'κ°€ λ§ˆμ΄λ°μ΄ν„°μ™€ κ²°ν•©ν•˜μ—¬ μ œκ³΅ν•˜λŠ” 맞좀 μžμ‚° κ°€μ΄λ“œλŠ” μ–΄λ–€ κ²ƒμΈκ°€μš”?",
        query="λ„€μ΄λ²„νŽ˜μ΄κ°€ μΆœμ‹œν•œ 'AI 금육 λΉ„μ„œ'κ°€ λ§ˆμ΄λ°μ΄ν„°μ™€ κ²°ν•©ν•˜μ—¬ μ œκ³΅ν•˜λŠ” 맞좀 μžμ‚° κ°€μ΄λ“œλŠ” μ–΄λ–€ κ²ƒμΈκ°€μš”?",
        expected_keywords=["λ„€μ΄λ²„νŽ˜μ΄", "λ§ˆμ΄λ°μ΄ν„°"],
    ))

    # μ΅œμ’… μš”μ•½
    print("=" * 60)
    print("πŸ“‹ μ΅œμ’… μš”μ•½")
    print("=" * 60)
    labels = ["β‘  μ‹ ν•œ AI 쏠 포트폴리였", "β‘‘ 카카였페이 AI μ‹ μš©ν‰κ°€", "β‘’ ν† μŠ€ AI FDS", "β‘£ λ„€μ΄λ²„νŽ˜μ΄ AI 금육 λΉ„μ„œ"]
    for label, passed in zip(labels, results):
        print(f"  {'βœ… PASS' if passed else '⚠️  PARTIAL'} | {label}")
    print()
    pass_count = sum(results)
    print(f"  총 {pass_count}/{len(results)}개 μ‹œλ‚˜λ¦¬μ˜€ μ™„μ „ 톡과")