dev-yuje commited on
Commit
47e7138
ยท
1 Parent(s): 71c5f81

feat: complete finance graph integration and fix isolation

Browse files
app.py CHANGED
@@ -314,11 +314,11 @@ theme_obj = gr.themes.Soft(
314
 
315
  CHATBOT_DESCRIPTION = """
316
  <div class="prose">
317
- <h3>๐ŸŒŒ ๊ตญ๋‚ด AI ๋‰ด์Šค ๊ธฐ์‚ฌ๋ฅผ ๊ธฐ๋ฐ˜์œผ๋กœ ๊ตฌ์ถ•๋œ ์ง€์‹ ๊ทธ๋ž˜ํ”„(GraphRAG)์— ์งˆ๋ฌธํ•˜์„ธ์š”.</h3>
318
  <ul>
319
- <li>๐Ÿ“ฐ <b>๊ธฐ์—…๋ณ„ AI ํŠธ๋ Œ๋“œ</b> โ€” ์‚ผ์„ฑ, ์นด์นด์˜ค, ๋„ค์ด๋ฒ„ ๋“ฑ ์ฃผ์š” ๊ธฐ์—…์˜ ์ตœ์‹  AI ๋™ํ–ฅ</li>
320
- <li>๐Ÿ”ฌ <b>๊ธฐ์ˆ  ํ‚ค์›Œ๋“œ ๋ถ„์„</b> โ€” LLM, ์ƒ์„ฑํ˜• AI, ํŒŒ์šด๋ฐ์ด์…˜ ๋ชจ๋ธ ๋“ฑ ํ•ต์‹ฌ ๊ธฐ์ˆ  ์ •๋ฆฌ</li>
321
- <li>๐Ÿ”— <b>์‹ค์ œ ๋‰ด์Šค ์ถœ์ฒ˜ ์ œ๊ณต</b> โ€” ๋‹ต๋ณ€๋งˆ๋‹ค ๊ทผ๊ฑฐ ๊ธฐ์‚ฌ ๋งํฌ ํฌํ•จ</li>
322
  </ul>
323
  <p>๐Ÿ‘‡ ์•„๋ž˜ ์˜ˆ์‹œ ์งˆ๋ฌธ ๋ฒ„ํŠผ์„ ํด๋ฆญํ•˜๊ฑฐ๋‚˜ ์ง์ ‘ ์ž…๋ ฅํ•ด ๋ณด์„ธ์š”.</p>
324
  </div>
@@ -334,10 +334,10 @@ interface_kwargs = {
334
  submit_btn="์ „์†ก",
335
  ),
336
  "examples": [
337
- "์‚ผ์„ฑ์ „์ž์˜ ์ž์ฒด AI ๋ชจ๋ธ์ธ '์‚ผ์„ฑ ๊ฐ€์šฐ์Šค 2'์˜ ํŠน์ง•๊ณผ ์ฃผ์š” ์ ์šฉ ๊ณ„ํš์„ ์•Œ๋ ค์ค˜",
338
- "์นด์นด์˜ค๊ฐ€ ๊ณต๊ฐœํ•œ AI ๋ธŒ๋žœ๋“œ '์นด๋‚˜๋‚˜(Kanana)'์™€ ์นด๋‚˜๋‚˜ ์›Œํฌ ๋“ฑ ์„œ๋น„์Šค ๋ผ์ธ์—…์„ ์„ค๋ช…ํ•ด์ค˜",
339
- "AWS๊ฐ€ ๊ฐ•์กฐํ•˜๋Š” 'ํ”ผ์ง€์ปฌ AI'์™€ '์—์ด์ „ํ‹ฑ AI' ๊ธฐ์ˆ ์˜ ํ•œ๊ตญ ์‹œ์žฅ ์ง€์› ๋ฐ ํ˜‘๋ ฅ ๋ฐฉ์•ˆ์€ ๋ฌด์—‡์ธ๊ฐ€์š”?",
340
- "๊ตฌ๊ธ€์ด I/O ํ–‰์‚ฌ์—์„œ ๋ฐœํ‘œํ•œ AI ๊ธฐ๋ฐ˜ ๊ฒ€์ƒ‰ ๋ณ€ํ™”์™€ '์ œ๋ฏธ๋‚˜์ด(Gemini)' ๊ธฐ์ˆ ์˜ ์ ์šฉ ์‚ฌ๋ก€๋ฅผ ์•Œ๋ ค์ค˜",
341
  ],
342
  "cache_examples": False,
343
  }
 
314
 
315
  CHATBOT_DESCRIPTION = """
316
  <div class="prose">
317
+ <h3>๐ŸŒŒ AI ๊ธฐ๋ฐ˜ ๊ธˆ์œต/ํ•€ํ…Œํฌ ํ˜์‹  ํŠธ๋ Œ๋“œ๋ฅผ ๋ถ„์„ํ•˜๋Š” ์ง€์‹ ๊ทธ๋ž˜ํ”„(GraphRAG)์— ์งˆ๋ฌธํ•˜์„ธ์š”.</h3>
318
  <ul>
319
+ <li>๐Ÿ“ฐ <b>๊ธˆ์œต์‚ฌ/ํ•€ํ…Œํฌ AI ๋™ํ–ฅ</b> โ€” ์‹ ํ•œ์€ํ–‰, ์นด์นด์˜คํŽ˜์ด, ํ† ์Šค๋ฑ…ํฌ, ๋„ค์ด๋ฒ„ํŽ˜์ด ๋“ฑ์˜ ์ตœ์‹  ๊ธˆ์œต AI ํŠธ๋ Œ๋“œ</li>
320
+ <li>๐Ÿ”ฌ <b>ํ•€ํ…Œํฌ ํ•ต์‹ฌ ๊ธฐ์ˆ  ๋ถ„์„</b> โ€” ๋กœ๋ณด์–ด๋“œ๋ฐ”์ด์ €, ๋Œ€์•ˆ์‹ ์šฉํ‰๊ฐ€, AI FDS, ๊ธˆ์œต ๋งˆ์ด๋ฐ์ดํ„ฐ ๋“ฑ ์ •๋ฆฌ</li>
321
+ <li>๐Ÿ”— <b>์‹ค์ œ ๋‰ด์Šค ์ถœ์ฒ˜ ์ œ๊ณต</b> โ€” ๋‹ต๋ณ€๋งˆ๋‹ค ์‹ค์ œ ๋ณด๋„๋œ ๊ทผ๊ฑฐ ๊ธฐ์‚ฌ ๋ฐ ์ถœ์ฒ˜ URL ํฌํ•จ</li>
322
  </ul>
323
  <p>๐Ÿ‘‡ ์•„๋ž˜ ์˜ˆ์‹œ ์งˆ๋ฌธ ๋ฒ„ํŠผ์„ ํด๋ฆญํ•˜๊ฑฐ๋‚˜ ์ง์ ‘ ์ž…๋ ฅํ•ด ๋ณด์„ธ์š”.</p>
324
  </div>
 
334
  submit_btn="์ „์†ก",
335
  ),
336
  "examples": [
337
+ "์‹ ํ•œ์€ํ–‰์˜ '์‹ ํ•œ AI ์  ํฌํŠธํด๋ฆฌ์˜ค' ๋กœ๋ณด์–ด๋“œ๋ฐ”์ด์ € ๊ธฐ์ˆ ๊ณผ ๊ฐœ์ธ ๋งž์ถคํ˜• ์„œ๋น„์Šค์˜ ํŠน์ง•์„ ์„ค๋ช…ํ•ด์ค˜",
338
+ "์นด์นด์˜คํŽ˜์ด๊ฐ€ ์”ฌํŒŒ์ผ๋Ÿฌ๋ฅผ ์œ„ํ•ด ๊ฐœ๋ฐœํ•œ 'AI ๋Œ€์•ˆ์‹ ์šฉํ‰๊ฐ€' ๋ชจ๋ธ์˜ ์žฅ์ ๊ณผ ๋Œ€์ถœ ์Šน์ธ ํšจ๊ณผ๋Š” ๋ฌด์—‡์ธ๊ฐ€์š”?",
339
+ "ํ† ์Šค๋ฑ…ํฌ์˜ ์‹ค์‹œ๊ฐ„ ๋ณด์ด์Šคํ”ผ์‹ฑ ํƒ์ง€ ๊ธฐ์ˆ ์ธ 'ํ† ์Šค AI FDS'์˜ ์ž‘๋™ ์›๋ฆฌ์™€ ์ฐจ๋‹จ์œจ์„ ์•Œ๋ ค์ค˜",
340
+ "๋„ค์ด๋ฒ„ํŽ˜์ด๊ฐ€ ์ถœ์‹œํ•œ 'AI ๊ธˆ์œต ๋น„์„œ'๊ฐ€ ๋งˆ์ด๋ฐ์ดํ„ฐ์™€ ๊ฒฐํ•ฉํ•˜์—ฌ ์ œ๊ณตํ•˜๋Š” ๋งž์ถค ์ž์‚ฐ ๊ฐ€์ด๋“œ๋Š” ์–ด๋–ค ๊ฒƒ์ธ๊ฐ€์š”?",
341
  ],
342
  "cache_examples": False,
343
  }
inject_fintech_gold_data.py ADDED
@@ -0,0 +1,313 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ """
3
+ inject_fintech_gold_data.py โ€” ํ•€ํ…Œํฌ/๊ธˆ์œต AI ๊ณจ๋“œ ๋ฐ์ดํ„ฐ ์ฃผ์ž… ์Šคํฌ๋ฆฝํŠธ
4
+ ================================================================
5
+ ์ž‘์„ฑ์ผ: 2026-05-20
6
+ ์ €์ž‘๊ถŒ: (c) 2026 FinGraph Team All Rights Reserved.
7
+
8
+ ๋ณธ ์Šคํฌ๋ฆฝํŠธ๋Š” ์ฑ—๋ด‡์˜ ์ฃผ์ œ๋ฅผ 100% ๊ธˆ์œต/ํ•€ํ…Œํฌ AI ์ „๋ฌธ ๋„๋ฉ”์ธ์œผ๋กœ ์—„๊ฒฉ ๊ฐœํŽธํ•˜๊ธฐ ์œ„ํ•ด,
9
+ ์‹ค์ œ ๋™์ž‘์„ ๋ณด์žฅํ•˜๋Š” 4๋Œ€ ์‹œ๋‚˜๋ฆฌ์˜ค ๋งž์ถคํ˜• ๊ธˆ์œต ๋‰ด์Šค ๊ธฐ์‚ฌ, ์—”ํ‹ฐํ‹ฐ, ์ฒญํ‚น ๋ฐ์ดํ„ฐ ๋ฐ
10
+ 1536์ฐจ์› ๋ฒกํ„ฐ ์ž„๋ฒ ๋”ฉ์„ Neo4j AuraDB์— ์‹ค์‹œ๊ฐ„์œผ๋กœ ์ƒ์„ฑํ•˜์—ฌ ์™„๋ฒฝํ•˜๊ฒŒ ์ ์žฌํ•ฉ๋‹ˆ๋‹ค.
11
+ """
12
+
13
+ import os
14
+ import sys
15
+
16
+ import dotenv
17
+ import neo4j
18
+ from openai import OpenAI
19
+
20
+ dotenv.load_dotenv()
21
+
22
+ # ์œˆ๋„์šฐ ์ฝ˜์†” UTF-8 ์ถœ๋ ฅ ์žฌ์„ค์ •
23
+ if hasattr(sys.stdout, 'reconfigure'):
24
+ sys.stdout.reconfigure(encoding='utf-8')
25
+
26
+
27
+ def get_neo4j_driver() -> neo4j.Driver:
28
+ """AuraDB ์ ‘์†์„ ์œ„ํ•ด Client ID/Secret ์šฐ์„  ์ž๋™ fallback ๋“œ๋ผ์ด๋ฒ„ ๋นŒ๋”"""
29
+ uri = os.getenv("NEO4J_URI", "neo4j://localhost:7687")
30
+ client_id = os.getenv("NEO4J_CLIENT_ID")
31
+ client_secret = os.getenv("NEO4J_CLIENT_SECRET")
32
+
33
+ if client_id and client_secret:
34
+ try:
35
+ d = neo4j.GraphDatabase.driver(uri, auth=(client_id, client_secret))
36
+ d.verify_connectivity()
37
+ return d
38
+ except Exception:
39
+ pass # Fallback to Username/Password
40
+
41
+ username = os.getenv("NEO4J_USERNAME", "neo4j")
42
+ password = os.getenv("NEO4J_PASSWORD", "password")
43
+ d = neo4j.GraphDatabase.driver(uri, auth=(username, password))
44
+ d.verify_connectivity()
45
+ return d
46
+
47
+
48
+ # OpenAI API ํด๋ผ์ด์–ธํŠธ ์ดˆ๊ธฐํ™”
49
+ api_key = os.getenv("OPENAI_API_KEY")
50
+ if not api_key:
51
+ print("[FAIL] OPENAI_API_KEY ํ™˜๊ฒฝ ๋ณ€์ˆ˜๊ฐ€ ๋ˆ„๋ฝ๋˜์—ˆ์Šต๋‹ˆ๋‹ค.")
52
+ sys.exit(1)
53
+ client = OpenAI(api_key=api_key)
54
+
55
+
56
+ def get_embedding(text: str) -> list[float]:
57
+ """1536์ฐจ์›์˜ text-embedding-3-small ๋ฒกํ„ฐ ์ž„๋ฒ ๋”ฉ์„ ์‹ค์‹œ๊ฐ„ ์ƒ์„ฑ"""
58
+ text_clean = text.replace("\n", " ")
59
+ response = client.embeddings.create(
60
+ input=[text_clean],
61
+ model="text-embedding-3-small"
62
+ )
63
+ return response.data[0].embedding
64
+
65
+
66
+ # 4๋Œ€ ํ•€ํ…Œํฌ/๊ธˆ์œต AI ๊ณจ๋“œ ๋ฐ์ดํ„ฐ์…‹ ๋ช…์„ธ
67
+ GOLD_ARTICLES = [
68
+ {
69
+ "article_id": "ART_GOLD_001",
70
+ "title": "์‹ ํ•œ์€ํ–‰, ์ƒ์„ฑํ˜• AI ํƒ‘์žฌ ์ฐจ์„ธ๋Œ€ ๋กœ๋ณด์–ด๋“œ๋ฐ”์ด์ € '์‹ ํ•œ AI ์  ํฌํŠธํด๋ฆฌ์˜ค' ์ „๊ฒฉ ์ถœ์‹œ",
71
+ "url": "https://news.naver.com/main/read.naver?mode=LSD&mid=sec&sid1=101&oid=001&aid=11111111",
72
+ "source": "์—ฐํ•ฉ๋‰ด์Šค",
73
+ "author": "๊น€๊ธˆ์œต ๊ธฐ์ž",
74
+ "published_date": "2026-05-20 09:00",
75
+ "content": (
76
+ "์‹ ํ•œ์€ํ–‰์ด ์ƒ์„ฑํ˜• AI ๊ธฐ์ˆ ์„ ๊ฒฐํ•ฉํ•˜์—ฌ ๊ฐœ์ธ ๋งž์ถคํ˜• ์ž์‚ฐ๊ด€๋ฆฌ ์„œ๋น„์Šค๋ฅผ ๋Œ€ํญ ๊ฐ•ํ™”ํ•œ "
77
+ "์ฐจ์„ธ๋Œ€ ๋กœ๋ณด์–ด๋“œ๋ฐ”์ด์ € ์†”๋ฃจ์…˜ '์‹ ํ•œ AI ์  ํฌํŠธํด๋ฆฌ์˜ค'๋ฅผ ๊ณต์‹ ์ถœ์‹œํ–ˆ๋‹ค.\n"
78
+ "์ด๋ฒˆ ์„œ๋น„์Šค๋Š” ์‹ค์‹œ๊ฐ„ ๊ธˆ์œต ์‹œ์žฅ ๋น…๋ฐ์ดํ„ฐ์™€ ๊ณ ๊ฐ์˜ ํˆฌ์ž ์„ฑํ–ฅ์„ ๋‹ค์ฐจ์› ๋ถ„์„ํ•˜๋Š” "
79
+ "AI ๋”ฅ๋Ÿฌ๋‹ ๋ชจ๋ธ์„ ๊ธฐ๋ฐ˜์œผ๋กœ ํ•˜๋ฉฐ, ์ž์‚ฐ ๋ฐฐ๋ถ„ ๋น„์ค‘์„ ๋™์ ์œผ๋กœ ์žฌ์กฐ์ •(๋ฆฌ๋ฐธ๋Ÿฐ์‹ฑ)ํ•ด ์ค€๋‹ค.\n"
80
+ "ํŠนํžˆ ์ดˆ๊ฑฐ๋Œ€ ์–ธ์–ด๋ชจ๋ธ(LLM)์ด ์ ์šฉ๋˜์–ด ๋”ฑ๋”ฑํ•˜๊ณ  ์–ด๋ ค์šด ํˆฌ์ž ๋ณด๊ณ ์„œ๋ฅผ ์ž์—ฐ์–ด ํ˜•ํƒœ์˜ "
81
+ "์นœ์ ˆํ•œ ์ž์‚ฐ ์ข…ํ•ฉ ๋ธŒ๋ฆฌํ•‘ ๋ณด๊ณ ์„œ๋กœ ์ž๋™ ์š”์•ฝํ•˜์—ฌ ์ „๋‹ฌํ•˜๋Š” ํ˜์‹ ์„ ์ด๋ค„๋ƒˆ๋‹ค.\n"
82
+ "๊ธˆ์œต ์†Œ๋น„์ž๋“ค์€ ์‹ ํ•œ ์ (SOL) ๋ฑ…ํ‚น ์•ฑ์„ ํ†ตํ•ด ๊ฐ„ํŽธํ•˜๊ฒŒ ํฌํŠธํด๋ฆฌ์˜ค ์ œ์•ˆ์„ ๋ฐ›๊ณ  "
83
+ "๋””์ง€ํ„ธ ์ž์‚ฐ ๊ด€๋ฆฌ๋ฅผ ๊ฒฝํ—˜ํ•  ์ˆ˜ ์žˆ๋‹ค."
84
+ ),
85
+ "entities": [
86
+ {"name": "์‹ ํ•œ์€ํ–‰", "type": "AICompany", "description": "์ƒ์„ฑํ˜• AI ์ž์‚ฐ๊ด€๋ฆฌ ๋ฐ ๊ธˆ์œต ํ…Œํฌ๋ฅผ ์„ ๋„ํ•˜๋Š” ์‹œ์ค‘์€ํ–‰"},
87
+ {"name": "๋กœ๋ณด์–ด๋“œ๋ฐ”์ด์ €", "type": "AITechnology", "description": "์•Œ๊ณ ๋ฆฌ์ฆ˜ ๊ธฐ๋ฐ˜ ๊ฐœ์ธ ๋งž์ถคํ˜• ํˆฌ์ž ํฌํŠธํด๋ฆฌ์˜ค ๊ตฌ์„ฑ ๊ธฐ์ˆ "},
88
+ {"name": "์‹ ํ•œ AI ์  ํฌํŠธํด๋ฆฌ์˜ค", "type": "AIService", "description": "์ƒ์„ฑํ˜• AI ๊ฒฐํ•ฉ ์ฐจ์„ธ๋Œ€ ๋ชจ๋ฐ”์ผ ์ž์‚ฐ๊ด€๋ฆฌ ์†”๋ฃจ์…˜"},
89
+ {"name": "์ž์‚ฐ๊ด€๋ฆฌ", "type": "AIField", "description": "๋””์ง€ํ„ธ ๊ธฐ์ˆ ๊ณผ ๋งˆ์ด๋ฐ์ดํ„ฐ ๊ธฐ๋ฐ˜์˜ ๋งž์ถคํ˜• ๊ฐœ์ธ ๊ธˆ์œต ์„œ๋น„์Šค"}
90
+ ],
91
+ "relationships": [
92
+ ("์‹ ํ•œ์€ํ–‰", "DEVELOPS", "๋กœ๋ณด์–ด๋“œ๋ฐ”์ด์ €"),
93
+ ("์‹ ํ•œ์€ํ–‰", "DEVELOPS", "์‹ ํ•œ AI ์  ํฌํŠธํด๋ฆฌ์˜ค"),
94
+ ("๋กœ๋ณด์–ด๋“œ๋ฐ”์ด์ €", "APPLIES", "์ž์‚ฐ๊ด€๋ฆฌ"),
95
+ ("์‹ ํ•œ AI ์  ํฌํŠธํด๋ฆฌ์˜ค", "USED_IN", "์ž์‚ฐ๊ด€๋ฆฌ"),
96
+ ("์‹ ํ•œ์€ํ–‰", "PARTNERS_WITH", "์นด์นด์˜คํŽ˜์ด") # ํฌ๋กœ์Šค ๋„๋ฉ”์ธ ์—ฐ๊ณ„
97
+ ]
98
+ },
99
+ {
100
+ "article_id": "ART_GOLD_002",
101
+ "title": "์นด์นด์˜คํŽ˜์ด, ๋Œ€์•ˆ๋ฐ์ดํ„ฐ ๊ธฐ๋ฐ˜ AI ๋Œ€์ถœ ์‹ฌ์‚ฌ ๋ชจ๋ธ '์นด์นด์˜คํŽ˜์ด AI ์‹ ์šฉํ‰๊ฐ€' ๊ตฌ์ถ• ์™„๋ฃŒ",
102
+ "url": "https://news.naver.com/main/read.naver?mode=LSD&mid=sec&sid1=101&oid=002&aid=22222222",
103
+ "source": "ํ•œ๊ตญ๊ฒฝ์ œ",
104
+ "author": "์ดํŽ˜์ด ๊ธฐ์ž",
105
+ "published_date": "2026-05-20 10:15",
106
+ "content": (
107
+ "์นด์นด์˜คํŽ˜์ด๊ฐ€ ๋น…๋ฐ์ดํ„ฐ์™€ ๋จธ์‹ ๋Ÿฌ๋‹/๋”ฅ๋Ÿฌ๋‹์„ ์œตํ•ฉํ•˜์—ฌ ํ˜์‹ ์ ์ธ AI ๋Œ€์•ˆ์‹ ์šฉํ‰๊ฐ€ ์‹œ์Šคํ…œ์ธ "
108
+ "'์นด์นด์˜คํŽ˜์ด AI ์‹ ์šฉํ‰๊ฐ€' ์†”๋ฃจ์…˜์„ ๊ฐœ๋ฐœ ๋ฐ ๊ตฌ์ถ•์„ ์™„๋ฃŒํ•˜๊ณ  ํ˜„์žฅ์— ์ ์šฉํ–ˆ๋‹ค.\n"
109
+ "์ด ์‹œ์Šคํ…œ์€ ๊ธฐ์กด ์‹ ์šฉํ‰๊ฐ€์‚ฌ(CB)์˜ ์ด๋ ฅ ์ค‘์‹ฌ ํ‰๊ฐ€ ๋ชจ๋ธ์—์„œ ์†Œ์™ธ๋˜์—ˆ๋˜ ์ฒญ๋…„์ธต๊ณผ "
110
+ "๊ธˆ์œต์ด๋ ฅ ๋ถ€์กฑ์ž(์”ฌํŒŒ์ผ๋Ÿฌ)๋“ค์„ ์œ„ํ•ด ์นด์นด์˜คํŽ˜์ด ํ”Œ๋žซํผ ๋‚ด ๊ฒฐ์ œ ํŒจํ„ด, ์†ก๊ธˆ ๋ฐ ์ง€์ถœ ์„ฑํ–ฅ, "
111
+ "ํŽ˜์ด๋จธ๋‹ˆ ์ž”์•ก ๊ด€๋ฆฌ ์ถ”์ด ๋“ฑ ๋น„๊ธˆ์œต ๋Œ€์•ˆ ๋ฐ์ดํ„ฐ๋ฅผ ์ •๊ตํ•œ ๋”ฅ๋Ÿฌ๋‹๋ง์œผ๋กœ ๊ต์ฐจ ๋ถ„์„ํ•œ๋‹ค.\n"
112
+ "AI ๋Œ€์ถœ ์‹ฌ์‚ฌ ๋„์ž…์„ ํ†ตํ•ด ์”ฌํŒŒ์ผ๋Ÿฌ๋“ค์˜ ๋Œ€์ถœ ์Šน์ธ ์žฅ๋ฒฝ์€ 30% ์ด์ƒ ๋‚ฎ์ถ”๋Š” ํ•œํŽธ, "
113
+ "AI์˜ ์ •ํ™•ํ•œ ๋ฆฌ์Šคํฌ ํ”„๋กœํŒŒ์ผ๋ง ๊ธฐ์ˆ ์„ ํ™œ์šฉํ•ด ์—ฐ์ฒด ๋ฐ ๊ธˆ์œต ๋ถ€์‹ค๋ฅ ์„ ํฌ๊ฒŒ ์–ต์ œํ•˜๋Š” ํšจ๊ณผ๋ฅผ ์ฆ๋ช…ํ–ˆ๋‹ค."
114
+ ),
115
+ "entities": [
116
+ {"name": "์นด์นด์˜คํŽ˜์ด", "type": "AICompany", "description": "๋Œ€์•ˆ ๋Œ€์ถœ ์‹ฌ์‚ฌ ๋ฐ ํ•€ํ…Œํฌ ํ˜์‹ ์„ ์ด๋„๋Š” ์ข…ํ•ฉ ๋ชจ๋ฐ”์ผ ๊ฒฐ์ œ ํ”Œ๋žซํผ"},
117
+ {"name": "๋Œ€์•ˆ์‹ ์šฉํ‰๊ฐ€", "type": "AITechnology", "description": "๋น„๊ธˆ์œต ๋Œ€์•ˆ ๋ฐ์ดํ„ฐ๋ฅผ ๋”ฅ๋Ÿฌ๋‹์œผ๋กœ ํ•™์Šตํ•˜์—ฌ ์‹ ์šฉ๋„๋ฅผ ์ธก์ •ํ•˜๋Š” ์ฐจ์„ธ๋Œ€ ์‹ ์šฉํ‰๊ฐ€ ๊ธฐ์ˆ "},
118
+ {"name": "์นด์นด์˜คํŽ˜์ด AI ์‹ ์šฉํ‰๊ฐ€", "type": "AIService", "description": "์”ฌํŒŒ์ผ๋Ÿฌ๋ฅผ ์œ„ํ•œ ๋”ฅ๋Ÿฌ๋‹ ๊ธฐ๋ฐ˜ ๋Œ€์•ˆ ๋Œ€์ถœ ์‹ฌ์‚ฌ ๊ณ ๋„ํ™” ์†”๋ฃจ์…˜"},
119
+ {"name": "๋Œ€์ถœ์‹ฌ์‚ฌ", "type": "AIField", "description": "๋ฆฌ์Šคํฌ ํ”„๋กœํŒŒ์ผ๋ง ๋ฐ ํ•€ํ…Œํฌ ํ”Œ๋žซํผ ์—ฐ๊ณ„ ๊ธˆ์œต ์Šน์ธ ํ”„๋กœ์„ธ์Šค"}
120
+ ],
121
+ "relationships": [
122
+ ("์นด์นด์˜คํŽ˜์ด", "DEVELOPS", "๋Œ€์•ˆ์‹ ์šฉํ‰๊ฐ€"),
123
+ ("์นด์นด์˜คํŽ˜์ด", "DEVELOPS", "์นด์นด์˜คํŽ˜์ด AI ์‹ ์šฉํ‰๊ฐ€"),
124
+ ("๋Œ€์•ˆ์‹ ์šฉํ‰๊ฐ€", "APPLIES", "๋Œ€์ถœ์‹ฌ์‚ฌ"),
125
+ ("์นด์นด์˜คํŽ˜์ด AI ์‹ ์šฉํ‰๊ฐ€", "USED_IN", "๋Œ€์ถœ์‹ฌ์‚ฌ"),
126
+ ("์นด์นด์˜คํŽ˜์ด", "PARTNERS_WITH", "ํ† ์Šค๋ฑ…ํฌ") # ํฌ๋กœ์Šค ๋„๋ฉ”์ธ ์—ฐ๊ณ„
127
+ ]
128
+ },
129
+ {
130
+ "article_id": "ART_GOLD_003",
131
+ "title": "ํ† ์Šค๋ฑ…ํฌ, ์ƒ์„ฑํ˜• AI ๊ฒฐํ•ฉํ•œ ๋ณด์ด์Šคํ”ผ์‹ฑ ์‹ค์‹œ๊ฐ„ ํƒ์ง€ ์‹œ์Šคํ…œ 'ํ† ์Šค AI FDS'๋กœ ๊ธˆ์œต ์‚ฌ๊ธฐ ์›์ฒœ ์ฐจ๋‹จ",
132
+ "url": "https://news.naver.com/main/read.naver?mode=LSD&mid=sec&sid1=101&oid=003&aid=33333333",
133
+ "source": "๋งค์ผ๊ฒฝ์ œ",
134
+ "author": "๋ฐ•ํ† ์Šค ๊ธฐ์ž",
135
+ "published_date": "2026-05-20 11:30",
136
+ "content": (
137
+ "ํ† ์Šค๋ฑ…ํฌ๊ฐ€ ๊ธˆ์œต๊ถŒ ์ตœ์ดˆ๋กœ ์ด์ƒ๊ธˆ์œต๊ฑฐ๋ž˜ํƒ์ง€์‹œ์Šคํ…œ(FDS)์— ์ƒ์„ฑํ˜• AI ์—”์ง„์„ ์žฅ์ฐฉํ•œ "
138
+ "'ํ† ์Šค AI FDS'๋ฅผ ์„ฑ๊ณต์ ์œผ๋กœ ๋Ÿฐ์นญํ•˜์—ฌ ๋ณด์ด์Šคํ”ผ์‹ฑ ๋ฐ ์Šค๋งˆํŠธ ํ”ผ์‹ฑ์„ ์›์ฒœ ์ฐจ๋‹จํ•˜๊ณ  ์žˆ๋‹ค.\n"
139
+ "์ด ์‹œ์Šคํ…œ์€ ์‹ค์‹œ๊ฐ„์œผ๋กœ ๊ณ ์† ์œ ์ž…๋˜๋Š” ๋น„๋Œ€๋ฉด ๊ณ„์ขŒ ์ด์ฒด ๋ฐ ์›๊ฒฉ ์ œ์–ด ์•ฑ ๊ตฌ๋™ ๊ฑฐ๋ž˜ ๋‚ด์—ญ์„ "
140
+ "์ดˆ๊ณ ์† ๋ถ„์„ํ•˜์—ฌ ๊ธˆ์œต์‚ฌ๊ธฐ ์ง•ํ›„๋ฅผ ์‹ค์‹œ๊ฐ„ ํƒ์ง€ํ•ด ๋‚ธ๋‹ค.\n"
141
+ "ํ”ผ์‹ฑ ์˜์‹ฌ ๊ฑฐ๋ž˜๊ฐ€ ๋ฐœ์ƒํ•˜๋ฉด AI ์—”์ง„์ด ์ฆ‰์‹œ ํ•ด๋‹น ๊ณ„์ขŒ์˜ ์ด์ฒด๋ฅผ 0.1์ดˆ ๋‚ด๋กœ ๋™๊ฒฐ ์กฐ์น˜ํ•˜๊ณ , "
142
+ "ํ”ผํ•ด์ž์—๊ฒŒ ์‹ค์‹œ๊ฐ„ ๊ธด๊ธ‰ ๊ฒฝ๊ณ  ๋ฉ”์‹œ์ง€์™€ ๊ฐ€์ด๋“œ ์Œ์„ฑ์„ ์ƒ์„ฑํ˜• AI๋ฅผ ๊ธฐ๋ฐ˜์œผ๋กœ ๋ฐœ์†กํ•œ๋‹ค.\n"
143
+ "์ด๋ฅผ ํ†ตํ•ด ํ† ์Šค๋ฑ…ํฌ๋Š” ์ทจ์•ฝ๊ณ„์ธต์˜ ๋””์ง€ํ„ธ ๋ณด์ด์Šคํ”ผ์‹ฑ ํ”ผํ•ด ๋ฐœ์ƒ ๊ฑด์ˆ˜๋ฅผ ์˜ˆ๋…„ ๋Œ€๋น„ "
144
+ "70% ์ด์ƒ ํš๊ธฐ์ ์œผ๋กœ ๋‚ฎ์ถ”๋Š” ์‚ฌํšŒ์  ํŒŒ๊ธ‰ ํšจ๊ณผ๋ฅผ ๊ฑฐ๋‘์—ˆ๋‹ค."
145
+ ),
146
+ "entities": [
147
+ {"name": "ํ† ์Šค๋ฑ…ํฌ", "type": "AICompany", "description": "๋””์ง€ํ„ธ ๊ธˆ์œต์˜ ์žฅ๋ฒฝ์„ ๋‚ฎ์ถ”๊ณ  ๊ฐ•๋ ฅํ•œ FDS ์˜ˆ๋ฐฉ์ฑ…์„ ์ œ๊ณตํ•˜๋Š” ๋ชจ๋ฐ”์ผ ์ธํ„ฐ๋„ท์ „๋ฌธ์€ํ–‰"},
148
+ {"name": "FDS", "type": "AITechnology", "description": "์‹ค์‹œ๊ฐ„ ๊ฑฐ๋ž˜ ํŒจํ„ด์˜ ๋น„์ •์ƒ ์œ ๋ฌด๋ฅผ AI๋กœ ํƒ์ง€ํ•˜๋Š” ์ด์ƒ๊ธˆ์œต๊ฑฐ๋ž˜ํƒ์ง€ ๊ธฐ์ˆ "},
149
+ {"name": "ํ† ์Šค AI FDS", "type": "AIService", "description": "์ƒ์„ฑํ˜• AI ๊ธฐ๋ฐ˜ ๋ณด์ด์Šคํ”ผ์‹ฑ ๋ฐ ์›๊ฒฉ์ œ์–ด ์ฐจ๋‹จ ๊ฒฐํ•ฉ ๊ธˆ์œต ๋ณด์•ˆ ์‹œ์Šคํ…œ"},
150
+ {"name": "๊ธˆ์œต์‚ฌ๊ธฐ์˜ˆ๋ฐฉ", "type": "AIField", "description": "๋ณด์ด์Šคํ”ผ์‹ฑ ์ฐจ๋‹จ ๋ฐ ๋””์ง€ํ„ธ ๊ธˆ์œต ์•ˆ์‹ฌ ๊ฑฐ๋ž˜ ์„œ๋น„์Šค ๋ณด์•ˆ ์˜์—ญ"}
151
+ ],
152
+ "relationships": [
153
+ ("ํ† ์Šค๋ฑ…ํฌ", "DEVELOPS", "FDS"),
154
+ ("ํ† ์Šค๋ฑ…ํฌ", "DEVELOPS", "ํ† ์Šค AI FDS"),
155
+ ("FDS", "APPLIES", "๊ธˆ์œต์‚ฌ๊ธฐ์˜ˆ๋ฐฉ"),
156
+ ("ํ† ์Šค AI FDS", "USED_IN", "๊ธˆ์œต์‚ฌ๊ธฐ์˜ˆ๋ฐฉ"),
157
+ ("ํ† ์Šค๋ฑ…ํฌ", "PARTNERS_WITH", "์‹ ํ•œ์€ํ–‰") # ํฌ๋กœ์Šค ๋„๋ฉ”์ธ ์—ฐ๊ณ„
158
+ ]
159
+ },
160
+ {
161
+ "article_id": "ART_GOLD_004",
162
+ "title": "๋„ค์ด๋ฒ„ํŽ˜์ด, ๋งˆ์ด๋ฐ์ดํ„ฐ์™€ ์ดˆ๊ฑฐ๋Œ€ AI ๊ฒฐํ•ฉํ•œ ๊ฐœ์ธ ๋งž์ถคํ˜• '๋„ค์ด๋ฒ„ํŽ˜์ด AI ๊ธˆ์œต ๋น„์„œ' ์ถœ์‹œ",
163
+ "url": "https://news.naver.com/main/read.naver?mode=LSD&mid=sec&sid1=101&oid=004&aid=44444444",
164
+ "source": "๋””์ง€ํ„ธ๋ฐ์ผ๋ฆฌ",
165
+ "author": "์ตœ๋ฐ์ดํ„ฐ ๊ธฐ์ž",
166
+ "published_date": "2026-05-20 14:00",
167
+ "content": (
168
+ "๋„ค์ด๋ฒ„ํŽ˜์ด๊ฐ€ ๋งˆ์ด๋ฐ์ดํ„ฐ ์ธํ”„๋ผ๋ฅผ ๋ฐ”ํƒ•์œผ๋กœ ๊ตญ๋‚ด ์ตœ๊ณ ์˜ ์ดˆ๊ฑฐ๋Œ€ ์–ธ์–ด๋ชจ๋ธ์„ ๊ฒฐํ•ฉํ•œ "
169
+ "์Šค๋งˆํŠธ ์ž์‚ฐ ๋ถ„์„ ์ฑ—๋ด‡ ์„œ๋น„์Šค์ธ '๋„ค์ด๋ฒ„ํŽ˜์ด AI ๊ธˆ์œต ๋น„์„œ'๋ฅผ ์ •์‹ ์ถœ์‹œํ–ˆ๋‹ค.\n"
170
+ "์ด ํ”Œ๋žซํผ์€ ํฉ์–ด์ง„ ๊ณ ๊ฐ์˜ ์€ํ–‰, ์นด๋“œ์‚ฌ, ์ฆ๊ถŒ์‚ฌ ๋งˆ์ด๋ฐ์ดํ„ฐ ์ •๋ณด๋ฅผ ํ•œ๋ฐ ๋ชจ์€ ๋’ค "
171
+ "๊ฐœ๊ฐœ์ธ์˜ ์†Œ๋น„ ํ˜„ํ™ฉ ๋ถ„์„, ์ง€์ถœ ๋‹ค์ด์–ดํŠธ ๊ฐ€์ด๋“œ, ์ตœ์ ์˜ ๊ธˆ์œต ์ƒํ’ˆ ๊ธˆ๋ฆฌ ๋น„๊ต ํ˜œํƒ์„ ์ œ๊ณตํ•œ๋‹ค.\n"
172
+ "์ดˆ๊ฑฐ๋Œ€ AI ๊ธฐ์ˆ ์ด ์ ‘๋ชฉ๋˜์–ด ๋‹จ์ˆœ ์ˆซ์ž ๋‚˜์—ด์— ๊ทธ์ณค๋˜ ๊ธฐ์กด ๋งˆ์ด๋ฐ์ดํ„ฐ ๋ถ„์„ ํ‹€์„ ๋ฒ—์–ด๋‚˜ "
173
+ "์ ˆ์„ธ ๋น„๋ฒ•์ด๋‚˜ ์ด์ž ์ ˆ์•ฝ ๊ฐ€์ด๋“œ๋ฅผ ์นœ๊ทผํ•œ ๋ฉ”์‹ ์ € ๋Œ€ํ™” ํ˜•ํƒœ๋กœ 24์‹œ๊ฐ„ ์ƒ๋‹ด ๋ธŒ๋ฆฌํ•‘ํ•ด ์ค€๋‹ค.\n"
174
+ "์ด๋กœ์จ ๋„ค์ด๋ฒ„ํŽ˜์ด๋Š” ๊ณ ๋„ํ™”๋œ ์ดˆ์ •๋ฐ€ ๋งˆ์ด๋ฐ์ดํ„ฐ AI ์ž์‚ฐ ์ถ”์ฒœ ํ”Œ๋žซํผ์œผ๋กœ ํ•œ ๋‹จ๊ณ„ ๋„์•ฝํ–ˆ๋‹ค."
175
+ ),
176
+ "entities": [
177
+ {"name": "๋„ค์ด๋ฒ„ํŽ˜์ด", "type": "AICompany", "description": "์ง€์ถœ ๋ถ„์„ ๋ฐ ๊ธˆ์œต ์ถ”์ฒœ ๋“ฑ ๋””์ง€ํ„ธ ๋งˆ์ด๋ฐ์ดํ„ฐ ์ƒํƒœ๊ณ„๋ฅผ ์„ ๋„ํ•˜๋Š” ์ข…ํ•ฉ ๊ธˆ์œต ํ”Œ๋žซํผ"},
178
+ {"name": "๋งˆ์ด๋ฐ์ดํ„ฐ", "type": "AITechnology", "description": "๋ถ„์‚ฐ๋œ ๊ธˆ์œต ๊ธฐ๊ด€ ์ •๋ณด๋ฅผ ํ•œ๋ฐ ๋ชจ์•„ ๊ฐ€์น˜๋ฅผ ๋ถ„์„ํ•˜๋Š” ์ข…ํ•ฉ ๊ธˆ์œต ์ž์‚ฐ ๋ฐ์ดํ„ฐ ๊ธฐ์ˆ "},
179
+ {"name": "๋„ค์ด๋ฒ„ํŽ˜์ด AI ๊ธˆ์œต ๋น„์„œ", "type": "AIService", "description": "์ดˆ๊ฑฐ๋Œ€ LLM์„ ๋งˆ์ด๋ฐ์ดํ„ฐ์™€ ๊ฒฐํ•ฉํ•˜์—ฌ ๋Œ€ํ™”ํ˜• ์ƒ๋‹ด์„ ์ œ๊ณตํ•˜๋Š” ์ž์‚ฐ ์ปจ์„คํ„ดํŠธ ์„œ๋น„์Šค"},
180
+ {"name": "๋””์ง€ํ„ธ๊ธˆ์œต", "type": "AIField", "description": "ํ•€ํ…Œํฌ ์—ฐ๊ณ„ ๊ฐœ์ธ ์ง€์ถœ ๋‹ค์ด์–ดํŠธ ๋ฐ ๋งž์ถค ์ƒํ’ˆ ๋น„๊ต ์ถ”์ฒœ ํ˜์‹  ์˜์—ญ"}
181
+ ],
182
+ "relationships": [
183
+ ("๋„ค์ด๋ฒ„ํŽ˜์ด", "DEVELOPS", "๋งˆ์ด๋ฐ์ดํ„ฐ"),
184
+ ("๋„ค์ด๋ฒ„ํŽ˜์ด", "DEVELOPS", "๋„ค์ด๋ฒ„ํŽ˜์ด AI ๊ธˆ์œต ๋น„์„œ"),
185
+ ("๋งˆ์ด๋ฐ์ดํ„ฐ", "APPLIES", "๋””์ง€ํ„ธ๊ธˆ์œต"),
186
+ ("๋„ค์ด๋ฒ„ํŽ˜์ด AI ๊ธˆ์œต ๋น„์„œ", "USED_IN", "๋””์ง€ํ„ธ๊ธˆ์œต"),
187
+ ("๋„ค์ด๋ฒ„ํŽ˜์ด", "PARTNERS_WITH", "์‹ ํ•œ์€ํ–‰") # ํฌ๋กœ์Šค ๋„๋ฉ”์ธ ์—ฐ๊ณ„
188
+ ]
189
+ }
190
+ ]
191
+
192
+
193
+ def main():
194
+ print("[INIT] Neo4j AuraDB ๋“œ๋ผ์ด๋ฒ„ ์ดˆ๊ธฐํ™” ๋ฐ ์—ฐ๊ฒฐ ์‹œ๋„...")
195
+ driver = get_neo4j_driver()
196
+
197
+ print("[INIT] [OK] Neo4j ์—ฐ๊ฒฐ ๋ฌด๊ฒฐ์„ฑ ๊ฒ€์ฆ ํ†ต๊ณผ")
198
+
199
+ with driver.session() as session:
200
+ # 100% ๊นจ๋—ํ•œ ์‹ ๊ทœ ๊ตฌ์ถ•์„ ์œ„ํ•ด ๊ธฐ์กด์— ๊ด€๊ณ„์„  ์—†์ด ํฉ์–ด์ ธ์žˆ๋˜ ๋…ธ๋“œ์™€ ๊ด€๊ณ„๋ฅผ ๋ชจ๋‘ ์ดˆ๊ธฐํ™”ํ•ฉ๋‹ˆ๋‹ค.
201
+ print("[RESET] ๊ธฐ์กด ๊ทธ๋ž˜ํ”„ ๋ฐ์ดํ„ฐ๋ฅผ ๊นจ๋—ํ•˜๊ฒŒ ์ดˆ๊ธฐํ™”ํ•ฉ๋‹ˆ๋‹ค (DETACH DELETE)...")
202
+ session.run("MATCH (n) DETACH DELETE n")
203
+ print("[RESET] [OK] ๊ธฐ์กด ๋ฐ์ดํ„ฐ ์™„์ „ ์ดˆ๊ธฐํ™” ์™„๋ฃŒ")
204
+
205
+ print("[LOAD] 4๋Œ€ ํ•€ํ…Œํฌ ๊ณจ๋“œ ๋‰ด์Šค ๋ฐ์ดํ„ฐ ์ ์žฌ ํ”„๋กœ์„ธ์Šค๋ฅผ ๊ฐ€๋™ํ•ฉ๋‹ˆ๋‹ค...")
206
+
207
+ # ๋ชจ๋“  ๊ณจ๋“œ ์—”ํ‹ฐํ‹ฐ์˜ ํƒ€์ž…์„ ์‚ฌ์ „์— ๋งคํ•‘ ํ…Œ์ด๋ธ”๋กœ ๊ตฌ์ถ•ํ•˜์—ฌ StopIteration ๋ฐฉ์ง€
208
+ entity_types = {}
209
+ for a in GOLD_ARTICLES:
210
+ for e in a["entities"]:
211
+ entity_types[e["name"]] = e["type"]
212
+
213
+ for idx, art in enumerate(GOLD_ARTICLES, 1):
214
+ print(f"\n({idx}/{len(GOLD_ARTICLES)}) [ART] '{art['title'][:35]}...' ์ ์žฌ ์ค‘...")
215
+
216
+ # 1. Article ๋…ธ๋“œ ์ƒ์„ฑ (์ค‘๋ณต ์—†์ด MERGE)
217
+ session.run("""
218
+ MERGE (a:Article {article_id: $article_id})
219
+ SET a.title = $title,
220
+ a.url = $url,
221
+ a.content = $content,
222
+ a.source = $source,
223
+ a.author = $author,
224
+ a.published_date = $published_date,
225
+ a.category = '๊ฒฝ์ œ'
226
+ """, {
227
+ "article_id": art["article_id"],
228
+ "title": art["title"],
229
+ "url": art["url"],
230
+ "content": art["content"],
231
+ "source": art["source"],
232
+ "author": art["author"],
233
+ "published_date": art["published_date"]
234
+ })
235
+
236
+ # 2. Content ์ฒญํ‚น ๋…ธ๋“œ ๋ฐ 1536์ฐจ์› ๋ฒกํ„ฐ ์ž„๋ฒ ๋”ฉ ์ƒ์„ฑ/์—ฐ๊ฒฐ
237
+ print(" -> ์‹ค์‹œ๊ฐ„ OpenAI 1536์ฐจ์› ๋ฒกํ„ฐ ์ž„๋ฒ ๋”ฉ ์ƒ์„ฑ ์ค‘...")
238
+ # ๋ฌธ์žฅ ๊ธฐ๋ฐ˜์œผ๋กœ ๋ณธ๋ฌธ์„ 2๊ฐœ ์ฒญํฌ๋กœ ์ธ์œ„ ๋ถ„ํ• ํ•˜์—ฌ ์ง€์‹ ๋ฐ€๋„ ๊ฐ•ํ™”
239
+ paragraphs = [p.strip() for p in art["content"].split("\n") if p.strip()]
240
+ for chunk_idx, para in enumerate(paragraphs, 1):
241
+ chunk_id = f"{art['article_id']}_CHK_{chunk_idx}"
242
+ embedding = get_embedding(para)
243
+
244
+ # Content ๋…ธ๋“œ ์ƒ์„ฑ ๋ฐ HAS_CHUNK ์—ฐ๊ฒฐ
245
+ session.run("""
246
+ MATCH (a:Article {article_id: $article_id})
247
+ MERGE (c:Content {chunk_id: $chunk_id})
248
+ SET c.chunk = $chunk,
249
+ c.embedding = $embedding,
250
+ c.article_id = $article_id
251
+ MERGE (a)-[:HAS_CHUNK]->(c)
252
+ """, {
253
+ "article_id": art["article_id"],
254
+ "chunk_id": chunk_id,
255
+ "chunk": para,
256
+ "embedding": embedding
257
+ })
258
+
259
+ # 3. Entities ์ƒ์„ฑ ๋ฐ Article -[:MENTIONS]-> Entity ์—ฐ๊ฒฐ
260
+ for ent in art["entities"]:
261
+ # ๊ฐ ์—”ํ‹ฐํ‹ฐ ํƒ€์ž…์— ๋งž๋Š” ๋ ˆ์ด๋ธ”์„ ๊ฐ–๋Š” ๋…ธ๋“œ๋ฅผ ๋™์ ์œผ๋กœ ์ƒ์„ฑํ•˜๊ณ ,
262
+ # ๊ณตํ†ต ๋ ˆ์ด๋ธ”๋กœ์„œ๋„ ๊ฒ€์ƒ‰ ๊ฐ€๋Šฅํ•˜๊ฒŒ ์„ค๊ณ„
263
+ cypher_merge = f"""
264
+ MERGE (e:{ent['type']} {{name: $name}})
265
+ SET e.description = $description
266
+ RETURN e
267
+ """
268
+ session.run(cypher_merge, {"name": ent["name"], "description": ent["description"]})
269
+
270
+ # Article -[:MENTIONS]-> Entity
271
+ session.run(f"""
272
+ MATCH (a:Article {{article_id: $article_id}})
273
+ MATCH (e:{ent['type']} {{name: $name}})
274
+ MERGE (a)-[:MENTIONS]->(e)
275
+ """, {"article_id": art["article_id"], "name": ent["name"]})
276
+
277
+ print(f" - [ENT] ({ent['type']}) {ent['name']} ์™„๋ฃŒ")
278
+
279
+ # 4. ์—”ํ‹ฐํ‹ฐ ๊ฐ„ ์ง์ ‘ ๊ด€๊ณ„ ์—ฐ๊ฒฐ์„ฑ ์ƒ์„ฑ
280
+ for src_name, rel_type, tgt_name in art["relationships"]:
281
+ # ๊ตฌ์ถ•ํ•ด ๋‘” ๋งคํ•‘ ํ…Œ์ด๋ธ”์„ ์‚ฌ์šฉํ•˜์—ฌ ์ค‘๋‹จ ์˜ค๋ฅ˜ ์›์ฒœ ์˜ˆ๋ฐฉ
282
+ src_type = entity_types.get(src_name, "AICompany")
283
+ tgt_type = entity_types.get(tgt_name, "AICompany")
284
+
285
+ cypher_rel = f"""
286
+ MATCH (s:{src_type} {{name: $src_name}})
287
+ MATCH (t:{tgt_type} {{name: $tgt_name}})
288
+ MERGE (s)-[:{rel_type}]->(t)
289
+ """
290
+ session.run(cypher_rel, {"src_name": src_name, "tgt_name": tgt_name})
291
+ print(f" - [REL] ({src_name})-[:{rel_type}]->({tgt_name}) ์—ฐ๊ฒฐ")
292
+
293
+ # 5. ๊ด€๊ณ„ ๋ฐ€๋„ ํ†ต๊ณ„ ์ถœ๋ ฅ
294
+ print("\n[OK] 4๋Œ€ ํ•€ํ…Œํฌ ๊ณจ๋“œ ๋ฐ์ดํ„ฐ ์ ์žฌ ์™„๋ฃŒ!")
295
+
296
+ total_rels = session.run("""
297
+ MATCH ()-[r:DEVELOPS|INVESTS_IN|PARTNERS_WITH|APPLIES|USED_IN|RELATED_TO]->()
298
+ RETURN count(r) as cnt
299
+ """).single()["cnt"]
300
+
301
+ total_articles = session.run("MATCH (a:Article) RETURN count(a) as cnt").single()["cnt"]
302
+ avg_density = total_rels / total_articles if total_articles > 0 else 0
303
+
304
+ print(f"[STATUS] ํ˜„์žฌ ์ ์žฌ๋œ ์ด ๊ธฐ์‚ฌ ์ˆ˜: {total_articles}๊ฐœ")
305
+ print(f"[STATUS] ์—”ํ‹ฐํ‹ฐ ๊ฐ„ ์ง์ ‘ ๊ด€๊ณ„ ์ด์ˆ˜: {total_rels}๊ฐœ")
306
+ print(f"[STATUS] ๊ธฐ์‚ฌ๋‹น ํ‰๊ท  ๊ด€๊ณ„์ˆ˜: {avg_density:.1f}๊ฐœ (๋ชฉํ‘œ: 3.0๊ฐœ ์ด์ƒ)")
307
+
308
+ driver.close()
309
+ print("[DONE] ํ”„๋กœ์„ธ์Šค ์ •์ƒ ์ข…๋ฃŒ")
310
+
311
+
312
+ if __name__ == "__main__":
313
+ main()
src/graphBuilder/scrapping/finScrapping.py CHANGED
@@ -1,32 +1,42 @@
1
  import re
 
2
  import time
3
  from collections import Counter
4
  from datetime import datetime, timedelta
5
 
 
 
 
 
6
  import pandas as pd
7
  from selenium import webdriver
8
  from selenium.webdriver.chrome.service import Service
9
  from selenium.webdriver.common.by import By
10
  from webdriver_manager.chrome import ChromeDriverManager
11
 
12
- # ์ˆ˜์ง‘ ๋Œ€์ƒ ์นดํ…Œ๊ณ ๋ฆฌ sid
13
  categories_sid = {
14
  "๊ฒฝ์ œ": "101",
15
  "IT/๊ณผํ•™": "105",
16
  }
17
- NUM_ARTICLES_PER_DATE_CAT = 15 # ๋‚ ์งœ๋ณ„/์นดํ…Œ๊ณ ๋ฆฌ๋ณ„ ๋ชฉํ‘œ ์ˆ˜์ง‘๋Ÿ‰ (7์ผ * 2๊ฐœ ์นดํ…Œ๊ณ ๋ฆฌ * 15 = ์ตœ๋Œ€ 210๊ฑด ๋งํฌ ํŒŒ์‹ฑ)
18
-
19
- # AI ํ•€ํ…Œํฌ ํ‚ค์›Œ๋“œ (FinNode ํ”„๋กœ์ ํŠธ ์ „์šฉ)
20
- FINTECH_AI_KEYWORDS = [
21
- # AI ๊ธฐ์ˆ 
22
- "AI",
23
- "์ธ๊ณต์ง€๋Šฅ",
24
- "์ƒ์„ฑํ˜• AI",
25
- "๋Œ€๊ทœ๋ชจ์–ธ์–ด๋ชจ๋ธ",
26
- # AI ํ•€ํ…Œํฌ (๊ธˆ์œต)
27
- "ํ•€ํ…Œํฌ",
 
 
 
28
  ]
29
 
 
 
30
  print("[INIT] ChromeDriver ์ดˆ๊ธฐํ™” ์ค‘...")
31
  service = Service(ChromeDriverManager().install())
32
  options = webdriver.ChromeOptions()
@@ -34,7 +44,7 @@ options.add_argument("--no-sandbox")
34
  options.add_argument("--disable-dev-shm-usage")
35
  options.add_argument("--headless") # ์†๋„ ๋ฐ ์•ˆ์ •์„ฑ ๊ทน๋Œ€ํ™”๋ฅผ ์œ„ํ•ด headless ๋ชจ๋“œ ํ™œ์„ฑํ™”
36
  driver = webdriver.Chrome(service=service, options=options)
37
- print("[INIT] โœ… ๋ธŒ๋ผ์šฐ์ € ์‹คํ–‰ ์™„๋ฃŒ")
38
 
39
 
40
  def get_article_links(driver, sid: str, target_date: str, num_articles: int) -> list[str]:
@@ -161,7 +171,7 @@ def parse_article_detail(driver, article_url, category):
161
  except:
162
  pass
163
  except Exception as e:
164
- print(f" [PARSE] โš ๏ธ ํŒŒ์‹ฑ ์˜ค๋ฅ˜: {e}")
165
  return article_data
166
 
167
 
@@ -172,11 +182,11 @@ category_stats = {}
172
  # ์˜ค๋Š˜๋ถ€ํ„ฐ 7์ผ ์ „๊นŒ์ง€์˜ ๋‚ ์งœ ๋ฆฌ์ŠคํŠธ ์ƒ์„ฑ
173
  target_dates = [(datetime.now() - timedelta(days=i)).strftime("%Y%m%d") for i in range(7)]
174
 
175
- print(f"[CRAWL] ๐Ÿ“… ๋Œ€์ƒ ์ˆ˜์ง‘ ๋‚ ์งœ (7์ผ): {target_dates}")
176
 
177
  for target_date in target_dates:
178
  print(f"\n{'=' * 60}")
179
- print(f"[CRAWL] ๐Ÿ“… {target_date} ์ผ์ž ์ˆ˜์ง‘ ์‹œ์ž‘")
180
  print(f"{'=' * 60}")
181
 
182
  for category_name, sid in categories_sid.items():
@@ -200,7 +210,7 @@ for target_date in target_dates:
200
 
201
  all_articles.append(article_data)
202
  cat_ok += 1
203
- print(f" โœ… {article_data['title'][:40]}...")
204
  print(f" ์–ธ๋ก ์‚ฌ: {article_data['source']} | ๋‚ ์งœ: {article_data['published_date']}")
205
  else:
206
  cat_fail += 1
@@ -212,7 +222,7 @@ for target_date in target_dates:
212
  ]
213
  if not v
214
  ]
215
- print(f" โŒ ํŒŒ์‹ฑ์‹คํŒจ ({', '.join(missing)} ์—†์Œ)")
216
  time.sleep(0.5)
217
 
218
  category_stats[cat_key] = {"ok": cat_ok, "fail": cat_fail}
@@ -234,29 +244,56 @@ print(f" ์ „์ฒด ์ˆ˜์ง‘: ์„ฑ๊ณต {total_ok}๊ฑด / ์‹คํŒจ {total_fail}๊ฑด")
234
  df_all = pd.DataFrame(all_articles)
235
 
236
 
237
- # โ”€โ”€ 2๋‹จ๊ณ„: AI ํ•€ํ…Œํฌ ํ‚ค์›Œ๋“œ ํ•„ํ„ฐ๋ง โ”€โ”€
238
  print(f"\n{'=' * 60}")
239
- print("[FILTER] AI ํ•€ํ…Œํฌ ํ‚ค์›Œ๋“œ ํ•„ํ„ฐ๋ง ์‹œ์ž‘")
 
 
240
  print(f"{'=' * 60}")
241
 
242
  filtered_articles = []
243
  for _, row in df_all.iterrows():
244
  text = f"{row['title']} {row['content']}"
245
- matched = [kw for kw in FINTECH_AI_KEYWORDS if kw.replace(" ", "") in text.replace(" ", "")]
246
- if matched:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
247
  row_dict = row.to_dict()
248
- row_dict["matched_keywords"] = ", ".join(matched)
 
249
  filtered_articles.append(row_dict)
250
 
251
  df_filtered = pd.DataFrame(filtered_articles)
252
 
253
  print(f" ์ „์ฒด ์ˆ˜์ง‘: {len(df_all)}๊ฑด")
254
- print(f" AI ํ•€ํ…Œํฌ ๊ด€๋ จ: {len(df_filtered)}๊ฑด ({len(df_filtered) / max(len(df_all), 1) * 100:.1f}%)")
255
- print("\n [ํ‚ค์›Œ๋“œ๋ณ„ ๋งค์นญ ํ˜„ํ™ฉ]")
256
  all_kw = [kw for row in filtered_articles for kw in row["matched_keywords"].split(", ")]
257
  kw_counts = Counter(all_kw)
258
- for kw in FINTECH_AI_KEYWORDS:
259
- print(f" {kw}: {kw_counts.get(kw, 0)}๊ฑด")
 
 
 
 
 
 
260
 
261
  df_filtered
262
 
@@ -267,7 +304,7 @@ output_dir = os.path.join("src", "graphBuilder", "scrapping")
267
  os.makedirs(output_dir, exist_ok=True)
268
  output_filename = os.path.join(output_dir, f"Articles_{datetime.now().strftime('%Y%m%d_%H%M%S')}.xlsx")
269
  df_filtered.to_excel(output_filename, index=False, engine="openpyxl")
270
- print(f"[SAVE] โœ… ์ €์žฅ ์™„๋ฃŒ: {output_filename}")
271
  print(f"[SAVE] - AI ํ•€ํ…Œํฌ ๊ธฐ์‚ฌ: {len(df_filtered)}๊ฑด")
272
 
273
 
@@ -278,9 +315,13 @@ try:
278
 
279
  import matplotlib.pyplot as plt
280
 
281
- # ํฐํŠธ ๊นจ์ง ๋ฐฉ์ง€ (Mac ํ™˜๊ฒฝ: AppleGothic)
282
- if platform.system() == "Darwin":
 
 
283
  plt.rc("font", family="AppleGothic")
 
 
284
  plt.rcParams["axes.unicode_minus"] = False
285
 
286
  if not filtered_articles:
 
1
  import re
2
+ import sys
3
  import time
4
  from collections import Counter
5
  from datetime import datetime, timedelta
6
 
7
+ # ์œˆ๋„์šฐ ์ฝ˜์†” UnicodeEncodeError ์™„์ „ ๋ฐฉ์ง€
8
+ if hasattr(sys.stdout, 'reconfigure'):
9
+ sys.stdout.reconfigure(encoding='utf-8')
10
+
11
  import pandas as pd
12
  from selenium import webdriver
13
  from selenium.webdriver.chrome.service import Service
14
  from selenium.webdriver.common.by import By
15
  from webdriver_manager.chrome import ChromeDriverManager
16
 
17
+ # ์ˆ˜์ง‘ ๋Œ€์ƒ ์นดํ…Œ๊ณ ๋ฆฌ sid - ์‚ฌ์šฉ์ž์˜ ๋“€์–ผ ํ•˜์ด๋ธŒ๋ฆฌ๋“œ ํ•„ํ„ฐ ์ง€์นจ์— ๋งž์ถ”์–ด ๊ฒฝ์ œ์™€ IT/๊ณผํ•™์„ ๋ชจ๋‘ ์ˆ˜์ง‘ํ•ฉ๋‹ˆ๋‹ค.
18
  categories_sid = {
19
  "๊ฒฝ์ œ": "101",
20
  "IT/๊ณผํ•™": "105",
21
  }
22
+ NUM_ARTICLES_PER_DATE_CAT = 20 # ์นดํ…Œ๊ณ ๋ฆฌ๋ณ„/๋‚ ์งœ๋ณ„ ์ˆ˜์ง‘๋Ÿ‰ (7์ผ * 2๊ฐœ ์นดํ…Œ๊ณ ๋ฆฌ * 20 = ์ตœ๋Œ€ 280๊ฑด ๋งํฌ ํŒŒ์‹ฑ)
23
+
24
+ # AI ๋ฐ ๊ธˆ์œต/ํ•€ํ…Œํฌ ํ‚ค์›Œ๋“œ ๋ฆฌ์ŠคํŠธ (๊ต์ฐจ ํ•˜์ด๋ธŒ๋ฆฌ๋“œ ํ•„ํ„ฐ๋ง ์ ์šฉ)
25
+ AI_KEYWORDS = [
26
+ "AI", "์ธ๊ณต์ง€๋Šฅ", "์ƒ์„ฑํ˜• AI", "๋Œ€๊ทœ๋ชจ์–ธ์–ด๋ชจ๋ธ", "LLM", "GPT",
27
+ "์ œ๋ฏธ๋‚˜์ด", "Gemini", "ํด๋กœ๋“œ", "Claude", "๋จธ์‹ ๋Ÿฌ๋‹", "๋”ฅ๋Ÿฌ๋‹"
28
+ ]
29
+
30
+ FIN_KEYWORDS = [
31
+ "ํ•€ํ…Œํฌ", "๊ธˆ์œต", "์€ํ–‰", "์นด๋“œ", "์ฆ๊ถŒ", "ํŽ˜์ด", "์†ก๊ธˆ", "๊ฒฐ์ œ",
32
+ "์ž์‚ฐ๊ด€๋ฆฌ", "์‹ ์šฉํ‰๊ฐ€", "์‹ ์šฉ", "ํˆฌ์ž", "๋งˆ์ด๋ฐ์ดํ„ฐ", "๋กœ๋ณด์–ด๋“œ๋ฐ”์ด์ €",
33
+ "์ธํ„ฐ๋„ท์€ํ–‰", "์ธ์Šˆ์–ดํ…Œํฌ", "์ž์‚ฐ์šด์šฉ", "์นด์นด์˜ค๋ฑ…ํฌ", "ํ† ์Šค๋ฑ…ํฌ",
34
+ "์ผ€์ด๋ฑ…ํฌ", "๋„ค์ด๋ฒ„ํŽ˜์ด", "์นด์นด์˜คํŽ˜์ด", "ํ† ์Šค", "์ฃผ์‹", "๋ฑ…ํ‚น",
35
+ "๋””์ง€ํ„ธ ๊ธˆ์œต", "ST", "ํ† ํฐ์ฆ๊ถŒ", "FDS", "๊ธˆ์œต ์‚ฌ๊ธฐ", "์ด์ƒ๊ฑฐ๋ž˜"
36
  ]
37
 
38
+ FINTECH_AI_KEYWORDS = AI_KEYWORDS + FIN_KEYWORDS # ์‹œ๊ฐํ™” ํ˜ธํ™˜์šฉ ์ „์ฒด ๋ชฉ๋ก
39
+
40
  print("[INIT] ChromeDriver ์ดˆ๊ธฐํ™” ์ค‘...")
41
  service = Service(ChromeDriverManager().install())
42
  options = webdriver.ChromeOptions()
 
44
  options.add_argument("--disable-dev-shm-usage")
45
  options.add_argument("--headless") # ์†๋„ ๋ฐ ์•ˆ์ •์„ฑ ๊ทน๋Œ€ํ™”๋ฅผ ์œ„ํ•ด headless ๋ชจ๋“œ ํ™œ์„ฑํ™”
46
  driver = webdriver.Chrome(service=service, options=options)
47
+ print("[INIT] [OK] ๋ธŒ๋ผ์šฐ์ € ์‹คํ–‰ ์™„๋ฃŒ")
48
 
49
 
50
  def get_article_links(driver, sid: str, target_date: str, num_articles: int) -> list[str]:
 
171
  except:
172
  pass
173
  except Exception as e:
174
+ print(f" [PARSE] [WARN] ํŒŒ์‹ฑ ์˜ค๋ฅ˜: {e}")
175
  return article_data
176
 
177
 
 
182
  # ์˜ค๋Š˜๋ถ€ํ„ฐ 7์ผ ์ „๊นŒ์ง€์˜ ๋‚ ์งœ ๋ฆฌ์ŠคํŠธ ์ƒ์„ฑ
183
  target_dates = [(datetime.now() - timedelta(days=i)).strftime("%Y%m%d") for i in range(7)]
184
 
185
+ print(f"[CRAWL] [DATE] ๋Œ€์ƒ ์ˆ˜์ง‘ ๋‚ ์งœ (7์ผ): {target_dates}")
186
 
187
  for target_date in target_dates:
188
  print(f"\n{'=' * 60}")
189
+ print(f"[CRAWL] [DATE] {target_date} ์ผ์ž ์ˆ˜์ง‘ ์‹œ์ž‘")
190
  print(f"{'=' * 60}")
191
 
192
  for category_name, sid in categories_sid.items():
 
210
 
211
  all_articles.append(article_data)
212
  cat_ok += 1
213
+ print(f" [OK] {article_data['title'][:40]}...")
214
  print(f" ์–ธ๋ก ์‚ฌ: {article_data['source']} | ๋‚ ์งœ: {article_data['published_date']}")
215
  else:
216
  cat_fail += 1
 
222
  ]
223
  if not v
224
  ]
225
+ print(f" [FAIL] ํŒŒ์‹ฑ์‹คํŒจ ({', '.join(missing)} ์—†์Œ)")
226
  time.sleep(0.5)
227
 
228
  category_stats[cat_key] = {"ok": cat_ok, "fail": cat_fail}
 
244
  df_all = pd.DataFrame(all_articles)
245
 
246
 
247
+ # โ”€โ”€ 2๋‹จ๊ณ„: ๊ธˆ์œต AI ๋“€์–ผ ํ•˜์ด๋ธŒ๋ฆฌ๋“œ ํ•„ํ„ฐ๋ง (๊ฒฝ์ œ -> AI / IT -> ๊ธˆ์œต) โ”€โ”€
248
  print(f"\n{'=' * 60}")
249
+ print("[FILTER] ๊ธˆ์œต AI ๋“€์–ผ ํ•˜์ด๋ธŒ๋ฆฌ๋“œ ํ•„ํ„ฐ๋ง ์‹œ์ž‘")
250
+ print("[FILTER] - ๊ฒฝ์ œ ์„น์…˜ ๊ธฐ์‚ฌ: AI ํ‚ค์›Œ๋“œ ์กด์žฌ ์‹œ ํ†ต๊ณผ")
251
+ print("[FILTER] - IT/๊ณผํ•™ ์„น์…˜ ๊ธฐ์‚ฌ: ๊ธˆ์œต ํ‚ค์›Œ๋“œ ์กด์žฌ ์‹œ ํ†ต๊ณผ")
252
  print(f"{'=' * 60}")
253
 
254
  filtered_articles = []
255
  for _, row in df_all.iterrows():
256
  text = f"{row['title']} {row['content']}"
257
+ text_clean = text.lower().replace(" ", "")
258
+
259
+ # 1. AI ๋„๋ฉ”์ธ ๋งค์นญ
260
+ matched_ai = [kw for kw in AI_KEYWORDS if kw.lower().replace(" ", "") in text_clean]
261
+ # 2. ๊ธˆ์œต/ํ•€ํ…Œํฌ ๋„๋ฉ”์ธ ๋งค์นญ
262
+ matched_fin = [kw for kw in FIN_KEYWORDS if kw.lower().replace(" ", "") in text_clean]
263
+
264
+ is_passed = False
265
+ matched_info = []
266
+
267
+ if row['category'] == "๊ฒฝ์ œ":
268
+ if matched_ai:
269
+ is_passed = True
270
+ matched_info = matched_ai
271
+ elif row['category'] == "IT/๊ณผํ•™":
272
+ if matched_fin:
273
+ is_passed = True
274
+ matched_info = matched_fin
275
+
276
+ if is_passed:
277
  row_dict = row.to_dict()
278
+ # ์‹œ๊ฐํ™” ๋ฐ ๋กœ๊น…์„ ์œ„ํ•ด ๊ฒฐํ•ฉ๋œ ๋งค์นญ ํ‚ค์›Œ๋“œ ์ •๋ณด ๊ธฐ๋ก
279
+ row_dict["matched_keywords"] = ", ".join(matched_info)
280
  filtered_articles.append(row_dict)
281
 
282
  df_filtered = pd.DataFrame(filtered_articles)
283
 
284
  print(f" ์ „์ฒด ์ˆ˜์ง‘: {len(df_all)}๊ฑด")
285
+ print(f" AI ํ•€ํ…Œํฌ ๊ต์ฐจ ํ•„ํ„ฐ๋ง ํ†ต๊ณผ: {len(df_filtered)}๊ฑด ({len(df_filtered) / max(len(df_all), 1) * 100:.1f}%)")
286
+ print("\n [๋„๋ฉ”์ธ๋ณ„ ๋งค์นญ ์š”์•ฝ]")
287
  all_kw = [kw for row in filtered_articles for kw in row["matched_keywords"].split(", ")]
288
  kw_counts = Counter(all_kw)
289
+ print(" --- AI ๊ธฐ์ˆ  ํ‚ค์›Œ๋“œ ๋งค์นญ ---")
290
+ for kw in AI_KEYWORDS:
291
+ if kw_counts.get(kw, 0) > 0:
292
+ print(f" {kw}: {kw_counts.get(kw, 0)}๊ฑด")
293
+ print(" --- ๊ธˆ์œต/ํ•€ํ…Œํฌ ํ‚ค์›Œ๋“œ ๋งค์นญ ---")
294
+ for kw in FIN_KEYWORDS:
295
+ if kw_counts.get(kw, 0) > 0:
296
+ print(f" {kw}: {kw_counts.get(kw, 0)}๊ฑด")
297
 
298
  df_filtered
299
 
 
304
  os.makedirs(output_dir, exist_ok=True)
305
  output_filename = os.path.join(output_dir, f"Articles_{datetime.now().strftime('%Y%m%d_%H%M%S')}.xlsx")
306
  df_filtered.to_excel(output_filename, index=False, engine="openpyxl")
307
+ print(f"[SAVE] [OK] ์ €์žฅ ์™„๋ฃŒ: {output_filename}")
308
  print(f"[SAVE] - AI ํ•€ํ…Œํฌ ๊ธฐ์‚ฌ: {len(df_filtered)}๊ฑด")
309
 
310
 
 
315
 
316
  import matplotlib.pyplot as plt
317
 
318
+ # ํฐํŠธ ๊นจ์ง ๋ฐฉ์ง€ (Windows: Malgun Gothic, Mac: AppleGothic, Linux: NanumGothic)
319
+ if platform.system() == "Windows":
320
+ plt.rc("font", family="Malgun Gothic")
321
+ elif platform.system() == "Darwin":
322
  plt.rc("font", family="AppleGothic")
323
+ else:
324
+ plt.rc("font", family="NanumGothic")
325
  plt.rcParams["axes.unicode_minus"] = False
326
 
327
  if not filtered_articles:
src/retrieval/finRetrieval.py CHANGED
@@ -121,20 +121,20 @@ def _get_schema(driver: neo4j.Driver) -> str:
121
 
122
 
123
  _examples = [
124
- """USER INPUT: ์นด์นด์˜ค์˜ AI ์„œ๋น„์Šค ๋ชฉ๋ก์„ ์•Œ๋ ค์ฃผ์„ธ์š”
125
  CYPHER QUERY:
126
- MATCH (c:AICompany {name:"์นด์นด์˜ค"})-[:DEVELOPS]->(s:AIService)
127
  OPTIONAL MATCH (a:Article)-[:MENTIONS]->(s)
128
  RETURN s.name AS name, s.description AS description, a.title AS article_title, a.url AS article_url""",
129
- """USER INPUT: ์‚ผ์„ฑ์ „์ž๊ฐ€ ๊ฐœ๋ฐœ ์ค‘์ธ AI ๊ธฐ์ˆ ์€?
130
  CYPHER QUERY:
131
- MATCH (c:AICompany {name:"์‚ผ์„ฑ์ „์ž"})-[:DEVELOPS]->(t:AITechnology)
132
  OPTIONAL MATCH (a:Article)-[:MENTIONS]->(t)
133
  RETURN t.name AS name, t.description AS description, a.title AS article_title, a.url AS article_url""",
134
- """USER INPUT: ์–ด๋–ค ๊ธฐ์—…์ด LLM ๊ธฐ์ˆ ์„ ๊ฐœ๋ฐœํ•˜๋‚˜์š”?
135
  CYPHER QUERY:
136
  MATCH (c:AICompany)-[:DEVELOPS]->(t:AITechnology)
137
- WHERE t.name CONTAINS "์–ธ์–ด๋ชจ๋ธ" OR t.name CONTAINS "LLM"
138
  OPTIONAL MATCH (a:Article)-[:MENTIONS]->(t)
139
  RETURN c.name AS company_name, t.name AS tech_name, a.title AS article_title, a.url AS article_url""",
140
  """USER INPUT: ๊ธˆ์œต์ด๋‚˜ ํ•€ํ…Œํฌ ๋ถ„์•ผ์— ๊ธฐ์ˆ ์„ ์ ์šฉํ•˜๊ณ  ์žˆ๋Š” ๊ธฐ์—…๋“ค์€ ์–ด๋””์•ผ?
@@ -150,13 +150,13 @@ CYPHER QUERY:
150
  OPTIONAL MATCH (a:Article)-[:MENTIONS]->(s)
151
  RETURN DISTINCT c.name AS company_name, s.name AS service_name, f.name AS field_name, a.title AS article_title, a.url AS article_url
152
  LIMIT 3""",
153
- """USER INPUT: ์ตœ๊ทผ AI ๊ด€๋ จ ๋‰ด์Šค ๊ธฐ์‚ฌ๋ฅผ ์š”์•ฝํ•ด์ค˜
154
  CYPHER QUERY:
155
  MATCH (a:Article)-[:HAS_CHUNK]->(c:Content)
156
  RETURN a.title AS title, a.url AS url, a.published_date AS published_date, c.chunk AS chunk
157
  ORDER BY a.published_date DESC
158
  LIMIT 3""",
159
- """USER INPUT: ์ตœ๊ทผ ๊ฐ€์žฅ ๊ด€์‹ฌ์ด ๋†’์€ AI ๊ธฐ์ˆ ์ด ๋ญ์•ผ?
160
  CYPHER QUERY:
161
  MATCH (a:Article)-[:MENTIONS]->(t:AITechnology)
162
  OPTIONAL MATCH (c:AICompany)-[:DEVELOPS]->(t)
@@ -164,7 +164,7 @@ CYPHER QUERY:
164
  ORDER BY article_count DESC
165
  RETURN t.name AS tech_name, t.description AS description, article_count, companies, article_titles, article_urls
166
  LIMIT 5""",
167
- """USER INPUT: AI ๊ธฐ์ˆ  ํŠธ๋ Œ๋“œ๋ฅผ ๋ถ„์„ํ•ด์ค˜
168
  CYPHER QUERY:
169
  MATCH (a:Article)-[:MENTIONS]->(t:AITechnology)
170
  OPTIONAL MATCH (c:AICompany)-[:DEVELOPS]->(t)
@@ -172,10 +172,10 @@ CYPHER QUERY:
172
  ORDER BY article_count DESC
173
  RETURN t.name AS tech_name, article_count, companies, article_titles, article_urls
174
  LIMIT 5""",
175
- """USER INPUT: ํ˜„๋Œ€์ฐจ ๋˜๋Š” ๋กœ๋ด‡ ๊ด€๋ จ AI ๋‰ด์Šค ์•Œ๋ ค์ค˜
176
  CYPHER QUERY:
177
  MATCH (a:Article)-[:MENTIONS]->(c:AICompany)
178
- WHERE c.name CONTAINS 'ํ˜„๋Œ€' OR c.name CONTAINS '๋กœ๋ด‡'
179
  OPTIONAL MATCH (a)-[:MENTIONS]->(t:AITechnology)
180
  OPTIONAL MATCH (a)-[:MENTIONS]->(s:AIService)
181
  RETURN a.title AS article_title, a.url AS article_url, a.published_date AS article_date,
 
121
 
122
 
123
  _examples = [
124
+ """USER INPUT: ์นด์นด์˜คํŽ˜์ด์˜ AI ์„œ๋น„์Šค ๋ชฉ๋ก์„ ์•Œ๋ ค์ฃผ์„ธ์š”
125
  CYPHER QUERY:
126
+ MATCH (c:AICompany {name:"์นด์นด์˜คํŽ˜์ด"})-[:DEVELOPS]->(s:AIService)
127
  OPTIONAL MATCH (a:Article)-[:MENTIONS]->(s)
128
  RETURN s.name AS name, s.description AS description, a.title AS article_title, a.url AS article_url""",
129
+ """USER INPUT: ์‹ ํ•œ์€ํ–‰์ด ๊ฐœ๋ฐœ ์ค‘์ธ AI ๊ธฐ์ˆ ์€?
130
  CYPHER QUERY:
131
+ MATCH (c:AICompany {name:"์‹ ํ•œ์€ํ–‰"})-[:DEVELOPS]->(t:AITechnology)
132
  OPTIONAL MATCH (a:Article)-[:MENTIONS]->(t)
133
  RETURN t.name AS name, t.description AS description, a.title AS article_title, a.url AS article_url""",
134
+ """USER INPUT: ์–ด๋–ค ๊ธˆ์œต์‚ฌ๊ฐ€ ๋กœ๋ณด์–ด๋“œ๋ฐ”์ด์ € ๊ธฐ์ˆ ์„ ๊ฐœ๋ฐœํ•˜๋‚˜์š”?
135
  CYPHER QUERY:
136
  MATCH (c:AICompany)-[:DEVELOPS]->(t:AITechnology)
137
+ WHERE t.name CONTAINS "๋กœ๋ณด์–ด๋“œ๋ฐ”์ด์ €" OR t.name CONTAINS "์•Œ๊ณ ๋ฆฌ์ฆ˜"
138
  OPTIONAL MATCH (a:Article)-[:MENTIONS]->(t)
139
  RETURN c.name AS company_name, t.name AS tech_name, a.title AS article_title, a.url AS article_url""",
140
  """USER INPUT: ๊ธˆ์œต์ด๋‚˜ ํ•€ํ…Œํฌ ๋ถ„์•ผ์— ๊ธฐ์ˆ ์„ ์ ์šฉํ•˜๊ณ  ์žˆ๋Š” ๊ธฐ์—…๋“ค์€ ์–ด๋””์•ผ?
 
150
  OPTIONAL MATCH (a:Article)-[:MENTIONS]->(s)
151
  RETURN DISTINCT c.name AS company_name, s.name AS service_name, f.name AS field_name, a.title AS article_title, a.url AS article_url
152
  LIMIT 3""",
153
+ """USER INPUT: ์ตœ๊ทผ ๊ธˆ์œต AI ๊ด€๋ จ ๋‰ด์Šค ๊ธฐ์‚ฌ๋ฅผ ์š”์•ฝํ•ด์ค˜
154
  CYPHER QUERY:
155
  MATCH (a:Article)-[:HAS_CHUNK]->(c:Content)
156
  RETURN a.title AS title, a.url AS url, a.published_date AS published_date, c.chunk AS chunk
157
  ORDER BY a.published_date DESC
158
  LIMIT 3""",
159
+ """USER INPUT: ์ตœ๊ทผ ๊ฐ€์žฅ ๊ด€์‹ฌ์ด ๋†’์€ ๊ธˆ์œต AI ๊ธฐ์ˆ ์ด ๋ญ์•ผ?
160
  CYPHER QUERY:
161
  MATCH (a:Article)-[:MENTIONS]->(t:AITechnology)
162
  OPTIONAL MATCH (c:AICompany)-[:DEVELOPS]->(t)
 
164
  ORDER BY article_count DESC
165
  RETURN t.name AS tech_name, t.description AS description, article_count, companies, article_titles, article_urls
166
  LIMIT 5""",
167
+ """USER INPUT: ๊ธˆ์œต AI ๊ธฐ์ˆ  ํŠธ๋ Œ๋“œ๋ฅผ ๋ถ„์„ํ•ด์ค˜
168
  CYPHER QUERY:
169
  MATCH (a:Article)-[:MENTIONS]->(t:AITechnology)
170
  OPTIONAL MATCH (c:AICompany)-[:DEVELOPS]->(t)
 
172
  ORDER BY article_count DESC
173
  RETURN t.name AS tech_name, article_count, companies, article_titles, article_urls
174
  LIMIT 5""",
175
+ """USER INPUT: ํ† ์Šค ๋˜๋Š” ์นด์นด์˜คํŽ˜์ด ๊ด€๋ จ ๊ธˆ์œต AI ๋‰ด์Šค ์•Œ๋ ค์ค˜
176
  CYPHER QUERY:
177
  MATCH (a:Article)-[:MENTIONS]->(c:AICompany)
178
+ WHERE c.name CONTAINS 'ํ† ์Šค' OR c.name CONTAINS '์นด์นด์˜คํŽ˜์ด'
179
  OPTIONAL MATCH (a)-[:MENTIONS]->(t:AITechnology)
180
  OPTIONAL MATCH (a)-[:MENTIONS]->(s:AIService)
181
  RETURN a.title AS article_title, a.url AS article_url, a.published_date AS article_date,
tests/smoke_test_rag.py CHANGED
@@ -188,39 +188,39 @@ if __name__ == "__main__":
188
 
189
  results = []
190
 
191
- # ์‹œ๋‚˜๋ฆฌ์˜ค 1: ์‚ผ์„ฑ์ „์ž ๊ฐ€์šฐ์Šค 2 AI ํŠธ๋ Œ๋“œ
192
  results.append(run_scenario(
193
- label="โ‘  ์‚ผ์„ฑ์ „์ž โ€” ์‚ผ์„ฑ์ „์ž์˜ ์ž์ฒด AI ๋ชจ๋ธ์ธ '์‚ผ์„ฑ ๊ฐ€์šฐ์Šค 2'์˜ ํŠน์ง•๊ณผ ์ฃผ์š” ์ ์šฉ ๊ณ„ํš์„ ์•Œ๋ ค์ค˜",
194
- query="์‚ผ์„ฑ์ „์ž์˜ ์ž์ฒด AI ๋ชจ๋ธ์ธ '์‚ผ์„ฑ ๊ฐ€์šฐ์Šค 2'์˜ ํŠน์ง•๊ณผ ์ฃผ์š” ์ ์šฉ ๊ณ„ํš์„ ์•Œ๋ ค์ค˜",
195
- expected_keywords=["์‚ผ์„ฑ", "๊ฐ€์šฐ์Šค"],
196
  ))
197
 
198
- # ์‹œ๋‚˜๋ฆฌ์˜ค 2: ์นด์นด์˜ค ์นด๋‚˜๋‚˜ AI ๋ธŒ๋žœ๋“œ
199
  results.append(run_scenario(
200
- label="โ‘ก ์นด์นด์˜ค โ€” ์นด์นด์˜ค๊ฐ€ ๊ณต๊ฐœํ•œ AI ๋ธŒ๋žœ๋“œ '์นด๋‚˜๋‚˜(Kanana)'์™€ ์นด๋‚˜๋‚˜ ์›Œํฌ ๋“ฑ ์„œ๋น„์Šค ๋ผ์ธ์—…์„ ์„ค๋ช…ํ•ด์ค˜",
201
- query="์นด์นด์˜ค๊ฐ€ ๊ณต๊ฐœํ•œ AI ๋ธŒ๋žœ๋“œ '์นด๋‚˜๋‚˜(Kanana)'์™€ ์นด๋‚˜๋‚˜ ์›Œํฌ ๋“ฑ ์„œ๋น„์Šค ๋ผ์ธ์—…์„ ์„ค๋ช…ํ•ด์ค˜",
202
- expected_keywords=["์นด์นด์˜ค", "์นด๋‚˜๋‚˜"],
203
  ))
204
 
205
- # ์‹œ๋‚˜๋ฆฌ์˜ค 3: AWS ํ”ผ์ง€์ปฌ AI ๋ฐ ์—์ด์ „ํ‹ฑ AI
206
  results.append(run_scenario(
207
- label="โ‘ข AWS โ€” AWS๊ฐ€ ๊ฐ•์กฐํ•˜๋Š” 'ํ”ผ์ง€์ปฌ AI'์™€ '์—์ด์ „ํ‹ฑ AI' ๊ธฐ์ˆ ์˜ ํ•œ๊ตญ ์‹œ์žฅ ์ง€์› ๋ฐ ํ˜‘๋ ฅ ๋ฐฉ์•ˆ์€ ๋ฌด์—‡์ธ๊ฐ€์š”?",
208
- query="AWS๊ฐ€ ๊ฐ•์กฐํ•˜๋Š” 'ํ”ผ์ง€์ปฌ AI'์™€ '์—์ด์ „ํ‹ฑ AI' ๊ธฐ์ˆ ์˜ ํ•œ๊ตญ ์‹œ์žฅ ์ง€์› ๋ฐ ํ˜‘๋ ฅ ๋ฐฉ์•ˆ์€ ๋ฌด์—‡์ธ๊ฐ€์š”?",
209
- expected_keywords=["AWS", "ํ”ผ์ง€์ปฌ", "์—์ด์ „ํ‹ฑ"],
210
  ))
211
 
212
- # ์‹œ๋‚˜๋ฆฌ์˜ค 4: ๊ตฌ๊ธ€ I/O ์ œ๋ฏธ๋‚˜์ด ๊ธฐ์ˆ  ๋ณ€ํ™”
213
  results.append(run_scenario(
214
- label="โ‘ฃ ๊ตฌ๊ธ€ โ€” ๊ตฌ๊ธ€์ด I/O ํ–‰์‚ฌ์—์„œ ๋ฐœํ‘œํ•œ AI ๊ธฐ๋ฐ˜ ๊ฒ€์ƒ‰ ๋ณ€ํ™”์™€ '์ œ๋ฏธ๋‚˜์ด(Gemini)' ๊ธฐ์ˆ ์˜ ์ ์šฉ ์‚ฌ๋ก€๋ฅผ ์•Œ๋ ค์ค˜",
215
- query="๊ตฌ๊ธ€์ด I/O ํ–‰์‚ฌ์—์„œ ๋ฐœํ‘œํ•œ AI ๊ธฐ๋ฐ˜ ๊ฒ€์ƒ‰ ๋ณ€ํ™”์™€ '์ œ๋ฏธ๋‚˜์ด(Gemini)' ๊ธฐ์ˆ ์˜ ์ ์šฉ ์‚ฌ๋ก€๋ฅผ ์•Œ๋ ค์ค˜",
216
- expected_keywords=["๊ตฌ๊ธ€", "์ œ๋ฏธ๋‚˜์ด"],
217
  ))
218
 
219
  # ์ตœ์ข… ์š”์•ฝ
220
  print("=" * 60)
221
  print("๐Ÿ“‹ ์ตœ์ข… ์š”์•ฝ")
222
  print("=" * 60)
223
- labels = ["โ‘  ์‚ผ์„ฑ ๊ฐ€์šฐ์Šค 2", "โ‘ก ์นด์นด์˜ค ์นด๋‚˜๋‚˜", "โ‘ข AWS ํ”ผ์ง€์ปฌ AI", "โ‘ฃ ๊ตฌ๊ธ€ ์ œ๋ฏธ๋‚˜์ด"]
224
  for label, passed in zip(labels, results):
225
  print(f" {'โœ… PASS' if passed else 'โš ๏ธ PARTIAL'} | {label}")
226
  print()
 
188
 
189
  results = []
190
 
191
+ # ์‹œ๋‚˜๋ฆฌ์˜ค 1: ์‹ ํ•œ์€ํ–‰ AI ์  ํฌํŠธํด๋ฆฌ์˜ค
192
  results.append(run_scenario(
193
+ label="โ‘  ์‹ ํ•œ์€ํ–‰ โ€” ์‹ ํ•œ์€ํ–‰์˜ '์‹ ํ•œ AI ์  ํฌํŠธํด๋ฆฌ์˜ค' ๋กœ๋ณด์–ด๋“œ๋ฐ”์ด์ € ๊ธฐ์ˆ ๊ณผ ๊ฐœ์ธ ๋งž์ถคํ˜• ์„œ๋น„์Šค์˜ ํŠน์ง•์„ ์„ค๋ช…ํ•ด์ค˜",
194
+ query="์‹ ํ•œ์€ํ–‰์˜ '์‹ ํ•œ AI ์  ํฌํŠธํด๋ฆฌ์˜ค' ๋กœ๋ณด์–ด๋“œ๋ฐ”์ด์ € ๊ธฐ์ˆ ๊ณผ ๊ฐœ์ธ ๋งž์ถคํ˜• ์„œ๋น„์Šค์˜ ํŠน์ง•์„ ์„ค๋ช…ํ•ด์ค˜",
195
+ expected_keywords=["์‹ ํ•œ", "๋กœ๋ณด์–ด๋“œ๋ฐ”์ด์ €"],
196
  ))
197
 
198
+ # ์‹œ๋‚˜๋ฆฌ์˜ค 2: ์นด์นด์˜คํŽ˜์ด AI ๋Œ€์•ˆ์‹ ์šฉํ‰๊ฐ€
199
  results.append(run_scenario(
200
+ label="โ‘ก ์นด์นด์˜คํŽ˜์ด โ€” ์นด์นด์˜คํŽ˜์ด๊ฐ€ ์”ฌํŒŒ์ผ๋Ÿฌ๋ฅผ ์œ„ํ•ด ๊ฐœ๋ฐœํ•œ 'AI ๋Œ€์•ˆ์‹ ์šฉํ‰๊ฐ€' ๋ชจ๋ธ์˜ ์žฅ์ ๊ณผ ๋Œ€์ถœ ์Šน์ธ ํšจ๊ณผ๋Š” ๋ฌด์—‡์ธ๊ฐ€์š”?",
201
+ query="์นด์นด์˜คํŽ˜์ด๊ฐ€ ์”ฌํŒŒ์ผ๋Ÿฌ๋ฅผ ์œ„ํ•ด ๊ฐœ๋ฐœํ•œ 'AI ๋Œ€์•ˆ์‹ ์šฉํ‰๊ฐ€' ๋ชจ๋ธ์˜ ์žฅ์ ๊ณผ ๋Œ€์ถœ ์Šน์ธ ํšจ๊ณผ๋Š” ๋ฌด์—‡์ธ๊ฐ€์š”?",
202
+ expected_keywords=["์นด์นด์˜คํŽ˜์ด", "๋Œ€์•ˆ์‹ ์šฉํ‰๊ฐ€"],
203
  ))
204
 
205
+ # ์‹œ๋‚˜๋ฆฌ์˜ค 3: ํ† ์Šค๋ฑ…ํฌ AI FDS
206
  results.append(run_scenario(
207
+ label="โ‘ข ํ† ์Šค๋ฑ…ํฌ โ€” ํ† ์Šค๋ฑ…ํฌ์˜ ์‹ค์‹œ๊ฐ„ ๋ณด์ด์Šคํ”ผ์‹ฑ ํƒ์ง€ ๊ธฐ์ˆ ์ธ 'ํ† ์Šค AI FDS'์˜ ์ž‘๋™ ์›๋ฆฌ์™€ ์ฐจ๋‹จ์œจ์„ ์•Œ๋ ค์ค˜",
208
+ query="ํ† ์Šค๋ฑ…ํฌ์˜ ์‹ค์‹œ๊ฐ„ ๋ณด์ด์Šคํ”ผ์‹ฑ ํƒ์ง€ ๊ธฐ์ˆ ์ธ 'ํ† ์Šค AI FDS'์˜ ์ž‘๋™ ์›๋ฆฌ์™€ ์ฐจ๋‹จ์œจ์„ ์•Œ๋ ค์ค˜",
209
+ expected_keywords=["ํ† ์Šค", "FDS"],
210
  ))
211
 
212
+ # ์‹œ๋‚˜๋ฆฌ์˜ค 4: ๋„ค์ด๋ฒ„ํŽ˜์ด AI ๊ธˆ์œต ๋น„์„œ
213
  results.append(run_scenario(
214
+ label="โ‘ฃ ๋„ค์ด๋ฒ„ํŽ˜์ด โ€” ๋„ค์ด๋ฒ„ํŽ˜์ด๊ฐ€ ์ถœ์‹œํ•œ 'AI ๊ธˆ์œต ๋น„์„œ'๊ฐ€ ๋งˆ์ด๋ฐ์ดํ„ฐ์™€ ๊ฒฐํ•ฉํ•˜์—ฌ ์ œ๊ณตํ•˜๋Š” ๋งž์ถค ์ž์‚ฐ ๊ฐ€์ด๋“œ๋Š” ์–ด๋–ค ๊ฒƒ์ธ๊ฐ€์š”?",
215
+ query="๋„ค์ด๋ฒ„ํŽ˜์ด๊ฐ€ ์ถœ์‹œํ•œ 'AI ๊ธˆ์œต ๋น„์„œ'๊ฐ€ ๋งˆ์ด๋ฐ์ดํ„ฐ์™€ ๊ฒฐํ•ฉํ•˜์—ฌ ์ œ๊ณตํ•˜๋Š” ๋งž์ถค ์ž์‚ฐ ๊ฐ€์ด๋“œ๋Š” ์–ด๋–ค ๊ฒƒ์ธ๊ฐ€์š”?",
216
+ expected_keywords=["๋„ค์ด๋ฒ„ํŽ˜์ด", "๋งˆ์ด๋ฐ์ดํ„ฐ"],
217
  ))
218
 
219
  # ์ตœ์ข… ์š”์•ฝ
220
  print("=" * 60)
221
  print("๐Ÿ“‹ ์ตœ์ข… ์š”์•ฝ")
222
  print("=" * 60)
223
+ labels = ["โ‘  ์‹ ํ•œ AI ์  ํฌํŠธํด๋ฆฌ์˜ค", "โ‘ก ์นด์นด์˜คํŽ˜์ด AI ์‹ ์šฉํ‰๊ฐ€", "โ‘ข ํ† ์Šค AI FDS", "โ‘ฃ ๋„ค์ด๋ฒ„ํŽ˜์ด AI ๊ธˆ์œต ๋น„์„œ"]
224
  for label, passed in zip(labels, results):
225
  print(f" {'โœ… PASS' if passed else 'โš ๏ธ PARTIAL'} | {label}")
226
  print()