dev-yuje commited on
Commit
08fb91a
·
1 Parent(s): 3e720d0

style: 린트 포맷팅 자동 적용 및 프로젝트 기획안(AGENTS.md, README.md) 보완

Browse files
AGENTS.md CHANGED
@@ -1,9 +1,12 @@
1
  ###### 참고: https://wikidocs.net/340866
 
 
 
2
 
3
  # AGENTS.md
4
 
5
  ## 프로젝트 개요
6
- - 목적:
7
  - 언어: Python 3.10
8
  - 기술스택: GraphRAG, LangChain, LangGraph, Neo4j, HugingFace, Gradio
9
 
 
1
  ###### 참고: https://wikidocs.net/340866
2
+ ###### 하네스 엔지니어링: Global지침, Skills와 Workflow를 모두 포함하는 지침
3
+ ###### 개발 시작부터 배포까지 모든 것은 AGENTS.md에 기록한다.
4
+ ###### 예를들어 개발 단계에서 체크리스트를 만들어서 개발을 할 때마다 하나씩 체크하도록 지시한다.
5
 
6
  # AGENTS.md
7
 
8
  ## 프로젝트 개요
9
+ - 목적: AI 기반 핀테크 기술의 트렌드를 파악하도록 돕는 챗봇
10
  - 언어: Python 3.10
11
  - 기술스택: GraphRAG, LangChain, LangGraph, Neo4j, HugingFace, Gradio
12
 
README.md CHANGED
@@ -1,3 +1,13 @@
 
 
 
 
 
 
 
 
 
 
1
  # FinNode 🕸️
2
 
3
  **Neo4j GraphRAG 기반 AI 뉴스 지식 그래프 플랫폼**
 
1
+ ---
2
+ title: FinGraph
3
+ emoji: 🕸️
4
+ colorFrom: indigo
5
+ colorTo: indigo
6
+ sdk: gradio
7
+ sdk_version: 4.44.0
8
+ app_file: app.py
9
+ pinned: false
10
+ ---
11
  # FinNode 🕸️
12
 
13
  **Neo4j GraphRAG 기반 AI 뉴스 지식 그래프 플랫폼**
app.py CHANGED
@@ -8,11 +8,12 @@ Gradio ChatInterface + LangGraph 기반 대화 흐름 제어.
8
  python app.py
9
  """
10
 
11
- import os
 
12
  import dotenv
13
  import gradio as gr
14
- from typing import TypedDict, List
15
- from langgraph.graph import StateGraph, END
16
  from src.retrieval.finRetrieval import graphrag
17
 
18
  dotenv.load_dotenv()
 
8
  python app.py
9
  """
10
 
11
+ from typing import List, TypedDict
12
+
13
  import dotenv
14
  import gradio as gr
15
+ from langgraph.graph import END, StateGraph
16
+
17
  from src.retrieval.finRetrieval import graphrag
18
 
19
  dotenv.load_dotenv()
run_pipeline.py CHANGED
@@ -1,6 +1,8 @@
1
  import json
 
 
2
  from pipeline.workflow import pipeline
3
- from pipeline.db_writer import write_graph_to_neo4j, chunk_and_embed_article
4
 
5
  def run_test():
6
  # 1. 모의 테스트용 뉴스 기사 데이터 준비
 
1
  import json
2
+
3
+ from pipeline.db_writer import chunk_and_embed_article, write_graph_to_neo4j
4
  from pipeline.workflow import pipeline
5
+
6
 
7
  def run_test():
8
  # 1. 모의 테스트용 뉴스 기사 데이터 준비
src/graphBuilder/neo4j/finGraph.py CHANGED
@@ -10,27 +10,31 @@ finGraph.py — AI 뉴스 지식 그래프 빌더
10
  MENTIONS, HAS_CHUNK, PUBLISHED
11
  """
12
 
13
- import os
14
  import glob
15
  import json
16
- import pandas as pd
17
- import neo4j
 
18
  import dotenv
19
- from typing import TypedDict, List, Dict
 
20
  from langchain_openai import ChatOpenAI
21
- from langgraph.graph import StateGraph, END
22
- from neo4j_graphrag.llm import OpenAILLM
23
  from neo4j_graphrag.embeddings.openai import OpenAIEmbeddings
24
  from neo4j_graphrag.indexes import create_vector_index
 
25
 
26
  dotenv.load_dotenv()
27
 
28
- URI = os.getenv("NEO4J_URI", "neo4j://localhost:7687")
29
- AUTH = (os.getenv("NEO4J_USERNAME", "neo4j"), os.getenv("NEO4J_PASSWORD", "password"))
30
- driver = neo4j.GraphDatabase.driver(URI, auth=AUTH)
 
 
 
31
 
32
  chat_llm = ChatOpenAI(model="gpt-4o-mini", temperature=0)
33
- rag_llm = OpenAILLM(model_name="gpt-4o", model_params={"temperature": 0})
34
  embedder = OpenAIEmbeddings(model="text-embedding-3-small")
35
 
36
  INDEX_NAME = "content_vector_index"
@@ -39,6 +43,7 @@ INDEX_NAME = "content_vector_index"
39
  # 1. LangGraph 파이프라인 정의 (엔티티/관계 추출)
40
  # ──────────────────────────────────────────
41
 
 
42
  class ArticleState(TypedDict):
43
  article_id: str
44
  title: str
@@ -55,7 +60,10 @@ def check_ai_relevance(state: ArticleState) -> ArticleState:
55
  f"{state['text'][:400]}\n\n답변(yes/no):"
56
  )
57
  res = chat_llm.invoke(prompt)
58
- return {**state, "is_ai_related": res.content.strip().lower().startswith("yes")}
 
 
 
59
 
60
 
61
  def extract_entities(state: ArticleState) -> ArticleState:
@@ -67,8 +75,8 @@ def extract_entities(state: ArticleState) -> ArticleState:
67
  - AIService: 서비스/제품 (예: ChatGPT, HyperCLOVA X)
68
  - AIField: 적용 분야 (예: 금융AI, AI 반도체)
69
 
70
- 제목: {state['title']}
71
- 본문: {state['text'][:900]}
72
 
73
  JSON으로만 응답:{{"entities":[{{"name":"...","type":"AICompany|AITechnology|AIService|AIField","description":"..."}}]}}"""
74
  res = chat_llm.invoke(prompt)
@@ -156,19 +164,28 @@ def upsert_entity(tx, e: Dict) -> None:
156
  f"MERGE (n:{ntype} {{name:$name}}) "
157
  "ON CREATE SET n.description=$desc "
158
  "ON MATCH SET n.description=COALESCE(n.description,$desc)",
159
- name=e["name"], desc=e.get("description", ""),
 
160
  )
161
 
162
 
163
  def upsert_relation(tx, r: Dict) -> None:
164
  rel = r.get("relation", "RELATED_TO").upper().replace(" ", "_")
165
- allowed = {"DEVELOPS", "INVESTS_IN", "PARTNERS_WITH", "APPLIES", "USED_IN", "RELATED_TO"}
 
 
 
 
 
 
 
166
  if rel not in allowed:
167
  return
168
  try:
169
  tx.run(
170
  f"MATCH (s {{name:$src}}) MATCH (t {{name:$tgt}}) MERGE (s)-[:{rel}]->(t)",
171
- src=r["source"], tgt=r["target"],
 
172
  )
173
  except Exception:
174
  pass
@@ -176,24 +193,25 @@ def upsert_relation(tx, r: Dict) -> None:
176
 
177
  def upsert_article_and_mentions(tx, row: pd.Series, entities: List[Dict]) -> None:
178
  tx.run(
179
- "MERGE (a:Article {article_id:$aid}) "
180
- "SET a.title=$title, a.url=$url, a.published_date=$date",
181
- aid=row.get("article_id", ""), title=row.get("title", ""),
182
- url=row.get("url", ""), date=str(row.get("published_date", "")),
 
183
  )
184
  if pd.notna(row.get("source", "")):
185
  tx.run(
186
- "MERGE (m:Media {name:$src}) "
187
- "WITH m MATCH (a:Article {article_id:$aid}) MERGE (m)-[:PUBLISHED]->(a)",
188
- src=row["source"], aid=row.get("article_id", ""),
189
  )
190
  for e in entities:
191
  ntype = ENTITY_TYPE_MAP.get(e.get("type", "AICompany"), "AICompany")
192
  try:
193
  tx.run(
194
- f"MATCH (a:Article {{article_id:$aid}}) "
195
- f"MATCH (n:{ntype} {{name:$name}}) MERGE (a)-[:MENTIONS]->(n)",
196
- aid=row.get("article_id", ""), name=e["name"],
197
  )
198
  except Exception:
199
  pass
@@ -203,17 +221,14 @@ def chunk_text(text: str, size: int = 500, overlap: int = 50) -> List[str]:
203
  if not text or pd.isna(text):
204
  return []
205
  text = str(text)
206
- return [
207
- text[i:i + size].strip()
208
- for i in range(0, len(text), size - overlap)
209
- if text[i:i + size].strip()
210
- ]
211
 
212
 
213
  # ──────────────────────────────────────────
214
  # 3. 메인 실행 (스크립트로 직접 호출 시)
215
  # ──────────────────────────────────────────
216
 
 
217
  def main() -> None:
218
  # 최신 엑셀 로드
219
  xlsx_files = sorted(glob.glob("Articles_*.xlsx"))
@@ -232,12 +247,16 @@ def main() -> None:
232
  # 엔티티/관계 추출 및 적재
233
  print(f"총 {len(df)}건 처리 시작...")
234
  for idx, row in df.iterrows():
235
- aid = str(row.get("article_id", f"ART_{idx}"))
236
  title = str(row.get("title", ""))
237
- text = title + "\n" + str(row.get("content", ""))
238
  state: ArticleState = dict(
239
- article_id=aid, title=title, text=text,
240
- is_ai_related=False, entities=[], relations=[],
 
 
 
 
241
  )
242
  out = pipeline.invoke(state)
243
  if out["is_ai_related"]:
@@ -247,15 +266,15 @@ def main() -> None:
247
  for r in out["relations"]:
248
  s.execute_write(upsert_relation, r)
249
  s.execute_write(upsert_article_and_mentions, row, out["entities"])
250
- print(f" ✅ [{idx+1}/{len(df)}] {title[:35]}... | 엔티티: {[e['name'] for e in out['entities'][:4]]}")
251
  else:
252
- print(f" ⏭️ [{idx+1}/{len(df)}] AI 비관련: {title[:35]}...")
253
  print("\n✅ 엔티티/관계 추출 및 Neo4j 적재 완료")
254
 
255
  # Content 청킹 + 임베딩
256
  print("Content 노드 생성 및 임베딩 시작...")
257
  for idx, row in df.iterrows():
258
- aid = str(row.get("article_id", f"ART_{idx}"))
259
  chunks = chunk_text(str(row.get("content", "")))
260
  with driver.session() as s:
261
  for i, chunk in enumerate(chunks):
@@ -265,13 +284,23 @@ def main() -> None:
265
  "MERGE (c:Content {content_id:$cid}) "
266
  "SET c.chunk=$chunk, c.article_id=$aid, c.chunk_index=$i, c.embedding=$vec "
267
  "WITH c MATCH (a:Article {article_id:$aid}) MERGE (a)-[:HAS_CHUNK]->(c)",
268
- cid=cid, chunk=chunk, aid=aid, i=i, vec=vec,
 
 
 
 
269
  )
270
  print("✅ Content 노드 임베딩 완료")
271
 
272
  # 벡터 인덱스 생성
273
- create_vector_index(driver, INDEX_NAME, label="Content",
274
- embedding_property="embedding", dimensions=1536, similarity_fn="cosine")
 
 
 
 
 
 
275
  print(f"✅ 벡터 인덱스 [{INDEX_NAME}] 생성 완료")
276
 
277
 
 
10
  MENTIONS, HAS_CHUNK, PUBLISHED
11
  """
12
 
 
13
  import glob
14
  import json
15
+ import os
16
+ from typing import Dict, List, TypedDict
17
+
18
  import dotenv
19
+ import neo4j
20
+ import pandas as pd
21
  from langchain_openai import ChatOpenAI
22
+ from langgraph.graph import END, StateGraph
 
23
  from neo4j_graphrag.embeddings.openai import OpenAIEmbeddings
24
  from neo4j_graphrag.indexes import create_vector_index
25
+ from neo4j_graphrag.llm import OpenAILLM
26
 
27
  dotenv.load_dotenv()
28
 
29
+ URI = os.getenv("NEO4J_URI", "neo4j://localhost:7687")
30
+ AUTH = (
31
+ os.getenv("NEO4J_USERNAME", "neo4j"),
32
+ os.getenv("NEO4J_PASSWORD", "password"),
33
+ )
34
+ driver = neo4j.GraphDatabase.driver(URI, auth=AUTH)
35
 
36
  chat_llm = ChatOpenAI(model="gpt-4o-mini", temperature=0)
37
+ rag_llm = OpenAILLM(model_name="gpt-4o", model_params={"temperature": 0})
38
  embedder = OpenAIEmbeddings(model="text-embedding-3-small")
39
 
40
  INDEX_NAME = "content_vector_index"
 
43
  # 1. LangGraph 파이프라인 정의 (엔티티/관계 추출)
44
  # ──────────────────────────────────────────
45
 
46
+
47
  class ArticleState(TypedDict):
48
  article_id: str
49
  title: str
 
60
  f"{state['text'][:400]}\n\n답변(yes/no):"
61
  )
62
  res = chat_llm.invoke(prompt)
63
+ return {
64
+ **state,
65
+ "is_ai_related": res.content.strip().lower().startswith("yes"),
66
+ }
67
 
68
 
69
  def extract_entities(state: ArticleState) -> ArticleState:
 
75
  - AIService: 서비스/제품 (예: ChatGPT, HyperCLOVA X)
76
  - AIField: 적용 분야 (예: 금융AI, AI 반도체)
77
 
78
+ 제목: {state["title"]}
79
+ 본문: {state["text"][:900]}
80
 
81
  JSON으로만 응답:{{"entities":[{{"name":"...","type":"AICompany|AITechnology|AIService|AIField","description":"..."}}]}}"""
82
  res = chat_llm.invoke(prompt)
 
164
  f"MERGE (n:{ntype} {{name:$name}}) "
165
  "ON CREATE SET n.description=$desc "
166
  "ON MATCH SET n.description=COALESCE(n.description,$desc)",
167
+ name=e["name"],
168
+ desc=e.get("description", ""),
169
  )
170
 
171
 
172
  def upsert_relation(tx, r: Dict) -> None:
173
  rel = r.get("relation", "RELATED_TO").upper().replace(" ", "_")
174
+ allowed = {
175
+ "DEVELOPS",
176
+ "INVESTS_IN",
177
+ "PARTNERS_WITH",
178
+ "APPLIES",
179
+ "USED_IN",
180
+ "RELATED_TO",
181
+ }
182
  if rel not in allowed:
183
  return
184
  try:
185
  tx.run(
186
  f"MATCH (s {{name:$src}}) MATCH (t {{name:$tgt}}) MERGE (s)-[:{rel}]->(t)",
187
+ src=r["source"],
188
+ tgt=r["target"],
189
  )
190
  except Exception:
191
  pass
 
193
 
194
  def upsert_article_and_mentions(tx, row: pd.Series, entities: List[Dict]) -> None:
195
  tx.run(
196
+ "MERGE (a:Article {article_id:$aid}) SET a.title=$title, a.url=$url, a.published_date=$date",
197
+ aid=row.get("article_id", ""),
198
+ title=row.get("title", ""),
199
+ url=row.get("url", ""),
200
+ date=str(row.get("published_date", "")),
201
  )
202
  if pd.notna(row.get("source", "")):
203
  tx.run(
204
+ "MERGE (m:Media {name:$src}) WITH m MATCH (a:Article {article_id:$aid}) MERGE (m)-[:PUBLISHED]->(a)",
205
+ src=row["source"],
206
+ aid=row.get("article_id", ""),
207
  )
208
  for e in entities:
209
  ntype = ENTITY_TYPE_MAP.get(e.get("type", "AICompany"), "AICompany")
210
  try:
211
  tx.run(
212
+ f"MATCH (a:Article {{article_id:$aid}}) MATCH (n:{ntype} {{name:$name}}) MERGE (a)-[:MENTIONS]->(n)",
213
+ aid=row.get("article_id", ""),
214
+ name=e["name"],
215
  )
216
  except Exception:
217
  pass
 
221
  if not text or pd.isna(text):
222
  return []
223
  text = str(text)
224
+ return [text[i : i + size].strip() for i in range(0, len(text), size - overlap) if text[i : i + size].strip()]
 
 
 
 
225
 
226
 
227
  # ──────────────────────────────────────────
228
  # 3. 메인 실행 (스크립트로 직접 호출 시)
229
  # ──────────────────────────────────────────
230
 
231
+
232
  def main() -> None:
233
  # 최신 엑셀 로드
234
  xlsx_files = sorted(glob.glob("Articles_*.xlsx"))
 
247
  # 엔티티/관계 추출 및 적재
248
  print(f"총 {len(df)}건 처리 시작...")
249
  for idx, row in df.iterrows():
250
+ aid = str(row.get("article_id", f"ART_{idx}"))
251
  title = str(row.get("title", ""))
252
+ text = title + "\n" + str(row.get("content", ""))
253
  state: ArticleState = dict(
254
+ article_id=aid,
255
+ title=title,
256
+ text=text,
257
+ is_ai_related=False,
258
+ entities=[],
259
+ relations=[],
260
  )
261
  out = pipeline.invoke(state)
262
  if out["is_ai_related"]:
 
266
  for r in out["relations"]:
267
  s.execute_write(upsert_relation, r)
268
  s.execute_write(upsert_article_and_mentions, row, out["entities"])
269
+ print(f" ✅ [{idx + 1}/{len(df)}] {title[:35]}... | 엔티티: {[e['name'] for e in out['entities'][:4]]}")
270
  else:
271
+ print(f" ⏭️ [{idx + 1}/{len(df)}] AI 비관련: {title[:35]}...")
272
  print("\n✅ 엔티티/관계 추출 및 Neo4j 적재 완료")
273
 
274
  # Content 청킹 + 임베딩
275
  print("Content 노드 생성 및 임베딩 시작...")
276
  for idx, row in df.iterrows():
277
+ aid = str(row.get("article_id", f"ART_{idx}"))
278
  chunks = chunk_text(str(row.get("content", "")))
279
  with driver.session() as s:
280
  for i, chunk in enumerate(chunks):
 
284
  "MERGE (c:Content {content_id:$cid}) "
285
  "SET c.chunk=$chunk, c.article_id=$aid, c.chunk_index=$i, c.embedding=$vec "
286
  "WITH c MATCH (a:Article {article_id:$aid}) MERGE (a)-[:HAS_CHUNK]->(c)",
287
+ cid=cid,
288
+ chunk=chunk,
289
+ aid=aid,
290
+ i=i,
291
+ vec=vec,
292
  )
293
  print("✅ Content 노드 임베딩 완료")
294
 
295
  # 벡터 인덱스 생성
296
+ create_vector_index(
297
+ driver,
298
+ INDEX_NAME,
299
+ label="Content",
300
+ embedding_property="embedding",
301
+ dimensions=1536,
302
+ similarity_fn="cosine",
303
+ )
304
  print(f"✅ 벡터 인덱스 [{INDEX_NAME}] 생성 완료")
305
 
306
 
src/graphBuilder/scrapping/finScrapping.py CHANGED
@@ -1,166 +1,208 @@
 
 
 
 
1
 
2
-
3
-
4
  from selenium import webdriver
 
5
  from selenium.webdriver.common.by import By
6
  from webdriver_manager.chrome import ChromeDriverManager
7
- from selenium.webdriver.chrome.service import Service
8
- import pandas as pd
9
- import time
10
- from datetime import datetime
11
- import re
12
- from collections import Counter
13
 
14
  # 수집 대상 카테고리
15
  categories = {
16
- '경제': 'https://news.naver.com/section/101',
17
- 'IT/과학': 'https://news.naver.com/section/105',
18
  }
19
  NUM_ARTICLES_PER_CATEGORY = 80
20
 
21
  # AI 핀테크 키워드 (FinNode 프로젝트 전용)
22
  FINTECH_AI_KEYWORDS = [
23
  # AI 기술
24
- 'AI', '인공지능', '생성형 AI', '대규모언어모델',
 
 
 
25
  # AI 핀테크 (금융)
26
- '핀테크',
27
  ]
28
 
29
- print('[INIT] ChromeDriver 초기화 중...')
30
  service = Service(ChromeDriverManager().install())
31
  options = webdriver.ChromeOptions()
32
- options.add_argument('--no-sandbox')
33
- options.add_argument('--disable-dev-shm-usage')
34
  driver = webdriver.Chrome(service=service, options=options)
35
- print('[INIT] ✅ 브라우저 실행 완료')
 
36
 
37
  def get_article_links(driver, category_url, num_articles):
38
- print(f' [LINK] 페이지 이동: {category_url}')
39
  driver.get(category_url)
40
  time.sleep(3)
41
- print(f' [LINK] 로드 완료 (title: {driver.title})')
42
 
43
  article_links = []
44
  selectors = [
45
- 'a.sa_text_title', 'a.sa_text_lede', 'a.sa_text_strong',
46
- '.sa_text a', '.cluster_text_headline a', '.cluster_text_lede a'
 
 
 
 
47
  ]
48
 
49
  for selector in selectors:
50
  elements = driver.find_elements(By.CSS_SELECTOR, selector)
51
  print(f" [LINK] 셀렉터 '{selector}' -> {len(elements)}개 발견")
52
  for element in elements:
53
- url = element.get_attribute('href')
54
- if (url and 'news.naver.com' in url and '/article/' in url
55
- and '/comment/' not in url and url not in article_links):
 
 
 
 
 
56
  article_links.append(url)
57
  if len(article_links) >= num_articles:
58
  break
59
  if len(article_links) >= num_articles:
60
  break
61
 
62
- print(f' [LINK] ✅ 총 {len(article_links)}개 링크 확보\n')
63
  return article_links[:num_articles]
64
 
 
65
  def parse_article_detail(driver, article_url, category):
66
  driver.get(article_url)
67
  time.sleep(1.5)
68
  article_data = {
69
- 'article_id': '', 'title': '', 'content': '', 'url': article_url,
70
- 'published_date': '', 'source': '', 'author': '', 'category': category
 
 
 
 
 
 
71
  }
72
  try:
73
- match = re.search(r'article/(\d+)/(\d+)', article_url)
74
- article_data['article_id'] = (
75
- f"ART_{match.group(1)}_{match.group(2)}" if match
76
- else f"ART_{datetime.now().strftime('%Y%m%d%H%M%S')}"
77
  )
78
- for sel in ['#title_area span', '#ct .media_end_head_headline',
79
- '.media_end_head_headline', 'h2#title_area', '.news_end_title']:
 
 
 
 
 
80
  try:
81
  el = driver.find_element(By.CSS_SELECTOR, sel)
82
  if el.text.strip():
83
- article_data['title'] = el.text.strip(); break
84
- except: continue
85
- for sel in ['#dic_area', 'article#dic_area',
86
- '.go_trans._article_content', '._article_body_contents']:
 
 
 
 
 
 
87
  try:
88
  el = driver.find_element(By.CSS_SELECTOR, sel)
89
  if el.text.strip():
90
- article_data['content'] = el.text.strip(); break
91
- except: continue
 
 
92
  try:
93
- el = driver.find_element(By.CSS_SELECTOR, 'a.media_end_head_top_logo img')
94
- article_data['source'] = el.get_attribute('alt')
95
  except:
96
  try:
97
- el = driver.find_element(By.CSS_SELECTOR, '.media_end_head_top_logo_text')
98
- article_data['source'] = el.text.strip()
99
- except: pass
 
100
  try:
101
- el = driver.find_element(By.CSS_SELECTOR,
102
- 'span.media_end_head_info_datestamp_time, span[data-date-time]')
103
- article_data['published_date'] = (el.get_attribute('data-date-time') or el.text).strip()
 
 
104
  except:
105
- article_data['published_date'] = datetime.now().strftime('%Y-%m-%d %H:%M')
106
  try:
107
- el = driver.find_element(By.CSS_SELECTOR,
108
- 'em.media_end_head_journalist_name, span.byline_s')
109
- article_data['author'] = el.text.strip()
110
- except: pass
 
 
 
111
  except Exception as e:
112
- print(f' [PARSE] ⚠️ 파싱 오류: {e}')
113
  return article_data
114
 
 
115
  # ── 1단계: 전체 기사 수집 ──
116
  all_articles = []
117
  category_stats = {}
118
 
119
  for category_name, category_url in categories.items():
120
- print(f"\n{'='*60}")
121
- print(f'[CRAWL] [{category_name}] 카테고리 수집 시작')
122
- print(f"{'='*60}")
123
 
124
  article_links = get_article_links(driver, category_url, NUM_ARTICLES_PER_CATEGORY)
125
 
126
  cat_ok, cat_fail = 0, 0
127
  for idx, article_url in enumerate(article_links, 1):
128
- print(f' [PARSE] ({idx}/{len(article_links)}) {article_url[:70]}...')
129
  article_data = parse_article_detail(driver, article_url, category_name)
130
 
131
- if article_data['title'] and article_data['content']:
132
  all_articles.append(article_data)
133
  cat_ok += 1
134
  print(f" ✅ {article_data['title'][:40]}...")
135
  print(f" 언론사: {article_data['source']} | 날짜: {article_data['published_date']}")
136
  else:
137
  cat_fail += 1
138
- missing = [x for x, v in [('제목', article_data['title']), ('본문', article_data['content'])] if not v]
 
 
 
 
 
 
 
139
  print(f" ❌ 파싱실패 ({', '.join(missing)} 없음)")
140
  time.sleep(0.5)
141
 
142
- category_stats[category_name] = {'ok': cat_ok, 'fail': cat_fail}
143
  print(f"\n [CRAWL] [{category_name}] 완료: 성공 {cat_ok}개 / 실패 {cat_fail}개")
144
 
145
  driver.quit()
146
- print(f'\n[DONE] 브라우저 종료')
147
- print(f"\n{'='*60}")
148
- print(f'[SUMMARY] 수집 결과 요약')
149
- print(f"{'='*60}")
150
  for cat, s in category_stats.items():
151
- print(f' {cat}: 성공 {s["ok"]}건 / 실패 {s["fail"]}건')
152
- print(f' 전체 수집: {len(all_articles)}건')
153
 
154
  df_all = pd.DataFrame(all_articles)
155
  df_all
156
 
157
 
158
-
159
-
160
  # ── 2단계: AI 핀테크 키워드 필터링 ──
161
- print(f"\n{'='*60}")
162
- print('[FILTER] AI 핀테크 키워드 필터링 시작')
163
- print(f"{'='*60}")
164
 
165
  filtered_articles = []
166
  for _, row in df_all.iterrows():
@@ -168,68 +210,74 @@ for _, row in df_all.iterrows():
168
  matched = [kw for kw in FINTECH_AI_KEYWORDS if kw.replace(" ", "") in text.replace(" ", "")]
169
  if matched:
170
  row_dict = row.to_dict()
171
- row_dict['matched_keywords'] = ', '.join(matched)
172
  filtered_articles.append(row_dict)
173
 
174
  df_filtered = pd.DataFrame(filtered_articles)
175
 
176
- print(f' 전체 수집: {len(df_all)}건')
177
- print(f' AI 핀테크 관련: {len(df_filtered)}건 ({len(df_filtered)/max(len(df_all),1)*100:.1f}%)')
178
- print(f'\n [키워드별 매칭 현황]')
179
- all_kw = [kw for row in filtered_articles for kw in row['matched_keywords'].split(', ')]
180
  kw_counts = Counter(all_kw)
181
  for kw in FINTECH_AI_KEYWORDS:
182
- print(f' {kw}: {kw_counts.get(kw, 0)}건')
183
 
184
  df_filtered
185
 
186
  # ── 3단계: 저장 ──
187
  output_filename = f"Articles_{datetime.now().strftime('%Y%m%d_%H%M%S')}.xlsx"
188
- df_filtered.to_excel(output_filename, index=False, engine='openpyxl')
189
- print(f'[SAVE] ✅ 저장 완료: {output_filename}')
190
- print(f'[SAVE] - AI 핀테크 기사: {len(df_filtered)}건')
191
-
192
-
193
 
194
 
195
  # ── 4단계: 키워드 빈도 시각화 ──
196
- import matplotlib.pyplot as plt
197
  import platform
198
  from collections import Counter
199
 
 
 
200
  # 폰트 깨짐 방지 (Mac 환경: AppleGothic)
201
- if platform.system() == 'Darwin':
202
- plt.rc('font', family='AppleGothic')
203
- plt.rcParams['axes.unicode_minus'] = False
204
 
205
  if not filtered_articles:
206
- print('시각화할 데이터가 없습니다.')
207
  else:
208
  # 빈도수 계산
209
- all_kw = [kw for row in filtered_articles for kw in row['matched_keywords'].split(', ')]
210
  kw_counts = Counter(all_kw)
211
-
212
  # 📌 변경 포인트: FINTECH_AI_KEYWORDS 전체 목록을 순서대로 그래프에 강제 표시 (0건 포함)
213
  keywords = FINTECH_AI_KEYWORDS
214
  counts = [kw_counts.get(kw, 0) for kw in keywords]
215
-
216
  plt.figure(figsize=(12, 6))
217
-
218
  # 막대 그래프 생성
219
- bars = plt.bar(keywords, counts, color='skyblue', edgecolor='white')
220
-
221
  # 막대 위에 숫자(빈도수) 표시
222
  for bar in bars:
223
  height = bar.get_height()
224
  # 막대의 중앙(x), 막대의 높이(y) 위치에 텍스트를 배치
225
- plt.text(bar.get_x() + bar.get_width() / 2.0, height, f'{height}',
226
- ha='center', va='bottom', size=11, fontweight='bold', color='black')
 
 
 
 
 
 
 
 
227
 
228
- plt.title('수집된 AI 핀테크 기사 키워드 출현 빈도 (전체)', fontsize=15, pad=15)
229
- plt.xlabel('키워드', fontsize=12)
230
- plt.ylabel('출현 횟수 (건)', fontsize=12)
231
- plt.grid(axis='y', linestyle='--', alpha=0.7)
232
  plt.xticks(rotation=45)
233
  plt.tight_layout()
234
  plt.show()
235
-
 
1
+ import re
2
+ import time
3
+ from collections import Counter
4
+ from datetime import datetime
5
 
6
+ import pandas as pd
 
7
  from selenium import webdriver
8
+ from selenium.webdriver.chrome.service import Service
9
  from selenium.webdriver.common.by import By
10
  from webdriver_manager.chrome import ChromeDriverManager
 
 
 
 
 
 
11
 
12
  # 수집 대상 카테고리
13
  categories = {
14
+ "경제": "https://news.naver.com/section/101",
15
+ "IT/과학": "https://news.naver.com/section/105",
16
  }
17
  NUM_ARTICLES_PER_CATEGORY = 80
18
 
19
  # AI 핀테크 키워드 (FinNode 프로젝트 전용)
20
  FINTECH_AI_KEYWORDS = [
21
  # AI 기술
22
+ "AI",
23
+ "인공지능",
24
+ "생성형 AI",
25
+ "대규모언어모델",
26
  # AI 핀테크 (금융)
27
+ "핀테크",
28
  ]
29
 
30
+ print("[INIT] ChromeDriver 초기화 중...")
31
  service = Service(ChromeDriverManager().install())
32
  options = webdriver.ChromeOptions()
33
+ options.add_argument("--no-sandbox")
34
+ options.add_argument("--disable-dev-shm-usage")
35
  driver = webdriver.Chrome(service=service, options=options)
36
+ print("[INIT] ✅ 브라우저 실행 완료")
37
+
38
 
39
  def get_article_links(driver, category_url, num_articles):
40
+ print(f" [LINK] 페이지 이동: {category_url}")
41
  driver.get(category_url)
42
  time.sleep(3)
43
+ print(f" [LINK] 로드 완료 (title: {driver.title})")
44
 
45
  article_links = []
46
  selectors = [
47
+ "a.sa_text_title",
48
+ "a.sa_text_lede",
49
+ "a.sa_text_strong",
50
+ ".sa_text a",
51
+ ".cluster_text_headline a",
52
+ ".cluster_text_lede a",
53
  ]
54
 
55
  for selector in selectors:
56
  elements = driver.find_elements(By.CSS_SELECTOR, selector)
57
  print(f" [LINK] 셀렉터 '{selector}' -> {len(elements)}개 발견")
58
  for element in elements:
59
+ url = element.get_attribute("href")
60
+ if (
61
+ url
62
+ and "news.naver.com" in url
63
+ and "/article/" in url
64
+ and "/comment/" not in url
65
+ and url not in article_links
66
+ ):
67
  article_links.append(url)
68
  if len(article_links) >= num_articles:
69
  break
70
  if len(article_links) >= num_articles:
71
  break
72
 
73
+ print(f" [LINK] ✅ 총 {len(article_links)}개 링크 확보\n")
74
  return article_links[:num_articles]
75
 
76
+
77
  def parse_article_detail(driver, article_url, category):
78
  driver.get(article_url)
79
  time.sleep(1.5)
80
  article_data = {
81
+ "article_id": "",
82
+ "title": "",
83
+ "content": "",
84
+ "url": article_url,
85
+ "published_date": "",
86
+ "source": "",
87
+ "author": "",
88
+ "category": category,
89
  }
90
  try:
91
+ match = re.search(r"article/(\d+)/(\d+)", article_url)
92
+ article_data["article_id"] = (
93
+ f"ART_{match.group(1)}_{match.group(2)}" if match else f"ART_{datetime.now().strftime('%Y%m%d%H%M%S')}"
 
94
  )
95
+ for sel in [
96
+ "#title_area span",
97
+ "#ct .media_end_head_headline",
98
+ ".media_end_head_headline",
99
+ "h2#title_area",
100
+ ".news_end_title",
101
+ ]:
102
  try:
103
  el = driver.find_element(By.CSS_SELECTOR, sel)
104
  if el.text.strip():
105
+ article_data["title"] = el.text.strip()
106
+ break
107
+ except:
108
+ continue
109
+ for sel in [
110
+ "#dic_area",
111
+ "article#dic_area",
112
+ ".go_trans._article_content",
113
+ "._article_body_contents",
114
+ ]:
115
  try:
116
  el = driver.find_element(By.CSS_SELECTOR, sel)
117
  if el.text.strip():
118
+ article_data["content"] = el.text.strip()
119
+ break
120
+ except:
121
+ continue
122
  try:
123
+ el = driver.find_element(By.CSS_SELECTOR, "a.media_end_head_top_logo img")
124
+ article_data["source"] = el.get_attribute("alt")
125
  except:
126
  try:
127
+ el = driver.find_element(By.CSS_SELECTOR, ".media_end_head_top_logo_text")
128
+ article_data["source"] = el.text.strip()
129
+ except:
130
+ pass
131
  try:
132
+ el = driver.find_element(
133
+ By.CSS_SELECTOR,
134
+ "span.media_end_head_info_datestamp_time, span[data-date-time]",
135
+ )
136
+ article_data["published_date"] = (el.get_attribute("data-date-time") or el.text).strip()
137
  except:
138
+ article_data["published_date"] = datetime.now().strftime("%Y-%m-%d %H:%M")
139
  try:
140
+ el = driver.find_element(
141
+ By.CSS_SELECTOR,
142
+ "em.media_end_head_journalist_name, span.byline_s",
143
+ )
144
+ article_data["author"] = el.text.strip()
145
+ except:
146
+ pass
147
  except Exception as e:
148
+ print(f" [PARSE] ⚠️ 파싱 오류: {e}")
149
  return article_data
150
 
151
+
152
  # ── 1단계: 전체 기사 수집 ──
153
  all_articles = []
154
  category_stats = {}
155
 
156
  for category_name, category_url in categories.items():
157
+ print(f"\n{'=' * 60}")
158
+ print(f"[CRAWL] [{category_name}] 카테고리 수집 시작")
159
+ print(f"{'=' * 60}")
160
 
161
  article_links = get_article_links(driver, category_url, NUM_ARTICLES_PER_CATEGORY)
162
 
163
  cat_ok, cat_fail = 0, 0
164
  for idx, article_url in enumerate(article_links, 1):
165
+ print(f" [PARSE] ({idx}/{len(article_links)}) {article_url[:70]}...")
166
  article_data = parse_article_detail(driver, article_url, category_name)
167
 
168
+ if article_data["title"] and article_data["content"]:
169
  all_articles.append(article_data)
170
  cat_ok += 1
171
  print(f" ✅ {article_data['title'][:40]}...")
172
  print(f" 언론사: {article_data['source']} | 날짜: {article_data['published_date']}")
173
  else:
174
  cat_fail += 1
175
+ missing = [
176
+ x
177
+ for x, v in [
178
+ ("제목", article_data["title"]),
179
+ ("본문", article_data["content"]),
180
+ ]
181
+ if not v
182
+ ]
183
  print(f" ❌ 파싱실패 ({', '.join(missing)} 없음)")
184
  time.sleep(0.5)
185
 
186
+ category_stats[category_name] = {"ok": cat_ok, "fail": cat_fail}
187
  print(f"\n [CRAWL] [{category_name}] 완료: 성공 {cat_ok}개 / 실패 {cat_fail}개")
188
 
189
  driver.quit()
190
+ print("\n[DONE] 브라우저 종료")
191
+ print(f"\n{'=' * 60}")
192
+ print("[SUMMARY] 수집 결과 요약")
193
+ print(f"{'=' * 60}")
194
  for cat, s in category_stats.items():
195
+ print(f" {cat}: 성공 {s['ok']}건 / 실패 {s['fail']}건")
196
+ print(f" 전체 수집: {len(all_articles)}건")
197
 
198
  df_all = pd.DataFrame(all_articles)
199
  df_all
200
 
201
 
 
 
202
  # ── 2단계: AI 핀테크 키워드 필터링 ──
203
+ print(f"\n{'=' * 60}")
204
+ print("[FILTER] AI 핀테크 키워드 필터링 시작")
205
+ print(f"{'=' * 60}")
206
 
207
  filtered_articles = []
208
  for _, row in df_all.iterrows():
 
210
  matched = [kw for kw in FINTECH_AI_KEYWORDS if kw.replace(" ", "") in text.replace(" ", "")]
211
  if matched:
212
  row_dict = row.to_dict()
213
+ row_dict["matched_keywords"] = ", ".join(matched)
214
  filtered_articles.append(row_dict)
215
 
216
  df_filtered = pd.DataFrame(filtered_articles)
217
 
218
+ print(f" 전체 수집: {len(df_all)}건")
219
+ print(f" AI 핀테크 관련: {len(df_filtered)}건 ({len(df_filtered) / max(len(df_all), 1) * 100:.1f}%)")
220
+ print("\n [키워드별 매칭 현황]")
221
+ all_kw = [kw for row in filtered_articles for kw in row["matched_keywords"].split(", ")]
222
  kw_counts = Counter(all_kw)
223
  for kw in FINTECH_AI_KEYWORDS:
224
+ print(f" {kw}: {kw_counts.get(kw, 0)}건")
225
 
226
  df_filtered
227
 
228
  # ── 3단계: 저장 ──
229
  output_filename = f"Articles_{datetime.now().strftime('%Y%m%d_%H%M%S')}.xlsx"
230
+ df_filtered.to_excel(output_filename, index=False, engine="openpyxl")
231
+ print(f"[SAVE] ✅ 저장 완료: {output_filename}")
232
+ print(f"[SAVE] - AI 핀테크 기사: {len(df_filtered)}건")
 
 
233
 
234
 
235
  # ── 4단계: 키워드 빈도 시각화 ──
 
236
  import platform
237
  from collections import Counter
238
 
239
+ import matplotlib.pyplot as plt
240
+
241
  # 폰트 깨짐 방지 (Mac 환경: AppleGothic)
242
+ if platform.system() == "Darwin":
243
+ plt.rc("font", family="AppleGothic")
244
+ plt.rcParams["axes.unicode_minus"] = False
245
 
246
  if not filtered_articles:
247
+ print("시각화할 데이터가 없습니다.")
248
  else:
249
  # 빈도수 계산
250
+ all_kw = [kw for row in filtered_articles for kw in row["matched_keywords"].split(", ")]
251
  kw_counts = Counter(all_kw)
252
+
253
  # 📌 변경 포인트: FINTECH_AI_KEYWORDS 전체 목록을 순서대로 그래프에 강제 표시 (0건 포함)
254
  keywords = FINTECH_AI_KEYWORDS
255
  counts = [kw_counts.get(kw, 0) for kw in keywords]
256
+
257
  plt.figure(figsize=(12, 6))
258
+
259
  # 막대 그래프 생성
260
+ bars = plt.bar(keywords, counts, color="skyblue", edgecolor="white")
261
+
262
  # 막대 위에 숫자(빈도수) 표시
263
  for bar in bars:
264
  height = bar.get_height()
265
  # 막대의 중앙(x), 막대의 높이(y) 위치에 텍스트를 배치
266
+ plt.text(
267
+ bar.get_x() + bar.get_width() / 2.0,
268
+ height,
269
+ f"{height}",
270
+ ha="center",
271
+ va="bottom",
272
+ size=11,
273
+ fontweight="bold",
274
+ color="black",
275
+ )
276
 
277
+ plt.title("수집된 AI 핀테크 기사 키워드 출현 빈도 (전체)", fontsize=15, pad=15)
278
+ plt.xlabel("키워드", fontsize=12)
279
+ plt.ylabel("출현 횟수 (건)", fontsize=12)
280
+ plt.grid(axis="y", linestyle="--", alpha=0.7)
281
  plt.xticks(rotation=45)
282
  plt.tight_layout()
283
  plt.show()
 
src/retrieval/finRetrieval.py CHANGED
@@ -11,17 +11,18 @@ app.py에서 import하여 Gradio 챗봇과 연동합니다.
11
  """
12
 
13
  import os
 
14
  import dotenv
15
  import neo4j
16
- from neo4j_graphrag.llm import OpenAILLM
17
  from neo4j_graphrag.embeddings.openai import OpenAIEmbeddings
 
 
18
  from neo4j_graphrag.retrievers import (
19
- VectorRetriever,
20
- VectorCypherRetriever,
21
  Text2CypherRetriever,
22
  ToolsRetriever,
 
 
23
  )
24
- from neo4j_graphrag.generation import RagTemplate, GraphRAG
25
 
26
  dotenv.load_dotenv()
27
 
@@ -29,11 +30,14 @@ dotenv.load_dotenv()
29
  # 1. DB / LLM / Embedder 초기화
30
  # ──────────────────────────────────────────
31
 
32
- URI = os.getenv("NEO4J_URI", "neo4j://localhost:7687")
33
- AUTH = (os.getenv("NEO4J_USERNAME", "neo4j"), os.getenv("NEO4J_PASSWORD", "password"))
34
- driver = neo4j.GraphDatabase.driver(URI, auth=AUTH)
 
 
 
35
 
36
- rag_llm = OpenAILLM(model_name="gpt-4o", model_params={"temperature": 0})
37
  embedder = OpenAIEmbeddings(model="text-embedding-3-small")
38
 
39
  INDEX_NAME = "content_vector_index"
@@ -76,6 +80,7 @@ vector_cypher_retriever = VectorCypherRetriever(
76
  embedder=embedder,
77
  )
78
 
 
79
  # (3) 자연어 → Cypher 자동 변환 검색
80
  def _get_schema() -> str:
81
  with driver.session() as s:
@@ -85,9 +90,7 @@ def _get_schema() -> str:
85
  "RETURN nodeType, collect(propertyName) as props"
86
  ).data()
87
  rels = s.run(
88
- "MATCH (n)-[r]->(m) "
89
- "RETURN DISTINCT labels(n)[0] as src, type(r) as rel, labels(m)[0] as tgt "
90
- "LIMIT 30"
91
  ).data()
92
  txt = "=== Neo4j Schema ===\n노드:\n"
93
  for n in nodes:
@@ -103,18 +106,15 @@ _examples = [
103
  CYPHER QUERY:
104
  MATCH (c:AICompany {name:"카카오"})-[:DEVELOPS]->(s:AIService)
105
  RETURN s.name, s.description""",
106
-
107
  """USER INPUT: 삼성전자가 개발 중인 AI 기술은?
108
  CYPHER QUERY:
109
  MATCH (c:AICompany {name:"삼성전자"})-[:DEVELOPS]->(t:AITechnology)
110
  RETURN t.name, t.description""",
111
-
112
  """USER INPUT: 최근 AI 관련 기사 5개
113
  CYPHER QUERY:
114
  MATCH (a:Article)-[:MENTIONS]->(:AICompany)
115
  RETURN DISTINCT a.article_id, a.title, a.url, a.published_date
116
  ORDER BY a.published_date DESC LIMIT 5""",
117
-
118
  """USER INPUT: 어떤 기업이 LLM 기술을 개발하나요?
119
  CYPHER QUERY:
120
  MATCH (c:AICompany)-[:DEVELOPS]->(t:AITechnology)
 
11
  """
12
 
13
  import os
14
+
15
  import dotenv
16
  import neo4j
 
17
  from neo4j_graphrag.embeddings.openai import OpenAIEmbeddings
18
+ from neo4j_graphrag.generation import GraphRAG, RagTemplate
19
+ from neo4j_graphrag.llm import OpenAILLM
20
  from neo4j_graphrag.retrievers import (
 
 
21
  Text2CypherRetriever,
22
  ToolsRetriever,
23
+ VectorCypherRetriever,
24
+ VectorRetriever,
25
  )
 
26
 
27
  dotenv.load_dotenv()
28
 
 
30
  # 1. DB / LLM / Embedder 초기화
31
  # ──────────────────────────────────────────
32
 
33
+ URI = os.getenv("NEO4J_URI", "neo4j://localhost:7687")
34
+ AUTH = (
35
+ os.getenv("NEO4J_USERNAME", "neo4j"),
36
+ os.getenv("NEO4J_PASSWORD", "password"),
37
+ )
38
+ driver = neo4j.GraphDatabase.driver(URI, auth=AUTH)
39
 
40
+ rag_llm = OpenAILLM(model_name="gpt-4o", model_params={"temperature": 0})
41
  embedder = OpenAIEmbeddings(model="text-embedding-3-small")
42
 
43
  INDEX_NAME = "content_vector_index"
 
80
  embedder=embedder,
81
  )
82
 
83
+
84
  # (3) 자연어 → Cypher 자동 변환 검색
85
  def _get_schema() -> str:
86
  with driver.session() as s:
 
90
  "RETURN nodeType, collect(propertyName) as props"
91
  ).data()
92
  rels = s.run(
93
+ "MATCH (n)-[r]->(m) RETURN DISTINCT labels(n)[0] as src, type(r) as rel, labels(m)[0] as tgt LIMIT 30"
 
 
94
  ).data()
95
  txt = "=== Neo4j Schema ===\n노드:\n"
96
  for n in nodes:
 
106
  CYPHER QUERY:
107
  MATCH (c:AICompany {name:"카카오"})-[:DEVELOPS]->(s:AIService)
108
  RETURN s.name, s.description""",
 
109
  """USER INPUT: 삼성전자가 개발 중인 AI 기술은?
110
  CYPHER QUERY:
111
  MATCH (c:AICompany {name:"삼성전자"})-[:DEVELOPS]->(t:AITechnology)
112
  RETURN t.name, t.description""",
 
113
  """USER INPUT: 최근 AI 관련 기사 5개
114
  CYPHER QUERY:
115
  MATCH (a:Article)-[:MENTIONS]->(:AICompany)
116
  RETURN DISTINCT a.article_id, a.title, a.url, a.published_date
117
  ORDER BY a.published_date DESC LIMIT 5""",
 
118
  """USER INPUT: 어떤 기업이 LLM 기술을 개발하나요?
119
  CYPHER QUERY:
120
  MATCH (c:AICompany)-[:DEVELOPS]->(t:AITechnology)
tests/test_chunk_text.py CHANGED
@@ -1,5 +1,6 @@
1
  from src.graphBuilder.neo4j.finGraph import chunk_text
2
 
 
3
  def test_chunk_text_empty_returns_empty_list():
4
  assert chunk_text("") == []
5
 
 
1
  from src.graphBuilder.neo4j.finGraph import chunk_text
2
 
3
+
4
  def test_chunk_text_empty_returns_empty_list():
5
  assert chunk_text("") == []
6
 
tests/test_retrieval.py CHANGED
@@ -1,5 +1,7 @@
1
  import os
 
2
  import pytest
 
3
  from src.retrieval.finRetrieval import graphrag
4
 
5
  # API 키와 Neo4j 연결정보가 없을 경우 테스트를 건너뜁니다.
 
1
  import os
2
+
3
  import pytest
4
+
5
  from src.retrieval.finRetrieval import graphrag
6
 
7
  # API 키와 Neo4j 연결정보가 없을 경우 테스트를 건너뜁니다.