style: 린트 포맷팅 자동 적용 및 프로젝트 기획안(AGENTS.md, README.md) 보완
Browse files- AGENTS.md +4 -1
- README.md +10 -0
- app.py +4 -3
- run_pipeline.py +3 -1
- src/graphBuilder/neo4j/finGraph.py +70 -41
- src/graphBuilder/scrapping/finScrapping.py +147 -99
- src/retrieval/finRetrieval.py +14 -14
- tests/test_chunk_text.py +1 -0
- tests/test_retrieval.py +2 -0
AGENTS.md
CHANGED
|
@@ -1,9 +1,12 @@
|
|
| 1 |
###### 참고: https://wikidocs.net/340866
|
|
|
|
|
|
|
|
|
|
| 2 |
|
| 3 |
# AGENTS.md
|
| 4 |
|
| 5 |
## 프로젝트 개요
|
| 6 |
-
- 목적:
|
| 7 |
- 언어: Python 3.10
|
| 8 |
- 기술스택: GraphRAG, LangChain, LangGraph, Neo4j, HugingFace, Gradio
|
| 9 |
|
|
|
|
| 1 |
###### 참고: https://wikidocs.net/340866
|
| 2 |
+
###### 하네스 엔지니어링: Global지침, Skills와 Workflow를 모두 포함하는 지침
|
| 3 |
+
###### 개발 시작부터 배포까지 모든 것은 AGENTS.md에 기록한다.
|
| 4 |
+
###### 예를들어 개발 단계에서 체크리스트를 만들어서 개발을 할 때마다 하나씩 체크하도록 지시한다.
|
| 5 |
|
| 6 |
# AGENTS.md
|
| 7 |
|
| 8 |
## 프로젝트 개요
|
| 9 |
+
- 목적: AI 기반 핀테크 기술의 트렌드를 파악하도록 돕는 챗봇
|
| 10 |
- 언어: Python 3.10
|
| 11 |
- 기술스택: GraphRAG, LangChain, LangGraph, Neo4j, HugingFace, Gradio
|
| 12 |
|
README.md
CHANGED
|
@@ -1,3 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
# FinNode 🕸️
|
| 2 |
|
| 3 |
**Neo4j GraphRAG 기반 AI 뉴스 지식 그래프 플랫폼**
|
|
|
|
| 1 |
+
---
|
| 2 |
+
title: FinGraph
|
| 3 |
+
emoji: 🕸️
|
| 4 |
+
colorFrom: indigo
|
| 5 |
+
colorTo: indigo
|
| 6 |
+
sdk: gradio
|
| 7 |
+
sdk_version: 4.44.0
|
| 8 |
+
app_file: app.py
|
| 9 |
+
pinned: false
|
| 10 |
+
---
|
| 11 |
# FinNode 🕸️
|
| 12 |
|
| 13 |
**Neo4j GraphRAG 기반 AI 뉴스 지식 그래프 플랫폼**
|
app.py
CHANGED
|
@@ -8,11 +8,12 @@ Gradio ChatInterface + LangGraph 기반 대화 흐름 제어.
|
|
| 8 |
python app.py
|
| 9 |
"""
|
| 10 |
|
| 11 |
-
import
|
|
|
|
| 12 |
import dotenv
|
| 13 |
import gradio as gr
|
| 14 |
-
from
|
| 15 |
-
|
| 16 |
from src.retrieval.finRetrieval import graphrag
|
| 17 |
|
| 18 |
dotenv.load_dotenv()
|
|
|
|
| 8 |
python app.py
|
| 9 |
"""
|
| 10 |
|
| 11 |
+
from typing import List, TypedDict
|
| 12 |
+
|
| 13 |
import dotenv
|
| 14 |
import gradio as gr
|
| 15 |
+
from langgraph.graph import END, StateGraph
|
| 16 |
+
|
| 17 |
from src.retrieval.finRetrieval import graphrag
|
| 18 |
|
| 19 |
dotenv.load_dotenv()
|
run_pipeline.py
CHANGED
|
@@ -1,6 +1,8 @@
|
|
| 1 |
import json
|
|
|
|
|
|
|
| 2 |
from pipeline.workflow import pipeline
|
| 3 |
-
|
| 4 |
|
| 5 |
def run_test():
|
| 6 |
# 1. 모의 테스트용 뉴스 기사 데이터 준비
|
|
|
|
| 1 |
import json
|
| 2 |
+
|
| 3 |
+
from pipeline.db_writer import chunk_and_embed_article, write_graph_to_neo4j
|
| 4 |
from pipeline.workflow import pipeline
|
| 5 |
+
|
| 6 |
|
| 7 |
def run_test():
|
| 8 |
# 1. 모의 테스트용 뉴스 기사 데이터 준비
|
src/graphBuilder/neo4j/finGraph.py
CHANGED
|
@@ -10,27 +10,31 @@ finGraph.py — AI 뉴스 지식 그래프 빌더
|
|
| 10 |
MENTIONS, HAS_CHUNK, PUBLISHED
|
| 11 |
"""
|
| 12 |
|
| 13 |
-
import os
|
| 14 |
import glob
|
| 15 |
import json
|
| 16 |
-
import
|
| 17 |
-
import
|
|
|
|
| 18 |
import dotenv
|
| 19 |
-
|
|
|
|
| 20 |
from langchain_openai import ChatOpenAI
|
| 21 |
-
from langgraph.graph import
|
| 22 |
-
from neo4j_graphrag.llm import OpenAILLM
|
| 23 |
from neo4j_graphrag.embeddings.openai import OpenAIEmbeddings
|
| 24 |
from neo4j_graphrag.indexes import create_vector_index
|
|
|
|
| 25 |
|
| 26 |
dotenv.load_dotenv()
|
| 27 |
|
| 28 |
-
URI
|
| 29 |
-
AUTH
|
| 30 |
-
|
|
|
|
|
|
|
|
|
|
| 31 |
|
| 32 |
chat_llm = ChatOpenAI(model="gpt-4o-mini", temperature=0)
|
| 33 |
-
rag_llm
|
| 34 |
embedder = OpenAIEmbeddings(model="text-embedding-3-small")
|
| 35 |
|
| 36 |
INDEX_NAME = "content_vector_index"
|
|
@@ -39,6 +43,7 @@ INDEX_NAME = "content_vector_index"
|
|
| 39 |
# 1. LangGraph 파이프라인 정의 (엔티티/관계 추출)
|
| 40 |
# ──────────────────────────────────────────
|
| 41 |
|
|
|
|
| 42 |
class ArticleState(TypedDict):
|
| 43 |
article_id: str
|
| 44 |
title: str
|
|
@@ -55,7 +60,10 @@ def check_ai_relevance(state: ArticleState) -> ArticleState:
|
|
| 55 |
f"{state['text'][:400]}\n\n답변(yes/no):"
|
| 56 |
)
|
| 57 |
res = chat_llm.invoke(prompt)
|
| 58 |
-
return {
|
|
|
|
|
|
|
|
|
|
| 59 |
|
| 60 |
|
| 61 |
def extract_entities(state: ArticleState) -> ArticleState:
|
|
@@ -67,8 +75,8 @@ def extract_entities(state: ArticleState) -> ArticleState:
|
|
| 67 |
- AIService: 서비스/제품 (예: ChatGPT, HyperCLOVA X)
|
| 68 |
- AIField: 적용 분야 (예: 금융AI, AI 반도체)
|
| 69 |
|
| 70 |
-
제목: {state[
|
| 71 |
-
본문: {state[
|
| 72 |
|
| 73 |
JSON으로만 응답:{{"entities":[{{"name":"...","type":"AICompany|AITechnology|AIService|AIField","description":"..."}}]}}"""
|
| 74 |
res = chat_llm.invoke(prompt)
|
|
@@ -156,19 +164,28 @@ def upsert_entity(tx, e: Dict) -> None:
|
|
| 156 |
f"MERGE (n:{ntype} {{name:$name}}) "
|
| 157 |
"ON CREATE SET n.description=$desc "
|
| 158 |
"ON MATCH SET n.description=COALESCE(n.description,$desc)",
|
| 159 |
-
name=e["name"],
|
|
|
|
| 160 |
)
|
| 161 |
|
| 162 |
|
| 163 |
def upsert_relation(tx, r: Dict) -> None:
|
| 164 |
rel = r.get("relation", "RELATED_TO").upper().replace(" ", "_")
|
| 165 |
-
allowed = {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 166 |
if rel not in allowed:
|
| 167 |
return
|
| 168 |
try:
|
| 169 |
tx.run(
|
| 170 |
f"MATCH (s {{name:$src}}) MATCH (t {{name:$tgt}}) MERGE (s)-[:{rel}]->(t)",
|
| 171 |
-
src=r["source"],
|
|
|
|
| 172 |
)
|
| 173 |
except Exception:
|
| 174 |
pass
|
|
@@ -176,24 +193,25 @@ def upsert_relation(tx, r: Dict) -> None:
|
|
| 176 |
|
| 177 |
def upsert_article_and_mentions(tx, row: pd.Series, entities: List[Dict]) -> None:
|
| 178 |
tx.run(
|
| 179 |
-
"MERGE (a:Article {article_id:$aid}) "
|
| 180 |
-
|
| 181 |
-
|
| 182 |
-
url=row.get("url", ""),
|
|
|
|
| 183 |
)
|
| 184 |
if pd.notna(row.get("source", "")):
|
| 185 |
tx.run(
|
| 186 |
-
"MERGE (m:Media {name:$src}) "
|
| 187 |
-
|
| 188 |
-
|
| 189 |
)
|
| 190 |
for e in entities:
|
| 191 |
ntype = ENTITY_TYPE_MAP.get(e.get("type", "AICompany"), "AICompany")
|
| 192 |
try:
|
| 193 |
tx.run(
|
| 194 |
-
f"MATCH (a:Article {{article_id:$aid}}) "
|
| 195 |
-
|
| 196 |
-
|
| 197 |
)
|
| 198 |
except Exception:
|
| 199 |
pass
|
|
@@ -203,17 +221,14 @@ def chunk_text(text: str, size: int = 500, overlap: int = 50) -> List[str]:
|
|
| 203 |
if not text or pd.isna(text):
|
| 204 |
return []
|
| 205 |
text = str(text)
|
| 206 |
-
return [
|
| 207 |
-
text[i:i + size].strip()
|
| 208 |
-
for i in range(0, len(text), size - overlap)
|
| 209 |
-
if text[i:i + size].strip()
|
| 210 |
-
]
|
| 211 |
|
| 212 |
|
| 213 |
# ──────────────────────────────────────────
|
| 214 |
# 3. 메인 실행 (스크립트로 직접 호출 시)
|
| 215 |
# ──────────────────────────────────────────
|
| 216 |
|
|
|
|
| 217 |
def main() -> None:
|
| 218 |
# 최신 엑셀 로드
|
| 219 |
xlsx_files = sorted(glob.glob("Articles_*.xlsx"))
|
|
@@ -232,12 +247,16 @@ def main() -> None:
|
|
| 232 |
# 엔티티/관계 추출 및 적재
|
| 233 |
print(f"총 {len(df)}건 처리 시작...")
|
| 234 |
for idx, row in df.iterrows():
|
| 235 |
-
aid
|
| 236 |
title = str(row.get("title", ""))
|
| 237 |
-
text
|
| 238 |
state: ArticleState = dict(
|
| 239 |
-
article_id=aid,
|
| 240 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 241 |
)
|
| 242 |
out = pipeline.invoke(state)
|
| 243 |
if out["is_ai_related"]:
|
|
@@ -247,15 +266,15 @@ def main() -> None:
|
|
| 247 |
for r in out["relations"]:
|
| 248 |
s.execute_write(upsert_relation, r)
|
| 249 |
s.execute_write(upsert_article_and_mentions, row, out["entities"])
|
| 250 |
-
print(f" ✅ [{idx+1}/{len(df)}] {title[:35]}... | 엔티티: {[e['name'] for e in out['entities'][:4]]}")
|
| 251 |
else:
|
| 252 |
-
print(f" ⏭️ [{idx+1}/{len(df)}] AI 비관련: {title[:35]}...")
|
| 253 |
print("\n✅ 엔티티/관계 추출 및 Neo4j 적재 완료")
|
| 254 |
|
| 255 |
# Content 청킹 + 임베딩
|
| 256 |
print("Content 노드 생성 및 임베딩 시작...")
|
| 257 |
for idx, row in df.iterrows():
|
| 258 |
-
aid
|
| 259 |
chunks = chunk_text(str(row.get("content", "")))
|
| 260 |
with driver.session() as s:
|
| 261 |
for i, chunk in enumerate(chunks):
|
|
@@ -265,13 +284,23 @@ def main() -> None:
|
|
| 265 |
"MERGE (c:Content {content_id:$cid}) "
|
| 266 |
"SET c.chunk=$chunk, c.article_id=$aid, c.chunk_index=$i, c.embedding=$vec "
|
| 267 |
"WITH c MATCH (a:Article {article_id:$aid}) MERGE (a)-[:HAS_CHUNK]->(c)",
|
| 268 |
-
cid=cid,
|
|
|
|
|
|
|
|
|
|
|
|
|
| 269 |
)
|
| 270 |
print("✅ Content 노드 임베딩 완료")
|
| 271 |
|
| 272 |
# 벡터 인덱스 생성
|
| 273 |
-
create_vector_index(
|
| 274 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 275 |
print(f"✅ 벡터 인덱스 [{INDEX_NAME}] 생성 완료")
|
| 276 |
|
| 277 |
|
|
|
|
| 10 |
MENTIONS, HAS_CHUNK, PUBLISHED
|
| 11 |
"""
|
| 12 |
|
|
|
|
| 13 |
import glob
|
| 14 |
import json
|
| 15 |
+
import os
|
| 16 |
+
from typing import Dict, List, TypedDict
|
| 17 |
+
|
| 18 |
import dotenv
|
| 19 |
+
import neo4j
|
| 20 |
+
import pandas as pd
|
| 21 |
from langchain_openai import ChatOpenAI
|
| 22 |
+
from langgraph.graph import END, StateGraph
|
|
|
|
| 23 |
from neo4j_graphrag.embeddings.openai import OpenAIEmbeddings
|
| 24 |
from neo4j_graphrag.indexes import create_vector_index
|
| 25 |
+
from neo4j_graphrag.llm import OpenAILLM
|
| 26 |
|
| 27 |
dotenv.load_dotenv()
|
| 28 |
|
| 29 |
+
URI = os.getenv("NEO4J_URI", "neo4j://localhost:7687")
|
| 30 |
+
AUTH = (
|
| 31 |
+
os.getenv("NEO4J_USERNAME", "neo4j"),
|
| 32 |
+
os.getenv("NEO4J_PASSWORD", "password"),
|
| 33 |
+
)
|
| 34 |
+
driver = neo4j.GraphDatabase.driver(URI, auth=AUTH)
|
| 35 |
|
| 36 |
chat_llm = ChatOpenAI(model="gpt-4o-mini", temperature=0)
|
| 37 |
+
rag_llm = OpenAILLM(model_name="gpt-4o", model_params={"temperature": 0})
|
| 38 |
embedder = OpenAIEmbeddings(model="text-embedding-3-small")
|
| 39 |
|
| 40 |
INDEX_NAME = "content_vector_index"
|
|
|
|
| 43 |
# 1. LangGraph 파이프라인 정의 (엔티티/관계 추출)
|
| 44 |
# ──────────────────────────────────────────
|
| 45 |
|
| 46 |
+
|
| 47 |
class ArticleState(TypedDict):
|
| 48 |
article_id: str
|
| 49 |
title: str
|
|
|
|
| 60 |
f"{state['text'][:400]}\n\n답변(yes/no):"
|
| 61 |
)
|
| 62 |
res = chat_llm.invoke(prompt)
|
| 63 |
+
return {
|
| 64 |
+
**state,
|
| 65 |
+
"is_ai_related": res.content.strip().lower().startswith("yes"),
|
| 66 |
+
}
|
| 67 |
|
| 68 |
|
| 69 |
def extract_entities(state: ArticleState) -> ArticleState:
|
|
|
|
| 75 |
- AIService: 서비스/제품 (예: ChatGPT, HyperCLOVA X)
|
| 76 |
- AIField: 적용 분야 (예: 금융AI, AI 반도체)
|
| 77 |
|
| 78 |
+
제목: {state["title"]}
|
| 79 |
+
본문: {state["text"][:900]}
|
| 80 |
|
| 81 |
JSON으로만 응답:{{"entities":[{{"name":"...","type":"AICompany|AITechnology|AIService|AIField","description":"..."}}]}}"""
|
| 82 |
res = chat_llm.invoke(prompt)
|
|
|
|
| 164 |
f"MERGE (n:{ntype} {{name:$name}}) "
|
| 165 |
"ON CREATE SET n.description=$desc "
|
| 166 |
"ON MATCH SET n.description=COALESCE(n.description,$desc)",
|
| 167 |
+
name=e["name"],
|
| 168 |
+
desc=e.get("description", ""),
|
| 169 |
)
|
| 170 |
|
| 171 |
|
| 172 |
def upsert_relation(tx, r: Dict) -> None:
|
| 173 |
rel = r.get("relation", "RELATED_TO").upper().replace(" ", "_")
|
| 174 |
+
allowed = {
|
| 175 |
+
"DEVELOPS",
|
| 176 |
+
"INVESTS_IN",
|
| 177 |
+
"PARTNERS_WITH",
|
| 178 |
+
"APPLIES",
|
| 179 |
+
"USED_IN",
|
| 180 |
+
"RELATED_TO",
|
| 181 |
+
}
|
| 182 |
if rel not in allowed:
|
| 183 |
return
|
| 184 |
try:
|
| 185 |
tx.run(
|
| 186 |
f"MATCH (s {{name:$src}}) MATCH (t {{name:$tgt}}) MERGE (s)-[:{rel}]->(t)",
|
| 187 |
+
src=r["source"],
|
| 188 |
+
tgt=r["target"],
|
| 189 |
)
|
| 190 |
except Exception:
|
| 191 |
pass
|
|
|
|
| 193 |
|
| 194 |
def upsert_article_and_mentions(tx, row: pd.Series, entities: List[Dict]) -> None:
|
| 195 |
tx.run(
|
| 196 |
+
"MERGE (a:Article {article_id:$aid}) SET a.title=$title, a.url=$url, a.published_date=$date",
|
| 197 |
+
aid=row.get("article_id", ""),
|
| 198 |
+
title=row.get("title", ""),
|
| 199 |
+
url=row.get("url", ""),
|
| 200 |
+
date=str(row.get("published_date", "")),
|
| 201 |
)
|
| 202 |
if pd.notna(row.get("source", "")):
|
| 203 |
tx.run(
|
| 204 |
+
"MERGE (m:Media {name:$src}) WITH m MATCH (a:Article {article_id:$aid}) MERGE (m)-[:PUBLISHED]->(a)",
|
| 205 |
+
src=row["source"],
|
| 206 |
+
aid=row.get("article_id", ""),
|
| 207 |
)
|
| 208 |
for e in entities:
|
| 209 |
ntype = ENTITY_TYPE_MAP.get(e.get("type", "AICompany"), "AICompany")
|
| 210 |
try:
|
| 211 |
tx.run(
|
| 212 |
+
f"MATCH (a:Article {{article_id:$aid}}) MATCH (n:{ntype} {{name:$name}}) MERGE (a)-[:MENTIONS]->(n)",
|
| 213 |
+
aid=row.get("article_id", ""),
|
| 214 |
+
name=e["name"],
|
| 215 |
)
|
| 216 |
except Exception:
|
| 217 |
pass
|
|
|
|
| 221 |
if not text or pd.isna(text):
|
| 222 |
return []
|
| 223 |
text = str(text)
|
| 224 |
+
return [text[i : i + size].strip() for i in range(0, len(text), size - overlap) if text[i : i + size].strip()]
|
|
|
|
|
|
|
|
|
|
|
|
|
| 225 |
|
| 226 |
|
| 227 |
# ──────────────────────────────────────────
|
| 228 |
# 3. 메인 실행 (스크립트로 직접 호출 시)
|
| 229 |
# ──────────────────────────────────────────
|
| 230 |
|
| 231 |
+
|
| 232 |
def main() -> None:
|
| 233 |
# 최신 엑셀 로드
|
| 234 |
xlsx_files = sorted(glob.glob("Articles_*.xlsx"))
|
|
|
|
| 247 |
# 엔티티/관계 추출 및 적재
|
| 248 |
print(f"총 {len(df)}건 처리 시작...")
|
| 249 |
for idx, row in df.iterrows():
|
| 250 |
+
aid = str(row.get("article_id", f"ART_{idx}"))
|
| 251 |
title = str(row.get("title", ""))
|
| 252 |
+
text = title + "\n" + str(row.get("content", ""))
|
| 253 |
state: ArticleState = dict(
|
| 254 |
+
article_id=aid,
|
| 255 |
+
title=title,
|
| 256 |
+
text=text,
|
| 257 |
+
is_ai_related=False,
|
| 258 |
+
entities=[],
|
| 259 |
+
relations=[],
|
| 260 |
)
|
| 261 |
out = pipeline.invoke(state)
|
| 262 |
if out["is_ai_related"]:
|
|
|
|
| 266 |
for r in out["relations"]:
|
| 267 |
s.execute_write(upsert_relation, r)
|
| 268 |
s.execute_write(upsert_article_and_mentions, row, out["entities"])
|
| 269 |
+
print(f" ✅ [{idx + 1}/{len(df)}] {title[:35]}... | 엔티티: {[e['name'] for e in out['entities'][:4]]}")
|
| 270 |
else:
|
| 271 |
+
print(f" ⏭️ [{idx + 1}/{len(df)}] AI 비관련: {title[:35]}...")
|
| 272 |
print("\n✅ 엔티티/관계 추출 및 Neo4j 적재 완료")
|
| 273 |
|
| 274 |
# Content 청킹 + 임베딩
|
| 275 |
print("Content 노드 생성 및 임베딩 시작...")
|
| 276 |
for idx, row in df.iterrows():
|
| 277 |
+
aid = str(row.get("article_id", f"ART_{idx}"))
|
| 278 |
chunks = chunk_text(str(row.get("content", "")))
|
| 279 |
with driver.session() as s:
|
| 280 |
for i, chunk in enumerate(chunks):
|
|
|
|
| 284 |
"MERGE (c:Content {content_id:$cid}) "
|
| 285 |
"SET c.chunk=$chunk, c.article_id=$aid, c.chunk_index=$i, c.embedding=$vec "
|
| 286 |
"WITH c MATCH (a:Article {article_id:$aid}) MERGE (a)-[:HAS_CHUNK]->(c)",
|
| 287 |
+
cid=cid,
|
| 288 |
+
chunk=chunk,
|
| 289 |
+
aid=aid,
|
| 290 |
+
i=i,
|
| 291 |
+
vec=vec,
|
| 292 |
)
|
| 293 |
print("✅ Content 노드 임베딩 완료")
|
| 294 |
|
| 295 |
# 벡터 인덱스 생성
|
| 296 |
+
create_vector_index(
|
| 297 |
+
driver,
|
| 298 |
+
INDEX_NAME,
|
| 299 |
+
label="Content",
|
| 300 |
+
embedding_property="embedding",
|
| 301 |
+
dimensions=1536,
|
| 302 |
+
similarity_fn="cosine",
|
| 303 |
+
)
|
| 304 |
print(f"✅ 벡터 인덱스 [{INDEX_NAME}] 생성 완료")
|
| 305 |
|
| 306 |
|
src/graphBuilder/scrapping/finScrapping.py
CHANGED
|
@@ -1,166 +1,208 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
|
| 2 |
-
|
| 3 |
-
|
| 4 |
from selenium import webdriver
|
|
|
|
| 5 |
from selenium.webdriver.common.by import By
|
| 6 |
from webdriver_manager.chrome import ChromeDriverManager
|
| 7 |
-
from selenium.webdriver.chrome.service import Service
|
| 8 |
-
import pandas as pd
|
| 9 |
-
import time
|
| 10 |
-
from datetime import datetime
|
| 11 |
-
import re
|
| 12 |
-
from collections import Counter
|
| 13 |
|
| 14 |
# 수집 대상 카테고리
|
| 15 |
categories = {
|
| 16 |
-
|
| 17 |
-
|
| 18 |
}
|
| 19 |
NUM_ARTICLES_PER_CATEGORY = 80
|
| 20 |
|
| 21 |
# AI 핀테크 키워드 (FinNode 프로젝트 전용)
|
| 22 |
FINTECH_AI_KEYWORDS = [
|
| 23 |
# AI 기술
|
| 24 |
-
|
|
|
|
|
|
|
|
|
|
| 25 |
# AI 핀테크 (금융)
|
| 26 |
-
|
| 27 |
]
|
| 28 |
|
| 29 |
-
print(
|
| 30 |
service = Service(ChromeDriverManager().install())
|
| 31 |
options = webdriver.ChromeOptions()
|
| 32 |
-
options.add_argument(
|
| 33 |
-
options.add_argument(
|
| 34 |
driver = webdriver.Chrome(service=service, options=options)
|
| 35 |
-
print(
|
|
|
|
| 36 |
|
| 37 |
def get_article_links(driver, category_url, num_articles):
|
| 38 |
-
print(f
|
| 39 |
driver.get(category_url)
|
| 40 |
time.sleep(3)
|
| 41 |
-
print(f
|
| 42 |
|
| 43 |
article_links = []
|
| 44 |
selectors = [
|
| 45 |
-
|
| 46 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 47 |
]
|
| 48 |
|
| 49 |
for selector in selectors:
|
| 50 |
elements = driver.find_elements(By.CSS_SELECTOR, selector)
|
| 51 |
print(f" [LINK] 셀렉터 '{selector}' -> {len(elements)}개 발견")
|
| 52 |
for element in elements:
|
| 53 |
-
url = element.get_attribute(
|
| 54 |
-
if (
|
| 55 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 56 |
article_links.append(url)
|
| 57 |
if len(article_links) >= num_articles:
|
| 58 |
break
|
| 59 |
if len(article_links) >= num_articles:
|
| 60 |
break
|
| 61 |
|
| 62 |
-
print(f
|
| 63 |
return article_links[:num_articles]
|
| 64 |
|
|
|
|
| 65 |
def parse_article_detail(driver, article_url, category):
|
| 66 |
driver.get(article_url)
|
| 67 |
time.sleep(1.5)
|
| 68 |
article_data = {
|
| 69 |
-
|
| 70 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 71 |
}
|
| 72 |
try:
|
| 73 |
-
match = re.search(r
|
| 74 |
-
article_data[
|
| 75 |
-
f"ART_{match.group(1)}_{match.group(2)}" if match
|
| 76 |
-
else f"ART_{datetime.now().strftime('%Y%m%d%H%M%S')}"
|
| 77 |
)
|
| 78 |
-
for sel in [
|
| 79 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 80 |
try:
|
| 81 |
el = driver.find_element(By.CSS_SELECTOR, sel)
|
| 82 |
if el.text.strip():
|
| 83 |
-
article_data[
|
| 84 |
-
|
| 85 |
-
|
| 86 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 87 |
try:
|
| 88 |
el = driver.find_element(By.CSS_SELECTOR, sel)
|
| 89 |
if el.text.strip():
|
| 90 |
-
article_data[
|
| 91 |
-
|
|
|
|
|
|
|
| 92 |
try:
|
| 93 |
-
el = driver.find_element(By.CSS_SELECTOR,
|
| 94 |
-
article_data[
|
| 95 |
except:
|
| 96 |
try:
|
| 97 |
-
el = driver.find_element(By.CSS_SELECTOR,
|
| 98 |
-
article_data[
|
| 99 |
-
except:
|
|
|
|
| 100 |
try:
|
| 101 |
-
el = driver.find_element(
|
| 102 |
-
|
| 103 |
-
|
|
|
|
|
|
|
| 104 |
except:
|
| 105 |
-
article_data[
|
| 106 |
try:
|
| 107 |
-
el = driver.find_element(
|
| 108 |
-
|
| 109 |
-
|
| 110 |
-
|
|
|
|
|
|
|
|
|
|
| 111 |
except Exception as e:
|
| 112 |
-
print(f
|
| 113 |
return article_data
|
| 114 |
|
|
|
|
| 115 |
# ── 1단계: 전체 기사 수집 ──
|
| 116 |
all_articles = []
|
| 117 |
category_stats = {}
|
| 118 |
|
| 119 |
for category_name, category_url in categories.items():
|
| 120 |
-
print(f"\n{'='*60}")
|
| 121 |
-
print(f
|
| 122 |
-
print(f"{'='*60}")
|
| 123 |
|
| 124 |
article_links = get_article_links(driver, category_url, NUM_ARTICLES_PER_CATEGORY)
|
| 125 |
|
| 126 |
cat_ok, cat_fail = 0, 0
|
| 127 |
for idx, article_url in enumerate(article_links, 1):
|
| 128 |
-
print(f
|
| 129 |
article_data = parse_article_detail(driver, article_url, category_name)
|
| 130 |
|
| 131 |
-
if article_data[
|
| 132 |
all_articles.append(article_data)
|
| 133 |
cat_ok += 1
|
| 134 |
print(f" ✅ {article_data['title'][:40]}...")
|
| 135 |
print(f" 언론사: {article_data['source']} | 날짜: {article_data['published_date']}")
|
| 136 |
else:
|
| 137 |
cat_fail += 1
|
| 138 |
-
missing = [
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 139 |
print(f" ❌ 파싱실패 ({', '.join(missing)} 없음)")
|
| 140 |
time.sleep(0.5)
|
| 141 |
|
| 142 |
-
category_stats[category_name] = {
|
| 143 |
print(f"\n [CRAWL] [{category_name}] 완료: 성공 {cat_ok}개 / 실패 {cat_fail}개")
|
| 144 |
|
| 145 |
driver.quit()
|
| 146 |
-
print(
|
| 147 |
-
print(f"\n{'='*60}")
|
| 148 |
-
print(
|
| 149 |
-
print(f"{'='*60}")
|
| 150 |
for cat, s in category_stats.items():
|
| 151 |
-
print(f
|
| 152 |
-
print(f
|
| 153 |
|
| 154 |
df_all = pd.DataFrame(all_articles)
|
| 155 |
df_all
|
| 156 |
|
| 157 |
|
| 158 |
-
|
| 159 |
-
|
| 160 |
# ── 2단계: AI 핀테크 키워드 필터링 ──
|
| 161 |
-
print(f"\n{'='*60}")
|
| 162 |
-
print(
|
| 163 |
-
print(f"{'='*60}")
|
| 164 |
|
| 165 |
filtered_articles = []
|
| 166 |
for _, row in df_all.iterrows():
|
|
@@ -168,68 +210,74 @@ for _, row in df_all.iterrows():
|
|
| 168 |
matched = [kw for kw in FINTECH_AI_KEYWORDS if kw.replace(" ", "") in text.replace(" ", "")]
|
| 169 |
if matched:
|
| 170 |
row_dict = row.to_dict()
|
| 171 |
-
row_dict[
|
| 172 |
filtered_articles.append(row_dict)
|
| 173 |
|
| 174 |
df_filtered = pd.DataFrame(filtered_articles)
|
| 175 |
|
| 176 |
-
print(f
|
| 177 |
-
print(f
|
| 178 |
-
print(
|
| 179 |
-
all_kw = [kw for row in filtered_articles for kw in row[
|
| 180 |
kw_counts = Counter(all_kw)
|
| 181 |
for kw in FINTECH_AI_KEYWORDS:
|
| 182 |
-
print(f
|
| 183 |
|
| 184 |
df_filtered
|
| 185 |
|
| 186 |
# ── 3단계: 저장 ──
|
| 187 |
output_filename = f"Articles_{datetime.now().strftime('%Y%m%d_%H%M%S')}.xlsx"
|
| 188 |
-
df_filtered.to_excel(output_filename, index=False, engine=
|
| 189 |
-
print(f
|
| 190 |
-
print(f
|
| 191 |
-
|
| 192 |
-
|
| 193 |
|
| 194 |
|
| 195 |
# ── 4단계: 키워드 빈도 시각화 ──
|
| 196 |
-
import matplotlib.pyplot as plt
|
| 197 |
import platform
|
| 198 |
from collections import Counter
|
| 199 |
|
|
|
|
|
|
|
| 200 |
# 폰트 깨짐 방지 (Mac 환경: AppleGothic)
|
| 201 |
-
if platform.system() ==
|
| 202 |
-
plt.rc(
|
| 203 |
-
plt.rcParams[
|
| 204 |
|
| 205 |
if not filtered_articles:
|
| 206 |
-
print(
|
| 207 |
else:
|
| 208 |
# 빈도수 계산
|
| 209 |
-
all_kw = [kw for row in filtered_articles for kw in row[
|
| 210 |
kw_counts = Counter(all_kw)
|
| 211 |
-
|
| 212 |
# 📌 변경 포인트: FINTECH_AI_KEYWORDS 전체 목록을 순서대로 그래프에 강제 표시 (0건 포함)
|
| 213 |
keywords = FINTECH_AI_KEYWORDS
|
| 214 |
counts = [kw_counts.get(kw, 0) for kw in keywords]
|
| 215 |
-
|
| 216 |
plt.figure(figsize=(12, 6))
|
| 217 |
-
|
| 218 |
# 막대 그래프 생성
|
| 219 |
-
bars = plt.bar(keywords, counts, color=
|
| 220 |
-
|
| 221 |
# 막대 위에 숫자(빈도수) 표시
|
| 222 |
for bar in bars:
|
| 223 |
height = bar.get_height()
|
| 224 |
# 막대의 중앙(x), 막대의 높이(y) 위치에 텍스트를 배치
|
| 225 |
-
plt.text(
|
| 226 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 227 |
|
| 228 |
-
plt.title(
|
| 229 |
-
plt.xlabel(
|
| 230 |
-
plt.ylabel(
|
| 231 |
-
plt.grid(axis=
|
| 232 |
plt.xticks(rotation=45)
|
| 233 |
plt.tight_layout()
|
| 234 |
plt.show()
|
| 235 |
-
|
|
|
|
| 1 |
+
import re
|
| 2 |
+
import time
|
| 3 |
+
from collections import Counter
|
| 4 |
+
from datetime import datetime
|
| 5 |
|
| 6 |
+
import pandas as pd
|
|
|
|
| 7 |
from selenium import webdriver
|
| 8 |
+
from selenium.webdriver.chrome.service import Service
|
| 9 |
from selenium.webdriver.common.by import By
|
| 10 |
from webdriver_manager.chrome import ChromeDriverManager
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 11 |
|
| 12 |
# 수집 대상 카테고리
|
| 13 |
categories = {
|
| 14 |
+
"경제": "https://news.naver.com/section/101",
|
| 15 |
+
"IT/과학": "https://news.naver.com/section/105",
|
| 16 |
}
|
| 17 |
NUM_ARTICLES_PER_CATEGORY = 80
|
| 18 |
|
| 19 |
# AI 핀테크 키워드 (FinNode 프로젝트 전용)
|
| 20 |
FINTECH_AI_KEYWORDS = [
|
| 21 |
# AI 기술
|
| 22 |
+
"AI",
|
| 23 |
+
"인공지능",
|
| 24 |
+
"생성형 AI",
|
| 25 |
+
"대규모언어모델",
|
| 26 |
# AI 핀테크 (금융)
|
| 27 |
+
"핀테크",
|
| 28 |
]
|
| 29 |
|
| 30 |
+
print("[INIT] ChromeDriver 초기화 중...")
|
| 31 |
service = Service(ChromeDriverManager().install())
|
| 32 |
options = webdriver.ChromeOptions()
|
| 33 |
+
options.add_argument("--no-sandbox")
|
| 34 |
+
options.add_argument("--disable-dev-shm-usage")
|
| 35 |
driver = webdriver.Chrome(service=service, options=options)
|
| 36 |
+
print("[INIT] ✅ 브라우저 실행 완료")
|
| 37 |
+
|
| 38 |
|
| 39 |
def get_article_links(driver, category_url, num_articles):
|
| 40 |
+
print(f" [LINK] 페이지 이동: {category_url}")
|
| 41 |
driver.get(category_url)
|
| 42 |
time.sleep(3)
|
| 43 |
+
print(f" [LINK] 로드 완료 (title: {driver.title})")
|
| 44 |
|
| 45 |
article_links = []
|
| 46 |
selectors = [
|
| 47 |
+
"a.sa_text_title",
|
| 48 |
+
"a.sa_text_lede",
|
| 49 |
+
"a.sa_text_strong",
|
| 50 |
+
".sa_text a",
|
| 51 |
+
".cluster_text_headline a",
|
| 52 |
+
".cluster_text_lede a",
|
| 53 |
]
|
| 54 |
|
| 55 |
for selector in selectors:
|
| 56 |
elements = driver.find_elements(By.CSS_SELECTOR, selector)
|
| 57 |
print(f" [LINK] 셀렉터 '{selector}' -> {len(elements)}개 발견")
|
| 58 |
for element in elements:
|
| 59 |
+
url = element.get_attribute("href")
|
| 60 |
+
if (
|
| 61 |
+
url
|
| 62 |
+
and "news.naver.com" in url
|
| 63 |
+
and "/article/" in url
|
| 64 |
+
and "/comment/" not in url
|
| 65 |
+
and url not in article_links
|
| 66 |
+
):
|
| 67 |
article_links.append(url)
|
| 68 |
if len(article_links) >= num_articles:
|
| 69 |
break
|
| 70 |
if len(article_links) >= num_articles:
|
| 71 |
break
|
| 72 |
|
| 73 |
+
print(f" [LINK] ✅ 총 {len(article_links)}개 링크 확보\n")
|
| 74 |
return article_links[:num_articles]
|
| 75 |
|
| 76 |
+
|
| 77 |
def parse_article_detail(driver, article_url, category):
|
| 78 |
driver.get(article_url)
|
| 79 |
time.sleep(1.5)
|
| 80 |
article_data = {
|
| 81 |
+
"article_id": "",
|
| 82 |
+
"title": "",
|
| 83 |
+
"content": "",
|
| 84 |
+
"url": article_url,
|
| 85 |
+
"published_date": "",
|
| 86 |
+
"source": "",
|
| 87 |
+
"author": "",
|
| 88 |
+
"category": category,
|
| 89 |
}
|
| 90 |
try:
|
| 91 |
+
match = re.search(r"article/(\d+)/(\d+)", article_url)
|
| 92 |
+
article_data["article_id"] = (
|
| 93 |
+
f"ART_{match.group(1)}_{match.group(2)}" if match else f"ART_{datetime.now().strftime('%Y%m%d%H%M%S')}"
|
|
|
|
| 94 |
)
|
| 95 |
+
for sel in [
|
| 96 |
+
"#title_area span",
|
| 97 |
+
"#ct .media_end_head_headline",
|
| 98 |
+
".media_end_head_headline",
|
| 99 |
+
"h2#title_area",
|
| 100 |
+
".news_end_title",
|
| 101 |
+
]:
|
| 102 |
try:
|
| 103 |
el = driver.find_element(By.CSS_SELECTOR, sel)
|
| 104 |
if el.text.strip():
|
| 105 |
+
article_data["title"] = el.text.strip()
|
| 106 |
+
break
|
| 107 |
+
except:
|
| 108 |
+
continue
|
| 109 |
+
for sel in [
|
| 110 |
+
"#dic_area",
|
| 111 |
+
"article#dic_area",
|
| 112 |
+
".go_trans._article_content",
|
| 113 |
+
"._article_body_contents",
|
| 114 |
+
]:
|
| 115 |
try:
|
| 116 |
el = driver.find_element(By.CSS_SELECTOR, sel)
|
| 117 |
if el.text.strip():
|
| 118 |
+
article_data["content"] = el.text.strip()
|
| 119 |
+
break
|
| 120 |
+
except:
|
| 121 |
+
continue
|
| 122 |
try:
|
| 123 |
+
el = driver.find_element(By.CSS_SELECTOR, "a.media_end_head_top_logo img")
|
| 124 |
+
article_data["source"] = el.get_attribute("alt")
|
| 125 |
except:
|
| 126 |
try:
|
| 127 |
+
el = driver.find_element(By.CSS_SELECTOR, ".media_end_head_top_logo_text")
|
| 128 |
+
article_data["source"] = el.text.strip()
|
| 129 |
+
except:
|
| 130 |
+
pass
|
| 131 |
try:
|
| 132 |
+
el = driver.find_element(
|
| 133 |
+
By.CSS_SELECTOR,
|
| 134 |
+
"span.media_end_head_info_datestamp_time, span[data-date-time]",
|
| 135 |
+
)
|
| 136 |
+
article_data["published_date"] = (el.get_attribute("data-date-time") or el.text).strip()
|
| 137 |
except:
|
| 138 |
+
article_data["published_date"] = datetime.now().strftime("%Y-%m-%d %H:%M")
|
| 139 |
try:
|
| 140 |
+
el = driver.find_element(
|
| 141 |
+
By.CSS_SELECTOR,
|
| 142 |
+
"em.media_end_head_journalist_name, span.byline_s",
|
| 143 |
+
)
|
| 144 |
+
article_data["author"] = el.text.strip()
|
| 145 |
+
except:
|
| 146 |
+
pass
|
| 147 |
except Exception as e:
|
| 148 |
+
print(f" [PARSE] ⚠️ 파싱 오류: {e}")
|
| 149 |
return article_data
|
| 150 |
|
| 151 |
+
|
| 152 |
# ── 1단계: 전체 기사 수집 ──
|
| 153 |
all_articles = []
|
| 154 |
category_stats = {}
|
| 155 |
|
| 156 |
for category_name, category_url in categories.items():
|
| 157 |
+
print(f"\n{'=' * 60}")
|
| 158 |
+
print(f"[CRAWL] [{category_name}] 카테고리 수집 시작")
|
| 159 |
+
print(f"{'=' * 60}")
|
| 160 |
|
| 161 |
article_links = get_article_links(driver, category_url, NUM_ARTICLES_PER_CATEGORY)
|
| 162 |
|
| 163 |
cat_ok, cat_fail = 0, 0
|
| 164 |
for idx, article_url in enumerate(article_links, 1):
|
| 165 |
+
print(f" [PARSE] ({idx}/{len(article_links)}) {article_url[:70]}...")
|
| 166 |
article_data = parse_article_detail(driver, article_url, category_name)
|
| 167 |
|
| 168 |
+
if article_data["title"] and article_data["content"]:
|
| 169 |
all_articles.append(article_data)
|
| 170 |
cat_ok += 1
|
| 171 |
print(f" ✅ {article_data['title'][:40]}...")
|
| 172 |
print(f" 언론사: {article_data['source']} | 날짜: {article_data['published_date']}")
|
| 173 |
else:
|
| 174 |
cat_fail += 1
|
| 175 |
+
missing = [
|
| 176 |
+
x
|
| 177 |
+
for x, v in [
|
| 178 |
+
("제목", article_data["title"]),
|
| 179 |
+
("본문", article_data["content"]),
|
| 180 |
+
]
|
| 181 |
+
if not v
|
| 182 |
+
]
|
| 183 |
print(f" ❌ 파싱실패 ({', '.join(missing)} 없음)")
|
| 184 |
time.sleep(0.5)
|
| 185 |
|
| 186 |
+
category_stats[category_name] = {"ok": cat_ok, "fail": cat_fail}
|
| 187 |
print(f"\n [CRAWL] [{category_name}] 완료: 성공 {cat_ok}개 / 실패 {cat_fail}개")
|
| 188 |
|
| 189 |
driver.quit()
|
| 190 |
+
print("\n[DONE] 브라우저 종료")
|
| 191 |
+
print(f"\n{'=' * 60}")
|
| 192 |
+
print("[SUMMARY] 수집 결과 요약")
|
| 193 |
+
print(f"{'=' * 60}")
|
| 194 |
for cat, s in category_stats.items():
|
| 195 |
+
print(f" {cat}: 성공 {s['ok']}건 / 실패 {s['fail']}건")
|
| 196 |
+
print(f" 전체 수집: {len(all_articles)}건")
|
| 197 |
|
| 198 |
df_all = pd.DataFrame(all_articles)
|
| 199 |
df_all
|
| 200 |
|
| 201 |
|
|
|
|
|
|
|
| 202 |
# ── 2단계: AI 핀테크 키워드 필터링 ──
|
| 203 |
+
print(f"\n{'=' * 60}")
|
| 204 |
+
print("[FILTER] AI 핀테크 키워드 필터링 시작")
|
| 205 |
+
print(f"{'=' * 60}")
|
| 206 |
|
| 207 |
filtered_articles = []
|
| 208 |
for _, row in df_all.iterrows():
|
|
|
|
| 210 |
matched = [kw for kw in FINTECH_AI_KEYWORDS if kw.replace(" ", "") in text.replace(" ", "")]
|
| 211 |
if matched:
|
| 212 |
row_dict = row.to_dict()
|
| 213 |
+
row_dict["matched_keywords"] = ", ".join(matched)
|
| 214 |
filtered_articles.append(row_dict)
|
| 215 |
|
| 216 |
df_filtered = pd.DataFrame(filtered_articles)
|
| 217 |
|
| 218 |
+
print(f" 전체 수집: {len(df_all)}건")
|
| 219 |
+
print(f" AI 핀테크 관련: {len(df_filtered)}건 ({len(df_filtered) / max(len(df_all), 1) * 100:.1f}%)")
|
| 220 |
+
print("\n [키워드별 매칭 현황]")
|
| 221 |
+
all_kw = [kw for row in filtered_articles for kw in row["matched_keywords"].split(", ")]
|
| 222 |
kw_counts = Counter(all_kw)
|
| 223 |
for kw in FINTECH_AI_KEYWORDS:
|
| 224 |
+
print(f" {kw}: {kw_counts.get(kw, 0)}건")
|
| 225 |
|
| 226 |
df_filtered
|
| 227 |
|
| 228 |
# ── 3단계: 저장 ──
|
| 229 |
output_filename = f"Articles_{datetime.now().strftime('%Y%m%d_%H%M%S')}.xlsx"
|
| 230 |
+
df_filtered.to_excel(output_filename, index=False, engine="openpyxl")
|
| 231 |
+
print(f"[SAVE] ✅ 저장 완료: {output_filename}")
|
| 232 |
+
print(f"[SAVE] - AI 핀테크 기사: {len(df_filtered)}건")
|
|
|
|
|
|
|
| 233 |
|
| 234 |
|
| 235 |
# ── 4단계: 키워드 빈도 시각화 ──
|
|
|
|
| 236 |
import platform
|
| 237 |
from collections import Counter
|
| 238 |
|
| 239 |
+
import matplotlib.pyplot as plt
|
| 240 |
+
|
| 241 |
# 폰트 깨짐 방지 (Mac 환경: AppleGothic)
|
| 242 |
+
if platform.system() == "Darwin":
|
| 243 |
+
plt.rc("font", family="AppleGothic")
|
| 244 |
+
plt.rcParams["axes.unicode_minus"] = False
|
| 245 |
|
| 246 |
if not filtered_articles:
|
| 247 |
+
print("시각화할 데이터가 없습니다.")
|
| 248 |
else:
|
| 249 |
# 빈도수 계산
|
| 250 |
+
all_kw = [kw for row in filtered_articles for kw in row["matched_keywords"].split(", ")]
|
| 251 |
kw_counts = Counter(all_kw)
|
| 252 |
+
|
| 253 |
# 📌 변경 포인트: FINTECH_AI_KEYWORDS 전체 목록을 순서대로 그래프에 강제 표시 (0건 포함)
|
| 254 |
keywords = FINTECH_AI_KEYWORDS
|
| 255 |
counts = [kw_counts.get(kw, 0) for kw in keywords]
|
| 256 |
+
|
| 257 |
plt.figure(figsize=(12, 6))
|
| 258 |
+
|
| 259 |
# 막대 그래프 생성
|
| 260 |
+
bars = plt.bar(keywords, counts, color="skyblue", edgecolor="white")
|
| 261 |
+
|
| 262 |
# 막대 위에 숫자(빈도수) 표시
|
| 263 |
for bar in bars:
|
| 264 |
height = bar.get_height()
|
| 265 |
# 막대의 중앙(x), 막대의 높이(y) 위치에 텍스트를 배치
|
| 266 |
+
plt.text(
|
| 267 |
+
bar.get_x() + bar.get_width() / 2.0,
|
| 268 |
+
height,
|
| 269 |
+
f"{height}",
|
| 270 |
+
ha="center",
|
| 271 |
+
va="bottom",
|
| 272 |
+
size=11,
|
| 273 |
+
fontweight="bold",
|
| 274 |
+
color="black",
|
| 275 |
+
)
|
| 276 |
|
| 277 |
+
plt.title("수집된 AI 핀테크 기사 키워드 출현 빈도 (전체)", fontsize=15, pad=15)
|
| 278 |
+
plt.xlabel("키워드", fontsize=12)
|
| 279 |
+
plt.ylabel("출현 횟수 (건)", fontsize=12)
|
| 280 |
+
plt.grid(axis="y", linestyle="--", alpha=0.7)
|
| 281 |
plt.xticks(rotation=45)
|
| 282 |
plt.tight_layout()
|
| 283 |
plt.show()
|
|
|
src/retrieval/finRetrieval.py
CHANGED
|
@@ -11,17 +11,18 @@ app.py에서 import하여 Gradio 챗봇과 연동합니다.
|
|
| 11 |
"""
|
| 12 |
|
| 13 |
import os
|
|
|
|
| 14 |
import dotenv
|
| 15 |
import neo4j
|
| 16 |
-
from neo4j_graphrag.llm import OpenAILLM
|
| 17 |
from neo4j_graphrag.embeddings.openai import OpenAIEmbeddings
|
|
|
|
|
|
|
| 18 |
from neo4j_graphrag.retrievers import (
|
| 19 |
-
VectorRetriever,
|
| 20 |
-
VectorCypherRetriever,
|
| 21 |
Text2CypherRetriever,
|
| 22 |
ToolsRetriever,
|
|
|
|
|
|
|
| 23 |
)
|
| 24 |
-
from neo4j_graphrag.generation import RagTemplate, GraphRAG
|
| 25 |
|
| 26 |
dotenv.load_dotenv()
|
| 27 |
|
|
@@ -29,11 +30,14 @@ dotenv.load_dotenv()
|
|
| 29 |
# 1. DB / LLM / Embedder 초기화
|
| 30 |
# ──────────────────────────────────────────
|
| 31 |
|
| 32 |
-
URI
|
| 33 |
-
AUTH
|
| 34 |
-
|
|
|
|
|
|
|
|
|
|
| 35 |
|
| 36 |
-
rag_llm
|
| 37 |
embedder = OpenAIEmbeddings(model="text-embedding-3-small")
|
| 38 |
|
| 39 |
INDEX_NAME = "content_vector_index"
|
|
@@ -76,6 +80,7 @@ vector_cypher_retriever = VectorCypherRetriever(
|
|
| 76 |
embedder=embedder,
|
| 77 |
)
|
| 78 |
|
|
|
|
| 79 |
# (3) 자연어 → Cypher 자동 변환 검색
|
| 80 |
def _get_schema() -> str:
|
| 81 |
with driver.session() as s:
|
|
@@ -85,9 +90,7 @@ def _get_schema() -> str:
|
|
| 85 |
"RETURN nodeType, collect(propertyName) as props"
|
| 86 |
).data()
|
| 87 |
rels = s.run(
|
| 88 |
-
"MATCH (n)-[r]->(m) "
|
| 89 |
-
"RETURN DISTINCT labels(n)[0] as src, type(r) as rel, labels(m)[0] as tgt "
|
| 90 |
-
"LIMIT 30"
|
| 91 |
).data()
|
| 92 |
txt = "=== Neo4j Schema ===\n노드:\n"
|
| 93 |
for n in nodes:
|
|
@@ -103,18 +106,15 @@ _examples = [
|
|
| 103 |
CYPHER QUERY:
|
| 104 |
MATCH (c:AICompany {name:"카카오"})-[:DEVELOPS]->(s:AIService)
|
| 105 |
RETURN s.name, s.description""",
|
| 106 |
-
|
| 107 |
"""USER INPUT: 삼성전자가 개발 중인 AI 기술은?
|
| 108 |
CYPHER QUERY:
|
| 109 |
MATCH (c:AICompany {name:"삼성전자"})-[:DEVELOPS]->(t:AITechnology)
|
| 110 |
RETURN t.name, t.description""",
|
| 111 |
-
|
| 112 |
"""USER INPUT: 최근 AI 관련 기사 5개
|
| 113 |
CYPHER QUERY:
|
| 114 |
MATCH (a:Article)-[:MENTIONS]->(:AICompany)
|
| 115 |
RETURN DISTINCT a.article_id, a.title, a.url, a.published_date
|
| 116 |
ORDER BY a.published_date DESC LIMIT 5""",
|
| 117 |
-
|
| 118 |
"""USER INPUT: 어떤 기업이 LLM 기술을 개발하나요?
|
| 119 |
CYPHER QUERY:
|
| 120 |
MATCH (c:AICompany)-[:DEVELOPS]->(t:AITechnology)
|
|
|
|
| 11 |
"""
|
| 12 |
|
| 13 |
import os
|
| 14 |
+
|
| 15 |
import dotenv
|
| 16 |
import neo4j
|
|
|
|
| 17 |
from neo4j_graphrag.embeddings.openai import OpenAIEmbeddings
|
| 18 |
+
from neo4j_graphrag.generation import GraphRAG, RagTemplate
|
| 19 |
+
from neo4j_graphrag.llm import OpenAILLM
|
| 20 |
from neo4j_graphrag.retrievers import (
|
|
|
|
|
|
|
| 21 |
Text2CypherRetriever,
|
| 22 |
ToolsRetriever,
|
| 23 |
+
VectorCypherRetriever,
|
| 24 |
+
VectorRetriever,
|
| 25 |
)
|
|
|
|
| 26 |
|
| 27 |
dotenv.load_dotenv()
|
| 28 |
|
|
|
|
| 30 |
# 1. DB / LLM / Embedder 초기화
|
| 31 |
# ──────────────────────────────────────────
|
| 32 |
|
| 33 |
+
URI = os.getenv("NEO4J_URI", "neo4j://localhost:7687")
|
| 34 |
+
AUTH = (
|
| 35 |
+
os.getenv("NEO4J_USERNAME", "neo4j"),
|
| 36 |
+
os.getenv("NEO4J_PASSWORD", "password"),
|
| 37 |
+
)
|
| 38 |
+
driver = neo4j.GraphDatabase.driver(URI, auth=AUTH)
|
| 39 |
|
| 40 |
+
rag_llm = OpenAILLM(model_name="gpt-4o", model_params={"temperature": 0})
|
| 41 |
embedder = OpenAIEmbeddings(model="text-embedding-3-small")
|
| 42 |
|
| 43 |
INDEX_NAME = "content_vector_index"
|
|
|
|
| 80 |
embedder=embedder,
|
| 81 |
)
|
| 82 |
|
| 83 |
+
|
| 84 |
# (3) 자연어 → Cypher 자동 변환 검색
|
| 85 |
def _get_schema() -> str:
|
| 86 |
with driver.session() as s:
|
|
|
|
| 90 |
"RETURN nodeType, collect(propertyName) as props"
|
| 91 |
).data()
|
| 92 |
rels = s.run(
|
| 93 |
+
"MATCH (n)-[r]->(m) RETURN DISTINCT labels(n)[0] as src, type(r) as rel, labels(m)[0] as tgt LIMIT 30"
|
|
|
|
|
|
|
| 94 |
).data()
|
| 95 |
txt = "=== Neo4j Schema ===\n노드:\n"
|
| 96 |
for n in nodes:
|
|
|
|
| 106 |
CYPHER QUERY:
|
| 107 |
MATCH (c:AICompany {name:"카카오"})-[:DEVELOPS]->(s:AIService)
|
| 108 |
RETURN s.name, s.description""",
|
|
|
|
| 109 |
"""USER INPUT: 삼성전자가 개발 중인 AI 기술은?
|
| 110 |
CYPHER QUERY:
|
| 111 |
MATCH (c:AICompany {name:"삼성전자"})-[:DEVELOPS]->(t:AITechnology)
|
| 112 |
RETURN t.name, t.description""",
|
|
|
|
| 113 |
"""USER INPUT: 최근 AI 관련 기사 5개
|
| 114 |
CYPHER QUERY:
|
| 115 |
MATCH (a:Article)-[:MENTIONS]->(:AICompany)
|
| 116 |
RETURN DISTINCT a.article_id, a.title, a.url, a.published_date
|
| 117 |
ORDER BY a.published_date DESC LIMIT 5""",
|
|
|
|
| 118 |
"""USER INPUT: 어떤 기업이 LLM 기술을 개발하나요?
|
| 119 |
CYPHER QUERY:
|
| 120 |
MATCH (c:AICompany)-[:DEVELOPS]->(t:AITechnology)
|
tests/test_chunk_text.py
CHANGED
|
@@ -1,5 +1,6 @@
|
|
| 1 |
from src.graphBuilder.neo4j.finGraph import chunk_text
|
| 2 |
|
|
|
|
| 3 |
def test_chunk_text_empty_returns_empty_list():
|
| 4 |
assert chunk_text("") == []
|
| 5 |
|
|
|
|
| 1 |
from src.graphBuilder.neo4j.finGraph import chunk_text
|
| 2 |
|
| 3 |
+
|
| 4 |
def test_chunk_text_empty_returns_empty_list():
|
| 5 |
assert chunk_text("") == []
|
| 6 |
|
tests/test_retrieval.py
CHANGED
|
@@ -1,5 +1,7 @@
|
|
| 1 |
import os
|
|
|
|
| 2 |
import pytest
|
|
|
|
| 3 |
from src.retrieval.finRetrieval import graphrag
|
| 4 |
|
| 5 |
# API 키와 Neo4j 연결정보가 없을 경우 테스트를 건너뜁니다.
|
|
|
|
| 1 |
import os
|
| 2 |
+
|
| 3 |
import pytest
|
| 4 |
+
|
| 5 |
from src.retrieval.finRetrieval import graphrag
|
| 6 |
|
| 7 |
# API 키와 Neo4j 연결정보가 없을 경우 테스트를 건너뜁니다.
|