Spaces:

dev-yuje
/

FinGraph

Runtime error

App Files Files Community

dev-yuje commited on 16 days ago

Commit

08fb91a

1 Parent(s): 3e720d0

style: 린트 포맷팅 자동 적용 및 프로젝트 기획안(AGENTS.md, README.md) 보완

Browse files

Files changed (9) hide show

AGENTS.md +4 -1
README.md +10 -0
app.py +4 -3
run_pipeline.py +3 -1
src/graphBuilder/neo4j/finGraph.py +70 -41
src/graphBuilder/scrapping/finScrapping.py +147 -99
src/retrieval/finRetrieval.py +14 -14
tests/test_chunk_text.py +1 -0
tests/test_retrieval.py +2 -0

AGENTS.md CHANGED Viewed

@@ -1,9 +1,12 @@
 ###### 참고: https://wikidocs.net/340866
 # AGENTS.md
 ## 프로젝트 개요
-- 목적:
 - 언어: Python 3.10
 - 기술스택: GraphRAG, LangChain, LangGraph, Neo4j, HugingFace, Gradio

 ###### 참고: https://wikidocs.net/340866
+###### 하네스 엔지니어링: Global지침, Skills와 Workflow를 모두 포함하는 지침
+###### 개발 시작부터 배포까지 모든 것은 AGENTS.md에 기록한다.
+###### 예를들어 개발 단계에서 체크리스트를 만들어서 개발을 할 때마다 하나씩 체크하도록 지시한다.
 # AGENTS.md
 ## 프로젝트 개요
+- 목적: AI 기반 핀테크 기술의 트렌드를 파악하도록 돕는 챗봇
 - 언어: Python 3.10
 - 기술스택: GraphRAG, LangChain, LangGraph, Neo4j, HugingFace, Gradio

README.md CHANGED Viewed

@@ -1,3 +1,13 @@
 # FinNode 🕸️
 **Neo4j GraphRAG 기반 AI 뉴스 지식 그래프 플랫폼**

+---
+title: FinGraph
+emoji: 🕸️
+colorFrom: indigo
+colorTo: indigo
+sdk: gradio
+sdk_version: 4.44.0
+app_file: app.py
+pinned: false
+---
 # FinNode 🕸️
 **Neo4j GraphRAG 기반 AI 뉴스 지식 그래프 플랫폼**

app.py CHANGED Viewed

@@ -8,11 +8,12 @@ Gradio ChatInterface + LangGraph 기반 대화 흐름 제어.
     python app.py
 """
-import os
 import dotenv
 import gradio as gr
-from typing import TypedDict, List
-from langgraph.graph import StateGraph, END
 from src.retrieval.finRetrieval import graphrag
 dotenv.load_dotenv()

     python app.py
 """
+from typing import List, TypedDict
 import dotenv
 import gradio as gr
+from langgraph.graph import END, StateGraph
 from src.retrieval.finRetrieval import graphrag
 dotenv.load_dotenv()

run_pipeline.py CHANGED Viewed

@@ -1,6 +1,8 @@
 import json
 from pipeline.workflow import pipeline
-from pipeline.db_writer import write_graph_to_neo4j, chunk_and_embed_article
 def run_test():
     # 1. 모의 테스트용 뉴스 기사 데이터 준비

 import json
+from pipeline.db_writer import chunk_and_embed_article, write_graph_to_neo4j
 from pipeline.workflow import pipeline
 def run_test():
     # 1. 모의 테스트용 뉴스 기사 데이터 준비

src/graphBuilder/neo4j/finGraph.py CHANGED Viewed

@@ -10,27 +10,31 @@ finGraph.py — AI 뉴스 지식 그래프 빌더
         MENTIONS, HAS_CHUNK, PUBLISHED
 """
-import os
 import glob
 import json
-import pandas as pd
-import neo4j
 import dotenv
-from typing import TypedDict, List, Dict
 from langchain_openai import ChatOpenAI
-from langgraph.graph import StateGraph, END
-from neo4j_graphrag.llm import OpenAILLM
 from neo4j_graphrag.embeddings.openai import OpenAIEmbeddings
 from neo4j_graphrag.indexes import create_vector_index
 dotenv.load_dotenv()
-URI      = os.getenv("NEO4J_URI", "neo4j://localhost:7687")
-AUTH     = (os.getenv("NEO4J_USERNAME", "neo4j"), os.getenv("NEO4J_PASSWORD", "password"))
-driver   = neo4j.GraphDatabase.driver(URI, auth=AUTH)
 chat_llm = ChatOpenAI(model="gpt-4o-mini", temperature=0)
-rag_llm  = OpenAILLM(model_name="gpt-4o", model_params={"temperature": 0})
 embedder = OpenAIEmbeddings(model="text-embedding-3-small")
 INDEX_NAME = "content_vector_index"
@@ -39,6 +43,7 @@ INDEX_NAME = "content_vector_index"
 # 1. LangGraph 파이프라인 정의 (엔티티/관계 추출)
 # ──────────────────────────────────────────
 class ArticleState(TypedDict):
     article_id: str
     title: str
@@ -55,7 +60,10 @@ def check_ai_relevance(state: ArticleState) -> ArticleState:
         f"{state['text'][:400]}\n\n답변(yes/no):"
     )
     res = chat_llm.invoke(prompt)
-    return {**state, "is_ai_related": res.content.strip().lower().startswith("yes")}
 def extract_entities(state: ArticleState) -> ArticleState:
@@ -67,8 +75,8 @@ def extract_entities(state: ArticleState) -> ArticleState:
 - AIService: 서비스/제품 (예: ChatGPT, HyperCLOVA X)
 - AIField: 적용 분야 (예: 금융AI, AI 반도체)
-제목: {state['title']}
-본문: {state['text'][:900]}
 JSON으로만 응답:{{"entities":[{{"name":"...","type":"AICompany|AITechnology|AIService|AIField","description":"..."}}]}}"""
     res = chat_llm.invoke(prompt)
@@ -156,19 +164,28 @@ def upsert_entity(tx, e: Dict) -> None:
         f"MERGE (n:{ntype} {{name:$name}}) "
         "ON CREATE SET n.description=$desc "
         "ON MATCH  SET n.description=COALESCE(n.description,$desc)",
-        name=e["name"], desc=e.get("description", ""),
     )
 def upsert_relation(tx, r: Dict) -> None:
     rel = r.get("relation", "RELATED_TO").upper().replace(" ", "_")
-    allowed = {"DEVELOPS", "INVESTS_IN", "PARTNERS_WITH", "APPLIES", "USED_IN", "RELATED_TO"}
     if rel not in allowed:
         return
     try:
         tx.run(
             f"MATCH (s {{name:$src}}) MATCH (t {{name:$tgt}}) MERGE (s)-[:{rel}]->(t)",
-            src=r["source"], tgt=r["target"],
         )
     except Exception:
         pass
@@ -176,24 +193,25 @@ def upsert_relation(tx, r: Dict) -> None:
 def upsert_article_and_mentions(tx, row: pd.Series, entities: List[Dict]) -> None:
     tx.run(
-        "MERGE (a:Article {article_id:$aid}) "
-        "SET a.title=$title, a.url=$url, a.published_date=$date",
-        aid=row.get("article_id", ""), title=row.get("title", ""),
-        url=row.get("url", ""), date=str(row.get("published_date", "")),
     )
     if pd.notna(row.get("source", "")):
         tx.run(
-            "MERGE (m:Media {name:$src}) "
-            "WITH m MATCH (a:Article {article_id:$aid}) MERGE (m)-[:PUBLISHED]->(a)",
-            src=row["source"], aid=row.get("article_id", ""),
         )
     for e in entities:
         ntype = ENTITY_TYPE_MAP.get(e.get("type", "AICompany"), "AICompany")
         try:
             tx.run(
-                f"MATCH (a:Article {{article_id:$aid}}) "
-                f"MATCH (n:{ntype} {{name:$name}}) MERGE (a)-[:MENTIONS]->(n)",
-                aid=row.get("article_id", ""), name=e["name"],
             )
         except Exception:
             pass
@@ -203,17 +221,14 @@ def chunk_text(text: str, size: int = 500, overlap: int = 50) -> List[str]:
     if not text or pd.isna(text):
         return []
     text = str(text)
-    return [
-        text[i:i + size].strip()
-        for i in range(0, len(text), size - overlap)
-        if text[i:i + size].strip()
-    ]
 # ──────────────────────────────────────────
 # 3. 메인 실행 (스크립트로 직접 호출 시)
 # ──────────────────────────────────────────
 def main() -> None:
     # 최신 엑셀 로드
     xlsx_files = sorted(glob.glob("Articles_*.xlsx"))
@@ -232,12 +247,16 @@ def main() -> None:
     # 엔티티/관계 추출 및 적재
     print(f"총 {len(df)}건 처리 시작...")
     for idx, row in df.iterrows():
-        aid   = str(row.get("article_id", f"ART_{idx}"))
         title = str(row.get("title", ""))
-        text  = title + "\n" + str(row.get("content", ""))
         state: ArticleState = dict(
-            article_id=aid, title=title, text=text,
-            is_ai_related=False, entities=[], relations=[],
         )
         out = pipeline.invoke(state)
         if out["is_ai_related"]:
@@ -247,15 +266,15 @@ def main() -> None:
                 for r in out["relations"]:
                     s.execute_write(upsert_relation, r)
                 s.execute_write(upsert_article_and_mentions, row, out["entities"])
-            print(f"  ✅ [{idx+1}/{len(df)}] {title[:35]}... | 엔티티: {[e['name'] for e in out['entities'][:4]]}")
         else:
-            print(f"  ⏭️  [{idx+1}/{len(df)}] AI 비관련: {title[:35]}...")
     print("\n✅ 엔티티/관계 추출 및 Neo4j 적재 완료")
     # Content 청킹 + 임베딩
     print("Content 노드 생성 및 임베딩 시작...")
     for idx, row in df.iterrows():
-        aid    = str(row.get("article_id", f"ART_{idx}"))
         chunks = chunk_text(str(row.get("content", "")))
         with driver.session() as s:
             for i, chunk in enumerate(chunks):
@@ -265,13 +284,23 @@ def main() -> None:
                     "MERGE (c:Content {content_id:$cid}) "
                     "SET c.chunk=$chunk, c.article_id=$aid, c.chunk_index=$i, c.embedding=$vec "
                     "WITH c MATCH (a:Article {article_id:$aid}) MERGE (a)-[:HAS_CHUNK]->(c)",
-                    cid=cid, chunk=chunk, aid=aid, i=i, vec=vec,
                 )
     print("✅ Content 노드 임베딩 완료")
     # 벡터 인덱스 생성
-    create_vector_index(driver, INDEX_NAME, label="Content",
-                        embedding_property="embedding", dimensions=1536, similarity_fn="cosine")
     print(f"✅ 벡터 인덱스 [{INDEX_NAME}] 생성 완료")

         MENTIONS, HAS_CHUNK, PUBLISHED
 """
 import glob
 import json
+import os
+from typing import Dict, List, TypedDict
 import dotenv
+import neo4j
+import pandas as pd
 from langchain_openai import ChatOpenAI
+from langgraph.graph import END, StateGraph
 from neo4j_graphrag.embeddings.openai import OpenAIEmbeddings
 from neo4j_graphrag.indexes import create_vector_index
+from neo4j_graphrag.llm import OpenAILLM
 dotenv.load_dotenv()
+URI = os.getenv("NEO4J_URI", "neo4j://localhost:7687")
+AUTH = (
+    os.getenv("NEO4J_USERNAME", "neo4j"),
+    os.getenv("NEO4J_PASSWORD", "password"),
+)
+driver = neo4j.GraphDatabase.driver(URI, auth=AUTH)
 chat_llm = ChatOpenAI(model="gpt-4o-mini", temperature=0)
+rag_llm = OpenAILLM(model_name="gpt-4o", model_params={"temperature": 0})
 embedder = OpenAIEmbeddings(model="text-embedding-3-small")
 INDEX_NAME = "content_vector_index"
 # 1. LangGraph 파이프라인 정의 (엔티티/관계 추출)
 # ──────────────────────────────────────────
 class ArticleState(TypedDict):
     article_id: str
     title: str
         f"{state['text'][:400]}\n\n답변(yes/no):"
     )
     res = chat_llm.invoke(prompt)
+    return {
+        **state,
+        "is_ai_related": res.content.strip().lower().startswith("yes"),
+    }
 def extract_entities(state: ArticleState) -> ArticleState:
 - AIService: 서비스/제품 (예: ChatGPT, HyperCLOVA X)
 - AIField: 적용 분야 (예: 금융AI, AI 반도체)
+제목: {state["title"]}
+본문: {state["text"][:900]}
 JSON으로만 응답:{{"entities":[{{"name":"...","type":"AICompany|AITechnology|AIService|AIField","description":"..."}}]}}"""
     res = chat_llm.invoke(prompt)
         f"MERGE (n:{ntype} {{name:$name}}) "
         "ON CREATE SET n.description=$desc "
         "ON MATCH  SET n.description=COALESCE(n.description,$desc)",
+        name=e["name"],
+        desc=e.get("description", ""),
     )
 def upsert_relation(tx, r: Dict) -> None:
     rel = r.get("relation", "RELATED_TO").upper().replace(" ", "_")
+    allowed = {
+        "DEVELOPS",
+        "INVESTS_IN",
+        "PARTNERS_WITH",
+        "APPLIES",
+        "USED_IN",
+        "RELATED_TO",
+    }
     if rel not in allowed:
         return
     try:
         tx.run(
             f"MATCH (s {{name:$src}}) MATCH (t {{name:$tgt}}) MERGE (s)-[:{rel}]->(t)",
+            src=r["source"],
+            tgt=r["target"],
         )
     except Exception:
         pass
 def upsert_article_and_mentions(tx, row: pd.Series, entities: List[Dict]) -> None:
     tx.run(
+        "MERGE (a:Article {article_id:$aid}) SET a.title=$title, a.url=$url, a.published_date=$date",
+        aid=row.get("article_id", ""),
+        title=row.get("title", ""),
+        url=row.get("url", ""),
+        date=str(row.get("published_date", "")),
     )
     if pd.notna(row.get("source", "")):
         tx.run(
+            "MERGE (m:Media {name:$src}) WITH m MATCH (a:Article {article_id:$aid}) MERGE (m)-[:PUBLISHED]->(a)",
+            src=row["source"],
+            aid=row.get("article_id", ""),
         )
     for e in entities:
         ntype = ENTITY_TYPE_MAP.get(e.get("type", "AICompany"), "AICompany")
         try:
             tx.run(
+                f"MATCH (a:Article {{article_id:$aid}}) MATCH (n:{ntype} {{name:$name}}) MERGE (a)-[:MENTIONS]->(n)",
+                aid=row.get("article_id", ""),
+                name=e["name"],
             )
         except Exception:
             pass
     if not text or pd.isna(text):
         return []
     text = str(text)
+    return [text[i : i + size].strip() for i in range(0, len(text), size - overlap) if text[i : i + size].strip()]
 # ──────────────────────────────────────────
 # 3. 메인 실행 (스크립트로 직접 호출 시)
 # ──────────────────────────────────────────
 def main() -> None:
     # 최신 엑셀 로드
     xlsx_files = sorted(glob.glob("Articles_*.xlsx"))
     # 엔티티/관계 추출 및 적재
     print(f"총 {len(df)}건 처리 시작...")
     for idx, row in df.iterrows():
+        aid = str(row.get("article_id", f"ART_{idx}"))
         title = str(row.get("title", ""))
+        text = title + "\n" + str(row.get("content", ""))
         state: ArticleState = dict(
+            article_id=aid,
+            title=title,
+            text=text,
+            is_ai_related=False,
+            entities=[],
+            relations=[],
         )
         out = pipeline.invoke(state)
         if out["is_ai_related"]:
                 for r in out["relations"]:
                     s.execute_write(upsert_relation, r)
                 s.execute_write(upsert_article_and_mentions, row, out["entities"])
+            print(f"  ✅ [{idx + 1}/{len(df)}] {title[:35]}... | 엔티티: {[e['name'] for e in out['entities'][:4]]}")
         else:
+            print(f"  ⏭️  [{idx + 1}/{len(df)}] AI 비관련: {title[:35]}...")
     print("\n✅ 엔티티/관계 추출 및 Neo4j 적재 완료")
     # Content 청킹 + 임베딩
     print("Content 노드 생성 및 임베딩 시작...")
     for idx, row in df.iterrows():
+        aid = str(row.get("article_id", f"ART_{idx}"))
         chunks = chunk_text(str(row.get("content", "")))
         with driver.session() as s:
             for i, chunk in enumerate(chunks):
                     "MERGE (c:Content {content_id:$cid}) "
                     "SET c.chunk=$chunk, c.article_id=$aid, c.chunk_index=$i, c.embedding=$vec "
                     "WITH c MATCH (a:Article {article_id:$aid}) MERGE (a)-[:HAS_CHUNK]->(c)",
+                    cid=cid,
+                    chunk=chunk,
+                    aid=aid,
+                    i=i,
+                    vec=vec,
                 )
     print("✅ Content 노드 임베딩 완료")
     # 벡터 인덱스 생성
+    create_vector_index(
+        driver,
+        INDEX_NAME,
+        label="Content",
+        embedding_property="embedding",
+        dimensions=1536,
+        similarity_fn="cosine",
+    )
     print(f"✅ 벡터 인덱스 [{INDEX_NAME}] 생성 완료")

src/graphBuilder/scrapping/finScrapping.py CHANGED Viewed

@@ -1,166 +1,208 @@
 from selenium import webdriver
 from selenium.webdriver.common.by import By
 from webdriver_manager.chrome import ChromeDriverManager
-from selenium.webdriver.chrome.service import Service
-import pandas as pd
-import time
-from datetime import datetime
-import re
-from collections import Counter
 # 수집 대상 카테고리
 categories = {
-    '경제': 'https://news.naver.com/section/101',
-    'IT/과학': 'https://news.naver.com/section/105',
 }
 NUM_ARTICLES_PER_CATEGORY = 80
 # AI 핀테크 키워드 (FinNode 프로젝트 전용)
 FINTECH_AI_KEYWORDS = [
     # AI 기술
-    'AI', '인공지능', '생성형 AI', '대규모언어모델',
     # AI 핀테크 (금융)
-    '핀테크',
 ]
-print('[INIT] ChromeDriver 초기화 중...')
 service = Service(ChromeDriverManager().install())
 options = webdriver.ChromeOptions()
-options.add_argument('--no-sandbox')
-options.add_argument('--disable-dev-shm-usage')
 driver = webdriver.Chrome(service=service, options=options)
-print('[INIT] ✅ 브라우저 실행 완료')
 def get_article_links(driver, category_url, num_articles):
-    print(f'  [LINK] 페이지 이동: {category_url}')
     driver.get(category_url)
     time.sleep(3)
-    print(f'  [LINK] 로드 완료 (title: {driver.title})')
     article_links = []
     selectors = [
-        'a.sa_text_title', 'a.sa_text_lede', 'a.sa_text_strong',
-        '.sa_text a', '.cluster_text_headline a', '.cluster_text_lede a'
     ]
     for selector in selectors:
         elements = driver.find_elements(By.CSS_SELECTOR, selector)
         print(f"  [LINK] 셀렉터 '{selector}' -> {len(elements)}개 발견")
         for element in elements:
-            url = element.get_attribute('href')
-            if (url and 'news.naver.com' in url and '/article/' in url
-                    and '/comment/' not in url and url not in article_links):
                 article_links.append(url)
                 if len(article_links) >= num_articles:
                     break
         if len(article_links) >= num_articles:
             break
-    print(f'  [LINK] ✅ 총 {len(article_links)}개 링크 확보\n')
     return article_links[:num_articles]
 def parse_article_detail(driver, article_url, category):
     driver.get(article_url)
     time.sleep(1.5)
     article_data = {
-        'article_id': '', 'title': '', 'content': '', 'url': article_url,
-        'published_date': '', 'source': '', 'author': '', 'category': category
     }
     try:
-        match = re.search(r'article/(\d+)/(\d+)', article_url)
-        article_data['article_id'] = (
-            f"ART_{match.group(1)}_{match.group(2)}" if match
-            else f"ART_{datetime.now().strftime('%Y%m%d%H%M%S')}"
         )
-        for sel in ['#title_area span', '#ct .media_end_head_headline',
-                    '.media_end_head_headline', 'h2#title_area', '.news_end_title']:
             try:
                 el = driver.find_element(By.CSS_SELECTOR, sel)
                 if el.text.strip():
-                    article_data['title'] = el.text.strip(); break
-            except: continue
-        for sel in ['#dic_area', 'article#dic_area',
-                    '.go_trans._article_content', '._article_body_contents']:
             try:
                 el = driver.find_element(By.CSS_SELECTOR, sel)
                 if el.text.strip():
-                    article_data['content'] = el.text.strip(); break
-            except: continue
         try:
-            el = driver.find_element(By.CSS_SELECTOR, 'a.media_end_head_top_logo img')
-            article_data['source'] = el.get_attribute('alt')
         except:
             try:
-                el = driver.find_element(By.CSS_SELECTOR, '.media_end_head_top_logo_text')
-                article_data['source'] = el.text.strip()
-            except: pass
         try:
-            el = driver.find_element(By.CSS_SELECTOR,
-                'span.media_end_head_info_datestamp_time, span[data-date-time]')
-            article_data['published_date'] = (el.get_attribute('data-date-time') or el.text).strip()
         except:
-            article_data['published_date'] = datetime.now().strftime('%Y-%m-%d %H:%M')
         try:
-            el = driver.find_element(By.CSS_SELECTOR,
-                'em.media_end_head_journalist_name, span.byline_s')
-            article_data['author'] = el.text.strip()
-        except: pass
     except Exception as e:
-        print(f'    [PARSE] ⚠️  파싱 오류: {e}')
     return article_data
 # ── 1단계: 전체 기사 수집 ──
 all_articles = []
 category_stats = {}
 for category_name, category_url in categories.items():
-    print(f"\n{'='*60}")
-    print(f'[CRAWL] [{category_name}] 카테고리 수집 시작')
-    print(f"{'='*60}")
     article_links = get_article_links(driver, category_url, NUM_ARTICLES_PER_CATEGORY)
     cat_ok, cat_fail = 0, 0
     for idx, article_url in enumerate(article_links, 1):
-        print(f'  [PARSE] ({idx}/{len(article_links)}) {article_url[:70]}...')
         article_data = parse_article_detail(driver, article_url, category_name)
-        if article_data['title'] and article_data['content']:
             all_articles.append(article_data)
             cat_ok += 1
             print(f"    ✅ {article_data['title'][:40]}...")
             print(f"       언론사: {article_data['source']} | 날짜: {article_data['published_date']}")
         else:
             cat_fail += 1
-            missing = [x for x, v in [('제목', article_data['title']), ('본문', article_data['content'])] if not v]
             print(f"    ❌ 파싱실패 ({', '.join(missing)} 없음)")
         time.sleep(0.5)
-    category_stats[category_name] = {'ok': cat_ok, 'fail': cat_fail}
     print(f"\n  [CRAWL] [{category_name}] 완료: 성공 {cat_ok}개 / 실패 {cat_fail}개")
 driver.quit()
-print(f'\n[DONE] 브라우저 종료')
-print(f"\n{'='*60}")
-print(f'[SUMMARY] 수집 결과 요약')
-print(f"{'='*60}")
 for cat, s in category_stats.items():
-    print(f'  {cat}: 성공 {s["ok"]}건 / 실패 {s["fail"]}건')
-print(f'  전체 수집: {len(all_articles)}건')
 df_all = pd.DataFrame(all_articles)
 df_all
 # ── 2단계: AI 핀테크 키워드 필터링 ──
-print(f"\n{'='*60}")
-print('[FILTER] AI 핀테크 키워드 필터링 시작')
-print(f"{'='*60}")
 filtered_articles = []
 for _, row in df_all.iterrows():
@@ -168,68 +210,74 @@ for _, row in df_all.iterrows():
     matched = [kw for kw in FINTECH_AI_KEYWORDS if kw.replace(" ", "") in text.replace(" ", "")]
     if matched:
         row_dict = row.to_dict()
-        row_dict['matched_keywords'] = ', '.join(matched)
         filtered_articles.append(row_dict)
 df_filtered = pd.DataFrame(filtered_articles)
-print(f'  전체 수집: {len(df_all)}건')
-print(f'  AI 핀테크 관련: {len(df_filtered)}건 ({len(df_filtered)/max(len(df_all),1)*100:.1f}%)')
-print(f'\n  [키워드별 매칭 현황]')
-all_kw = [kw for row in filtered_articles for kw in row['matched_keywords'].split(', ')]
 kw_counts = Counter(all_kw)
 for kw in FINTECH_AI_KEYWORDS:
-    print(f'    {kw}: {kw_counts.get(kw, 0)}건')
 df_filtered
 # ── 3단계: 저장 ──
 output_filename = f"Articles_{datetime.now().strftime('%Y%m%d_%H%M%S')}.xlsx"
-df_filtered.to_excel(output_filename, index=False, engine='openpyxl')
-print(f'[SAVE] ✅ 저장 완료: {output_filename}')
-print(f'[SAVE]    - AI 핀테크 기사: {len(df_filtered)}건')
 # ── 4단계: 키워드 빈도 시각화 ──
-import matplotlib.pyplot as plt
 import platform
 from collections import Counter
 # 폰트 깨짐 방지 (Mac 환경: AppleGothic)
-if platform.system() == 'Darwin':
-    plt.rc('font', family='AppleGothic')
-plt.rcParams['axes.unicode_minus'] = False
 if not filtered_articles:
-    print('시각화할 데이터가 없습니다.')
 else:
     # 빈도수 계산
-    all_kw = [kw for row in filtered_articles for kw in row['matched_keywords'].split(', ')]
     kw_counts = Counter(all_kw)
     # 📌 변경 포인트: FINTECH_AI_KEYWORDS 전체 목록을 순서대로 그래프에 강제 표시 (0건 포함)
     keywords = FINTECH_AI_KEYWORDS
     counts = [kw_counts.get(kw, 0) for kw in keywords]
     plt.figure(figsize=(12, 6))
     # 막대 그래프 생성
-    bars = plt.bar(keywords, counts, color='skyblue', edgecolor='white')
     # 막대 위에 숫자(빈도수) 표시
     for bar in bars:
         height = bar.get_height()
         # 막대의 중앙(x), 막대의 높이(y) 위치에 텍스트를 배치
-        plt.text(bar.get_x() + bar.get_width() / 2.0, height, f'{height}',
-                 ha='center', va='bottom', size=11, fontweight='bold', color='black')
-    plt.title('수집된 AI 핀테크 기사 키워드 출현 빈도 (전체)', fontsize=15, pad=15)
-    plt.xlabel('키워드', fontsize=12)
-    plt.ylabel('출현 횟수 (건)', fontsize=12)
-    plt.grid(axis='y', linestyle='--', alpha=0.7)
     plt.xticks(rotation=45)
     plt.tight_layout()
     plt.show()

+import re
+import time
+from collections import Counter
+from datetime import datetime
+import pandas as pd
 from selenium import webdriver
+from selenium.webdriver.chrome.service import Service
 from selenium.webdriver.common.by import By
 from webdriver_manager.chrome import ChromeDriverManager
 # 수집 대상 카테고리
 categories = {
+    "경제": "https://news.naver.com/section/101",
+    "IT/과학": "https://news.naver.com/section/105",
 }
 NUM_ARTICLES_PER_CATEGORY = 80
 # AI 핀테크 키워드 (FinNode 프로젝트 전용)
 FINTECH_AI_KEYWORDS = [
     # AI 기술
+    "AI",
+    "인공지능",
+    "생성형 AI",
+    "대규모언어모델",
     # AI 핀테크 (금융)
+    "핀테크",
 ]
+print("[INIT] ChromeDriver 초기화 중...")
 service = Service(ChromeDriverManager().install())
 options = webdriver.ChromeOptions()
+options.add_argument("--no-sandbox")
+options.add_argument("--disable-dev-shm-usage")
 driver = webdriver.Chrome(service=service, options=options)
+print("[INIT] ✅ 브라우저 실행 완료")
 def get_article_links(driver, category_url, num_articles):
+    print(f"  [LINK] 페이지 이동: {category_url}")
     driver.get(category_url)
     time.sleep(3)
+    print(f"  [LINK] 로드 완료 (title: {driver.title})")
     article_links = []
     selectors = [
+        "a.sa_text_title",
+        "a.sa_text_lede",
+        "a.sa_text_strong",
+        ".sa_text a",
+        ".cluster_text_headline a",
+        ".cluster_text_lede a",
     ]
     for selector in selectors:
         elements = driver.find_elements(By.CSS_SELECTOR, selector)
         print(f"  [LINK] 셀렉터 '{selector}' -> {len(elements)}개 발견")
         for element in elements:
+            url = element.get_attribute("href")
+            if (
+                url
+                and "news.naver.com" in url
+                and "/article/" in url
+                and "/comment/" not in url
+                and url not in article_links
+            ):
                 article_links.append(url)
                 if len(article_links) >= num_articles:
                     break
         if len(article_links) >= num_articles:
             break
+    print(f"  [LINK] ✅ 총 {len(article_links)}개 링크 확보\n")
     return article_links[:num_articles]
 def parse_article_detail(driver, article_url, category):
     driver.get(article_url)
     time.sleep(1.5)
     article_data = {
+        "article_id": "",
+        "title": "",
+        "content": "",
+        "url": article_url,
+        "published_date": "",
+        "source": "",
+        "author": "",
+        "category": category,
     }
     try:
+        match = re.search(r"article/(\d+)/(\d+)", article_url)
+        article_data["article_id"] = (
+            f"ART_{match.group(1)}_{match.group(2)}" if match else f"ART_{datetime.now().strftime('%Y%m%d%H%M%S')}"
         )
+        for sel in [
+            "#title_area span",
+            "#ct .media_end_head_headline",
+            ".media_end_head_headline",
+            "h2#title_area",
+            ".news_end_title",
+        ]:
             try:
                 el = driver.find_element(By.CSS_SELECTOR, sel)
                 if el.text.strip():
+                    article_data["title"] = el.text.strip()
+                    break
+            except:
+                continue
+        for sel in [
+            "#dic_area",
+            "article#dic_area",
+            ".go_trans._article_content",
+            "._article_body_contents",
+        ]:
             try:
                 el = driver.find_element(By.CSS_SELECTOR, sel)
                 if el.text.strip():
+                    article_data["content"] = el.text.strip()
+                    break
+            except:
+                continue
         try:
+            el = driver.find_element(By.CSS_SELECTOR, "a.media_end_head_top_logo img")
+            article_data["source"] = el.get_attribute("alt")
         except:
             try:
+                el = driver.find_element(By.CSS_SELECTOR, ".media_end_head_top_logo_text")
+                article_data["source"] = el.text.strip()
+            except:
+                pass
         try:
+            el = driver.find_element(
+                By.CSS_SELECTOR,
+                "span.media_end_head_info_datestamp_time, span[data-date-time]",
+            )
+            article_data["published_date"] = (el.get_attribute("data-date-time") or el.text).strip()
         except:
+            article_data["published_date"] = datetime.now().strftime("%Y-%m-%d %H:%M")
         try:
+            el = driver.find_element(
+                By.CSS_SELECTOR,
+                "em.media_end_head_journalist_name, span.byline_s",
+            )
+            article_data["author"] = el.text.strip()
+        except:
+            pass
     except Exception as e:
+        print(f"    [PARSE] ⚠️  파싱 오류: {e}")
     return article_data
 # ── 1단계: 전체 기사 수집 ──
 all_articles = []
 category_stats = {}
 for category_name, category_url in categories.items():
+    print(f"\n{'=' * 60}")
+    print(f"[CRAWL] [{category_name}] 카테고리 수집 시작")
+    print(f"{'=' * 60}")
     article_links = get_article_links(driver, category_url, NUM_ARTICLES_PER_CATEGORY)
     cat_ok, cat_fail = 0, 0
     for idx, article_url in enumerate(article_links, 1):
+        print(f"  [PARSE] ({idx}/{len(article_links)}) {article_url[:70]}...")
         article_data = parse_article_detail(driver, article_url, category_name)
+        if article_data["title"] and article_data["content"]:
             all_articles.append(article_data)
             cat_ok += 1
             print(f"    ✅ {article_data['title'][:40]}...")
             print(f"       언론사: {article_data['source']} | 날짜: {article_data['published_date']}")
         else:
             cat_fail += 1
+            missing = [
+                x
+                for x, v in [
+                    ("제목", article_data["title"]),
+                    ("본문", article_data["content"]),
+                ]
+                if not v
+            ]
             print(f"    ❌ 파싱실패 ({', '.join(missing)} 없음)")
         time.sleep(0.5)
+    category_stats[category_name] = {"ok": cat_ok, "fail": cat_fail}
     print(f"\n  [CRAWL] [{category_name}] 완료: 성공 {cat_ok}개 / 실패 {cat_fail}개")
 driver.quit()
+print("\n[DONE] 브라우저 종료")
+print(f"\n{'=' * 60}")
+print("[SUMMARY] 수집 결과 요약")
+print(f"{'=' * 60}")
 for cat, s in category_stats.items():
+    print(f"  {cat}: 성공 {s['ok']}건 / 실패 {s['fail']}건")
+print(f"  전체 수집: {len(all_articles)}건")
 df_all = pd.DataFrame(all_articles)
 df_all
 # ── 2단계: AI 핀테크 키워드 필터링 ──
+print(f"\n{'=' * 60}")
+print("[FILTER] AI 핀테크 키워드 필터링 시작")
+print(f"{'=' * 60}")
 filtered_articles = []
 for _, row in df_all.iterrows():
     matched = [kw for kw in FINTECH_AI_KEYWORDS if kw.replace(" ", "") in text.replace(" ", "")]
     if matched:
         row_dict = row.to_dict()
+        row_dict["matched_keywords"] = ", ".join(matched)
         filtered_articles.append(row_dict)
 df_filtered = pd.DataFrame(filtered_articles)
+print(f"  전체 수집: {len(df_all)}건")
+print(f"  AI 핀테크 관련: {len(df_filtered)}건 ({len(df_filtered) / max(len(df_all), 1) * 100:.1f}%)")
+print("\n  [키워드별 매칭 현황]")
+all_kw = [kw for row in filtered_articles for kw in row["matched_keywords"].split(", ")]
 kw_counts = Counter(all_kw)
 for kw in FINTECH_AI_KEYWORDS:
+    print(f"    {kw}: {kw_counts.get(kw, 0)}건")
 df_filtered
 # ── 3단계: 저장 ──
 output_filename = f"Articles_{datetime.now().strftime('%Y%m%d_%H%M%S')}.xlsx"
+df_filtered.to_excel(output_filename, index=False, engine="openpyxl")
+print(f"[SAVE] ✅ 저장 완료: {output_filename}")
+print(f"[SAVE]    - AI 핀테크 기사: {len(df_filtered)}건")
 # ── 4단계: 키워드 빈도 시각화 ──
 import platform
 from collections import Counter
+import matplotlib.pyplot as plt
 # 폰트 깨짐 방지 (Mac 환경: AppleGothic)
+if platform.system() == "Darwin":
+    plt.rc("font", family="AppleGothic")
+plt.rcParams["axes.unicode_minus"] = False
 if not filtered_articles:
+    print("시각화할 데이터가 없습니다.")
 else:
     # 빈도수 계산
+    all_kw = [kw for row in filtered_articles for kw in row["matched_keywords"].split(", ")]
     kw_counts = Counter(all_kw)
     # 📌 변경 포인트: FINTECH_AI_KEYWORDS 전체 목록을 순서대로 그래프에 강제 표시 (0건 포함)
     keywords = FINTECH_AI_KEYWORDS
     counts = [kw_counts.get(kw, 0) for kw in keywords]
     plt.figure(figsize=(12, 6))
     # 막대 그래프 생성
+    bars = plt.bar(keywords, counts, color="skyblue", edgecolor="white")
     # 막대 위에 숫자(빈도수) 표시
     for bar in bars:
         height = bar.get_height()
         # 막대의 중앙(x), 막대의 높이(y) 위치에 텍스트를 배치
+        plt.text(
+            bar.get_x() + bar.get_width() / 2.0,
+            height,
+            f"{height}",
+            ha="center",
+            va="bottom",
+            size=11,
+            fontweight="bold",
+            color="black",
+        )
+    plt.title("수집된 AI 핀테크 기사 키워드 출현 빈도 (전체)", fontsize=15, pad=15)
+    plt.xlabel("키워드", fontsize=12)
+    plt.ylabel("출현 횟수 (건)", fontsize=12)
+    plt.grid(axis="y", linestyle="--", alpha=0.7)
     plt.xticks(rotation=45)
     plt.tight_layout()
     plt.show()

src/retrieval/finRetrieval.py CHANGED Viewed

@@ -11,17 +11,18 @@ app.py에서 import하여 Gradio 챗봇과 연동합니다.
 """
 import os
 import dotenv
 import neo4j
-from neo4j_graphrag.llm import OpenAILLM
 from neo4j_graphrag.embeddings.openai import OpenAIEmbeddings
 from neo4j_graphrag.retrievers import (
-    VectorRetriever,
-    VectorCypherRetriever,
     Text2CypherRetriever,
     ToolsRetriever,
 )
-from neo4j_graphrag.generation import RagTemplate, GraphRAG
 dotenv.load_dotenv()
@@ -29,11 +30,14 @@ dotenv.load_dotenv()
 # 1. DB / LLM / Embedder 초기화
 # ──────────────────────────────────────────
-URI      = os.getenv("NEO4J_URI", "neo4j://localhost:7687")
-AUTH     = (os.getenv("NEO4J_USERNAME", "neo4j"), os.getenv("NEO4J_PASSWORD", "password"))
-driver   = neo4j.GraphDatabase.driver(URI, auth=AUTH)
-rag_llm  = OpenAILLM(model_name="gpt-4o", model_params={"temperature": 0})
 embedder = OpenAIEmbeddings(model="text-embedding-3-small")
 INDEX_NAME = "content_vector_index"
@@ -76,6 +80,7 @@ vector_cypher_retriever = VectorCypherRetriever(
     embedder=embedder,
 )
 # (3) 자연어 → Cypher 자동 변환 검색
 def _get_schema() -> str:
     with driver.session() as s:
@@ -85,9 +90,7 @@ def _get_schema() -> str:
             "RETURN nodeType, collect(propertyName) as props"
         ).data()
         rels = s.run(
-            "MATCH (n)-[r]->(m) "
-            "RETURN DISTINCT labels(n)[0] as src, type(r) as rel, labels(m)[0] as tgt "
-            "LIMIT 30"
         ).data()
     txt = "=== Neo4j Schema ===\n노드:\n"
     for n in nodes:
@@ -103,18 +106,15 @@ _examples = [
 CYPHER QUERY:
 MATCH (c:AICompany {name:"카카오"})-[:DEVELOPS]->(s:AIService)
 RETURN s.name, s.description""",
     """USER INPUT: 삼성전자가 개발 중인 AI 기술은?
 CYPHER QUERY:
 MATCH (c:AICompany {name:"삼성전자"})-[:DEVELOPS]->(t:AITechnology)
 RETURN t.name, t.description""",
     """USER INPUT: 최근 AI 관련 기사 5개
 CYPHER QUERY:
 MATCH (a:Article)-[:MENTIONS]->(:AICompany)
 RETURN DISTINCT a.article_id, a.title, a.url, a.published_date
 ORDER BY a.published_date DESC LIMIT 5""",
     """USER INPUT: 어떤 기업이 LLM 기술을 개발하나요?
 CYPHER QUERY:
 MATCH (c:AICompany)-[:DEVELOPS]->(t:AITechnology)

 """
 import os
 import dotenv
 import neo4j
 from neo4j_graphrag.embeddings.openai import OpenAIEmbeddings
+from neo4j_graphrag.generation import GraphRAG, RagTemplate
+from neo4j_graphrag.llm import OpenAILLM
 from neo4j_graphrag.retrievers import (
     Text2CypherRetriever,
     ToolsRetriever,
+    VectorCypherRetriever,
+    VectorRetriever,
 )
 dotenv.load_dotenv()
 # 1. DB / LLM / Embedder 초기화
 # ──────────────────────────────────────────
+URI = os.getenv("NEO4J_URI", "neo4j://localhost:7687")
+AUTH = (
+    os.getenv("NEO4J_USERNAME", "neo4j"),
+    os.getenv("NEO4J_PASSWORD", "password"),
+)
+driver = neo4j.GraphDatabase.driver(URI, auth=AUTH)
+rag_llm = OpenAILLM(model_name="gpt-4o", model_params={"temperature": 0})
 embedder = OpenAIEmbeddings(model="text-embedding-3-small")
 INDEX_NAME = "content_vector_index"
     embedder=embedder,
 )
 # (3) 자연어 → Cypher 자동 변환 검색
 def _get_schema() -> str:
     with driver.session() as s:
             "RETURN nodeType, collect(propertyName) as props"
         ).data()
         rels = s.run(
+            "MATCH (n)-[r]->(m) RETURN DISTINCT labels(n)[0] as src, type(r) as rel, labels(m)[0] as tgt LIMIT 30"
         ).data()
     txt = "=== Neo4j Schema ===\n노드:\n"
     for n in nodes:
 CYPHER QUERY:
 MATCH (c:AICompany {name:"카카오"})-[:DEVELOPS]->(s:AIService)
 RETURN s.name, s.description""",
     """USER INPUT: 삼성전자가 개발 중인 AI 기술은?
 CYPHER QUERY:
 MATCH (c:AICompany {name:"삼성전자"})-[:DEVELOPS]->(t:AITechnology)
 RETURN t.name, t.description""",
     """USER INPUT: 최근 AI 관련 기사 5개
 CYPHER QUERY:
 MATCH (a:Article)-[:MENTIONS]->(:AICompany)
 RETURN DISTINCT a.article_id, a.title, a.url, a.published_date
 ORDER BY a.published_date DESC LIMIT 5""",
     """USER INPUT: 어떤 기업이 LLM 기술을 개발하나요?
 CYPHER QUERY:
 MATCH (c:AICompany)-[:DEVELOPS]->(t:AITechnology)

tests/test_chunk_text.py CHANGED Viewed

@@ -1,5 +1,6 @@
 from src.graphBuilder.neo4j.finGraph import chunk_text
 def test_chunk_text_empty_returns_empty_list():
     assert chunk_text("") == []

 from src.graphBuilder.neo4j.finGraph import chunk_text
 def test_chunk_text_empty_returns_empty_list():
     assert chunk_text("") == []

tests/test_retrieval.py CHANGED Viewed

@@ -1,5 +1,7 @@
 import os
 import pytest
 from src.retrieval.finRetrieval import graphrag
 # API 키와 Neo4j 연결정보가 없을 경우 테스트를 건너뜁니다.

 import os
 import pytest
 from src.retrieval.finRetrieval import graphrag
 # API 키와 Neo4j 연결정보가 없을 경우 테스트를 건너뜁니다.