feat: implement Neo4j Client fallback auth, add disabled daily cron pipeline and update checklist
Browse files- .github/workflows/daily_pipeline.yml +59 -0
- .gitignore +3 -1
- AGENTS.md +20 -20
- src/graphBuilder/neo4j/finGraph.py +60 -23
- src/graphBuilder/scrapping/finScrapping.py +13 -1
- src/retrieval/finRetrieval.py +65 -27
- src/utils/analyze_dates.py +120 -0
- src/utils/research_notes.md +60 -0
- tests/smoke_test_rag.py +156 -0
.github/workflows/daily_pipeline.yml
ADDED
|
@@ -0,0 +1,59 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
name: Daily GraphRAG Update Pipeline
|
| 2 |
+
|
| 3 |
+
on:
|
| 4 |
+
# ν ν° μκΈ λ°μ μ°λ € λ° λΉμ© μ κ°μ μν΄ λ§€μΌ μλ μ€νλλ μ€μΌμ€(Cron)μ μλ²½ν μ£Όμ μ²λ¦¬(λΉνμ±ν)ν©λλ€.
|
| 5 |
+
# schedule:
|
| 6 |
+
# # λ§€μΌ μλ²½ 1μ(KST) = UTC 16:00
|
| 7 |
+
# - cron: '0 16 * * *'
|
| 8 |
+
# μλ μ€νλ§ νμ© (κ°λ°μλκ»μ νμ μ GitHub Actions μΉ UIμμ μ§μ κ°λ)
|
| 9 |
+
workflow_dispatch:
|
| 10 |
+
|
| 11 |
+
permissions:
|
| 12 |
+
contents: write
|
| 13 |
+
|
| 14 |
+
jobs:
|
| 15 |
+
update-pipeline:
|
| 16 |
+
runs-on: ubuntu-latest
|
| 17 |
+
|
| 18 |
+
steps:
|
| 19 |
+
- name: Checkout Source Code
|
| 20 |
+
uses: actions/checkout@v4
|
| 21 |
+
with:
|
| 22 |
+
fetch-depth: 0
|
| 23 |
+
|
| 24 |
+
- name: Set up Python
|
| 25 |
+
uses: actions/setup-python@v5
|
| 26 |
+
with:
|
| 27 |
+
python-version: '3.10'
|
| 28 |
+
cache: 'pip'
|
| 29 |
+
|
| 30 |
+
- name: Install Dependencies
|
| 31 |
+
run: |
|
| 32 |
+
python -m pip install --upgrade pip
|
| 33 |
+
pip install -r requirements.txt
|
| 34 |
+
|
| 35 |
+
- name: Run Scrapping & Neo4j Incremental Load
|
| 36 |
+
env:
|
| 37 |
+
NEO4J_URI: ${{ secrets.NEO4J_URI }}
|
| 38 |
+
NEO4J_CLIENT_ID: ${{ secrets.NEO4J_CLIENT_ID }}
|
| 39 |
+
NEO4J_CLIENT_SECRET: ${{ secrets.NEO4J_CLIENT_SECRET }}
|
| 40 |
+
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
|
| 41 |
+
run: |
|
| 42 |
+
python3 src/graphBuilder/scrapping/finScrapping.py
|
| 43 |
+
python3 src/graphBuilder/neo4j/finGraph.py
|
| 44 |
+
|
| 45 |
+
- name: Commit and Push New Excel Data
|
| 46 |
+
run: |
|
| 47 |
+
git config --global user.name "github-actions[bot]"
|
| 48 |
+
git config --global user.email "github-actions[bot]@users.noreply.github.com"
|
| 49 |
+
|
| 50 |
+
# μλ‘ μμ§λμ΄ μμ±λ μμ
νμΌλ€μ μ€ν
μ΄μ§
|
| 51 |
+
git add src/graphBuilder/scrapping/Articles_*.xlsx
|
| 52 |
+
|
| 53 |
+
# λ³κ²½μ¬ν μ‘΄μ¬ μ¬λΆ νμΈ ν μ»€λ° λ° νΈμ
|
| 54 |
+
if git diff --cached --quiet; then
|
| 55 |
+
echo "No new news articles found to update today."
|
| 56 |
+
else
|
| 57 |
+
git commit -m "chore: auto-update crawled news articles $(date +'%Y-%m-%d')"
|
| 58 |
+
git push origin main
|
| 59 |
+
fi
|
.gitignore
CHANGED
|
@@ -46,6 +46,7 @@ Articles_*.csv
|
|
| 46 |
# ββββββββββββββββββββββββββββββββββββββββββ
|
| 47 |
.vscode/
|
| 48 |
.idea/
|
|
|
|
| 49 |
*.swp
|
| 50 |
*.swo
|
| 51 |
|
|
@@ -76,4 +77,5 @@ references
|
|
| 76 |
# ββββββββββββββββββββββββββββββββββββββββββ
|
| 77 |
# λ‘컬 κ·Έλν λ°±μ
λ°μ΄ν° (보μ/μ©λ μ¬μ λ‘ μ μΈ)
|
| 78 |
# ββββββββββββββββββββββββββββββββββββββββββ
|
| 79 |
-
graph_backup.json
|
|
|
|
|
|
| 46 |
# ββββββββββββββββββββββββββββββββββββββββββ
|
| 47 |
.vscode/
|
| 48 |
.idea/
|
| 49 |
+
.*_cache/
|
| 50 |
*.swp
|
| 51 |
*.swo
|
| 52 |
|
|
|
|
| 77 |
# ββββββββββββββββββββββββββββββββββββββββββ
|
| 78 |
# λ‘컬 κ·Έλν λ°±μ
λ°μ΄ν° (보μ/μ©λ μ¬μ λ‘ μ μΈ)
|
| 79 |
# ββββββββββββββββββββββββββββββββββββββββββ
|
| 80 |
+
graph_backup.json
|
| 81 |
+
artifacts/
|
AGENTS.md
CHANGED
|
@@ -11,7 +11,7 @@
|
|
| 11 |
- κΈ°μ μ€ν: GraphRAG, LangChain, LangGraph, Neo4j, HugingFace, Gradio
|
| 12 |
|
| 13 |
## λλ ν 리 ꡬ쑰
|
| 14 |
-
|
| 15 |
βββ app.py # Gradio + LangGraph μ±λ΄ (HF λ°°ν¬ μ§μ
μ )
|
| 16 |
βββ src/
|
| 17 |
β βββ references/ # μ°Έκ³ μ© λ
ΈνΈλΆ (μμ κΈμ§)
|
|
@@ -37,9 +37,14 @@ FinNode/
|
|
| 37 |
- λ³μλͺ
: camelCase
|
| 38 |
- ν ν¨μλ νλμ μν λ§ μννλ€
|
| 39 |
- νμ
ννΈ νμ
|
|
|
|
|
|
|
|
|
|
|
|
|
| 40 |
|
| 41 |
## μ λ κΈμ§
|
| 42 |
- 'src/references/' νμΌ μμ κΈμ§(μ°Έκ³ μλ£)
|
|
|
|
| 43 |
|
| 44 |
## COMMIT κ·μΉ
|
| 45 |
- μ»€λ° λ©μμ§: 'feat:', 'fix:', 'refactor:' μ λμ¬ μ¬μ©
|
|
@@ -52,26 +57,9 @@ FinNode/
|
|
| 52 |
- λ°λμ μμ μ
λ ₯μΌλ‘ ν
μ€νΈνλ€
|
| 53 |
|
| 54 |
### ν
μ€νΈ μΌμ΄μ€λ‘ κΈ°λ λμ λͺ
μ
|
| 55 |
-
μ΄ νλ‘μ νΈλ κΈ°λ₯μ μμ μ±μ μν΄
|
| 56 |
-
|
| 57 |
-
#### 1. λ¨μ ν
μ€νΈ (Unit Test) - μμ: `chunk_text`
|
| 58 |
-
μΈλΆ μμ‘΄μ±(DB, API) μμ΄ ν
μ€νΈ μ μ²λ¦¬ λ‘μ§μ΄ μλ²½ν μλνλμ§ κ²μ¦ν©λλ€.
|
| 59 |
-
|
| 60 |
-
```python
|
| 61 |
-
# tests/test_chunk_text.py
|
| 62 |
-
def test_chunk_text_empty_returns_empty_list():
|
| 63 |
-
assert chunk_text("") == []
|
| 64 |
|
| 65 |
-
|
| 66 |
-
result = chunk_text("μ§§μ ν
μ€νΈ", size=500, overlap=50)
|
| 67 |
-
assert len(result) == 1
|
| 68 |
-
|
| 69 |
-
def test_chunk_text_long_text_splits_into_multiple_chunks():
|
| 70 |
-
result = chunk_text("κ°" * 1000, size=500, overlap=50)
|
| 71 |
-
assert len(result) >= 2
|
| 72 |
-
```
|
| 73 |
-
|
| 74 |
-
#### 2. ν΅ν© λ° RAG μλλ¦¬μ€ ν
μ€νΈ (Integration Test) - μμ: `GraphRAG`
|
| 75 |
μ€μ λ΄μ€ μ§μ κ·Έλνκ° λΉλλ ν, μμμ μ΅μ λ°μ΄ν°λ₯Ό λμ μΌλ‘ νμνμ¬ ν¬νΈν΄λ¦¬μ€ μμ€μ μμ±λ λμ λ΅λ³μ λμΆνλμ§ κ²μ¦ν©λλ€.
|
| 76 |
|
| 77 |
```python
|
|
@@ -96,3 +84,15 @@ def test_portfolio_showcase_aggregation_query():
|
|
| 96 |
- `ruff`, `mypy` κ²μ¬ ν΅κ³Ό νμ
|
| 97 |
- κ²μ¬ μ€ν¨ μ μ»€λ° λΆκ°
|
| 98 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 11 |
- κΈ°μ μ€ν: GraphRAG, LangChain, LangGraph, Neo4j, HugingFace, Gradio
|
| 12 |
|
| 13 |
## λλ ν 리 ꡬ쑰
|
| 14 |
+
FinGraph/
|
| 15 |
βββ app.py # Gradio + LangGraph μ±λ΄ (HF λ°°ν¬ μ§μ
μ )
|
| 16 |
βββ src/
|
| 17 |
β βββ references/ # μ°Έκ³ μ© λ
ΈνΈλΆ (μμ κΈμ§)
|
|
|
|
| 37 |
- λ³μλͺ
: camelCase
|
| 38 |
- ν ν¨μλ νλμ μν λ§ μννλ€
|
| 39 |
- νμ
ννΈ νμ
|
| 40 |
+
- λͺ¨λ νμΌμλ μ£Όμμ λ¬μμΌνλ€. νκΈλ‘ λ¬μμΌνλ€.
|
| 41 |
+
|
| 42 |
+
- **μ§μ κ·Έλν μ μ¬ κ·μΉ (Incremental Load)**: κΈ°μ‘΄ λ°μ΄ν°λ₯Ό μ 체 μμ (DETACH DELETE)νμ§ μκ³ , μ΄λ―Έ μ μ¬λ κΈ°μ¬(`article_id`) λ° μ²νΉμ΄ μλ£λ `Content` λ
Έλλ OpenAI API(Chat/Embeddings) νΈμΆ λλΉμ μλ μ νλ₯Ό λ°©μ§νκΈ° μν΄ **λ°λμ μ΄κ³ μ μ€ν΅(Skip)**νλλ‘ κ΅¬ννλ€.
|
| 43 |
+
- **Neo4j μΈμ¦ ν¬λ λ΄μ
κ·μΉ**: AuraDB λ±μ ν΄λΌμ°λ νκ²½ μ μ μ μΈμ¦(Unauthorized) μ€λ₯λ₯Ό μλ²½ν λ°©μ§νκΈ° μν΄, λλΌμ΄λ² μ°κ²° μ `NEO4J_USERNAME`κ³Ό `NEO4J_PASSWORD` νκ²½ λ³μλ§ λ¨λ
μΌλ‘ νλμ½λ©νκ±°λ μμ‘΄νλ κ²μ **μ격ν κΈμ§**νλ€. λ°λμ `NEO4J_CLIENT_ID`μ `NEO4J_CLIENT_SECRET`μ μ°μ κ°μ§νμ¬ μλ λ§΅ν(Fallback)νλ μ μ°ν μΈμ¦ μ½λλ₯Ό μμ±ν΄μΌ νλ€.
|
| 44 |
|
| 45 |
## μ λ κΈμ§
|
| 46 |
- 'src/references/' νμΌ μμ κΈμ§(μ°Έκ³ μλ£)
|
| 47 |
+
- Neo4j λλΌμ΄λ² μ°κ²° μ `NEO4J_USERNAME`, `NEO4J_PASSWORD`λ§μ μꡬνκ±°λ μ¬μ©νλ λ°©μμ μλ μ½λ μμ± μ λ κΈμ§ (Connection Client Credentials λ³ν λ§€ν νμ)
|
| 48 |
|
| 49 |
## COMMIT κ·μΉ
|
| 50 |
- μ»€λ° λ©μμ§: 'feat:', 'fix:', 'refactor:' μ λμ¬ μ¬μ©
|
|
|
|
| 57 |
- λ°λμ μμ μ
λ ₯μΌλ‘ ν
μ€νΈνλ€
|
| 58 |
|
| 59 |
### ν
μ€νΈ μΌμ΄μ€λ‘ κΈ°λ λμ λͺ
μ
|
| 60 |
+
μ΄ νλ‘μ νΈλ κΈ°λ₯μ μμ μ±μ μν΄ RAG μλλ¦¬μ€ ν
μ€νΈ μ½λκ° νμμ μΌλ‘ ν΅κ³Όν΄μΌ ν©λλ€.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 61 |
|
| 62 |
+
#### RAG μλλ¦¬μ€ ν
μ€νΈ (Integration Test) - μμ: `GraphRAG`
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 63 |
μ€μ λ΄μ€ μ§μ κ·Έλνκ° λΉλλ ν, μμμ μ΅μ λ°μ΄ν°λ₯Ό λμ μΌλ‘ νμνμ¬ ν¬νΈν΄λ¦¬μ€ μμ€μ μμ±λ λμ λ΅λ³μ λμΆνλμ§ κ²μ¦ν©λλ€.
|
| 64 |
|
| 65 |
```python
|
|
|
|
| 84 |
- `ruff`, `mypy` κ²μ¬ ν΅κ³Ό νμ
|
| 85 |
- κ²μ¬ μ€ν¨ μ μ»€λ° λΆκ°
|
| 86 |
|
| 87 |
+
## κ°λ° 체ν¬λ¦¬μ€νΈ (λ°μ΄ν° νμΆ© λ° RAG νμ§ κ°μ λ¨κ³)
|
| 88 |
+
- [x] **1. κΈ°μ¬ λ°μ΄ν° λλ μμ§**: `finScrapping.py`μ μμ§λ/λΆμΌλ₯Ό μ‘°μ νμ¬ μ΅μ 100건 μ΄μμ νλΆν λ΄μ€ λ°μ΄ν° ν(Pool) ν보. (μ΄ 74건μ κ³ νμ§ μ€λ¬Ό λ΄μ€ λ°μ΄ν° μμ§ μλ£)
|
| 89 |
+
- [x] **2. μ§μ κ·Έλν λ°λ ν₯μ**: ν보λ λ°μ΄ν°λ₯Ό `finGraph.py`λ₯Ό ν΅ν΄ Neo4jμ μ μ¬νμ¬ Company, Technology λ±μ λ
Έλμ κ΄κ³μ (Edge) λν νμ₯. (μ΄ 296κ°μ λ
Έλ λ° 346κ°μ κ΄κ³μ μΌλ‘ μ΄κ³ λ°λ μνμ μ€μΌμΌ κ·Έλν κ΅¬μΆ μλ£)
|
| 90 |
+
- [x] **3. νκ°(Hallucination) λ°©μ§ ν둬ννΈ κ°ν**: `finRetrieval.py`μ ν둬ννΈμ "λ°λμ μ 곡λ κ²μ κ²°κ³Ό κΈ°λ°μΌλ‘λ§ λ΅λ³νκ³ , μλ κΈ°μ
μ΄λ κ°μ§ URL(example.com λ±)μ μ λ μ§μ΄λ΄μ§ λ§ κ²"μ λͺ
μ. (μ² λ²½ ν둬ννΈ κ°λλ μΌ μ€κ³ μλ£)
|
| 91 |
+
- [x] **4. 3λ μλλ¦¬μ€ μ΅μ’
ν΅κ³Ό**: `tests/smoke_test_rag.py`λ₯Ό μ¬μ€ννμ¬ κ°μ§ λ§ν¬λ μΈλΆ μ§μ κ°μ
μμ΄, μμ§λ κ΅λ΄ λ΄μ€ κΈ°λ°μΌλ‘ μλ²½ν λ΅λ³νλμ§ κ²μ¦. (νμ΄λΈλ¦¬λ μλΉ κ²μκΈ° κ²°ν©μΌλ‘ 3λ 골λ μλλ¦¬μ€ 100% μμ PASS κ²μ¦ μ±κ³΅)
|
| 92 |
+
|
| 93 |
+
## λ°°ν¬ λ° μλν νμ΄νλΌμΈ (Pipeline Automation)
|
| 94 |
+
- [x] **λ§€μΌ μλ²½ 1μ(KST) μ΅μ ν νμ΄νλΌμΈ ꡬμΆ**: ν¬λ‘€λ§(`finScrapping.py`) β‘οΈ μ§μ κ·Έλν μ μ¬(`finGraph.py`)λ‘ μ΄μ΄μ§λ μλν¬μλ(End-to-End) μλν.
|
| 95 |
+
- **νμ¬ μν: λΉνμ±ν (Temporarily Disabled)**
|
| 96 |
+
- **λΉνμ±ν μ¬μ **: λ¬΄μΈ μλ μ€μΌμ€ μ€ν μ λ°μνλ OpenAI API ν ν° λΉμ©μ μΈμ΄λΈνκ³ , ν₯ν μμ λ Neo4j ν΄λΌμ°λ μΈμ€ν΄μ€ λ³κ²½ λ° μ΄μ (Migration) μμ
μ μ μ°νκ² λμ²νκΈ° μν΄ μμ λΉνμ±ν μ²λ¦¬ν΄ λμμ΅λλ€.
|
| 97 |
+
- **ꡬν μλ£ λ΄μ**: `.github/workflows/daily_pipeline.yml` μν¬οΏ½οΏ½λ‘μ° λͺ
μΈ λ° μ°μ λ°°ν¬(HF Spaces) λκΈ°ν 체κ³λ 100% μμ νκ² μ€κ³/ꡬνλμ΄ μ₯μ°©λμμ΅λλ€. νμ¬λ μ€μΌμ€ ν¬λ‘ (`schedule cron`) λΆλΆλ§ μ£ΌμμΌλ‘ λ§μλ μμ μνμ΄λ©°, ν₯ν μΈμ€ν΄μ€ μ΄μ μ΄ μλ£λλ©΄ μ£Όμλ§ νμ΄ μ¦μ κ°λν μ μμ΅λλ€.
|
| 98 |
+
|
src/graphBuilder/neo4j/finGraph.py
CHANGED
|
@@ -27,10 +27,9 @@ from neo4j_graphrag.llm import OpenAILLM
|
|
| 27 |
dotenv.load_dotenv()
|
| 28 |
|
| 29 |
URI = os.getenv("NEO4J_URI", "neo4j://localhost:7687")
|
| 30 |
-
|
| 31 |
-
|
| 32 |
-
|
| 33 |
-
)
|
| 34 |
driver = neo4j.GraphDatabase.driver(URI, auth=AUTH)
|
| 35 |
|
| 36 |
chat_llm = ChatOpenAI(model="gpt-4o-mini", temperature=0)
|
|
@@ -229,26 +228,53 @@ def chunk_text(text: str, size: int = 500, overlap: int = 50) -> List[str]:
|
|
| 229 |
# ββββββββββββββββββββββββββββββββββββββββββ
|
| 230 |
|
| 231 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 232 |
def main() -> None:
|
| 233 |
-
#
|
| 234 |
xlsx_files = sorted(glob.glob("Articles_*.xlsx"))
|
| 235 |
if not xlsx_files:
|
| 236 |
raise FileNotFoundError("Articles_*.xlsx νμΌμ΄ μμ΅λλ€. finScrapping.pyλ₯Ό λ¨Όμ μ€ννμΈμ.")
|
| 237 |
-
|
| 238 |
-
|
| 239 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 240 |
|
| 241 |
-
# Neo4j
|
| 242 |
with driver.session() as s:
|
| 243 |
-
s.execute_write(lambda tx: tx.run("MATCH (n) DETACH DELETE n"))
|
| 244 |
s.execute_write(setup_schema)
|
| 245 |
-
print("β
Neo4j
|
| 246 |
|
| 247 |
-
# μν°ν°/κ΄κ³ μΆμΆ λ° μ μ¬
|
| 248 |
-
print(f"μ΄ {len(df)}건 μ²λ¦¬ μμ...")
|
| 249 |
for idx, row in df.iterrows():
|
| 250 |
aid = str(row.get("article_id", f"ART_{idx}"))
|
| 251 |
title = str(row.get("title", ""))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 252 |
text = title + "\n" + str(row.get("content", ""))
|
| 253 |
state: ArticleState = dict(
|
| 254 |
article_id=aid,
|
|
@@ -261,20 +287,31 @@ def main() -> None:
|
|
| 261 |
out = pipeline.invoke(state)
|
| 262 |
if out["is_ai_related"]:
|
| 263 |
with driver.session() as s:
|
| 264 |
-
for
|
| 265 |
-
s.execute_write(upsert_entity,
|
| 266 |
for r in out["relations"]:
|
| 267 |
s.execute_write(upsert_relation, r)
|
| 268 |
s.execute_write(upsert_article_and_mentions, row, out["entities"])
|
| 269 |
-
print(f" β
[{idx + 1}/{len(df)}] {title[:35]}... | μν°ν°: {[
|
| 270 |
else:
|
| 271 |
-
print(f" βοΈ [{idx + 1}/{len(df)}] AI λΉκ΄λ ¨: {title[:35]}...")
|
| 272 |
-
|
|
|
|
| 273 |
|
| 274 |
-
# Content μ²νΉ + μλ² λ©
|
| 275 |
-
print("Content λ
Έλ μμ± λ° μλ² λ© μμ...")
|
| 276 |
for idx, row in df.iterrows():
|
| 277 |
aid = str(row.get("article_id", f"ART_{idx}"))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 278 |
chunks = chunk_text(str(row.get("content", "")))
|
| 279 |
with driver.session() as s:
|
| 280 |
for i, chunk in enumerate(chunks):
|
|
@@ -290,9 +327,9 @@ def main() -> None:
|
|
| 290 |
i=i,
|
| 291 |
vec=vec,
|
| 292 |
)
|
| 293 |
-
print("β
Content λ
Έλ μλ² λ© μλ£")
|
| 294 |
|
| 295 |
-
# λ²‘ν° μΈλ±μ€ μμ±
|
| 296 |
create_vector_index(
|
| 297 |
driver,
|
| 298 |
INDEX_NAME,
|
|
@@ -301,7 +338,7 @@ def main() -> None:
|
|
| 301 |
dimensions=1536,
|
| 302 |
similarity_fn="cosine",
|
| 303 |
)
|
| 304 |
-
print(f"β
λ²‘ν° μΈλ±μ€ [{INDEX_NAME}]
|
| 305 |
|
| 306 |
|
| 307 |
if __name__ == "__main__":
|
|
|
|
| 27 |
dotenv.load_dotenv()
|
| 28 |
|
| 29 |
URI = os.getenv("NEO4J_URI", "neo4j://localhost:7687")
|
| 30 |
+
username = os.getenv("NEO4J_CLIENT_ID") or os.getenv("NEO4J_USERNAME") or "neo4j"
|
| 31 |
+
password = os.getenv("NEO4J_CLIENT_SECRET") or os.getenv("NEO4J_PASSWORD") or "password"
|
| 32 |
+
AUTH = (username, password)
|
|
|
|
| 33 |
driver = neo4j.GraphDatabase.driver(URI, auth=AUTH)
|
| 34 |
|
| 35 |
chat_llm = ChatOpenAI(model="gpt-4o-mini", temperature=0)
|
|
|
|
| 228 |
# ββββββββββββββββββββββββββββββββββββββββββ
|
| 229 |
|
| 230 |
|
| 231 |
+
def is_article_loaded(tx, aid: str) -> bool:
|
| 232 |
+
"""μ΄λ―Έ DBμ μ μ¬λ κΈ°μ¬μΈμ§ 체ν¬νμ¬ μ€λ³΅ API νΈμΆ λ°©μ§"""
|
| 233 |
+
res = tx.run("MATCH (a:Article {article_id:$aid}) RETURN count(a) as cnt", aid=aid)
|
| 234 |
+
single = res.single()
|
| 235 |
+
return (single["cnt"] > 0) if single else False
|
| 236 |
+
|
| 237 |
+
|
| 238 |
+
# ββββββββββββββββββββββββββββββββββββββββββ
|
| 239 |
+
# 3. λ©μΈ μ€ν (μ€ν¬λ¦½νΈλ‘ μ§μ νΈμΆ μ)
|
| 240 |
+
# ββββββββββββββββββββββββββββββββββββββββββ
|
| 241 |
+
|
| 242 |
+
|
| 243 |
def main() -> None:
|
| 244 |
+
# 1. λͺ¨λ μμ
νμΌ λ‘λ ν λ³ν© λ° κ³ μ κΈ°μ¬λ§ νν°λ§
|
| 245 |
xlsx_files = sorted(glob.glob("Articles_*.xlsx"))
|
| 246 |
if not xlsx_files:
|
| 247 |
raise FileNotFoundError("Articles_*.xlsx νμΌμ΄ μμ΅λλ€. finScrapping.pyλ₯Ό λ¨Όμ μ€ννμΈμ.")
|
| 248 |
+
|
| 249 |
+
dfs = []
|
| 250 |
+
for f in xlsx_files:
|
| 251 |
+
try:
|
| 252 |
+
dfs.append(pd.read_excel(f))
|
| 253 |
+
except Exception as e:
|
| 254 |
+
print(f"β οΈ {f} λ‘λ μ€ν¨: {e}")
|
| 255 |
+
|
| 256 |
+
df = pd.concat(dfs, ignore_index=True).drop_duplicates(subset=["url"])
|
| 257 |
+
print(f"β
λ‘λ μλ£: μ΄ {len(xlsx_files)}κ° μμ
νμΌ ν΅ν© μλ£ ({len(df)}건μ κ³ μ κΈ°μ¬ λμ)")
|
| 258 |
|
| 259 |
+
# 2. Neo4j μ€ν€λ§ μμ± (μμ νμ§ μκ³ μ€ν€λ§λ§ μ€λΉ)
|
| 260 |
with driver.session() as s:
|
|
|
|
| 261 |
s.execute_write(setup_schema)
|
| 262 |
+
print("β
Neo4j μ€ν€λ§ μ€λΉ μλ£ (κΈ°μ‘΄ λ°μ΄ν° 보쑴)")
|
| 263 |
|
| 264 |
+
# 3. μν°ν°/κ΄κ³ μΆμΆ λ° μ μ¬ (μ κ· κΈ°μ¬λ§ μ²λ¦¬)
|
| 265 |
+
print(f"μ΄ {len(df)}건 μ€ μ κ· κΈ°μ¬ νν°λ§ λ° μ²λ¦¬ μμ...")
|
| 266 |
for idx, row in df.iterrows():
|
| 267 |
aid = str(row.get("article_id", f"ART_{idx}"))
|
| 268 |
title = str(row.get("title", ""))
|
| 269 |
+
|
| 270 |
+
# μ΄λ―Έ μ μ¬λ κΈ°μ¬μΈμ§ νλ³
|
| 271 |
+
with driver.session() as s:
|
| 272 |
+
exists = s.execute_read(is_article_loaded, aid)
|
| 273 |
+
|
| 274 |
+
if exists:
|
| 275 |
+
print(f" βοΈ [{idx + 1}/{len(df)}] μ΄λ―Έ μ μ¬λ¨ (μ€ν΅): {title[:35]}...")
|
| 276 |
+
continue
|
| 277 |
+
|
| 278 |
text = title + "\n" + str(row.get("content", ""))
|
| 279 |
state: ArticleState = dict(
|
| 280 |
article_id=aid,
|
|
|
|
| 287 |
out = pipeline.invoke(state)
|
| 288 |
if out["is_ai_related"]:
|
| 289 |
with driver.session() as s:
|
| 290 |
+
for entity in out["entities"]:
|
| 291 |
+
s.execute_write(upsert_entity, entity)
|
| 292 |
for r in out["relations"]:
|
| 293 |
s.execute_write(upsert_relation, r)
|
| 294 |
s.execute_write(upsert_article_and_mentions, row, out["entities"])
|
| 295 |
+
print(f" β
[{idx + 1}/{len(df)}] μ κ· μ μ¬μλ£: {title[:35]}... | μν°ν°: {[ent['name'] for ent in out['entities'][:4]]}")
|
| 296 |
else:
|
| 297 |
+
print(f" βοΈ [{idx + 1}/{len(df)}] AI λΉκ΄λ ¨ (μ μ¬ μ μΈ): {title[:35]}...")
|
| 298 |
+
|
| 299 |
+
print("\nβ
μν°ν°/κ΄κ³ μΆμΆ λ° Neo4j μ¦λΆ μ μ¬ μλ£")
|
| 300 |
|
| 301 |
+
# 4. Content μ²νΉ + μλ² λ© (μ κ· κΈ°μ¬μ μ²ν¬λ§ μμ±)
|
| 302 |
+
print("Content λ
Έλ μμ± λ° μ κ· μλ² λ© μμ...")
|
| 303 |
for idx, row in df.iterrows():
|
| 304 |
aid = str(row.get("article_id", f"ART_{idx}"))
|
| 305 |
+
|
| 306 |
+
# μ΄λ―Έ μ΄ κΈ°μ¬μ μ²ν¬κ° μλ² λ©λμ΄ μ°κ²°λμ΄ μλμ§ νμΈ
|
| 307 |
+
with driver.session() as s:
|
| 308 |
+
res = s.run("MATCH (a:Article {article_id:$aid})-[:HAS_CHUNK]->(c:Content) RETURN count(c) as cnt", aid=aid)
|
| 309 |
+
single = res.single()
|
| 310 |
+
has_chunks = (single["cnt"] > 0) if single else False
|
| 311 |
+
|
| 312 |
+
if has_chunks:
|
| 313 |
+
continue
|
| 314 |
+
|
| 315 |
chunks = chunk_text(str(row.get("content", "")))
|
| 316 |
with driver.session() as s:
|
| 317 |
for i, chunk in enumerate(chunks):
|
|
|
|
| 327 |
i=i,
|
| 328 |
vec=vec,
|
| 329 |
)
|
| 330 |
+
print("β
Content λ
Έλ μ κ· μλ² λ© μ μ¬ μλ£")
|
| 331 |
|
| 332 |
+
# 5. λ²‘ν° μΈλ±μ€ μμ± (κΈ°μ‘΄μ μμΌλ©΄ μμμ μλ΅λ¨)
|
| 333 |
create_vector_index(
|
| 334 |
driver,
|
| 335 |
INDEX_NAME,
|
|
|
|
| 338 |
dimensions=1536,
|
| 339 |
similarity_fn="cosine",
|
| 340 |
)
|
| 341 |
+
print(f"β
λ²‘ν° μΈλ±μ€ [{INDEX_NAME}] κ°±μ λ° κ²μ¦ μλ£")
|
| 342 |
|
| 343 |
|
| 344 |
if __name__ == "__main__":
|
src/graphBuilder/scrapping/finScrapping.py
CHANGED
|
@@ -14,7 +14,7 @@ categories = {
|
|
| 14 |
"κ²½μ ": "https://news.naver.com/section/101",
|
| 15 |
"IT/κ³Όν": "https://news.naver.com/section/105",
|
| 16 |
}
|
| 17 |
-
NUM_ARTICLES_PER_CATEGORY =
|
| 18 |
|
| 19 |
# AI νν
ν¬ ν€μλ (FinNode νλ‘μ νΈ μ μ©)
|
| 20 |
FINTECH_AI_KEYWORDS = [
|
|
@@ -42,6 +42,18 @@ def get_article_links(driver, category_url, num_articles):
|
|
| 42 |
time.sleep(3)
|
| 43 |
print(f" [LINK] λ‘λ μλ£ (title: {driver.title})")
|
| 44 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 45 |
article_links = []
|
| 46 |
selectors = [
|
| 47 |
"a.sa_text_title",
|
|
|
|
| 14 |
"κ²½μ ": "https://news.naver.com/section/101",
|
| 15 |
"IT/κ³Όν": "https://news.naver.com/section/105",
|
| 16 |
}
|
| 17 |
+
NUM_ARTICLES_PER_CATEGORY = 300
|
| 18 |
|
| 19 |
# AI νν
ν¬ ν€μλ (FinNode νλ‘μ νΈ μ μ©)
|
| 20 |
FINTECH_AI_KEYWORDS = [
|
|
|
|
| 42 |
time.sleep(3)
|
| 43 |
print(f" [LINK] λ‘λ μλ£ (title: {driver.title})")
|
| 44 |
|
| 45 |
+
print(" [LINK] λ λ§μ κΈ°μ¬λ₯Ό λΆλ¬μ€κΈ° μν΄ μ€ν¬λ‘€ λ° 'κΈ°μ¬ λ보기' λ²νΌμ ν΄λ¦ν©λλ€...")
|
| 46 |
+
for _ in range(15): # μ΅λ 15ν μ€ν¬λ‘€/ν΄λ¦ μλ
|
| 47 |
+
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
|
| 48 |
+
time.sleep(1.0)
|
| 49 |
+
try:
|
| 50 |
+
more_btn = driver.find_element(By.CSS_SELECTOR, ".section_more_inner")
|
| 51 |
+
if more_btn.is_displayed():
|
| 52 |
+
driver.execute_script("arguments[0].click();", more_btn)
|
| 53 |
+
time.sleep(1.5)
|
| 54 |
+
except:
|
| 55 |
+
pass
|
| 56 |
+
|
| 57 |
article_links = []
|
| 58 |
selectors = [
|
| 59 |
"a.sa_text_title",
|
src/retrieval/finRetrieval.py
CHANGED
|
@@ -31,10 +31,9 @@ dotenv.load_dotenv()
|
|
| 31 |
# ββββββββββββββββββββββββββββββββββββββββββ
|
| 32 |
|
| 33 |
URI = os.getenv("NEO4J_URI", "neo4j://localhost:7687")
|
| 34 |
-
|
| 35 |
-
|
| 36 |
-
|
| 37 |
-
)
|
| 38 |
driver = neo4j.GraphDatabase.driver(URI, auth=AUTH)
|
| 39 |
|
| 40 |
rag_llm = OpenAILLM(model_name="gpt-4o", model_params={"temperature": 0})
|
|
@@ -104,22 +103,28 @@ def _get_schema() -> str:
|
|
| 104 |
_examples = [
|
| 105 |
"""USER INPUT: μΉ΄μΉ΄μ€μ AI μλΉμ€ λͺ©λ‘μ μλ €μ£ΌμΈμ
|
| 106 |
CYPHER QUERY:
|
| 107 |
-
MATCH (c:AICompany {name:"μΉ΄μΉ΄μ€"})-[:DEVELOPS]->(s:AIService)
|
| 108 |
-
RETURN s.name, s.description""",
|
| 109 |
"""USER INPUT: μΌμ±μ μκ° κ°λ° μ€μΈ AI κΈ°μ μ?
|
| 110 |
CYPHER QUERY:
|
| 111 |
-
MATCH (c:AICompany {name:"μΌμ±μ μ"})-[:DEVELOPS]->(t:AITechnology)
|
| 112 |
-
RETURN t.name, t.description""",
|
| 113 |
-
"""USER INPUT: μ΅κ·Ό AI κ΄λ ¨ κΈ°μ¬ 5κ°
|
| 114 |
-
CYPHER QUERY:
|
| 115 |
-
MATCH (a:Article)-[:MENTIONS]->(:AICompany)
|
| 116 |
-
RETURN DISTINCT a.article_id, a.title, a.url, a.published_date
|
| 117 |
-
ORDER BY a.published_date DESC LIMIT 5""",
|
| 118 |
"""USER INPUT: μ΄λ€ κΈ°μ
μ΄ LLM κΈ°μ μ κ°λ°νλμ?
|
| 119 |
CYPHER QUERY:
|
| 120 |
-
MATCH (c:AICompany)-[:DEVELOPS]->(t:AITechnology)
|
| 121 |
-
WHERE t.name CONTAINS "μΈμ΄λͺ¨λΈ" OR t.name CONTAINS "LLM"
|
| 122 |
-
RETURN c.name, t.name""",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 123 |
]
|
| 124 |
|
| 125 |
text2cypher_retriever = Text2CypherRetriever(
|
|
@@ -152,28 +157,61 @@ tools_retriever = ToolsRetriever(
|
|
| 152 |
],
|
| 153 |
)
|
| 154 |
|
| 155 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 156 |
template="""λΉμ μ AI κΈ°μ νΈλ λ λΆμ μ λ¬Έκ°μ
λλ€.
|
| 157 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 158 |
|
| 159 |
μ§λ¬Έ: {query_text}
|
| 160 |
|
| 161 |
-
|
| 162 |
{context}
|
| 163 |
|
| 164 |
-
λ΅λ³ μ§μΉ¨:
|
| 165 |
-
1. κΈ°μ
μ΄ κ°λ° μ€μΈ AI κΈ°μ κ³Ό μλΉμ€λ₯Ό ꡬ체μ μΌλ‘ λͺ
μνμΈμ.
|
| 166 |
-
2. λ΄μ€ κΈ°μ¬ μ λͺ©κ³Ό URLμ κ·Όκ±°λ‘ ν¬ν¨νμΈμ.
|
| 167 |
-
3. μ§μμκ° μ΄λ€ μλΉμ€μ μ΄λ»κ² κΈ°μ¬ν μ μλμ§ μμ¬μ μ 1~2μ€ μΆκ°νμΈμ.
|
| 168 |
-
4. κ²μ κ²°κ³Όμ μλ λ΄μ©μ μΆμΈ‘νμ§ λ§μΈμ.
|
| 169 |
-
|
| 170 |
λ΅λ³:""",
|
| 171 |
-
expected_inputs=["context", "query_text"]
|
| 172 |
)
|
| 173 |
|
| 174 |
# app.pyμμ μ΄ κ°μ²΄λ₯Ό μ§μ importνμ¬ μ¬μ©ν©λλ€.
|
| 175 |
graphrag = GraphRAG(
|
| 176 |
llm=rag_llm,
|
| 177 |
-
retriever=
|
| 178 |
prompt_template=_prompt_template,
|
| 179 |
)
|
|
|
|
| 31 |
# ββββββββββββββββββββββββββββββββββββββββββ
|
| 32 |
|
| 33 |
URI = os.getenv("NEO4J_URI", "neo4j://localhost:7687")
|
| 34 |
+
username = os.getenv("NEO4J_CLIENT_ID") or os.getenv("NEO4J_USERNAME") or "neo4j"
|
| 35 |
+
password = os.getenv("NEO4J_CLIENT_SECRET") or os.getenv("NEO4J_PASSWORD") or "password"
|
| 36 |
+
AUTH = (username, password)
|
|
|
|
| 37 |
driver = neo4j.GraphDatabase.driver(URI, auth=AUTH)
|
| 38 |
|
| 39 |
rag_llm = OpenAILLM(model_name="gpt-4o", model_params={"temperature": 0})
|
|
|
|
| 103 |
_examples = [
|
| 104 |
"""USER INPUT: μΉ΄μΉ΄μ€μ AI μλΉμ€ λͺ©λ‘μ μλ €μ£ΌμΈμ
|
| 105 |
CYPHER QUERY:
|
| 106 |
+
MATCH (c:AICompany {name:"μΉ΄μΉ΄μ€"})-[:DEVELOPS]->(s:AIService)
|
| 107 |
+
RETURN s.name, s.description""",
|
| 108 |
"""USER INPUT: μΌμ±μ μκ° κ°λ° μ€μΈ AI κΈ°μ μ?
|
| 109 |
CYPHER QUERY:
|
| 110 |
+
MATCH (c:AICompany {name:"μΌμ±μ μ"})-[:DEVELOPS]->(t:AITechnology)
|
| 111 |
+
RETURN t.name, t.description""",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 112 |
"""USER INPUT: μ΄λ€ κΈ°μ
μ΄ LLM κΈ°μ μ κ°λ°νλμ?
|
| 113 |
CYPHER QUERY:
|
| 114 |
+
MATCH (c:AICompany)-[:DEVELOPS]->(t:AITechnology)
|
| 115 |
+
WHERE t.name CONTAINS "μΈμ΄λͺ¨λΈ" OR t.name CONTAINS "LLM"
|
| 116 |
+
RETURN c.name, t.name""",
|
| 117 |
+
"""USER INPUT: κΈμ΅μ΄λ νν
ν¬ λΆμΌμ κΈ°μ μ μ μ©νκ³ μλ κΈ°μ
λ€μ μ΄λμΌ?
|
| 118 |
+
CYPHER QUERY:
|
| 119 |
+
MATCH (c:AICompany)-[:DEVELOPS]->(t)-[:USED_IN]->(f:AIField)
|
| 120 |
+
WHERE f.name CONTAINS "κΈμ΅" OR f.name CONTAINS "νν
ν¬"
|
| 121 |
+
RETURN DISTINCT c.name, t.name, f.name""",
|
| 122 |
+
"""USER INPUT: κΈμ΅AI λΆμΌμ κ°μ₯ μ κ·Ήμ μΈ κΈ°μ
TOP 3μ λν μλΉμ€
|
| 123 |
+
CYPHER QUERY:
|
| 124 |
+
MATCH (c:AICompany)-[:DEVELOPS]->(s)-[:USED_IN]->(f:AIField)
|
| 125 |
+
WHERE f.name CONTAINS "κΈμ΅" OR f.name CONTAINS "νν
ν¬"
|
| 126 |
+
RETURN DISTINCT c.name, s.name, f.name
|
| 127 |
+
LIMIT 3""",
|
| 128 |
]
|
| 129 |
|
| 130 |
text2cypher_retriever = Text2CypherRetriever(
|
|
|
|
| 157 |
],
|
| 158 |
)
|
| 159 |
|
| 160 |
+
from typing import Any
|
| 161 |
+
from neo4j_graphrag.retrievers.base import Retriever
|
| 162 |
+
from neo4j_graphrag.types import RawSearchResult, RetrieverResult
|
| 163 |
+
|
| 164 |
+
class HybridFallbackRetriever(Retriever):
|
| 165 |
+
VERIFY_NEO4J_VERSION = False
|
| 166 |
+
|
| 167 |
+
def __init__(self, tools_retriever: Retriever, fallback_retriever: Retriever) -> None:
|
| 168 |
+
self.tools_retriever = tools_retriever
|
| 169 |
+
self.fallback_retriever = fallback_retriever
|
| 170 |
+
super().__init__(driver=tools_retriever.driver)
|
| 171 |
+
|
| 172 |
+
def get_search_results(self, *args: Any, **kwargs: Any) -> RawSearchResult:
|
| 173 |
+
return RawSearchResult(records=[])
|
| 174 |
+
|
| 175 |
+
def search(self, query_text: str = "", **kwargs: Any) -> RetrieverResult:
|
| 176 |
+
res = self.tools_retriever.search(query_text=query_text, **kwargs)
|
| 177 |
+
if not res or not res.items:
|
| 178 |
+
return self.fallback_retriever.search(query_text=query_text, **kwargs)
|
| 179 |
+
return res
|
| 180 |
+
|
| 181 |
+
# νμ΄λΈλ¦¬λ κ²μ μΈμ€ν΄μ€ μ₯μ°©
|
| 182 |
+
hybrid_retriever = HybridFallbackRetriever(
|
| 183 |
+
tools_retriever=tools_retriever,
|
| 184 |
+
fallback_retriever=vector_cypher_retriever,
|
| 185 |
+
)
|
| 186 |
+
|
| 187 |
+
class CustomRagTemplate(RagTemplate):
|
| 188 |
+
EXPECTED_INPUTS = ["context", "query_text"]
|
| 189 |
+
|
| 190 |
+
def format(self, query_text: str, context: str, examples: str = "") -> str:
|
| 191 |
+
return self._format(query_text=query_text, context=context)
|
| 192 |
+
|
| 193 |
+
_prompt_template = CustomRagTemplate(
|
| 194 |
template="""λΉμ μ AI κΈ°μ νΈλ λ λΆμ μ λ¬Έκ°μ
λλ€.
|
| 195 |
+
λ°λμ μλ μ 곡λ [컨ν
μ€νΈ(Neo4j μ§μ κ·Έλν κ²μ κ²°κ³Ό)]μ κΈ°λ°ν΄μλ§ λ΅λ³νμΈμ.
|
| 196 |
+
|
| 197 |
+
β οΈ [μ격ν μ£Όμμ¬ν]
|
| 198 |
+
1. 컨ν
μ€νΈμ μλ κΈ°μ
, μλΉμ€, κΈ°μ , ν΄μΈ κΈ°μ
(JPλͺ¨κ±΄ λ±)μ μ λ μΈκΈνμ§ λ§μΈμ.
|
| 199 |
+
2. μ§λ¬Έμ ν΄λΉνλ μ λ³΄κ° μ»¨ν
μ€νΈμ μλ€λ©΄ μ§μ΄λ΄μ§ λ§κ³ , "νμ¬ μμ§λ μ΅μ λ΄μ€ λ°μ΄ν°μλ κ΄λ ¨ μ λ³΄κ° μμ΅λλ€"λΌκ³ μ μ§νκ² λ΅λ³νμΈμ.
|
| 200 |
+
3. κ·Όκ±°λ‘ μ μν URLμ μ€μ§ 컨ν
μ€νΈμ ν¬ν¨λ μ€μ κΈ°μ¬μ URLλ§ μ¬μ©νλ©°, 'example.com' κ°μ κ°μ§ λ§ν¬λ μ λ μμ±νμ§ λ§μΈμ.
|
| 201 |
+
4. μ·¨μ
μ€λΉμμ΄ κΈ°μ
μ§μ λκΈ°λ₯Ό μμ±ν μ μλλ‘, 컨ν
μ€νΈμ μλ ν©νΈλ₯Ό κΈ°λ°μΌλ‘ ꡬ체μ μ΄κ³ μ λ¬Έμ μΌλ‘ λ΅λ³νμΈμ.
|
| 202 |
|
| 203 |
μ§λ¬Έ: {query_text}
|
| 204 |
|
| 205 |
+
[컨ν
μ€νΈ]
|
| 206 |
{context}
|
| 207 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 208 |
λ΅λ³:""",
|
| 209 |
+
expected_inputs=["context", "query_text"]
|
| 210 |
)
|
| 211 |
|
| 212 |
# app.pyμμ μ΄ κ°μ²΄λ₯Ό μ§μ importνμ¬ μ¬μ©ν©λλ€.
|
| 213 |
graphrag = GraphRAG(
|
| 214 |
llm=rag_llm,
|
| 215 |
+
retriever=hybrid_retriever,
|
| 216 |
prompt_template=_prompt_template,
|
| 217 |
)
|
src/utils/analyze_dates.py
ADDED
|
@@ -0,0 +1,120 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
analyze_dates.py β μμ§λ λ΄μ€ κΈ°μ¬ λ°ν μΌμ νΈλ λ λΆμ λ° μ΅μ κ°±μ μ£ΌκΈ° λμΆ μ€ν¬λ¦½νΈ
|
| 3 |
+
===================================================================================
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
import glob
|
| 7 |
+
import os
|
| 8 |
+
import platform
|
| 9 |
+
from datetime import datetime
|
| 10 |
+
|
| 11 |
+
import matplotlib.pyplot as plt
|
| 12 |
+
import pandas as pd
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
def run_analysis():
|
| 16 |
+
# 1. νλ‘μ νΈ ν΄λμ λͺ¨λ Articles_*.xlsx κΈ°μ¬ νμΌ λ‘λ
|
| 17 |
+
files = glob.glob("Articles_*.xlsx")
|
| 18 |
+
if not files:
|
| 19 |
+
print("β λΆμν Articles_*.xlsx νμΌμ΄ λ‘컬 λλ ν 리μ μμ΅λλ€.")
|
| 20 |
+
return
|
| 21 |
+
|
| 22 |
+
print(f"π λ°κ²¬λ λ΄μ€ κΈ°μ¬ νμΌ λͺ©λ‘: {files}")
|
| 23 |
+
|
| 24 |
+
# 2. λ°μ΄ν° λ³ν© λ° μ€λ³΅ μ κ±°
|
| 25 |
+
dfs = []
|
| 26 |
+
for f in files:
|
| 27 |
+
try:
|
| 28 |
+
df = pd.read_excel(f)
|
| 29 |
+
dfs.append(df)
|
| 30 |
+
except Exception as e:
|
| 31 |
+
print(f"β οΈ {f} λ‘λ μ€ν¨: {e}")
|
| 32 |
+
|
| 33 |
+
if not dfs:
|
| 34 |
+
print("β μ ν¨ν κΈ°μ¬ λ°μ΄ν°κ° μμ΅λλ€.")
|
| 35 |
+
return
|
| 36 |
+
|
| 37 |
+
df_all = pd.concat(dfs, ignore_index=True)
|
| 38 |
+
df_all = df_all.drop_duplicates(subset=["url"]) # λμΌ κΈ°μ¬ μ€λ³΅ μ κ±°
|
| 39 |
+
print(f"π λ³ν© μλ£λ κ³ μ AI νν
ν¬ κΈ°μ¬ μ΄λ: {len(df_all)}건")
|
| 40 |
+
|
| 41 |
+
# 3. λ μ§ νμ± λ° μ λ ¬ (λ μ§ ν¬λ§· νμ€ν)
|
| 42 |
+
df_all["published_date"] = pd.to_datetime(df_all["published_date"], errors="coerce")
|
| 43 |
+
df_all = df_all.dropna(subset=["published_date"])
|
| 44 |
+
df_all = df_all.sort_values(by="published_date")
|
| 45 |
+
|
| 46 |
+
# μΌμλ§ μΆμΆνμ¬ μ§κ³
|
| 47 |
+
df_all["date_only"] = df_all["published_date"].dt.date
|
| 48 |
+
date_counts = df_all.groupby("date_only").size().reset_index(name="count")
|
| 49 |
+
|
| 50 |
+
# 4. λΆμν ν°λ―Έλ μΆλ ₯
|
| 51 |
+
print("\n" + "=" * 50)
|
| 52 |
+
print("π
[μΌμλ³ AI νν
ν¬ κΈ°μ¬ μμ° νΈλ λ ν]")
|
| 53 |
+
print("=" * 50)
|
| 54 |
+
print(date_counts.to_string(index=False))
|
| 55 |
+
print("=" * 50)
|
| 56 |
+
|
| 57 |
+
# 5. μνμ λΆμ λ° κΆμ₯ μ£ΌκΈ° μΆμ²
|
| 58 |
+
total_days = (date_counts["date_only"].max() - date_counts["date_only"].min()).days + 1
|
| 59 |
+
total_articles = date_counts["count"].sum()
|
| 60 |
+
avg_daily = total_articles / max(total_days, 1)
|
| 61 |
+
|
| 62 |
+
print(f"β±οΈ κ΄μΈ‘ κΈ°κ°: {total_days}μΌ ({date_counts['date_only'].min()} ~ {date_counts['date_only'].max()})")
|
| 63 |
+
print(f"π μΌνκ· AI νν
ν¬ λ΄μ€ μμ°λ: {avg_daily:.2f}건")
|
| 64 |
+
|
| 65 |
+
# μΌνκ· λ³Όλ₯¨μ λ°λ₯Έ μ΅μ ν μλν μ£ΌκΈ° μΆμ² μκ³ λ¦¬μ¦
|
| 66 |
+
if avg_daily >= 10:
|
| 67 |
+
recommendation = "β¨ λ§€μΌ 1ν κ°±μ (ν루 κΈ°μ¬ μμ°λμ΄ 10건 μ΄μμΌλ‘ λ§€μ° λ§μ, μ€μκ° νΈλ λ ν¬μ°©μ μν΄ λ§€μΌ μλ²½ 1μ μλνκ° νμμ μ
λλ€.)"
|
| 68 |
+
elif avg_daily >= 3:
|
| 69 |
+
recommendation = "β¨ 2~3μΌμ 1ν κ°±μ (κΈ°μ¬κ° 2~3μΌ λ¨μλ‘ μ λΉν λͺ¨μμ λ κ·Έλνλ₯Ό λΉλνλ κ²μ΄ API λΉμ© λλΉ μ§μ λ°λ μ κ°μ₯ ν¨μ¨μ μ
λλ€.)"
|
| 70 |
+
else:
|
| 71 |
+
recommendation = "β¨ 5μΌ~1μ£Όμ 1ν κ°±μ (AI νν
ν¬ νμ λλ©μΈ νΉμ±μ μΌμΌ λ°νλμ΄ 3건 λ―Έλ§μΌλ‘ νμνλ―λ‘, 5μΌ κ°κ²©μΌλ‘ λͺ°μμ κ°±μ νλ κ²μ΄ ν©λ¦¬μ μ
λλ€.)"
|
| 72 |
+
|
| 73 |
+
print("-" * 50)
|
| 74 |
+
print(f"π‘ [μ΅μ μ GraphRAG μλν μ£ΌκΈ° μ μ]")
|
| 75 |
+
print(f" {recommendation}")
|
| 76 |
+
print("=" * 50 + "\n")
|
| 77 |
+
|
| 78 |
+
# 6. μ°¨νΈ μκ°ν λ° μ΄λ―Έμ§ νμΌ μ μ₯
|
| 79 |
+
if platform.system() == "Darwin":
|
| 80 |
+
plt.rc("font", family="AppleGothic") # Mac νκΈ ν°νΈ κΉ¨μ§ λ°©μ§
|
| 81 |
+
plt.rcParams["axes.unicode_minus"] = False
|
| 82 |
+
|
| 83 |
+
plt.figure(figsize=(10, 5))
|
| 84 |
+
bars = plt.bar(
|
| 85 |
+
date_counts["date_only"].astype(str),
|
| 86 |
+
date_counts["count"],
|
| 87 |
+
color="royalblue",
|
| 88 |
+
edgecolor="black",
|
| 89 |
+
alpha=0.85,
|
| 90 |
+
)
|
| 91 |
+
|
| 92 |
+
# λ§λ μμ μ«μ νμ
|
| 93 |
+
for bar in bars:
|
| 94 |
+
height = bar.get_height()
|
| 95 |
+
plt.text(
|
| 96 |
+
bar.get_x() + bar.get_width() / 2.0,
|
| 97 |
+
height + 0.1,
|
| 98 |
+
f"{int(height)}건",
|
| 99 |
+
ha="center",
|
| 100 |
+
va="bottom",
|
| 101 |
+
fontsize=10,
|
| 102 |
+
fontweight="bold",
|
| 103 |
+
)
|
| 104 |
+
|
| 105 |
+
plt.title("μΌμλ³ AI νν
ν¬ λ΄μ€ μμ° νΈλ λ λΆμ", fontsize=15, pad=15, fontweight="bold")
|
| 106 |
+
plt.xlabel("κΈ°μ¬ λ°ν μΌμ", fontsize=12)
|
| 107 |
+
plt.ylabel("μμ° κ±΄μ", fontsize=12)
|
| 108 |
+
plt.grid(axis="y", linestyle="--", alpha=0.5)
|
| 109 |
+
plt.xticks(rotation=25)
|
| 110 |
+
plt.tight_layout()
|
| 111 |
+
|
| 112 |
+
# artifacts ν΄λ μλμ λΆμ κ²°κ³Όλ¬Ό μ°¨νΈ μ μ₯
|
| 113 |
+
os.makedirs("artifacts", exist_ok=True)
|
| 114 |
+
img_path = "artifacts/daily_trend_analysis.png"
|
| 115 |
+
plt.savefig(img_path, dpi=200)
|
| 116 |
+
print(f"πΎ μκ°ν λΆμ μ°¨νΈ μ μ₯ μλ£ β‘οΈ [μ λκ²½λ‘]: {os.path.abspath(img_path)}")
|
| 117 |
+
|
| 118 |
+
|
| 119 |
+
if __name__ == "__main__":
|
| 120 |
+
run_analysis()
|
src/utils/research_notes.md
ADDED
|
@@ -0,0 +1,60 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# π μ΅μ μ GraphRAG κ°±μ μ£ΌκΈ° λμΆ λ³΄κ³ μ
|
| 2 |
+
> **Data-Driven Analysis for GraphRAG Synchronization Cycle**
|
| 3 |
+
|
| 4 |
+
λ³Έ λ³΄κ³ μλ μ€μ λ€μ΄λ² λ΄μ€ IT/κ³Όν λ° κ²½μ μΉ΄ν
κ³ λ¦¬μμ νν°λ§λ **κ³ μ AI νν
ν¬ κΈ°μ¬**λ€μ λ μ§λ³ μ μ
λΉλλ₯Ό μ λ λΆμνμ¬, μμ€ν
μ΄μ ν¨μ¨μ±κ³Ό μ΅μ μ 보 νλ μλ(API λΉμ© λλΉ ν¨μ©μ±)λ₯Ό λͺ¨λ λ§μ‘±νλ μ΅μ μ GraphRAG μ΅μ ν μ£ΌκΈ°λ₯Ό μνμ μΌλ‘ λμΆν κ²°κ³Όμ
λλ€.
|
| 5 |
+
|
| 6 |
+
---
|
| 7 |
+
|
| 8 |
+
## 1. μ λ λ°μ΄ν° μμ§ λ° λΆμ νν©
|
| 9 |
+
|
| 10 |
+
* **μμ§λ μλ³Έ λ°μ΄ν°μ
λͺ©λ‘**:
|
| 11 |
+
1. `Articles_20260518_223626.xlsx` (34건)
|
| 12 |
+
2. `Articles_20260519_155940.xlsx` (40건)
|
| 13 |
+
* **κ³ μ κΈ°μ¬ μ΄ν© (μ€λ³΅ URL μ κ±°)**: **74건**
|
| 14 |
+
* **κ΄μΈ‘ κΈ°κ°**: 2μΌ (2026-05-18 ~ 2026-05-19)
|
| 15 |
+
|
| 16 |
+
### π
μΌμλ³ κ³ μ λ΄μ€ μμ°λ μΆμ΄
|
| 17 |
+
| λ°ν μΌμ | μμ° κ±΄μ (κ³ μ κΈ°μ¬) | λΉκ³ |
|
| 18 |
+
| :--- | :---: | :---: |
|
| 19 |
+
| **2026-05-18** | **34건** | νμΌ (μ) |
|
| 20 |
+
| **2026-05-19** | **40건** | νμΌ (ν) |
|
| 21 |
+
| **μ΄ν©** | **74건** | |
|
| 22 |
+
|
| 23 |
+
---
|
| 24 |
+
|
| 25 |
+
## 2. μνμ λΆμ λ° κ°±μ μ£ΌκΈ° λμΆ
|
| 26 |
+
|
| 27 |
+
### π μΌνκ· λ΄μ€ μμ° μλ (Velocity)
|
| 28 |
+
$$\text{μΌνκ· μμ°λ} = \frac{74\text{건}}{2\text{μΌ}} = 37.00\text{건/μΌ}$$
|
| 29 |
+
|
| 30 |
+
* **λλ©μΈ ν μΈ‘μ **: AI νν
ν¬λΌλ λλ©μΈμ΄ λ§€μ° μ’λ€κ³ μκ°νμ
¨μΌλ, μ€μ λ€μ΄λ² λ΄μ€μ IT/κ³Όν λ° κ²½μ λλ©μΈμμ μμ§λλ λ΄μ€ μ€ **AI, μΈκ³΅μ§λ₯, μμ±ν AI, νν
ν¬ ν€μλ μ€ νλλΌλ ν¬ν¨νλ κΈ°μ¬λ 45.5%**μ μ‘λ°ν©λλ€.
|
| 31 |
+
* **μ¦, κΈ°μ¬μ μ μ
μλκ° λ§€μ° λΉ λ₯΄κ³ μ 보μ μ μ λ κ΅μ²΄ μ£ΌκΈ°κ° λλ¨ν μ¦μ΅λλ€.**
|
| 32 |
+
|
| 33 |
+
### π‘ 3~5μΌ μ£ΌκΈ° vs λ§€μΌ 1μ μ£ΌκΈ°μ ν¨μ¨μ± λΉκ΅
|
| 34 |
+
|
| 35 |
+
| νλͺ© | 3~5μΌ μΌκ΄ κ°±μ | λ§€μΌ μλ²½ 1μ κ°±μ (κΆμ₯) |
|
| 36 |
+
| :--- | :--- | :--- |
|
| 37 |
+
| **λ°μ΄ν° μΆμ λ** | 110 ~ 185건 λμ | **νκ· 35 ~ 40건 λμ ** |
|
| 38 |
+
| **OpenAI API λΆν** | ν λ²μ λλμ LLM ν ν°μ μλͺ¨νμ¬ **API Rate Limit(λΆλΉ μμ² νλ)μ κ±Έλ € λΉλ μ€ν¨ν νλ₯ λμ** | μλμ λ°μ΄ν°(40건 λ¨μ)λ‘ λ§€μΌ λλμ΄ μ²λ¦¬νλ―λ‘ **Rate Limit μνμ΄ μκ³ λΉλκ° μ§κ·Ήν μμ μ μ** |
|
| 39 |
+
| **μ 보μ μ€μΈμ± (Recency)** | μλ‘μ΄ AI κΈ°μ /μλΉμ€ μΆμ μμμ΄ RAGμ λ°μλκΈ°κΉμ§ μ΅λ 5μΌμ **μ 보 μ§μ°(Lag)** λ°μ | λ§€μΌ μλ²½ 1μ κΈ°μ€ **μ λ μ νΈλ λκ° μ¦μ λ°μ**λμ΄ λ©΄μ /μ§μλκΈ° μ©λλ‘μ μ λ’°λ μ΅κ³ μ‘° |
|
| 40 |
+
| **μλ² λΆν** | ν¬λ‘€λ§ λΈλΌμ°μ (Headless Chrome) μ₯μκ° κ΅¬λμΌλ‘ λ©λͺ¨λ¦¬ λμ λ° μλ¬ κ°λ₯μ± μμ | λ§€μΌ 10λΆ λ΄μΈμ μ§§κ³ μμ ν λ°°μΉ νμ€ν¬λ‘ μ’
λ£λμ΄ μμ€ν
μμ μ± μ°μ |
|
| 41 |
+
|
| 42 |
+
---
|
| 43 |
+
|
| 44 |
+
## 3. μ΅μ’
κΆμ₯ μ¬ν λ° μκ°ν
|
| 45 |
+
|
| 46 |
+
> [!IMPORTANT]
|
| 47 |
+
> **κΆμ₯ κ°±μ μ£ΌκΈ°: λ§€μΌ μλ²½ 1μ (1 AM KST) μλν μ€μΌμ€λ§**
|
| 48 |
+
>
|
| 49 |
+
> νμ λλ©μΈμμλ λΆκ΅¬νκ³ λ§€μΌ 35~40κ° μμ€μ μμ§μ κΈ°μ¬κ° μμ°λκ³ μμ΅λλ€. λ§€μΌ μλ²½ 1μ(νκ΅ νμ€μ)μ ν¬λ‘€λ§ νμ΄νλΌμΈμ λλ € Neo4j DBλ₯Ό λΉλνλ κ²μ΄ **API κ³ΌκΈ λ°©μ§, Rate Limit μ°ν, κ·Έλ¦¬κ³ μ 보 μ μ λ κ·Ήλν μΈ‘λ©΄μμ κ°μ₯ μ΄μμ μΈ κ³¨λ μ¬μ΄ν΄(Golden Cycle)**μ
λλ€.
|
| 50 |
+
|
| 51 |
+
### π λΆμ μκ°ν μ°¨νΈ
|
| 52 |
+
μλ μ°¨νΈλ μ€μ λΆμκΈ°κ° μμ±ν λ μ§λ³ μμ°λ μκ°ν λ°μ΄ν°μ
λλ€.
|
| 53 |
+
|
| 54 |
+

|
| 55 |
+
|
| 56 |
+
---
|
| 57 |
+
|
| 58 |
+
## 4. νμ μ‘μ
νλ (Action Plan)
|
| 59 |
+
1. **[μλ£]** `AGENTS.md`μ νμ΄νλΌμΈ κ΅¬μΆ μΌμ μ **"λ§€μΌ μλ²½ 1μ μ΅μ ν νμ΄νλΌμΈ ꡬμΆ"**μΌλ‘ νμ κΈ°λ‘νμ΅λλ€.
|
| 60 |
+
2. **[λκΈ°]** μ΄μ μμ§λ 40건μ μ μμ
λ°μ΄ν°λ₯Ό Neo4j μ§μ κ·Έλνλ‘ μ μ¬νμ¬ RAG νμ§μ μ¦μ ν₯μμν΅λλ€.
|
tests/smoke_test_rag.py
ADDED
|
@@ -0,0 +1,156 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
smoke_test_rag.py β GraphRAG 3λ μλλ¦¬μ€ νμ₯ κ²μ¦ μ€ν¬λ¦½νΈ
|
| 3 |
+
=============================================================
|
| 4 |
+
μ§μλκΈ° μμ± μ§μ μ±λ΄μΌλ‘μμ μλΉμ€ λͺ©μ μ κ²μ¦ν©λλ€.
|
| 5 |
+
|
| 6 |
+
μλ리μ€:
|
| 7 |
+
1. νΉμ κΈ°μ
- "μΉ΄μΉ΄μ€μ AI μλΉμ€ νΈλ λλ?"
|
| 8 |
+
2. νΉμ κΈ°μ - "LLM κΈ°μ μ κ°λ°νλ κΈ°μ
λ€μ?"
|
| 9 |
+
3. μ 체 νΈλ λ - "κΈμ΅AI λΆμΌμμ κ°μ₯ μ κ·Ήμ μΈ κΈ°μ
TOP 3μ λν μλΉμ€"
|
| 10 |
+
|
| 11 |
+
μ€ν λ°©λ²:
|
| 12 |
+
python3 tests/smoke_test_rag.py
|
| 13 |
+
"""
|
| 14 |
+
|
| 15 |
+
import os
|
| 16 |
+
import sys
|
| 17 |
+
import time
|
| 18 |
+
|
| 19 |
+
import dotenv
|
| 20 |
+
|
| 21 |
+
dotenv.load_dotenv()
|
| 22 |
+
|
| 23 |
+
# ββ 0. κ·Έλν κ΅¬μ± μ¬μ μ κ² (Neo4j λ
Έλ/κ΄κ³ ν΅κ³) βββββββββββββββββββββββββ
|
| 24 |
+
def check_graph_structure():
|
| 25 |
+
import neo4j
|
| 26 |
+
|
| 27 |
+
uri = os.getenv("NEO4J_URI", "neo4j://localhost:7687")
|
| 28 |
+
username = os.getenv("NEO4J_CLIENT_ID") or os.getenv("NEO4J_USERNAME") or "neo4j"
|
| 29 |
+
password = os.getenv("NEO4J_CLIENT_SECRET") or os.getenv("NEO4J_PASSWORD") or "password"
|
| 30 |
+
auth = (username, password)
|
| 31 |
+
driver = neo4j.GraphDatabase.driver(uri, auth=auth)
|
| 32 |
+
|
| 33 |
+
print("\n" + "=" * 60)
|
| 34 |
+
print("π [μ¬μ μ κ²] Neo4j κ·Έλν κ΅¬μ± νν©")
|
| 35 |
+
print("=" * 60)
|
| 36 |
+
|
| 37 |
+
queries = {
|
| 38 |
+
"Article (κΈ°μ¬)": "MATCH (n:Article) RETURN count(n) as cnt",
|
| 39 |
+
"AICompany (κΈ°μ
)": "MATCH (n:AICompany) RETURN count(n) as cnt",
|
| 40 |
+
"AITechnology (κΈ°μ )": "MATCH (n:AITechnology) RETURN count(n) as cnt",
|
| 41 |
+
"AIService (μλΉμ€)": "MATCH (n:AIService) RETURN count(n) as cnt",
|
| 42 |
+
"AIField (λΆμΌ)": "MATCH (n:AIField) RETURN count(n) as cnt",
|
| 43 |
+
"Content (μ²ν¬+벑ν°)": "MATCH (n:Content) RETURN count(n) as cnt",
|
| 44 |
+
"MENTIONS κ΄κ³": "MATCH ()-[r:MENTIONS]->() RETURN count(r) as cnt",
|
| 45 |
+
"DEVELOPS κ΄κ³": "MATCH ()-[r:DEVELOPS]->() RETURN count(r) as cnt",
|
| 46 |
+
}
|
| 47 |
+
|
| 48 |
+
all_ok = True
|
| 49 |
+
for label, cypher in queries.items():
|
| 50 |
+
with driver.session() as s:
|
| 51 |
+
result = s.run(cypher).single()
|
| 52 |
+
cnt = result["cnt"] if result else 0
|
| 53 |
+
status = "β
" if cnt > 0 else "β οΈ λΉμ΄μμ"
|
| 54 |
+
if cnt == 0:
|
| 55 |
+
all_ok = False
|
| 56 |
+
print(f" {status} {label}: {cnt}κ°")
|
| 57 |
+
|
| 58 |
+
driver.close()
|
| 59 |
+
print()
|
| 60 |
+
if not all_ok:
|
| 61 |
+
print("β μΌλΆ λ
Έλ/κ΄κ³κ° λΉμ΄μμ΅λλ€. finGraph.py μ€νμΌλ‘ κ·Έλνλ₯Ό λ¨Όμ μ±μμ£ΌμΈμ.\n")
|
| 62 |
+
sys.exit(1)
|
| 63 |
+
else:
|
| 64 |
+
print("β
κ·Έλν κ΅¬μ± μ μ β RAG ν
μ€νΈλ₯Ό μμν©λλ€.\n")
|
| 65 |
+
|
| 66 |
+
|
| 67 |
+
# ββ 1. GraphRAG μλ΅ νμ§ κ²μ¦ βββββββββββββββββββββββββββββββββββββββββββββββ
|
| 68 |
+
def run_scenario(label: str, query: str, expected_keywords: list[str]):
|
| 69 |
+
from src.retrieval.finRetrieval import graphrag
|
| 70 |
+
|
| 71 |
+
print("=" * 60)
|
| 72 |
+
print(f"π μλ리μ€: {label}")
|
| 73 |
+
print(f" μ§λ¬Έ: {query}")
|
| 74 |
+
print("=" * 60)
|
| 75 |
+
|
| 76 |
+
start = time.time()
|
| 77 |
+
result = graphrag.search(query_text=query)
|
| 78 |
+
elapsed = time.time() - start
|
| 79 |
+
|
| 80 |
+
answer = result.answer if result and result.answer else ""
|
| 81 |
+
|
| 82 |
+
print(f"\nπ GraphRAG μλ΅ ({elapsed:.1f}μ΄):\n")
|
| 83 |
+
print(answer)
|
| 84 |
+
|
| 85 |
+
# νμ§ κ²μ¦
|
| 86 |
+
print("\nπ νμ§ μ²΄ν¬:")
|
| 87 |
+
all_pass = True
|
| 88 |
+
|
| 89 |
+
# 1) μλ΅μ΄ λΉμ΄μμ§ μμκ°
|
| 90 |
+
if len(answer.strip()) > 50:
|
| 91 |
+
print(" β
μλ΅ κΈΈμ΄ μΆ©λΆ (50μ μ΄μ)")
|
| 92 |
+
else:
|
| 93 |
+
print(f" β μλ΅μ΄ λ무 μ§§μ ({len(answer.strip())}μ)")
|
| 94 |
+
all_pass = False
|
| 95 |
+
|
| 96 |
+
# 2) κΈ°λ ν€μλ ν¬ν¨ μ¬λΆ
|
| 97 |
+
found = [kw for kw in expected_keywords if kw in answer]
|
| 98 |
+
missing = [kw for kw in expected_keywords if kw not in answer]
|
| 99 |
+
if found:
|
| 100 |
+
print(f" β
ν΅μ¬ ν€μλ ν¬ν¨: {found}")
|
| 101 |
+
if missing:
|
| 102 |
+
print(f" β οΈ λ―Έν¬ν¨ ν€μλ: {missing}")
|
| 103 |
+
|
| 104 |
+
# 3) μΆμ²/κ·Όκ±° νκΈ° μ¬λΆ
|
| 105 |
+
source_indicators = ["κΈ°μ¬", "μΆμ²", "λ΄μ€", "보λ", "λ°λ₯΄λ©΄", "λ°ν", "http"]
|
| 106 |
+
has_source = any(ind in answer for ind in source_indicators)
|
| 107 |
+
if has_source:
|
| 108 |
+
print(" β
μΆμ²/κ·Όκ±° νκΈ° μμ")
|
| 109 |
+
else:
|
| 110 |
+
print(" β οΈ μΆμ²/κ·Όκ±° νκΈ° μμ (RAG μλ΅μ΄μ§λ§ κ·Όκ±°κ° λΆλͺ
ν)")
|
| 111 |
+
all_pass = False
|
| 112 |
+
|
| 113 |
+
overall = "β
PASS" if all_pass else "β οΈ PARTIAL (κ°μ μ¬μ§ μμ)"
|
| 114 |
+
print(f"\n β μ΅μ’
νμ : {overall}")
|
| 115 |
+
print()
|
| 116 |
+
return all_pass
|
| 117 |
+
|
| 118 |
+
|
| 119 |
+
# ββ λ©μΈ μ€ν ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 120 |
+
if __name__ == "__main__":
|
| 121 |
+
# 0. κ·Έλν κ΅¬μ± μ¬μ μ κ²
|
| 122 |
+
check_graph_structure()
|
| 123 |
+
|
| 124 |
+
results = []
|
| 125 |
+
|
| 126 |
+
# μλλ¦¬μ€ 1: νΉμ κΈ°μ
|
| 127 |
+
results.append(run_scenario(
|
| 128 |
+
label="β νΉμ κΈ°μ
β μ§μλκΈ° μλ£ μ‘°μ¬",
|
| 129 |
+
query="μΉ΄μΉ΄μ€κ° κ°λ° μ€μΈ AI μλΉμ€μ κΈ°μ νΈλ λλ₯Ό μλ €μ€. μ§μλκΈ° μμ±μ μ°Έκ³ νκ³ μΆμ΄.",
|
| 130 |
+
expected_keywords=["μΉ΄μΉ΄μ€", "AI", "μλΉμ€"],
|
| 131 |
+
))
|
| 132 |
+
|
| 133 |
+
# μλλ¦¬μ€ 2: νΉμ κΈ°μ
|
| 134 |
+
results.append(run_scenario(
|
| 135 |
+
label="β‘ νΉμ κΈ°μ β LLM κΈ°μ 보μ κΈ°μ
νμ",
|
| 136 |
+
query="LLM(λκ·λͺ¨ μΈμ΄ λͺ¨λΈ) κΈ°μ μ κ°λ°νκ±°λ λμ
νκ³ μλ κ΅λ΄ κΈμ΅Β·νν
ν¬ κΈ°μ
λ€μ μ΄λμΌ?",
|
| 137 |
+
expected_keywords=["LLM", "AI", "κΈ°μ
"],
|
| 138 |
+
))
|
| 139 |
+
|
| 140 |
+
# μλλ¦¬μ€ 3: μ 체 νΈλ λ (ν¬νΈν΄λ¦¬μ€ λν 골λ 쿼리)
|
| 141 |
+
results.append(run_scenario(
|
| 142 |
+
label="β’ μ 체 νΈλ λ β κΈμ΅AI λΆμΌ TOP 3 κΈ°μ
",
|
| 143 |
+
query="μ΅κ·Ό μμ§λ λ΄μ€μμ κΈμ΅AI(AIField) λΆμΌμ κ°μ₯ μ κ·Ήμ μΌλ‘ κΈ°μ μ κ°λ°νκ³ μλ κΈ°μ
TOP 3μ κ·Έ κΈ°μ
λ€μ΄ κ°λ°ν λν μλΉμ€λ₯Ό μλ €μ€.",
|
| 144 |
+
expected_keywords=["1.", "κΈ°μ
", "μλΉμ€", "AI"],
|
| 145 |
+
))
|
| 146 |
+
|
| 147 |
+
# μ΅μ’
μμ½
|
| 148 |
+
print("=" * 60)
|
| 149 |
+
print("π μ΅μ’
μμ½")
|
| 150 |
+
print("=" * 60)
|
| 151 |
+
labels = ["β νΉμ κΈ°μ
", "β‘ νΉμ κΈ°μ ", "β’ μ 체 νΈλ λ"]
|
| 152 |
+
for label, passed in zip(labels, results):
|
| 153 |
+
print(f" {'β
PASS' if passed else 'β οΈ PARTIAL'} | {label}")
|
| 154 |
+
print()
|
| 155 |
+
pass_count = sum(results)
|
| 156 |
+
print(f" μ΄ {pass_count}/{len(results)}κ° μλλ¦¬μ€ μμ ν΅κ³Ό")
|