Spaces:
Runtime error
Runtime error
Upload folder using huggingface_hub
Browse files- .gitattributes +3 -32
- .streamlit/config.toml +6 -0
- README.md +61 -13
- app.py +15 -0
- chroma/4f0d0b85-04bb-4afc-9e18-51530c937a3b/data_level0.bin +3 -0
- chroma/4f0d0b85-04bb-4afc-9e18-51530c937a3b/data_level0.bin:Zone.Identifier +0 -0
- chroma/4f0d0b85-04bb-4afc-9e18-51530c937a3b/header.bin +3 -0
- chroma/4f0d0b85-04bb-4afc-9e18-51530c937a3b/header.bin:Zone.Identifier +0 -0
- chroma/4f0d0b85-04bb-4afc-9e18-51530c937a3b/index_metadata.pickle +3 -0
- chroma/4f0d0b85-04bb-4afc-9e18-51530c937a3b/index_metadata.pickle:Zone.Identifier +0 -0
- chroma/4f0d0b85-04bb-4afc-9e18-51530c937a3b/length.bin +3 -0
- chroma/4f0d0b85-04bb-4afc-9e18-51530c937a3b/length.bin:Zone.Identifier +0 -0
- chroma/4f0d0b85-04bb-4afc-9e18-51530c937a3b/link_lists.bin +3 -0
- chroma/4f0d0b85-04bb-4afc-9e18-51530c937a3b/link_lists.bin:Zone.Identifier +0 -0
- chroma/chroma.sqlite3 +3 -0
- chroma/chroma.sqlite3:Zone.Identifier +0 -0
- multimodal_rag_langgraph_gemini_st.py +492 -0
- news_text_scraper.py +315 -0
- packages.txt +1 -0
- requirements.txt +31 -3
.gitattributes
CHANGED
|
@@ -1,35 +1,6 @@
|
|
| 1 |
-
*.
|
| 2 |
-
*.
|
| 3 |
*.bin filter=lfs diff=lfs merge=lfs -text
|
| 4 |
-
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
| 5 |
-
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
| 6 |
-
*.ftz filter=lfs diff=lfs merge=lfs -text
|
| 7 |
-
*.gz filter=lfs diff=lfs merge=lfs -text
|
| 8 |
-
*.h5 filter=lfs diff=lfs merge=lfs -text
|
| 9 |
-
*.joblib filter=lfs diff=lfs merge=lfs -text
|
| 10 |
-
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
| 11 |
-
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
| 12 |
-
*.model filter=lfs diff=lfs merge=lfs -text
|
| 13 |
-
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
| 14 |
-
*.npy filter=lfs diff=lfs merge=lfs -text
|
| 15 |
-
*.npz filter=lfs diff=lfs merge=lfs -text
|
| 16 |
-
*.onnx filter=lfs diff=lfs merge=lfs -text
|
| 17 |
-
*.ot filter=lfs diff=lfs merge=lfs -text
|
| 18 |
-
*.parquet filter=lfs diff=lfs merge=lfs -text
|
| 19 |
-
*.pb filter=lfs diff=lfs merge=lfs -text
|
| 20 |
-
*.pickle filter=lfs diff=lfs merge=lfs -text
|
| 21 |
-
*.pkl filter=lfs diff=lfs merge=lfs -text
|
| 22 |
*.pt filter=lfs diff=lfs merge=lfs -text
|
| 23 |
-
*.pth filter=lfs diff=lfs merge=lfs -text
|
| 24 |
-
*.rar filter=lfs diff=lfs merge=lfs -text
|
| 25 |
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
| 26 |
-
|
| 27 |
-
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
| 28 |
-
*.tar filter=lfs diff=lfs merge=lfs -text
|
| 29 |
-
*.tflite filter=lfs diff=lfs merge=lfs -text
|
| 30 |
-
*.tgz filter=lfs diff=lfs merge=lfs -text
|
| 31 |
-
*.wasm filter=lfs diff=lfs merge=lfs -text
|
| 32 |
-
*.xz filter=lfs diff=lfs merge=lfs -text
|
| 33 |
-
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
-
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
-
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
| 1 |
+
*.sqlite3 filter=lfs diff=lfs merge=lfs -text
|
| 2 |
+
*.db filter=lfs diff=lfs merge=lfs -text
|
| 3 |
*.bin filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4 |
*.pt filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
| 5 |
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
| 6 |
+
chroma/4f0d0b85-04bb-4afc-9e18-51530c937a3b/index_metadata.pickle filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
.streamlit/config.toml
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[server]
|
| 2 |
+
headless = true
|
| 3 |
+
port = 7860
|
| 4 |
+
|
| 5 |
+
[browser]
|
| 6 |
+
gatherUsageStats = false
|
README.md
CHANGED
|
@@ -1,19 +1,67 @@
|
|
| 1 |
---
|
| 2 |
-
title: Press Ethics
|
| 3 |
-
emoji:
|
| 4 |
-
colorFrom:
|
| 5 |
-
colorTo:
|
| 6 |
-
sdk:
|
| 7 |
-
|
| 8 |
-
|
| 9 |
-
- streamlit
|
| 10 |
pinned: false
|
| 11 |
-
|
| 12 |
---
|
| 13 |
|
| 14 |
-
#
|
| 15 |
|
| 16 |
-
|
| 17 |
|
| 18 |
-
|
| 19 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
---
|
| 2 |
+
title: Press Ethics Analyzer
|
| 3 |
+
emoji: ๐ฐ
|
| 4 |
+
colorFrom: blue
|
| 5 |
+
colorTo: green
|
| 6 |
+
sdk: streamlit
|
| 7 |
+
sdk_version: "1.51.0"
|
| 8 |
+
app_file: multimodal_rag_langgraph_gemini_st.py
|
|
|
|
| 9 |
pinned: false
|
| 10 |
+
license: mit
|
| 11 |
---
|
| 12 |
|
| 13 |
+
# ๐ฐ ๋ด์ค ์ฌ์๋ฌธ ๋ถ์ ์์คํ
|
| 14 |
|
| 15 |
+
ํ๊ตญ์ ๋ฌธ์ค๋ฆฌ์์ํ ์ฌ์ ๊ธฐ์ค์ ์ ์ฉํ AI ๊ธฐ๋ฐ ๋ด์ค ๊ธฐ์ฌ ์ค๋ฆฌ ๋ถ์ ์์คํ
์
๋๋ค.
|
| 16 |
|
| 17 |
+
## ๐ ์ฃผ์ ๊ธฐ๋ฅ
|
| 18 |
+
|
| 19 |
+
- **๊ธฐ์ฌ ์๋ ์ถ์ถ**: URL๋ง ์
๋ ฅํ๋ฉด ๊ธฐ์ฌ ๋ด์ฉ ์๋ ์์ง
|
| 20 |
+
- **๋ฉํฐ๋ชจ๋ฌ ๋ถ์**: ํ
์คํธ์ ์ด๋ฏธ์ง๋ฅผ ํจ๊ป ๋ถ์ (Gemini 2.0 Flash)
|
| 21 |
+
- **์ ์ฌ ์ฌ๋ก ๊ฒ์**: ChromaDB ๊ธฐ๋ฐ ๋ฒกํฐ ๊ฒ์์ผ๋ก ๊ด๋ จ ์ฌ์ ์ฌ๋ก ํ์
|
| 22 |
+
- **์ฌ์๋ฌธ ์๋ ์์ฑ**: ์ ๋ฌธ์ค๋ฆฌ์ค์ฒ์๊ฐ 16๊ฐ ์กฐํญ ๊ธฐ์ค ์๋ ํ๋จ
|
| 23 |
+
- **๋จ๊ณ๋ณ ์งํ ํ์**: 5๋จ๊ณ ๋ถ์ ๊ณผ์ ์ค์๊ฐ ๋ชจ๋ํฐ๋ง
|
| 24 |
+
|
| 25 |
+
## ๐ ์ฌ์ฉ ๋ฐฉ๋ฒ
|
| 26 |
+
|
| 27 |
+
1. **API ํค ์
๋ ฅ**: ์ฌ์ด๋๋ฐ์์ Gemini API ํค ์
๋ ฅ ([๋ฐ๊ธ๋ฐ๊ธฐ](https://makersuite.google.com/app/apikey))
|
| 28 |
+
2. **URL ์
๋ ฅ**: ๋ถ์ํ ๋ด์ค ๊ธฐ์ฌ URL ์
๋ ฅ
|
| 29 |
+
3. **๋ถ์ ์์**: '๋ถ์ ์์' ๋ฒํผ ํด๋ฆญ
|
| 30 |
+
4. **๊ฒฐ๊ณผ ํ์ธ**: 5๋จ๊ณ ์งํ ๊ณผ์ ์ ๊ฑฐ์ณ ์ต์ข
์ฌ์๋ฌธ ํ์ธ
|
| 31 |
+
|
| 32 |
+
## ๐ ๋ถ์ ๋จ๊ณ
|
| 33 |
+
|
| 34 |
+
1. **๊ธฐ์ฌ ์ถ์ถ**: URL์์ ์ ๋ชฉ, ๋ณธ๋ฌธ, ์ด๋ฏธ์ง ์ถ์ถ
|
| 35 |
+
2. **์ด๋ฏธ์ง ์ฒ๋ฆฌ**: Gemini ๋ฉํฐ๋ชจ๋ฌ๋ก ์ด๋ฏธ์ง ๋ด์ฉ ๋ถ์
|
| 36 |
+
3. **์ ์ฌ ์ฌ๋ก ๊ฒ์**: ๋ฒกํฐ DB์์ ๊ด๋ จ ์ฌ์ ์ฌ๋ก 5๊ฐ ๊ฒ์
|
| 37 |
+
4. **์ฌ์๋ฌธ ์์ฑ**: LangGraph ์ํฌํ๋ก์ฐ๋ก ์ฌ์๋ฌธ ์ด์ ์์ฑ
|
| 38 |
+
5. **์ต์ข
๊ฒํ **: ์กฐํญ ์ ํ์ฑ ๋ฐ ๊ธฐ์ฌ ๊ด๋ จ์ฑ ๊ฒ์ฆ
|
| 39 |
+
|
| 40 |
+
## ๐ ์ฌ์ ๊ธฐ์ค
|
| 41 |
+
|
| 42 |
+
์ ๋ฌธ์ค๋ฆฌ์ค์ฒ์๊ฐ 16๊ฐ ์กฐํญ์ ๊ธฐ์ค์ผ๋ก ๊ธฐ์ฌ๋ฅผ ๋ถ์ํฉ๋๋ค:
|
| 43 |
+
|
| 44 |
+
- ์ 1์กฐ: ์ธ๋ก ์ ์์ , ์ฑ
์, ๋
๋ฆฝ
|
| 45 |
+
- ์ 2์กฐ: ์ทจ์ฌ ์ค์น
|
| 46 |
+
- ์ 3์กฐ: ๋ณด๋ ์ค์น
|
| 47 |
+
- ์ 7์กฐ: ๋ฒ์ฃ๋ณด๋์ ์ธ๊ถ์กด์ค
|
| 48 |
+
- ์ 10์กฐ: ํธ์ง ์ง์นจ
|
| 49 |
+
- ์ 12์กฐ: ์ฌ์ํ ๋ณดํธ
|
| 50 |
+
- ๊ทธ ์ธ 10๊ฐ ์กฐํญ
|
| 51 |
+
|
| 52 |
+
## ๐ ๏ธ ๊ธฐ์ ์คํ
|
| 53 |
+
|
| 54 |
+
- **AI ๋ชจ๋ธ**: Google Gemini 2.0 Flash
|
| 55 |
+
- **ํ๋ ์์ํฌ**: LangGraph, Streamlit
|
| 56 |
+
- **๋ฒกํฐ DB**: ChromaDB
|
| 57 |
+
- **์๋ฒ ๋ฉ**: Sentence Transformers (multilingual-e5-large-instruct)
|
| 58 |
+
|
| 59 |
+
## โ ๏ธ ์ฃผ์์ฌํญ
|
| 60 |
+
|
| 61 |
+
- ๋ณธ์ธ์ Gemini API ํค๊ฐ ํ์ํฉ๋๋ค
|
| 62 |
+
- ๋ถ์์๋ ์ ๋ถ์ด ์์๋ ์ ์์ต๋๋ค
|
| 63 |
+
- ์ด ์์คํ
์ ์ฐธ๊ณ ์ฉ์ด๋ฉฐ, ์ต์ข
ํ๋จ์ ์ ๋ฌธ๊ฐ์ ๊ฒํ ๊ฐ ํ์ํฉ๋๋ค
|
| 64 |
+
|
| 65 |
+
## ๐ ๋ผ์ด์ ์ค
|
| 66 |
+
|
| 67 |
+
์ด ํ๋ก์ ํธ๋ ๊ต์ก ๋ฐ ์ฐ๊ตฌ ๋ชฉ์ ์ผ๋ก ์ฌ์ฉ๋ฉ๋๋ค.
|
app.py
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# -*- coding: utf-8 -*-
|
| 2 |
+
"""
|
| 3 |
+
Hugging Face Spaces ์ง์
์
|
| 4 |
+
"""
|
| 5 |
+
import subprocess
|
| 6 |
+
import sys
|
| 7 |
+
|
| 8 |
+
# Streamlit ์ฑ ์คํ
|
| 9 |
+
if __name__ == "__main__":
|
| 10 |
+
subprocess.run([
|
| 11 |
+
sys.executable, "-m", "streamlit", "run",
|
| 12 |
+
"multimodal_rag_langgraph_gemini_st.py",
|
| 13 |
+
"--server.port=7860",
|
| 14 |
+
"--server.address=0.0.0.0"
|
| 15 |
+
])
|
chroma/4f0d0b85-04bb-4afc-9e18-51530c937a3b/data_level0.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:03daa563a12b4b05ab94546c01459890854373c93f5974309dd6b1f76051252c
|
| 3 |
+
size 59304000
|
chroma/4f0d0b85-04bb-4afc-9e18-51530c937a3b/data_level0.bin:Zone.Identifier
ADDED
|
Binary file (25 Bytes). View file
|
|
|
chroma/4f0d0b85-04bb-4afc-9e18-51530c937a3b/header.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:34820ba6903ee9392713712f72d463ddfb43aa6c38e4f186fe0b4b97bfa330e7
|
| 3 |
+
size 100
|
chroma/4f0d0b85-04bb-4afc-9e18-51530c937a3b/header.bin:Zone.Identifier
ADDED
|
Binary file (25 Bytes). View file
|
|
|
chroma/4f0d0b85-04bb-4afc-9e18-51530c937a3b/index_metadata.pickle
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:daadb7ebf7d186886ffdc10b257cefb7b67e892356823e99c6df4ca12901b1ee
|
| 3 |
+
size 397964
|
chroma/4f0d0b85-04bb-4afc-9e18-51530c937a3b/index_metadata.pickle:Zone.Identifier
ADDED
|
Binary file (25 Bytes). View file
|
|
|
chroma/4f0d0b85-04bb-4afc-9e18-51530c937a3b/length.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:be6484ac6ebc61a1b74d796300c4aa351991b8326255ec0da95f9ef91c9e46b8
|
| 3 |
+
size 56000
|
chroma/4f0d0b85-04bb-4afc-9e18-51530c937a3b/length.bin:Zone.Identifier
ADDED
|
Binary file (25 Bytes). View file
|
|
|
chroma/4f0d0b85-04bb-4afc-9e18-51530c937a3b/link_lists.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:42587e10ca5555b2a136ff8f45b9b1fc8ca165b49f4fd92a6fa4c17722017ed5
|
| 3 |
+
size 120260
|
chroma/4f0d0b85-04bb-4afc-9e18-51530c937a3b/link_lists.bin:Zone.Identifier
ADDED
|
Binary file (25 Bytes). View file
|
|
|
chroma/chroma.sqlite3
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:dcf796984f09d766f212a37735343b8fd149194ce36924b9ff6a25d9caf7c6ef
|
| 3 |
+
size 174977024
|
chroma/chroma.sqlite3:Zone.Identifier
ADDED
|
Binary file (25 Bytes). View file
|
|
|
multimodal_rag_langgraph_gemini_st.py
ADDED
|
@@ -0,0 +1,492 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# -*- coding: utf-8 -*-
|
| 2 |
+
"""
|
| 3 |
+
Streamlit ๊ธฐ๋ฐ ๋ด์ค ์ฌ์๋ฌธ ๋ถ์ ์์คํ
(Gemini 2.0 Flash ๋ฒ์ )
|
| 4 |
+
"""
|
| 5 |
+
import streamlit as st
|
| 6 |
+
import os
|
| 7 |
+
import re
|
| 8 |
+
from typing import TypedDict
|
| 9 |
+
from langgraph.graph import StateGraph, END
|
| 10 |
+
import google.generativeai as genai
|
| 11 |
+
import chromadb
|
| 12 |
+
from chromadb.config import Settings
|
| 13 |
+
from sentence_transformers import SentenceTransformer
|
| 14 |
+
from chromadb.utils.embedding_functions import EmbeddingFunction
|
| 15 |
+
from news_text_scraper import extract_article
|
| 16 |
+
import base64
|
| 17 |
+
import requests
|
| 18 |
+
from PIL import Image
|
| 19 |
+
from io import BytesIO
|
| 20 |
+
import time
|
| 21 |
+
|
| 22 |
+
# ========== ํ์ด์ง ์ค์ ==========
|
| 23 |
+
st.set_page_config(
|
| 24 |
+
page_title="๋ด์ค ์ฌ์๋ฌธ ๋ถ์ ์์คํ
",
|
| 25 |
+
page_icon="๐ฐ",
|
| 26 |
+
layout="wide"
|
| 27 |
+
)
|
| 28 |
+
|
| 29 |
+
# ========== ์ฌ์ด๋๋ฐ ์ค์ ==========
|
| 30 |
+
st.sidebar.title("โ๏ธ ์ค์ ")
|
| 31 |
+
st.sidebar.markdown("---")
|
| 32 |
+
|
| 33 |
+
gemini_api_key = st.sidebar.text_input(
|
| 34 |
+
"Gemini API Key",
|
| 35 |
+
type="password",
|
| 36 |
+
help="Google AI Studio์์ ๋ฐ๊ธ๋ฐ์ Gemini API ํค๋ฅผ ์
๋ ฅํ์ธ์."
|
| 37 |
+
)
|
| 38 |
+
|
| 39 |
+
st.sidebar.markdown("---")
|
| 40 |
+
st.sidebar.markdown("""
|
| 41 |
+
### ์ฌ์ฉ ๋ฐฉ๋ฒ
|
| 42 |
+
1. Gemini API Key ์
๋ ฅ
|
| 43 |
+
2. ๋ถ์ํ ๊ธฐ์ฌ URL ์
๋ ฅ
|
| 44 |
+
3. '๋ถ์ ์์' ๋ฒํผ ํด๋ฆญ
|
| 45 |
+
4. ๋จ๊ณ๋ณ ์งํ ์ํฉ ํ์ธ
|
| 46 |
+
5. ์ต์ข
๊ฒฐ๊ณผ ํ์ธ
|
| 47 |
+
|
| 48 |
+
### ์ฃผ์์ฌํญ
|
| 49 |
+
- API ํค๋ ์ธ์
์ข
๋ฃ ์ ์ญ์ ๋ฉ๋๋ค
|
| 50 |
+
- ๋ถ์์๋ ์ ๋ถ์ด ์์๋ ์ ์์ต๋๋ค
|
| 51 |
+
""")
|
| 52 |
+
|
| 53 |
+
# ========== ๋ฉ์ธ ํ๋ฉด ==========
|
| 54 |
+
st.title("๐ฐ ๋ด์ค ์ฌ์๋ฌธ ๋ถ์ ์์คํ
")
|
| 55 |
+
st.markdown("**Gemini 2.0 Flash ๊ธฐ๋ฐ - ํ๊ตญ์ ๋ฌธ์ค๋ฆฌ์์ํ ์ฌ์ ๊ธฐ์ค ์ ์ฉ**")
|
| 56 |
+
st.markdown("---")
|
| 57 |
+
|
| 58 |
+
# ========== ์ค์ ==========
|
| 59 |
+
CHROMA_PATH = "./chroma/"
|
| 60 |
+
COLLECTION_NAME = "press_ethics_e5_072025"
|
| 61 |
+
|
| 62 |
+
# ========== State ์ ์ ==========
|
| 63 |
+
class AnalysisState(TypedDict):
|
| 64 |
+
url: str
|
| 65 |
+
article: dict
|
| 66 |
+
image_desc: str
|
| 67 |
+
similar_cases: str
|
| 68 |
+
decision: str
|
| 69 |
+
review_result: dict
|
| 70 |
+
error: str
|
| 71 |
+
violation_count: int
|
| 72 |
+
|
| 73 |
+
# ========== ์๋ฒ ๋ฉ ํจ์ (์บ์ฑ) ==========
|
| 74 |
+
@st.cache_resource
|
| 75 |
+
def load_embedding_model():
|
| 76 |
+
"""์๋ฒ ๋ฉ ๋ชจ๋ธ ๋ก๋ (์บ์ฑ)"""
|
| 77 |
+
class CustomEmbedding(EmbeddingFunction):
|
| 78 |
+
def __init__(self):
|
| 79 |
+
self.model = SentenceTransformer("intfloat/multilingual-e5-large-instruct", device="cpu")
|
| 80 |
+
|
| 81 |
+
def __call__(self, input):
|
| 82 |
+
return self.model.encode(input).tolist()
|
| 83 |
+
|
| 84 |
+
return CustomEmbedding()
|
| 85 |
+
|
| 86 |
+
@st.cache_resource
|
| 87 |
+
def load_chroma_collection():
|
| 88 |
+
"""ChromaDB ์ปฌ๋ ์
๋ก๋ (์บ์ฑ)"""
|
| 89 |
+
try:
|
| 90 |
+
client = chromadb.PersistentClient(path=CHROMA_PATH, settings=Settings())
|
| 91 |
+
collection = client.get_collection(name=COLLECTION_NAME)
|
| 92 |
+
return collection
|
| 93 |
+
except Exception as e:
|
| 94 |
+
st.error(f"โ ChromaDB ๋ก๋ ์คํจ: {e}")
|
| 95 |
+
return None
|
| 96 |
+
|
| 97 |
+
# ========== ๊ท์ ๋ฐ ํ๋กฌํํธ ==========
|
| 98 |
+
REGULATION = """๋น์ ์ ํ๊ตญ์ ๋ฌธ์ค๋ฆฌ์์ํ ์ฌ์์์์
๋๋ค.
|
| 99 |
+
#์ ๋ฌธ์ค๋ฆฌ์ค์ฒ์๊ฐ:
|
| 100 |
+
์ 1์กฐใ์ธ๋ก ์ ์์ , ์ฑ
์, ๋
๋ฆฝใโ ์ ์น๊ถ๋ ฅ์ผ๋ก๋ถํฐ์ ์์ โก์ฌํยท๊ฒฝ์ ์ธ๋ ฅ์ผ๋ก๋ถํฐ์ ๋
๋ฆฝ โข์ฌํ์ ์ฑ
์ โฃ์ฐจ๋ณ๊ณผ ํธ๊ฒฌ ๊ธ์ง โค์ฌํ์ ์ฝ์ ๋ณดํธ
|
| 101 |
+
์ 2์กฐใ์ทจ์ฌ ์ค์นใโ ์ ๋ถ ์ฌ์นญยท์์ฅ ๊ธ์ง โก์๋ฃ ๋ฌด๋จ ์ด์ฉ ๊ธ์ง โข์ฌ๋ ๋ฐ ์ฌ๊ณ ์ทจ์ฌ โฃ์ ํ ๋ฐ ๋์งํธ ๊ธฐ๊ธฐ ํ์ฉ ์ทจ์ฌ โค๋์ฒญ ๋ฐ ๋น๋ฐ์ดฌ์ ๊ธ์ง โฅ๋ถ๋นํ ๊ธ์ ์ ๊ณต ๊ธ์ง
|
| 102 |
+
์ 3์กฐใ๋ณด๋ ์ค์นใโ ๋ณด๋๊ธฐ์ฌ์ ์ฌ์ค๊ณผ ์๊ฒฌ ๊ตฌ๋ถ โก๊ณต์ ๋ณด๋ โข๋ฐ๋ก ์ ๊ธฐํ โฃ๋ฏธํ์ธ ๋ณด๋ ๋ช
์ ์์น โค๋ณด๋์๋ฃ ๊ฒ์ฆ โฅ์ ์ ๋ณด๋ ๊ธ์ง โฆ์ฌ๋ ๋ณด๋์ ์ ์ค โง์์ด ๋ณด๋์ ์ฃผ์ โจํผ์์ฌ์ค ๋ณด๋ โฉํ์ค์ด ์ฌ์ฉ
|
| 103 |
+
์ 4์กฐใ์ฌ๋ฒ ๋ณด๋ ์ค์นใโ ์ฌํ ๋ถ๋น ์ํฅ ๊ธ์ง โกํ๊ฒฐ๋ฌธ ๋ฑ์ ์ฌ์ ๋ณด๋ ๊ธ์ง
|
| 104 |
+
์ 5์กฐใ์ทจ์ฌ์์ ๋ช
์์ ๋ณดํธใโ ์ทจ์ฌ์ ๋ณดํธ โก์ทจ์ฌ์ ๋ช
์์ ์ต๋ช
์กฐ๊ฑด โข์ 3์ ๋น๋ฐฉ๊ณผ ์ต๋ช
๋ณด๋ ๊ธ์ง โฃ์ทจ์ฌ์๊ณผ์ ๋น๋ณด๋ ์ฝ์
|
| 105 |
+
์ 6์กฐใ๋ณด๋์ ์ ์ํใโ ๋ณด๋์ ์ ์ํ ์ฐ์ฅ ๊ธ์ง โก๋ณด๋์ ์ ์ํ์ ํจ๋ ฅ ์์ค
|
| 106 |
+
์ 7์กฐใ๋ฒ์ฃ๋ณด๋์ ์ธ๊ถ์กด์คใโ ํผ์์ ๋ฐ ํผ๊ณ ์ธ์ ๋ช
์ ์กด์ค โกํผ์์ยทํผ๊ณ ์ธยท์ฐธ๊ณ ์ธ ๋ฑ ์ดฌ์ ์ ์ค โข๋ฒ์ฃ์ ๋ฌด๊ดํ ๊ฐ์กฑ ๋ณดํธ โฃ์ฑ๋ฒ์ฃ ๋ฑ์ 2์ฐจ ํผํด ๋ฐฉ์ง โค๋ฏธ์ฑ๋
ํผ์์ ์ ์ ๋ณดํธ
|
| 107 |
+
์ 8์กฐใ์ ์๋ฌผ์ ์ ์ฌ์ ์ธ์ฉใโ ํต์ ๊ธฐ์ฌ์ ์ถ์ฒ ๋ช
์ โกํ ์ธ๋ก ์ฌ ๋ณด๋ ๋ฑ์ ํ์ ๊ธ์ง โข์ถํ๋ฌผ ๋ฑ์ ํ์ ๊ธ์ง โฃ์ฌ์ง, ์์ ๋ฑ์ ์ ์๊ถ ๋ณดํธ
|
| 108 |
+
์ 9์กฐใํ๋ก ์ ์์นใโ ์ฌ์ค์ ์ ๋ก ์ฑ โกํ๋ก ์ ์์
|
| 109 |
+
์ 10์กฐใํธ์ง ์ง์นจใโ ์ ๋ชฉ์ ์์น โกํธ์ง ๋ณ๊ฒฝ ๊ธ์ง โข๊ธฐ๊ณ ๋ฌธ ๋ณ๊ฒฝ ๊ธ์ง โฃ๊ธฐ์ฌ ์ ์ โค๊ด๋ จ์ฌ์ง ๊ฒ์ฌ โฅ์ฌ์ง ๋ฐ ์์ ์กฐ์ ๊ธ์ง โฆ๊ธฐ์ฌ์ ๊ด๊ณ ์ ๊ตฌ๋ถ โง์ด์ฉ์์ ๊ถ๋ฆฌ ๋ณดํธ โจ๋ถ๋นํ ์ฌ์ ์ก ๊ธ์ง
|
| 110 |
+
์ 11์กฐใ๋ช
์์ ์ ์ฉ์กด์คใโ ๋ช
์ยท์ ์ฉ ํผ์ ๊ธ์ง โก์ฌ์์ ๋ช
์ ์กด์ค
|
| 111 |
+
์ 12์กฐใ์ฌ์ํ ๋ณดํธใโ ์ฌ์ํ ์นจํด ๊ธ์ง โก๊ฐ์ธ์ ๋ณด ๋ฌด๋จ ๊ฒ์ ๋ฑ ๊ธ์ง โข์ฌ์ํ ๋ฑ์ ์ดฌ์ ๋ฐ ๋ณด๋ ๊ธ์ง โฃ๊ณต์ธ์ ์ฌ์ํ ๋ณด๋
|
| 112 |
+
๏ฟฝ๏ฟฝ๏ฟฝ13์กฐใ์ฒญ์๋
๊ณผ ์ด๋ฆฐ์ด ๋ณดํธใโ ์ฒญ์๋
๊ณผ ์ด๋ฆฐ์ด ์ทจ์ฌ ๋ณด๋ โก๋ฒ์ฃ ๋ณด๋์ ์ฒญ์๋
, ์ด๋ฆฐ์ด ๋ณดํธ โข์ ํดํ๊ฒฝ์ผ๋ก๋ถํฐ์ ๋ณดํธ โฃ์ ๊ดดยท๋ฉ์น ๋ณด๋์ ํ ํ์กฐ
|
| 113 |
+
์ 14์กฐใ์ ๋ณด์ ๋ถ๋น์ด์ฉ๊ธ์งใโ ์์ ์ฃผ์ ๋ฑ์ ๊ดํ ๋ณด๋ ์ ํ โก์ฃผ์ยท๋ถ๋์ฐ ๋ฑ์ ๋ถ๋น ๊ฑฐ๋ ๊ธ์ง
|
| 114 |
+
์ 15์กฐใ์ธ๋ก ์ธ์ ํ์ใโ ๊ธํ์์ ๋ฐ ํฅ์, ์ฒญํ ๊ธ์ง โก๋ถ๋นํ ์ง๋จ ์ํฅ๋ ฅ ํ์ฌ ๊ธ์ง โข๊ด๊ณ ยทํ๋งค ๋ฑ ์์
ํ์ ๊ธ์ง
|
| 115 |
+
์ 16์กฐใ๊ณต์ต์ ์ ์ใโ ๊ตญ๊ฐ ์์ ๋ฑ โก๊ณต์ค ์๋
โข๋ฒ์ฃ์ ํญ๋ก โฃ๊ณต์ค์ ์ค๋ ๋ฐฉ์ง"""
|
| 116 |
+
|
| 117 |
+
def parse_regulation_dict():
|
| 118 |
+
"""REGULATION์ ํ์ฑํ์ฌ ์กฐํญ ๋์
๋๋ฆฌ ์์ฑ"""
|
| 119 |
+
articles = {}
|
| 120 |
+
lines = REGULATION.split('\n')
|
| 121 |
+
for line in lines:
|
| 122 |
+
if line.startswith('์ '):
|
| 123 |
+
match = re.match(r'์ (\d+)์กฐใ([^ใ]+)ใ(.+)', line)
|
| 124 |
+
if match:
|
| 125 |
+
num = match.group(1)
|
| 126 |
+
name = match.group(2)
|
| 127 |
+
items_text = match.group(3)
|
| 128 |
+
items = {}
|
| 129 |
+
item_pattern = r'([โ โกโขโฃโคโฅโฆโงโจโฉ])([^โ โกโขโฃโคโฅโฆโงโจโฉ]+)'
|
| 130 |
+
for item_match in re.finditer(item_pattern, items_text):
|
| 131 |
+
item_num = item_match.group(1)
|
| 132 |
+
item_content = item_match.group(2).strip()
|
| 133 |
+
items[item_num] = item_content
|
| 134 |
+
articles[num] = {'name': name, 'items': items}
|
| 135 |
+
return articles
|
| 136 |
+
|
| 137 |
+
REGULATION_DICT = parse_regulation_dict()
|
| 138 |
+
|
| 139 |
+
def correct_article_reference(text):
|
| 140 |
+
"""์ฌ์๋ฌธ์ ์กฐํญ ์ฐธ์กฐ๋ฅผ REGULATION_DICT์ ๋ง๊ฒ ์๋ ์์ """
|
| 141 |
+
pattern = r'์ (\d+)์กฐใ([^ใ]+)ใ([โ โกโขโฃโคโฅโฆโงโจโฉ])(?:ํญ|ํธ)?(?:\([^)]*\))*'
|
| 142 |
+
|
| 143 |
+
def replace_match(match):
|
| 144 |
+
article_num = match.group(1)
|
| 145 |
+
cited_name = match.group(2).strip()
|
| 146 |
+
item_num = match.group(3)
|
| 147 |
+
|
| 148 |
+
if article_num in REGULATION_DICT:
|
| 149 |
+
correct_name = REGULATION_DICT[article_num]['name']
|
| 150 |
+
items = REGULATION_DICT[article_num]['items']
|
| 151 |
+
|
| 152 |
+
if item_num in items:
|
| 153 |
+
item_content = items[item_num]
|
| 154 |
+
return f'์ {article_num}์กฐใ{correct_name}ใ{item_num}({item_content})'
|
| 155 |
+
else:
|
| 156 |
+
return f'์ {article_num}์กฐใ{correct_name}ใ{item_num}'
|
| 157 |
+
return match.group(0)
|
| 158 |
+
|
| 159 |
+
return re.sub(pattern, replace_match, text)
|
| 160 |
+
|
| 161 |
+
INST_PROMPT = """#์ฌ์ ์ง์นจ:
|
| 162 |
+
1. **๋ณด์์ ํ๋จ ์์น**: ์ ๋ฌธ์ค๋ฆฌ์ค์ฒ์๊ฐ์ ์ฒด๊ณ์ ์ผ๋ก ๊ฒํ ํ๋, ๋งค์ฐ ๋ณด์์ ์ผ๋ก ํ๋จ
|
| 163 |
+
2. **๋ช
๋ฐฑํ๊ณ ์ฌ๊ฐํ ์๋ฐ๋ง ์ง์ **: ์์ฌ์ค๋ฝ๊ฑฐ๋ ๊ฒฝ๋ฏธํ๊ฑฐ๋ ๋ถ๋ถ๋ช
ํ ์ฌ์์ ๋ชจ๋ "์๋ฐ ์์"
|
| 164 |
+
3. ์ ์ฌ ์ฌ๋ก๋ฅผ ์ฐธ๊ณ ํ๋, ํด๋น ๊ธฐ์ฌ์ ๊ตฌ์ฒด์ ๋ด์ฉ๊ณผ ๋งฅ๋ฝ์ ์ค์ฌ์ผ๋ก ๋
๋ฆฝ์ ์ผ๋ก ํ๋จ
|
| 165 |
+
4. **ํน๋ณ ์ฃผ์์ฌํญ**:
|
| 166 |
+
- ํน์ ๋จ์ฒด/๊ธฐ์
์ ํ๋์ ์ง๋์น๊ฒ ์นญ์ฐฌํ๊ณ ํ๋ณดํ๋ ๋ด์ฉ โ ์ 1์กฐโก(์ฌํยท๊ฒฝ์ ์ธ๋ ฅ์ผ๋ก๋ถํฐ์ ๋
๋ฆฝ) ๋๋ ์ 10์กฐโฆ(๊ธฐ์ฌ์ ๊ด๊ณ ์ ๊ตฌ๋ถ) ์ ์ฉ ๊ฒํ
|
| 167 |
+
- ๋จ์ํ ๋จ์ฒด ํ๋์ ์๊ฐํ๋ ์์ค์ ์๋ฐ ์๋. ๋ช
๋ฐฑํ ํ๋ณด/๊ด๊ณ ๋ชฉ์ ์ด์ด์ผ ํจ
|
| 168 |
+
|
| 169 |
+
#์์ฑ ํ์ (๋ฐ๋์ ์ ํํ ์ค์):
|
| 170 |
+
|
| 171 |
+
**[์๋ฐ ์์ ์] - ์ ๋ ์์:**
|
| 172 |
+
- ์ค์ง "์๋ฐ ์์" ๊ธ์๋ง ์ถ๋ ฅ. ์ด๋ ํ ์ถ๊ฐ ์ค๋ช
, ์ด์ , ์ฝ๋ฉํธ๋ ์ ๋ ๊ธ์ง
|
| 173 |
+
|
| 174 |
+
**[์๋ฐ ์] - ์ ํํ ์ค์:**
|
| 175 |
+
์๋ 4๋จ๊ณ๋ฅผ ๋ฐ๋์ ์์๋๋ก ๋ฐ๋ฅด๋, "1๋จ๊ณ", "2๋จ๊ณ" ๋ฑ์ ์์ ๋ชฉ ์์ด ์์ฐ์ค๋ฌ์ด ๋ฌธ์ฅ์ผ๋ก ์ฐ๊ฒฐ:
|
| 176 |
+
1๋จ๊ณ) ๊ธฐ์ฌ ์์ฝ 2~3๋ฌธ์ฅ
|
| 177 |
+
- "์ ๊ธฐ์ฌ๋ โโโ์ ๋ํด ๋ณด๋ํ๋ฉด์..." ํ์์ผ๋ก ์์
|
| 178 |
+
2๋จ๊ณ) ๋ฌธ์ ์ ์ง์ 1~2๋ฌธ์ฅ
|
| 179 |
+
- "๊ทธ๋ฌ๋ ์ด ๋ณด๋๋...", "ํ์ง๋ง..." ๋ฑ์ผ๋ก ๋ฌธ์ ์ ๋ช
ํํ ์ง์
|
| 180 |
+
3๋จ๊ณ) ๊ท์ ๊ทผ๊ฑฐ 1~2๋ฌธ์ฅ
|
| 181 |
+
- ์ ๋ฌธ์ค๋ฆฌ์ค์ฒ์๊ฐ์ ๋ฐํ์ผ๋ก ๋ฌธ์ ์ ์๋ฐ ์ ๋น์ฑ์ ์ ์
|
| 182 |
+
4๋จ๊ณ) ๊ฒฐ๋ก ๋ฌธ์ฅ (์ ํํ ์ด ํ์ ์ค์)
|
| 183 |
+
- "๋ฐ๋ผ์ ์ ๋ณด๋๋ ์ ๋ฌธ์ค๋ฆฌ์ค์ฒ์๊ฐ ์ โ์กฐใ์กฐํญ๋ช
ใโํญ(์ธ๋ถ๋ด์ฉ)์ ์๋ฐํ๋ค๊ณ ์ธ์ ํ์ฌ ์ฃผ๋ฌธ๊ณผ ๊ฐ์ด ๊ฒฐ์ ํ๋ค."
|
| 184 |
+
- ์ ์ฒด 6๋ฌธ์ฅ ์ด์
|
| 185 |
+
- ์ ์ฌ ์ฌ๋ก์ ์์ฐ์ค๋ฌ์ด ๋ฌธ์ฅ์ฒด ์ฐธ๊ณ
|
| 186 |
+
- "1)", "2)", "3)" ๋ฑ์ ๋ฒํธ๋ ์์ ๋ชฉ ์ ๋ ์ฌ์ฉ ๊ธ์ง"""
|
| 187 |
+
|
| 188 |
+
# ========== Gemini API ํธ์ถ ํจ์ ==========
|
| 189 |
+
def call_gemini(api_key: str, prompt: str, image_data: str = None, temperature: float = 0.0) -> str:
|
| 190 |
+
"""Gemini API ํธ์ถ"""
|
| 191 |
+
genai.configure(api_key=api_key)
|
| 192 |
+
model = genai.GenerativeModel('gemini-2.0-flash-exp')
|
| 193 |
+
|
| 194 |
+
generation_config = genai.GenerationConfig(
|
| 195 |
+
temperature=temperature,
|
| 196 |
+
max_output_tokens=8192,
|
| 197 |
+
)
|
| 198 |
+
|
| 199 |
+
if image_data:
|
| 200 |
+
image_part = {
|
| 201 |
+
"mime_type": "image/jpeg",
|
| 202 |
+
"data": base64.b64decode(image_data)
|
| 203 |
+
}
|
| 204 |
+
response = model.generate_content(
|
| 205 |
+
[prompt, image_part],
|
| 206 |
+
generation_config=generation_config
|
| 207 |
+
)
|
| 208 |
+
else:
|
| 209 |
+
response = model.generate_content(
|
| 210 |
+
prompt,
|
| 211 |
+
generation_config=generation_config
|
| 212 |
+
)
|
| 213 |
+
|
| 214 |
+
return response.text
|
| 215 |
+
|
| 216 |
+
# ========== ๋ถ์ ํจ์ ==========
|
| 217 |
+
def analyze_article_streamlit(url: str, api_key: str, progress_container, status_container):
|
| 218 |
+
"""Streamlit์ฉ ๊ธฐ์ฌ ๋ถ์ ํจ์"""
|
| 219 |
+
|
| 220 |
+
# ์งํ ์ํฉ ํ์
|
| 221 |
+
progress_bar = progress_container.progress(0)
|
| 222 |
+
|
| 223 |
+
# 1. ๊ธฐ์ฌ ์ถ์ถ
|
| 224 |
+
status_container.info("๐ 1๋จ๊ณ: ๊ธฐ์ฌ ์ถ์ถ ์ค...")
|
| 225 |
+
progress_bar.progress(10)
|
| 226 |
+
|
| 227 |
+
try:
|
| 228 |
+
article = extract_article(url)
|
| 229 |
+
if not article or not article.get('text'):
|
| 230 |
+
status_container.error("โ ๊ธฐ์ฌ ์ถ์ถ ์คํจ: ์ ํจํ ๊ธฐ์ฌ๋ฅผ ์ฐพ์ ์ ์์ต๋๋ค.")
|
| 231 |
+
return None
|
| 232 |
+
status_container.success(f"โ
๊ธฐ์ฌ ์ถ์ถ ์๋ฃ: {article.get('title', '')[:50]}...")
|
| 233 |
+
except Exception as e:
|
| 234 |
+
status_container.error(f"โ ๊ธฐ์ฌ ์ถ์ถ ์ค๋ฅ: {e}")
|
| 235 |
+
return None
|
| 236 |
+
|
| 237 |
+
progress_bar.progress(20)
|
| 238 |
+
|
| 239 |
+
# 2. ์ด๋ฏธ์ง ์ฒ๋ฆฌ
|
| 240 |
+
status_container.info("๐ผ๏ธ 2๋จ๊ณ: ์ด๋ฏธ์ง ์ฒ๋ฆฌ ์ค...")
|
| 241 |
+
image_desc = None
|
| 242 |
+
img_url = article.get('image_url')
|
| 243 |
+
|
| 244 |
+
if img_url:
|
| 245 |
+
try:
|
| 246 |
+
resp = requests.get(img_url, timeout=10, headers={'User-Agent': 'Mozilla/5.0'})
|
| 247 |
+
img = Image.open(BytesIO(resp.content))
|
| 248 |
+
if img.mode == 'RGBA':
|
| 249 |
+
bg = Image.new('RGB', img.size, (255, 255, 255))
|
| 250 |
+
bg.paste(img, mask=img.split()[-1])
|
| 251 |
+
img = bg
|
| 252 |
+
elif img.mode != 'RGB':
|
| 253 |
+
img = img.convert('RGB')
|
| 254 |
+
|
| 255 |
+
buffer = BytesIO()
|
| 256 |
+
img.save(buffer, format='JPEG', quality=85)
|
| 257 |
+
b64 = base64.b64encode(buffer.getvalue()).decode('utf-8')
|
| 258 |
+
|
| 259 |
+
image_desc = call_gemini(api_key, "์ด ์ด๋ฏธ์ง๋ฅผ ํ๊ตญ์ด๋ก ์์ธํ ์ค๋ช
ํด์ฃผ์ธ์.", image_data=b64, temperature=0.3)
|
| 260 |
+
status_container.success("โ
์ด๋ฏธ์ง ์ค๋ช
์์ฑ ์๋ฃ")
|
| 261 |
+
except Exception as e:
|
| 262 |
+
status_container.warning(f"โ ๏ธ ์ด๋ฏธ์ง ์ฒ๋ฆฌ ์คํจ: {e}")
|
| 263 |
+
else:
|
| 264 |
+
status_container.info("โน๏ธ ์ด๋ฏธ์ง ์์")
|
| 265 |
+
|
| 266 |
+
progress_bar.progress(40)
|
| 267 |
+
|
| 268 |
+
# 3. ์ ์ฌ ์ฌ๋ก ๊ฒ์
|
| 269 |
+
status_container.info("๐ 3๋จ๊ณ: ์ ์ฌ ์ฌ๋ก ๊ฒ์ ์ค...")
|
| 270 |
+
similar_cases = ""
|
| 271 |
+
violation_count = 0
|
| 272 |
+
no_violation_count = 0
|
| 273 |
+
|
| 274 |
+
try:
|
| 275 |
+
ef = load_embedding_model()
|
| 276 |
+
collection = load_chroma_collection()
|
| 277 |
+
|
| 278 |
+
if collection:
|
| 279 |
+
text = f"{article.get('title', '')} {article.get('text', '')[:2000]}"
|
| 280 |
+
query_emb = ef([text])
|
| 281 |
+
results = collection.query(query_embeddings=query_emb, n_results=5)
|
| 282 |
+
cases = []
|
| 283 |
+
for i in range(len(results["documents"][0])):
|
| 284 |
+
reason = results['metadatas'][0][i]['reason']
|
| 285 |
+
cases.append(f"{i+1}. {reason}")
|
| 286 |
+
|
| 287 |
+
# ์๋ฐ ๊ฐ์ ์นด์ดํ
|
| 288 |
+
if '์๋ฐ' in reason and '์๋ฐ ์์' not in reason and '์๋ฐ์์' not in reason:
|
| 289 |
+
violation_count += 1
|
| 290 |
+
elif '์๋ฐ ์์' in reason or '์๋ฐ์์' in reason:
|
| 291 |
+
no_violation_count += 1
|
| 292 |
+
|
| 293 |
+
similar_cases = "\n".join(cases)
|
| 294 |
+
status_container.success(f"โ
์ ์ฌ ์ฌ๋ก {len(cases)}๊ฐ ๊ฒ์ ์๋ฃ (์๋ฐ {violation_count}/5, ์๋ฐ์์ {no_violation_count}/5)")
|
| 295 |
+
else:
|
| 296 |
+
status_container.warning("โ ๏ธ ์ ์ฌ ์ฌ๋ก ๊ฒ์ ์คํจ: ChromaDB ๋ก๋ ์ค๋ฅ")
|
| 297 |
+
except Exception as e:
|
| 298 |
+
status_container.warning(f"โ ๏ธ ์ ์ฌ ์ฌ๋ก ๊ฒ์ ์คํจ: {e}")
|
| 299 |
+
|
| 300 |
+
progress_bar.progress(60)
|
| 301 |
+
|
| 302 |
+
# 4. ์ฌ์๋ฌธ ์์ฑ
|
| 303 |
+
status_container.info("๐ 4๋จ๊ณ: ์ฌ์๋ฌธ ์์ฑ ์ค...")
|
| 304 |
+
|
| 305 |
+
try:
|
| 306 |
+
prompt = f"{REGULATION}\n\n{INST_PROMPT}\n\n#๊ธฐ์ฌ:\n{article.get('title', '')} {article.get('text', '')[:2000]}"
|
| 307 |
+
if image_desc:
|
| 308 |
+
prompt += f"\n\n#์ด๋ฏธ์ง:\n{image_desc}"
|
| 309 |
+
if similar_cases:
|
| 310 |
+
prompt += f"\n\n#์ ์ฌ์ฌ๋ก:\n{similar_cases}"
|
| 311 |
+
|
| 312 |
+
if no_violation_count >= 4:
|
| 313 |
+
prompt += f"\n\n**์ค์**: ์ ์ฌ ์ฌ๋ก 5๊ฐ ์ค {no_violation_count}๊ฐ๊ฐ '์๋ฐ ์์'์
๋๋ค. 4๊ฐ ์ด์์ด๋ฏ๋ก ์ด ๊ธฐ์ฌ๋ '์๋ฐ ์์'์ ๊ฐ๋ ฅํ๊ฒ ๊ณ ๋ คํ์ญ์์ค."
|
| 314 |
+
|
| 315 |
+
decision = call_gemini(api_key, prompt, temperature=0.0)
|
| 316 |
+
status_container.success("โ
์ฌ์๋ฌธ ์์ฑ ์๋ฃ")
|
| 317 |
+
except Exception as e:
|
| 318 |
+
status_container.error(f"โ ์ฌ์๋ฌธ ์์ฑ ์คํจ: {e}")
|
| 319 |
+
return None
|
| 320 |
+
|
| 321 |
+
progress_bar.progress(80)
|
| 322 |
+
|
| 323 |
+
# 5. ์ต์ข
๊ฒํ
|
| 324 |
+
status_container.info("๐ 5๋จ๊ณ: ์ต์ข
๊ฒํ ์ค...")
|
| 325 |
+
|
| 326 |
+
if "์๋ฐ ์์" in decision or "์๋ฐ์์" in decision:
|
| 327 |
+
final_decision = "์๋ฐ ์์"
|
| 328 |
+
status_container.success("โ
๊ฒํ ์๋ฃ: ์๋ฐ ์์")
|
| 329 |
+
else:
|
| 330 |
+
try:
|
| 331 |
+
review_prompt = f"""๋น์ ์ ์ ๋ฌธ์ค๋ฆฌ์์ํ ๊ฒํ ๋ด๋น์์
๋๋ค. ์์ฑ๋ ์ฌ์๋ฌธ์ ๊ฒํ ํ๊ณ ์์ ํ์ธ์.
|
| 332 |
+
|
| 333 |
+
#๋ถ์ ๋์ ๊ธฐ์ฌ:
|
| 334 |
+
์ ๋ชฉ: {article.get('title', '')}
|
| 335 |
+
๋ณธ๋ฌธ: {article.get('text', '')[:2000]}
|
| 336 |
+
|
| 337 |
+
#์์ฑ๋ ์ฌ์๋ฌธ:
|
| 338 |
+
{decision}
|
| 339 |
+
|
| 340 |
+
#์ ๋ฌธ์ค๋ฆฌ์ค์ฒ์๊ฐ:
|
| 341 |
+
{REGULATION}
|
| 342 |
+
|
| 343 |
+
#๊ฒํ ์๋ฌด (๋ฐ๋์ ์ค์):
|
| 344 |
+
1. **์กฐํญ ์ ํ์ฑ**: ์ธ์ฉ๋ ์กฐํญ์ด ์ ๋ฌธ์ค๋ฆฌ์ค์ฒ์๊ฐ์ ์ ํํ ์กด์ฌํ๋์ง ํ์ธ(์กฐํญ ๋ฒํธ, ์กฐํญ๋ช
๋์กฐ)ํ๊ณ ํ๋ฆฐ ๋ถ๋ถ ์์
|
| 345 |
+
2. **๊ธฐ์ฌ ๊ด๋ จ์ฑ**: ์ฌ์๋ฌธ์ด ์ค์ ๊ธฐ์ฌ ๋ด์ฉ๊ณผ ์ผ์นํ๋์ง ํ์ธ(ํ๊ฐ ๋ด์ฉ ์ญ์ )ํ๊ณ ํ์์ ์์
|
| 346 |
+
3. **ํ์ ๊ฒ์ฆ ๋ฐ ์์ **:
|
| 347 |
+
- "1)", "2)", "3)" ๋ฑ์ ๋ฒํธ๋ ์์ ๋ชฉ์ด ์์ผ๋ฉด ๋ชจ๋ ์ญ์ ํ๊ณ ์์ฐ์ค๋ฌ์ด ๋ฌธ์ฅ์ฒด๋ก ์์
|
| 348 |
+
- ๋ฐ๋์: ๊ธฐ์ฌ ์์ฝ(2~3๋ฌธ์ฅ) โ ๋ฌธ์ ์ (1~2๋ฌธ์ฅ) โ ๊ทผ๊ฑฐ(1~2๋ฌธ์ฅ) โ ๊ฒฐ๋ก ("๋ฐ๋ผ์ ์ ๋ณด๋๋...") ์์ ์ค์
|
| 349 |
+
4. **๊ฒํ ์๊ฒฌ ์์ ์ ๊ฑฐ**: "์ฌ์๋ฌธ์์ ์ธ๊ธ๋...", "ํ์ธ๋์ง ์์ต๋๋ค", "๊ฒํ ๊ฒฐ๊ณผ..." ๋ฑ์ ๊ฒํ ์๊ฒฌ์ ์ ๋ ํฌํจํ์ง ๋ง ๊ฒ
|
| 350 |
+
- ๊ฒํ ์์ ๋ฉํ์ ์ฝ๋ฉํธ๋ ๋ชจ๋ ์ญ์
|
| 351 |
+
- ์ค์ง ์ฌ์๋ฌธ ๋ณธ๋ฌธ๋ง ์ถ๋ ฅ
|
| 352 |
+
|
| 353 |
+
์์ ๋ ์ต์ข
์ฌ์๋ฌธ๋ง ์ถ๋ ฅํ์์ค (๊ฒํ ์๊ฒฌ ์ ๋ ํฌํจ ๊ธ์ง):"""
|
| 354 |
+
|
| 355 |
+
final_decision = call_gemini(api_key, review_prompt, temperature=0.0)
|
| 356 |
+
final_decision = correct_article_reference(final_decision.strip())
|
| 357 |
+
status_container.success("โ
๊ฒํ ์๋ฃ: ์กฐํญ ์ ํ์ฑ ๋ฐ ๊ธฐ์ฌ ๊ด๋ จ์ฑ ๊ฒ์ฆ ์๋ฃ")
|
| 358 |
+
except Exception as e:
|
| 359 |
+
status_container.warning(f"โ ๏ธ ๊ฒํ ์คํจ: {e}")
|
| 360 |
+
final_decision = decision
|
| 361 |
+
|
| 362 |
+
progress_bar.progress(100)
|
| 363 |
+
status_container.success("๐ ๋ถ์ ์๋ฃ!")
|
| 364 |
+
|
| 365 |
+
# ๊ฒฐ๊ณผ ๋ฐํ
|
| 366 |
+
return {
|
| 367 |
+
'article': article,
|
| 368 |
+
'image_desc': image_desc,
|
| 369 |
+
'similar_cases': similar_cases,
|
| 370 |
+
'violation_count': violation_count,
|
| 371 |
+
'no_violation_count': no_violation_count,
|
| 372 |
+
'final_decision': final_decision
|
| 373 |
+
}
|
| 374 |
+
|
| 375 |
+
# ========== ๋ฉ์ธ UI ==========
|
| 376 |
+
url_input = st.text_input(
|
| 377 |
+
"๐ ๊ธฐ์ฌ URL ์
๋ ฅ",
|
| 378 |
+
placeholder="https://news.example.com/article/12345",
|
| 379 |
+
help="๋ถ์ํ ๋ด์ค ๊ธฐ์ฌ์ URL์ ์
๋ ฅํ์ธ์"
|
| 380 |
+
)
|
| 381 |
+
|
| 382 |
+
col1, col2, col3 = st.columns([1, 1, 4])
|
| 383 |
+
with col1:
|
| 384 |
+
analyze_button = st.button("๐ ๋ถ์ ์์", type="primary", use_container_width=True)
|
| 385 |
+
with col2:
|
| 386 |
+
clear_button = st.button("๐ ์ด๊ธฐํ", use_container_width=True)
|
| 387 |
+
|
| 388 |
+
if clear_button:
|
| 389 |
+
st.rerun()
|
| 390 |
+
|
| 391 |
+
st.markdown("---")
|
| 392 |
+
|
| 393 |
+
# ========== ๋ถ์ ์คํ ==========
|
| 394 |
+
if analyze_button:
|
| 395 |
+
if not gemini_api_key:
|
| 396 |
+
st.error("โ Gemini API Key๋ฅผ ์
๋ ฅํด์ฃผ์ธ์!")
|
| 397 |
+
elif not url_input:
|
| 398 |
+
st.error("โ ๊ธฐ์ฌ URL์ ์
๋ ฅํด์ฃผ์ธ์!")
|
| 399 |
+
else:
|
| 400 |
+
# ์งํ ์ํฉ ์ปจํ
์ด๋
|
| 401 |
+
progress_container = st.container()
|
| 402 |
+
status_container = st.container()
|
| 403 |
+
|
| 404 |
+
# ๋ถ์ ์คํ
|
| 405 |
+
result = analyze_article_streamlit(url_input, gemini_api_key, progress_container, status_container)
|
| 406 |
+
|
| 407 |
+
if result:
|
| 408 |
+
st.markdown("---")
|
| 409 |
+
st.header("๐ ๋ถ์ ๊ฒฐ๊ณผ")
|
| 410 |
+
|
| 411 |
+
# ๊ฒฐ๊ณผ ํ์
|
| 412 |
+
col1, col2 = st.columns([2, 1])
|
| 413 |
+
|
| 414 |
+
with col1:
|
| 415 |
+
st.subheader("๐ฐ ๊ธฐ์ฌ ์ ๋ณด")
|
| 416 |
+
article = result['article']
|
| 417 |
+
|
| 418 |
+
st.write(f"**์ ๋ชฉ:** {article.get('title', 'N/A')}")
|
| 419 |
+
st.write(f"**์ธ๋ก ์ฌ:** {article.get('media', 'N/A')}")
|
| 420 |
+
st.write(f"**๋ ์ง:** {article.get('date', 'N/A')}")
|
| 421 |
+
st.write(f"**URL:** {article.get('url', url_input)}")
|
| 422 |
+
|
| 423 |
+
if article.get('image_url'):
|
| 424 |
+
st.write(f"**์ด๋ฏธ์ง URL:** {article.get('image_url', 'N/A')}")
|
| 425 |
+
try:
|
| 426 |
+
st.image(article['image_url'], caption="๊ธฐ์ฌ ์ด๋ฏธ์ง", use_container_width=True)
|
| 427 |
+
except:
|
| 428 |
+
st.warning("์ด๋ฏธ์ง๋ฅผ ๋ถ๋ฌ์ฌ ์ ์์ต๋๋ค.")
|
| 429 |
+
|
| 430 |
+
with st.expander("๐ ๊ธฐ์ฌ ๋ณธ๋ฌธ", expanded=False):
|
| 431 |
+
st.write(article.get('text', 'N/A')[:1000] + "..." if len(article.get('text', '')) > 1000 else article.get('text', 'N/A'))
|
| 432 |
+
|
| 433 |
+
with col2:
|
| 434 |
+
st.subheader("๐ ๋ถ์ ์ ๋ณด")
|
| 435 |
+
|
| 436 |
+
# ๋ถ์ ๊ฒฐ๊ณผ
|
| 437 |
+
if result['final_decision'].strip() == "์๋ฐ ์์":
|
| 438 |
+
st.success("โ
**๋ถ์ ๊ฒฐ๊ณผ:** ์๋ฐ ์์")
|
| 439 |
+
else:
|
| 440 |
+
st.error("โ ๏ธ **๋ถ์ ๊ฒฐ๊ณผ:** ์๋ฐ")
|
| 441 |
+
|
| 442 |
+
# ์ ์ฌ ์ฌ๋ก ํต๊ณ
|
| 443 |
+
st.metric("์๋ฐ ์ฌ๋ก", f"{result['violation_count']}/5")
|
| 444 |
+
st.metric("์๋ฐ ์์ ์ฌ๋ก", f"{result['no_violation_count']}/5")
|
| 445 |
+
|
| 446 |
+
# ์ฌ์๋ฌธ
|
| 447 |
+
st.markdown("---")
|
| 448 |
+
st.subheader("โ๏ธ ์ต์ข
์ฌ์๋ฌธ")
|
| 449 |
+
st.info(result['final_decision'])
|
| 450 |
+
|
| 451 |
+
# ์ ์ฌ ์ฌ๋ก
|
| 452 |
+
if result['similar_cases']:
|
| 453 |
+
with st.expander("๐ ์ ์ฌ ์ฌ๋ก (5๊ฐ)", expanded=False):
|
| 454 |
+
st.text(result['similar_cases'])
|
| 455 |
+
|
| 456 |
+
# ์ด๋ฏธ์ง ์ค๋ช
|
| 457 |
+
if result['image_desc']:
|
| 458 |
+
with st.expander("๐ผ๏ธ ์ด๋ฏธ์ง ์ค๋ช
", expanded=False):
|
| 459 |
+
st.write(result['image_desc'])
|
| 460 |
+
|
| 461 |
+
# ๋ค์ด๋ก๋ ๋ฒํผ
|
| 462 |
+
st.markdown("---")
|
| 463 |
+
result_text = f"""
|
| 464 |
+
# ๋ด์ค ์ฌ์๋ฌธ ๋ถ์ ๊ฒฐ๊ณผ
|
| 465 |
+
|
| 466 |
+
## ๊ธฐ์ฌ ์ ๋ณด
|
| 467 |
+
- **์ ๋ชฉ:** {article.get('title', 'N/A')}
|
| 468 |
+
- **์ธ๋ก ์ฌ:** {article.get('media', 'N/A')}
|
| 469 |
+
- **๋ ์ง:** {article.get('date', 'N/A')}
|
| 470 |
+
- **URL:** {article.get('url', url_input)}
|
| 471 |
+
- **์ด๋ฏธ์ง URL:** {article.get('image_url', 'N/A')}
|
| 472 |
+
|
| 473 |
+
## ๋ถ์ ๊ฒฐ๊ณผ
|
| 474 |
+
- **๊ฒฐ๊ณผ:** {"์๋ฐ ์์" if result['final_decision'].strip() == "์๋ฐ ์์" else "์๋ฐ"}
|
| 475 |
+
- **์ ์ฌ ์ฌ๋ก ์๋ฐ ์:** {result['violation_count']}/5
|
| 476 |
+
|
| 477 |
+
## ์ต์ข
์ฌ์๋ฌธ
|
| 478 |
+
{result['final_decision']}
|
| 479 |
+
|
| 480 |
+
## ์ ์ฌ ์ฌ๋ก
|
| 481 |
+
{result['similar_cases']}
|
| 482 |
+
"""
|
| 483 |
+
|
| 484 |
+
st.download_button(
|
| 485 |
+
label="๐พ ๊ฒฐ๊ณผ ๋ค์ด๋ก๋ (TXT)",
|
| 486 |
+
data=result_text,
|
| 487 |
+
file_name=f"์ฌ์๋ฌธ_๋ถ์๊ฒฐ๊ณผ_{time.strftime('%Y%m%d_%H%M%S')}.txt",
|
| 488 |
+
mime="text/plain"
|
| 489 |
+
)
|
| 490 |
+
|
| 491 |
+
else:
|
| 492 |
+
st.info("๐ ๊ธฐ์ฌ URL์ ์
๋ ฅํ๊ณ '๋ถ์ ์์' ๋ฒํผ์ ํด๋ฆญํ์ธ์.")
|
news_text_scraper.py
ADDED
|
@@ -0,0 +1,315 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
๋ด์ค ๊ธฐ์ฌ ์คํฌ๋ํ ๋๊ตฌ (์ต์ ํ ๋ฒ์ )
|
| 3 |
+
|
| 4 |
+
์ฌ์ฉ๋ฒ:
|
| 5 |
+
article = extract_article(url)
|
| 6 |
+
|
| 7 |
+
๋ฐํ ํ์ (JSON):
|
| 8 |
+
{
|
| 9 |
+
'title': '๊ธฐ์ฌ ์ ๋ชฉ',
|
| 10 |
+
'text': '๊ธฐ์ฌ ๋ณธ๋ฌธ ํ
์คํธ',
|
| 11 |
+
'image_url': '๋ํ ์ด๋ฏธ์ง URL'
|
| 12 |
+
}
|
| 13 |
+
|
| 14 |
+
์์กด์ฑ: pip3 install trafilatura newspaper3k playwright beautifulsoup4 requests fake-useragent extruct
|
| 15 |
+
Playwright ์ด๊ธฐ ์ค์น: playwright install chromium
|
| 16 |
+
|
| 17 |
+
์ฑ๋ฅ ์ต์ ์์:
|
| 18 |
+
1. Trafilatura (๊ฐ์ฅ ๋น ๋ฅด๊ณ ์ ํ, ์ ์ ์ฝํ
์ธ )
|
| 19 |
+
2. Newspaper3k (๋น ๋ฅด๊ณ ํ๊ตญ์ด ์ง์ ์ฐ์)
|
| 20 |
+
3. Playwright + Trafilatura (JavaScript ๋ ๋๋ง ํ์์)
|
| 21 |
+
4. Playwright + Newspaper3k (๋์ฒด ๋ฐฉ๋ฒ)
|
| 22 |
+
"""
|
| 23 |
+
|
| 24 |
+
import json
|
| 25 |
+
import time
|
| 26 |
+
from typing import Optional, Dict
|
| 27 |
+
from urllib.parse import urljoin
|
| 28 |
+
|
| 29 |
+
import requests
|
| 30 |
+
import trafilatura
|
| 31 |
+
from bs4 import BeautifulSoup
|
| 32 |
+
from newspaper import Article
|
| 33 |
+
|
| 34 |
+
try:
|
| 35 |
+
from fake_useragent import UserAgent
|
| 36 |
+
ua = UserAgent()
|
| 37 |
+
USER_AGENT = ua.random
|
| 38 |
+
except ImportError:
|
| 39 |
+
USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
|
| 40 |
+
|
| 41 |
+
try:
|
| 42 |
+
from playwright.sync_api import sync_playwright, TimeoutError as PlaywrightTimeout
|
| 43 |
+
PLAYWRIGHT_AVAILABLE = True
|
| 44 |
+
except ImportError:
|
| 45 |
+
PLAYWRIGHT_AVAILABLE = False
|
| 46 |
+
print("โ ๏ธ Playwright ๋ฏธ์ค์น - JavaScript ๋ ๋๋ง ๊ธฐ๋ฅ ๋นํ์ฑํ")
|
| 47 |
+
|
| 48 |
+
try:
|
| 49 |
+
import extruct
|
| 50 |
+
EXTRUCT_AVAILABLE = True
|
| 51 |
+
except ImportError:
|
| 52 |
+
EXTRUCT_AVAILABLE = False
|
| 53 |
+
|
| 54 |
+
# HTTP ํค๋ ์ค์
|
| 55 |
+
HEADERS = {
|
| 56 |
+
'User-Agent': USER_AGENT,
|
| 57 |
+
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
|
| 58 |
+
'Accept-Language': 'ko-KR,ko;q=0.8,en-US;q=0.5,en;q=0.3',
|
| 59 |
+
'Accept-Encoding': 'gzip, deflate',
|
| 60 |
+
'DNT': '1',
|
| 61 |
+
'Connection': 'keep-alive',
|
| 62 |
+
'Upgrade-Insecure-Requests': '1',
|
| 63 |
+
}
|
| 64 |
+
|
| 65 |
+
def fetch_with_headers(url: str) -> str:
|
| 66 |
+
"""HTTP ํค๋๋ฅผ ํฌํจํ URL ์์ฒญ"""
|
| 67 |
+
response = requests.get(url, headers=HEADERS, timeout=30)
|
| 68 |
+
response.raise_for_status()
|
| 69 |
+
return response.text
|
| 70 |
+
|
| 71 |
+
def extract_images_from_html(html: str, base_url: str = "") -> Optional[str]:
|
| 72 |
+
"""HTML์์ ์ด๋ฏธ์ง ์ถ์ถ (์ฌ๋ฌ ๋ฐฉ๋ฒ ์๋)"""
|
| 73 |
+
soup = BeautifulSoup(html, 'html.parser')
|
| 74 |
+
|
| 75 |
+
# 1. og:image ๋ฉํํ๊ทธ
|
| 76 |
+
og_image = soup.find('meta', property='og:image')
|
| 77 |
+
if og_image and og_image.get('content'):
|
| 78 |
+
return og_image.get('content')
|
| 79 |
+
|
| 80 |
+
# 2. twitter:image
|
| 81 |
+
tw_image = soup.find('meta', attrs={'name': 'twitter:image'})
|
| 82 |
+
if tw_image and tw_image.get('content'):
|
| 83 |
+
return tw_image.get('content')
|
| 84 |
+
|
| 85 |
+
# 3. extruct๋ก JSON-LD ํ์ฑ
|
| 86 |
+
if EXTRUCT_AVAILABLE:
|
| 87 |
+
try:
|
| 88 |
+
metadata = extruct.extract(html, base_url=base_url)
|
| 89 |
+
# Schema.org ImageObject ์ฐพ๊ธฐ
|
| 90 |
+
for item in metadata.get('json-ld', []):
|
| 91 |
+
if isinstance(item, dict):
|
| 92 |
+
if item.get('image'):
|
| 93 |
+
img = item['image']
|
| 94 |
+
if isinstance(img, str):
|
| 95 |
+
return img
|
| 96 |
+
elif isinstance(img, dict) and img.get('url'):
|
| 97 |
+
return img['url']
|
| 98 |
+
elif isinstance(img, list) and len(img) > 0:
|
| 99 |
+
return img[0] if isinstance(img[0], str) else img[0].get('url')
|
| 100 |
+
except:
|
| 101 |
+
pass
|
| 102 |
+
|
| 103 |
+
# 4. article ๋ด๋ถ์ ์ฒซ ๋ฒ์งธ ์ด๋ฏธ์ง
|
| 104 |
+
article_imgs = soup.select('article img[src], .article img[src], #article img[src]')
|
| 105 |
+
if article_imgs:
|
| 106 |
+
src = article_imgs[0].get('src')
|
| 107 |
+
return urljoin(base_url, src) if src else None
|
| 108 |
+
|
| 109 |
+
# 5. ์ผ๋ฐ img ํ๊ทธ
|
| 110 |
+
imgs = soup.find_all('img', src=True)
|
| 111 |
+
for img in imgs:
|
| 112 |
+
src = img.get('src')
|
| 113 |
+
# ๋ก๊ณ , ์์ด์ฝ ์ ์ธ
|
| 114 |
+
if src and not any(x in src.lower() for x in ['logo', 'icon', 'avatar', 'profile', 'ad', 'banner']):
|
| 115 |
+
# ์ต์ ํฌ๊ธฐ ํ์ธ (width/height ์์ฑ)
|
| 116 |
+
width = img.get('width', '0')
|
| 117 |
+
height = img.get('height', '0')
|
| 118 |
+
try:
|
| 119 |
+
if int(width) >= 200 or int(height) >= 200:
|
| 120 |
+
return urljoin(base_url, src)
|
| 121 |
+
except:
|
| 122 |
+
return urljoin(base_url, src)
|
| 123 |
+
|
| 124 |
+
return None
|
| 125 |
+
|
| 126 |
+
def extract_trafilatura(url: str) -> Optional[Dict[str, str]]:
|
| 127 |
+
"""Trafilatura ๊ธฐ์ฌ ์ถ์ถ"""
|
| 128 |
+
try:
|
| 129 |
+
html = fetch_with_headers(url)
|
| 130 |
+
result = trafilatura.extract(html, output_format='json', url=url,
|
| 131 |
+
include_images=True, include_links=True)
|
| 132 |
+
if result:
|
| 133 |
+
data = json.loads(result)
|
| 134 |
+
image_url = data.get('image') or extract_images_from_html(html, url)
|
| 135 |
+
|
| 136 |
+
return {
|
| 137 |
+
'title': data.get('title'),
|
| 138 |
+
'text': data.get('text'),
|
| 139 |
+
'image_url': image_url
|
| 140 |
+
}
|
| 141 |
+
except Exception as e:
|
| 142 |
+
print(f"Trafilatura ์คํจ: {e}")
|
| 143 |
+
return None
|
| 144 |
+
|
| 145 |
+
def extract_newspaper(url: str) -> Optional[Dict[str, str]]:
|
| 146 |
+
"""Newspaper3k ๊ธฐ์ฌ ์ถ์ถ"""
|
| 147 |
+
try:
|
| 148 |
+
html = fetch_with_headers(url)
|
| 149 |
+
article = Article(url)
|
| 150 |
+
article.config.browser_user_agent = HEADERS['User-Agent']
|
| 151 |
+
article.set_html(html)
|
| 152 |
+
article.parse()
|
| 153 |
+
|
| 154 |
+
image_url = article.top_image or extract_images_from_html(html, url)
|
| 155 |
+
|
| 156 |
+
return {
|
| 157 |
+
'title': article.title,
|
| 158 |
+
'text': article.text,
|
| 159 |
+
'image_url': image_url
|
| 160 |
+
}
|
| 161 |
+
except Exception as e:
|
| 162 |
+
print(f"Newspaper3k ์คํจ: {e}")
|
| 163 |
+
return None
|
| 164 |
+
|
| 165 |
+
def get_rendered_html_playwright(url: str, wait: int = 2) -> Optional[str]:
|
| 166 |
+
"""Playwright๋ก ๋ ๋๋ง๋ HTML ๊ฐ์ ธ์ค๊ธฐ"""
|
| 167 |
+
try:
|
| 168 |
+
with sync_playwright() as p:
|
| 169 |
+
browser = p.chromium.launch(headless=True)
|
| 170 |
+
context = browser.new_context(
|
| 171 |
+
user_agent=HEADERS['User-Agent'],
|
| 172 |
+
viewport={'width': 1920, 'height': 1080}
|
| 173 |
+
)
|
| 174 |
+
page = context.new_page()
|
| 175 |
+
page.goto(url, wait_until='domcontentloaded', timeout=30000)
|
| 176 |
+
time.sleep(wait)
|
| 177 |
+
html = page.content()
|
| 178 |
+
browser.close()
|
| 179 |
+
return html
|
| 180 |
+
except Exception as e:
|
| 181 |
+
print(f"Playwright ์ค๋ฅ: {e}")
|
| 182 |
+
return None
|
| 183 |
+
|
| 184 |
+
def extract_playwright_trafilatura(url: str) -> Optional[Dict[str, str]]:
|
| 185 |
+
"""Playwright + Trafilatura ์กฐํฉ"""
|
| 186 |
+
try:
|
| 187 |
+
html = get_rendered_html_playwright(url)
|
| 188 |
+
if html:
|
| 189 |
+
result = trafilatura.extract(html, output_format='json', url=url,
|
| 190 |
+
include_images=True, include_links=True)
|
| 191 |
+
if result:
|
| 192 |
+
data = json.loads(result)
|
| 193 |
+
image_url = data.get('image') or extract_images_from_html(html, url)
|
| 194 |
+
|
| 195 |
+
return {
|
| 196 |
+
'title': data.get('title'),
|
| 197 |
+
'text': data.get('text'),
|
| 198 |
+
'image_url': image_url
|
| 199 |
+
}
|
| 200 |
+
except Exception as e:
|
| 201 |
+
print(f"Playwright+Trafilatura ์คํจ: {e}")
|
| 202 |
+
return None
|
| 203 |
+
|
| 204 |
+
def extract_playwright_newspaper(url: str) -> Optional[Dict[str, str]]:
|
| 205 |
+
"""Playwright + Newspaper3k ์กฐํฉ"""
|
| 206 |
+
try:
|
| 207 |
+
html = get_rendered_html_playwright(url)
|
| 208 |
+
if html:
|
| 209 |
+
article = Article(url='')
|
| 210 |
+
article.set_html(html)
|
| 211 |
+
article.parse()
|
| 212 |
+
|
| 213 |
+
image_url = article.top_image or extract_images_from_html(html, url)
|
| 214 |
+
|
| 215 |
+
return {
|
| 216 |
+
'title': article.title,
|
| 217 |
+
'text': article.text,
|
| 218 |
+
'image_url': image_url
|
| 219 |
+
}
|
| 220 |
+
except Exception as e:
|
| 221 |
+
print(f"Playwright+Newspaper3k ์คํจ: {e}")
|
| 222 |
+
return None
|
| 223 |
+
|
| 224 |
+
|
| 225 |
+
def extract_article(url: str) -> Optional[Dict[str, str]]:
|
| 226 |
+
"""๊ธฐ์ฌ ์ถ์ถ - ์ต์ ์์๋ก ์๋"""
|
| 227 |
+
print(f"๐ ์ถ์ถ ์์: {url}")
|
| 228 |
+
|
| 229 |
+
result = {'title': None, 'text': None, 'image_url': None}
|
| 230 |
+
|
| 231 |
+
# ์ต์ ์์: ๋น ๋ฅด๊ณ ์ ํํ ๊ฒ๋ถํฐ ์๋
|
| 232 |
+
# 1. Trafilatura - ๊ฐ์ฅ ๋น ๋ฅด๊ณ ์ ํ (์ ์ ์ฝํ
์ธ )
|
| 233 |
+
# 2. Newspaper3k - ๋น ๋ฅด๊ณ ํ๊ตญ์ด ์ง์ ์ฐ์
|
| 234 |
+
# 3. Playwright + Trafilatura - JavaScript ๋ ๋๋ง์ด ํ์ํ ๊ฒฝ์ฐ
|
| 235 |
+
# 4. Playwright + Newspaper3k - ๋์ฒด ๋ฐฉ๋ฒ
|
| 236 |
+
extractors = [
|
| 237 |
+
("Trafilatura", extract_trafilatura),
|
| 238 |
+
("Newspaper3k", extract_newspaper),
|
| 239 |
+
]
|
| 240 |
+
|
| 241 |
+
# Playwright ์ถ๊ฐ (JavaScript ๋ ๋๋ง ํ์์)
|
| 242 |
+
if PLAYWRIGHT_AVAILABLE:
|
| 243 |
+
extractors.extend([
|
| 244 |
+
("Playwright+Trafilatura", extract_playwright_trafilatura),
|
| 245 |
+
("Playwright+Newspaper3k", extract_playwright_newspaper),
|
| 246 |
+
])
|
| 247 |
+
|
| 248 |
+
for i, (name, extractor) in enumerate(extractors, 1):
|
| 249 |
+
print(f" {i}๏ธโฃ {name} ์๋...")
|
| 250 |
+
try:
|
| 251 |
+
data = extractor(url)
|
| 252 |
+
if data:
|
| 253 |
+
# ๊ฒฐ๊ณผ ์
๋ฐ์ดํธ
|
| 254 |
+
updated = []
|
| 255 |
+
for key in result:
|
| 256 |
+
if not result[key] and data.get(key):
|
| 257 |
+
result[key] = data[key]
|
| 258 |
+
updated.append(key)
|
| 259 |
+
|
| 260 |
+
if updated:
|
| 261 |
+
print(f" โ ์ถ์ถ ์ฑ๊ณต: {', '.join(updated)}")
|
| 262 |
+
|
| 263 |
+
# ์ ๋ชฉ, ๋ณธ๋ฌธ, ์ด๋ฏธ์ง ๋ชจ๋ ์์ผ๋ฉด ์ฑ๊ณต
|
| 264 |
+
if result['title'] and result['text'] and result['image_url']:
|
| 265 |
+
print(f" โ
{name} ์๋ฃ! (์ ๋ชฉ O, ๋ณธ๋ฌธ O, ์ด๋ฏธ์ง O)")
|
| 266 |
+
return result
|
| 267 |
+
|
| 268 |
+
# ์ํ ์ถ๋ ฅ
|
| 269 |
+
status = f"์ ๋ชฉ: {'O' if result['title'] else 'X'}, ๋ณธ๋ฌธ: {'O' if result['text'] else 'X'}, ์ด๋ฏธ์ง: {'O' if result['image_url'] else 'X'}"
|
| 270 |
+
if result['title'] and result['text']:
|
| 271 |
+
print(f" โ ๏ธ ์ด๋ฏธ์ง ์์ - ๋ค์ ๋จ๊ณ ๊ณ์ ({status})")
|
| 272 |
+
else:
|
| 273 |
+
print(f" โ ๏ธ ๋ถ๋ถ ์ฑ๊ณต ({status})")
|
| 274 |
+
else:
|
| 275 |
+
print(f" โ {name} ์คํจ")
|
| 276 |
+
except requests.HTTPError as e:
|
| 277 |
+
if e.response.status_code in (403, 429):
|
| 278 |
+
print(f" โ {name} ์ฐจ๋จ๋จ (HTTP {e.response.status_code})")
|
| 279 |
+
raise
|
| 280 |
+
print(f" โ {name} ์ค๋ฅ: {e}")
|
| 281 |
+
except Exception as e:
|
| 282 |
+
print(f" โ {name} ์ค๋ฅ: {e}")
|
| 283 |
+
|
| 284 |
+
if result['title'] or result['text']:
|
| 285 |
+
print(f" โ
์ต์ข
๊ฒฐ๊ณผ - ์ ๋ชฉ: {'O' if result['title'] else 'X'}, ๋ณธ๋ฌธ: {'O' if result['text'] else 'X'}, ์ด๋ฏธ์ง: {'O' if result['image_url'] else 'X'}")
|
| 286 |
+
return result
|
| 287 |
+
|
| 288 |
+
print(" โ ๋ชจ๋ ๋ฐฉ๋ฒ ์คํจ")
|
| 289 |
+
return None
|
| 290 |
+
|
| 291 |
+
if __name__ == "__main__":
|
| 292 |
+
test_urls = [
|
| 293 |
+
"https://www.chosun.com/national/education/2025/07/19/4OMZBICJSNDGXA567IKPRBUFKA/",
|
| 294 |
+
"https://news.nate.com/view/20250521n37437",
|
| 295 |
+
"https://www.hani.co.kr/arti/society/society_general/1204840.html"
|
| 296 |
+
]
|
| 297 |
+
|
| 298 |
+
print(f"์ฌ์ฉ ๊ฐ๋ฅํ ๋๊ตฌ:")
|
| 299 |
+
print(f" - Playwright: {'โ
' if PLAYWRIGHT_AVAILABLE else 'โ'}")
|
| 300 |
+
print(f" - Extruct: {'โ
' if EXTRUCT_AVAILABLE else 'โ'}")
|
| 301 |
+
print(f" - Fake UserAgent: {'โ
' if 'ua' in dir() else 'โ'}\n")
|
| 302 |
+
|
| 303 |
+
for url in test_urls:
|
| 304 |
+
print(f"\n{'='*60}")
|
| 305 |
+
try:
|
| 306 |
+
article = extract_article(url)
|
| 307 |
+
if article:
|
| 308 |
+
print(f"\n๐ ์ ๋ชฉ: {article.get('title', 'N/A')[:100]}...")
|
| 309 |
+
print(f"๐ ๋ณธ๋ฌธ: {len(article.get('text', ''))}์")
|
| 310 |
+
print(f"๐ผ๏ธ ์ด๋ฏธ์ง: {article.get('image_url', 'N/A')[:80]}..." if article.get('image_url') else "๐ผ๏ธ ์ด๋ฏธ์ง: ์์")
|
| 311 |
+
else:
|
| 312 |
+
print("์ถ์ถ ์คํจ")
|
| 313 |
+
except Exception as e:
|
| 314 |
+
print(f"์ ์ฒด ์คํจ: {e}")
|
| 315 |
+
print("="*60)
|
packages.txt
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
# System packages for Streamlit and dependencies
|
requirements.txt
CHANGED
|
@@ -1,3 +1,31 @@
|
|
| 1 |
-
|
| 2 |
-
|
| 3 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Core Dependencies
|
| 2 |
+
streamlit==1.51.0
|
| 3 |
+
google-generativeai==0.8.5
|
| 4 |
+
python-dotenv==1.1.1
|
| 5 |
+
|
| 6 |
+
# LangGraph and Agents
|
| 7 |
+
langgraph==0.6.8
|
| 8 |
+
langgraph-checkpoint==2.1.1
|
| 9 |
+
|
| 10 |
+
# Vector Database
|
| 11 |
+
chromadb==1.1.0
|
| 12 |
+
|
| 13 |
+
# Machine Learning & Embeddings
|
| 14 |
+
sentence-transformers==5.1.1
|
| 15 |
+
torch>=2.0.0
|
| 16 |
+
transformers>=4.30.0
|
| 17 |
+
|
| 18 |
+
# Image Processing
|
| 19 |
+
Pillow>=11.0.0
|
| 20 |
+
|
| 21 |
+
# Web Scraping
|
| 22 |
+
requests==2.32.5
|
| 23 |
+
beautifulsoup4==4.14.2
|
| 24 |
+
lxml==6.0.2
|
| 25 |
+
|
| 26 |
+
# Data Processing
|
| 27 |
+
pandas>=2.0.0
|
| 28 |
+
numpy>=1.23.0
|
| 29 |
+
|
| 30 |
+
# Utilities
|
| 31 |
+
tqdm>=4.60.0
|