jonghhhh commited on
Commit
3c03221
ยท
verified ยท
1 Parent(s): 914cd2c

Upload folder using huggingface_hub

Browse files
.gitattributes CHANGED
@@ -1,35 +1,6 @@
1
- *.7z filter=lfs diff=lfs merge=lfs -text
2
- *.arrow filter=lfs diff=lfs merge=lfs -text
3
  *.bin filter=lfs diff=lfs merge=lfs -text
4
- *.bz2 filter=lfs diff=lfs merge=lfs -text
5
- *.ckpt filter=lfs diff=lfs merge=lfs -text
6
- *.ftz filter=lfs diff=lfs merge=lfs -text
7
- *.gz filter=lfs diff=lfs merge=lfs -text
8
- *.h5 filter=lfs diff=lfs merge=lfs -text
9
- *.joblib filter=lfs diff=lfs merge=lfs -text
10
- *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
- *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
- *.model filter=lfs diff=lfs merge=lfs -text
13
- *.msgpack filter=lfs diff=lfs merge=lfs -text
14
- *.npy filter=lfs diff=lfs merge=lfs -text
15
- *.npz filter=lfs diff=lfs merge=lfs -text
16
- *.onnx filter=lfs diff=lfs merge=lfs -text
17
- *.ot filter=lfs diff=lfs merge=lfs -text
18
- *.parquet filter=lfs diff=lfs merge=lfs -text
19
- *.pb filter=lfs diff=lfs merge=lfs -text
20
- *.pickle filter=lfs diff=lfs merge=lfs -text
21
- *.pkl filter=lfs diff=lfs merge=lfs -text
22
  *.pt filter=lfs diff=lfs merge=lfs -text
23
- *.pth filter=lfs diff=lfs merge=lfs -text
24
- *.rar filter=lfs diff=lfs merge=lfs -text
25
  *.safetensors filter=lfs diff=lfs merge=lfs -text
26
- saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
- *.tar.* filter=lfs diff=lfs merge=lfs -text
28
- *.tar filter=lfs diff=lfs merge=lfs -text
29
- *.tflite filter=lfs diff=lfs merge=lfs -text
30
- *.tgz filter=lfs diff=lfs merge=lfs -text
31
- *.wasm filter=lfs diff=lfs merge=lfs -text
32
- *.xz filter=lfs diff=lfs merge=lfs -text
33
- *.zip filter=lfs diff=lfs merge=lfs -text
34
- *.zst filter=lfs diff=lfs merge=lfs -text
35
- *tfevents* filter=lfs diff=lfs merge=lfs -text
 
1
+ *.sqlite3 filter=lfs diff=lfs merge=lfs -text
2
+ *.db filter=lfs diff=lfs merge=lfs -text
3
  *.bin filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4
  *.pt filter=lfs diff=lfs merge=lfs -text
 
 
5
  *.safetensors filter=lfs diff=lfs merge=lfs -text
6
+ chroma/4f0d0b85-04bb-4afc-9e18-51530c937a3b/index_metadata.pickle filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
.streamlit/config.toml ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ [server]
2
+ headless = true
3
+ port = 7860
4
+
5
+ [browser]
6
+ gatherUsageStats = false
README.md CHANGED
@@ -1,19 +1,67 @@
1
  ---
2
- title: Press Ethics
3
- emoji: ๐Ÿš€
4
- colorFrom: red
5
- colorTo: red
6
- sdk: docker
7
- app_port: 8501
8
- tags:
9
- - streamlit
10
  pinned: false
11
- short_description: Streamlit template space
12
  ---
13
 
14
- # Welcome to Streamlit!
15
 
16
- Edit `/src/streamlit_app.py` to customize this app to your heart's desire. :heart:
17
 
18
- If you have any questions, checkout our [documentation](https://docs.streamlit.io) and [community
19
- forums](https://discuss.streamlit.io).
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
+ title: Press Ethics Analyzer
3
+ emoji: ๐Ÿ“ฐ
4
+ colorFrom: blue
5
+ colorTo: green
6
+ sdk: streamlit
7
+ sdk_version: "1.51.0"
8
+ app_file: multimodal_rag_langgraph_gemini_st.py
 
9
  pinned: false
10
+ license: mit
11
  ---
12
 
13
+ # ๐Ÿ“ฐ ๋‰ด์Šค ์‹ฌ์˜๋ฌธ ๋ถ„์„ ์‹œ์Šคํ…œ
14
 
15
+ ํ•œ๊ตญ์‹ ๋ฌธ์œค๋ฆฌ์œ„์›ํšŒ ์‹ฌ์˜ ๊ธฐ์ค€์„ ์ ์šฉํ•œ AI ๊ธฐ๋ฐ˜ ๋‰ด์Šค ๊ธฐ์‚ฌ ์œค๋ฆฌ ๋ถ„์„ ์‹œ์Šคํ…œ์ž…๋‹ˆ๋‹ค.
16
 
17
+ ## ๐ŸŒŸ ์ฃผ์š” ๊ธฐ๋Šฅ
18
+
19
+ - **๊ธฐ์‚ฌ ์ž๋™ ์ถ”์ถœ**: URL๋งŒ ์ž…๋ ฅํ•˜๋ฉด ๊ธฐ์‚ฌ ๋‚ด์šฉ ์ž๋™ ์ˆ˜์ง‘
20
+ - **๋ฉ€ํ‹ฐ๋ชจ๋‹ฌ ๋ถ„์„**: ํ…์ŠคํŠธ์™€ ์ด๋ฏธ์ง€๋ฅผ ํ•จ๊ป˜ ๋ถ„์„ (Gemini 2.0 Flash)
21
+ - **์œ ์‚ฌ ์‚ฌ๋ก€ ๊ฒ€์ƒ‰**: ChromaDB ๊ธฐ๋ฐ˜ ๋ฒกํ„ฐ ๊ฒ€์ƒ‰์œผ๋กœ ๊ด€๋ จ ์‹ฌ์˜ ์‚ฌ๋ก€ ํƒ์ƒ‰
22
+ - **์‹ฌ์˜๋ฌธ ์ž๋™ ์ƒ์„ฑ**: ์‹ ๋ฌธ์œค๋ฆฌ์‹ค์ฒœ์š”๊ฐ• 16๊ฐœ ์กฐํ•ญ ๊ธฐ์ค€ ์ž๋™ ํŒ๋‹จ
23
+ - **๋‹จ๊ณ„๋ณ„ ์ง„ํ–‰ ํ‘œ์‹œ**: 5๋‹จ๊ณ„ ๋ถ„์„ ๊ณผ์ • ์‹ค์‹œ๊ฐ„ ๋ชจ๋‹ˆํ„ฐ๋ง
24
+
25
+ ## ๐Ÿš€ ์‚ฌ์šฉ ๋ฐฉ๋ฒ•
26
+
27
+ 1. **API ํ‚ค ์ž…๋ ฅ**: ์‚ฌ์ด๋“œ๋ฐ”์—์„œ Gemini API ํ‚ค ์ž…๋ ฅ ([๋ฐœ๊ธ‰๋ฐ›๊ธฐ](https://makersuite.google.com/app/apikey))
28
+ 2. **URL ์ž…๋ ฅ**: ๋ถ„์„ํ•  ๋‰ด์Šค ๊ธฐ์‚ฌ URL ์ž…๋ ฅ
29
+ 3. **๋ถ„์„ ์‹œ์ž‘**: '๋ถ„์„ ์‹œ์ž‘' ๋ฒ„ํŠผ ํด๋ฆญ
30
+ 4. **๊ฒฐ๊ณผ ํ™•์ธ**: 5๋‹จ๊ณ„ ์ง„ํ–‰ ๊ณผ์ •์„ ๊ฑฐ์ณ ์ตœ์ข… ์‹ฌ์˜๋ฌธ ํ™•์ธ
31
+
32
+ ## ๐Ÿ” ๋ถ„์„ ๋‹จ๊ณ„
33
+
34
+ 1. **๊ธฐ์‚ฌ ์ถ”์ถœ**: URL์—์„œ ์ œ๋ชฉ, ๋ณธ๋ฌธ, ์ด๋ฏธ์ง€ ์ถ”์ถœ
35
+ 2. **์ด๋ฏธ์ง€ ์ฒ˜๋ฆฌ**: Gemini ๋ฉ€ํ‹ฐ๋ชจ๋‹ฌ๋กœ ์ด๋ฏธ์ง€ ๋‚ด์šฉ ๋ถ„์„
36
+ 3. **์œ ์‚ฌ ์‚ฌ๋ก€ ๊ฒ€์ƒ‰**: ๋ฒกํ„ฐ DB์—์„œ ๊ด€๋ จ ์‹ฌ์˜ ์‚ฌ๋ก€ 5๊ฐœ ๊ฒ€์ƒ‰
37
+ 4. **์‹ฌ์˜๋ฌธ ์ƒ์„ฑ**: LangGraph ์›Œํฌํ”Œ๋กœ์šฐ๋กœ ์‹ฌ์˜๋ฌธ ์ดˆ์•ˆ ์ž‘์„ฑ
38
+ 5. **์ตœ์ข… ๊ฒ€ํ† **: ์กฐํ•ญ ์ •ํ™•์„ฑ ๋ฐ ๊ธฐ์‚ฌ ๊ด€๋ จ์„ฑ ๊ฒ€์ฆ
39
+
40
+ ## ๐Ÿ“Š ์‹ฌ์˜ ๊ธฐ์ค€
41
+
42
+ ์‹ ๋ฌธ์œค๋ฆฌ์‹ค์ฒœ์š”๊ฐ• 16๊ฐœ ์กฐํ•ญ์„ ๊ธฐ์ค€์œผ๋กœ ๊ธฐ์‚ฌ๋ฅผ ๋ถ„์„ํ•ฉ๋‹ˆ๋‹ค:
43
+
44
+ - ์ œ1์กฐ: ์–ธ๋ก ์˜ ์ž์œ , ์ฑ…์ž„, ๋…๋ฆฝ
45
+ - ์ œ2์กฐ: ์ทจ์žฌ ์ค€์น™
46
+ - ์ œ3์กฐ: ๋ณด๋„ ์ค€์น™
47
+ - ์ œ7์กฐ: ๋ฒ”์ฃ„๋ณด๋„์™€ ์ธ๊ถŒ์กด์ค‘
48
+ - ์ œ10์กฐ: ํŽธ์ง‘ ์ง€์นจ
49
+ - ์ œ12์กฐ: ์‚ฌ์ƒํ™œ ๋ณดํ˜ธ
50
+ - ๊ทธ ์™ธ 10๊ฐœ ์กฐํ•ญ
51
+
52
+ ## ๐Ÿ› ๏ธ ๊ธฐ์ˆ  ์Šคํƒ
53
+
54
+ - **AI ๋ชจ๋ธ**: Google Gemini 2.0 Flash
55
+ - **ํ”„๋ ˆ์ž„์›Œํฌ**: LangGraph, Streamlit
56
+ - **๋ฒกํ„ฐ DB**: ChromaDB
57
+ - **์ž„๋ฒ ๋”ฉ**: Sentence Transformers (multilingual-e5-large-instruct)
58
+
59
+ ## โš ๏ธ ์ฃผ์˜์‚ฌํ•ญ
60
+
61
+ - ๋ณธ์ธ์˜ Gemini API ํ‚ค๊ฐ€ ํ•„์š”ํ•ฉ๋‹ˆ๋‹ค
62
+ - ๋ถ„์„์—๋Š” ์ˆ˜ ๋ถ„์ด ์†Œ์š”๋  ์ˆ˜ ์žˆ์Šต๋‹ˆ๋‹ค
63
+ - ์ด ์‹œ์Šคํ…œ์€ ์ฐธ๊ณ ์šฉ์ด๋ฉฐ, ์ตœ์ข… ํŒ๋‹จ์€ ์ „๋ฌธ๊ฐ€์˜ ๊ฒ€ํ† ๊ฐ€ ํ•„์š”ํ•ฉ๋‹ˆ๋‹ค
64
+
65
+ ## ๐Ÿ“ ๋ผ์ด์„ ์Šค
66
+
67
+ ์ด ํ”„๋กœ์ ํŠธ๋Š” ๊ต์œก ๋ฐ ์—ฐ๊ตฌ ๋ชฉ์ ์œผ๋กœ ์‚ฌ์šฉ๋ฉ๋‹ˆ๋‹ค.
app.py ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ """
3
+ Hugging Face Spaces ์ง„์ž…์ 
4
+ """
5
+ import subprocess
6
+ import sys
7
+
8
+ # Streamlit ์•ฑ ์‹คํ–‰
9
+ if __name__ == "__main__":
10
+ subprocess.run([
11
+ sys.executable, "-m", "streamlit", "run",
12
+ "multimodal_rag_langgraph_gemini_st.py",
13
+ "--server.port=7860",
14
+ "--server.address=0.0.0.0"
15
+ ])
chroma/4f0d0b85-04bb-4afc-9e18-51530c937a3b/data_level0.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:03daa563a12b4b05ab94546c01459890854373c93f5974309dd6b1f76051252c
3
+ size 59304000
chroma/4f0d0b85-04bb-4afc-9e18-51530c937a3b/data_level0.bin:Zone.Identifier ADDED
Binary file (25 Bytes). View file
 
chroma/4f0d0b85-04bb-4afc-9e18-51530c937a3b/header.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:34820ba6903ee9392713712f72d463ddfb43aa6c38e4f186fe0b4b97bfa330e7
3
+ size 100
chroma/4f0d0b85-04bb-4afc-9e18-51530c937a3b/header.bin:Zone.Identifier ADDED
Binary file (25 Bytes). View file
 
chroma/4f0d0b85-04bb-4afc-9e18-51530c937a3b/index_metadata.pickle ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:daadb7ebf7d186886ffdc10b257cefb7b67e892356823e99c6df4ca12901b1ee
3
+ size 397964
chroma/4f0d0b85-04bb-4afc-9e18-51530c937a3b/index_metadata.pickle:Zone.Identifier ADDED
Binary file (25 Bytes). View file
 
chroma/4f0d0b85-04bb-4afc-9e18-51530c937a3b/length.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:be6484ac6ebc61a1b74d796300c4aa351991b8326255ec0da95f9ef91c9e46b8
3
+ size 56000
chroma/4f0d0b85-04bb-4afc-9e18-51530c937a3b/length.bin:Zone.Identifier ADDED
Binary file (25 Bytes). View file
 
chroma/4f0d0b85-04bb-4afc-9e18-51530c937a3b/link_lists.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:42587e10ca5555b2a136ff8f45b9b1fc8ca165b49f4fd92a6fa4c17722017ed5
3
+ size 120260
chroma/4f0d0b85-04bb-4afc-9e18-51530c937a3b/link_lists.bin:Zone.Identifier ADDED
Binary file (25 Bytes). View file
 
chroma/chroma.sqlite3 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dcf796984f09d766f212a37735343b8fd149194ce36924b9ff6a25d9caf7c6ef
3
+ size 174977024
chroma/chroma.sqlite3:Zone.Identifier ADDED
Binary file (25 Bytes). View file
 
multimodal_rag_langgraph_gemini_st.py ADDED
@@ -0,0 +1,492 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ """
3
+ Streamlit ๊ธฐ๋ฐ˜ ๋‰ด์Šค ์‹ฌ์˜๋ฌธ ๋ถ„์„ ์‹œ์Šคํ…œ (Gemini 2.0 Flash ๋ฒ„์ „)
4
+ """
5
+ import streamlit as st
6
+ import os
7
+ import re
8
+ from typing import TypedDict
9
+ from langgraph.graph import StateGraph, END
10
+ import google.generativeai as genai
11
+ import chromadb
12
+ from chromadb.config import Settings
13
+ from sentence_transformers import SentenceTransformer
14
+ from chromadb.utils.embedding_functions import EmbeddingFunction
15
+ from news_text_scraper import extract_article
16
+ import base64
17
+ import requests
18
+ from PIL import Image
19
+ from io import BytesIO
20
+ import time
21
+
22
+ # ========== ํŽ˜์ด์ง€ ์„ค์ • ==========
23
+ st.set_page_config(
24
+ page_title="๋‰ด์Šค ์‹ฌ์˜๋ฌธ ๋ถ„์„ ์‹œ์Šคํ…œ",
25
+ page_icon="๐Ÿ“ฐ",
26
+ layout="wide"
27
+ )
28
+
29
+ # ========== ์‚ฌ์ด๋“œ๋ฐ” ์„ค์ • ==========
30
+ st.sidebar.title("โš™๏ธ ์„ค์ •")
31
+ st.sidebar.markdown("---")
32
+
33
+ gemini_api_key = st.sidebar.text_input(
34
+ "Gemini API Key",
35
+ type="password",
36
+ help="Google AI Studio์—์„œ ๋ฐœ๊ธ‰๋ฐ›์€ Gemini API ํ‚ค๋ฅผ ์ž…๋ ฅํ•˜์„ธ์š”."
37
+ )
38
+
39
+ st.sidebar.markdown("---")
40
+ st.sidebar.markdown("""
41
+ ### ์‚ฌ์šฉ ๋ฐฉ๋ฒ•
42
+ 1. Gemini API Key ์ž…๋ ฅ
43
+ 2. ๋ถ„์„ํ•  ๊ธฐ์‚ฌ URL ์ž…๋ ฅ
44
+ 3. '๋ถ„์„ ์‹œ์ž‘' ๋ฒ„ํŠผ ํด๋ฆญ
45
+ 4. ๋‹จ๊ณ„๋ณ„ ์ง„ํ–‰ ์ƒํ™ฉ ํ™•์ธ
46
+ 5. ์ตœ์ข… ๊ฒฐ๊ณผ ํ™•์ธ
47
+
48
+ ### ์ฃผ์˜์‚ฌํ•ญ
49
+ - API ํ‚ค๋Š” ์„ธ์…˜ ์ข…๋ฃŒ ์‹œ ์‚ญ์ œ๋ฉ๋‹ˆ๋‹ค
50
+ - ๋ถ„์„์—๋Š” ์ˆ˜ ๋ถ„์ด ์†Œ์š”๋  ์ˆ˜ ์žˆ์Šต๋‹ˆ๋‹ค
51
+ """)
52
+
53
+ # ========== ๋ฉ”์ธ ํ™”๋ฉด ==========
54
+ st.title("๐Ÿ“ฐ ๋‰ด์Šค ์‹ฌ์˜๋ฌธ ๋ถ„์„ ์‹œ์Šคํ…œ")
55
+ st.markdown("**Gemini 2.0 Flash ๊ธฐ๋ฐ˜ - ํ•œ๊ตญ์‹ ๋ฌธ์œค๋ฆฌ์œ„์›ํšŒ ์‹ฌ์˜ ๊ธฐ์ค€ ์ ์šฉ**")
56
+ st.markdown("---")
57
+
58
+ # ========== ์„ค์ • ==========
59
+ CHROMA_PATH = "./chroma/"
60
+ COLLECTION_NAME = "press_ethics_e5_072025"
61
+
62
+ # ========== State ์ •์˜ ==========
63
+ class AnalysisState(TypedDict):
64
+ url: str
65
+ article: dict
66
+ image_desc: str
67
+ similar_cases: str
68
+ decision: str
69
+ review_result: dict
70
+ error: str
71
+ violation_count: int
72
+
73
+ # ========== ์ž„๋ฒ ๋”ฉ ํ•จ์ˆ˜ (์บ์‹ฑ) ==========
74
+ @st.cache_resource
75
+ def load_embedding_model():
76
+ """์ž„๋ฒ ๋”ฉ ๋ชจ๋ธ ๋กœ๋“œ (์บ์‹ฑ)"""
77
+ class CustomEmbedding(EmbeddingFunction):
78
+ def __init__(self):
79
+ self.model = SentenceTransformer("intfloat/multilingual-e5-large-instruct", device="cpu")
80
+
81
+ def __call__(self, input):
82
+ return self.model.encode(input).tolist()
83
+
84
+ return CustomEmbedding()
85
+
86
+ @st.cache_resource
87
+ def load_chroma_collection():
88
+ """ChromaDB ์ปฌ๋ ‰์…˜ ๋กœ๋“œ (์บ์‹ฑ)"""
89
+ try:
90
+ client = chromadb.PersistentClient(path=CHROMA_PATH, settings=Settings())
91
+ collection = client.get_collection(name=COLLECTION_NAME)
92
+ return collection
93
+ except Exception as e:
94
+ st.error(f"โŒ ChromaDB ๋กœ๋“œ ์‹คํŒจ: {e}")
95
+ return None
96
+
97
+ # ========== ๊ทœ์ • ๋ฐ ํ”„๋กฌํ”„ํŠธ ==========
98
+ REGULATION = """๋‹น์‹ ์€ ํ•œ๊ตญ์‹ ๋ฌธ์œค๋ฆฌ์œ„์›ํšŒ ์‹ฌ์˜์œ„์›์ž…๋‹ˆ๋‹ค.
99
+ #์‹ ๋ฌธ์œค๋ฆฌ์‹ค์ฒœ์š”๊ฐ•:
100
+ ์ œ1์กฐใ€Œ์–ธ๋ก ์˜ ์ž์œ , ์ฑ…์ž„, ๋…๋ฆฝใ€โ‘ ์ •์น˜๊ถŒ๋ ฅ์œผ๋กœ๋ถ€ํ„ฐ์˜ ์ž์œ  โ‘ก์‚ฌํšŒยท๊ฒฝ์ œ ์„ธ๋ ฅ์œผ๋กœ๋ถ€ํ„ฐ์˜ ๋…๋ฆฝ โ‘ข์‚ฌํšŒ์  ์ฑ…์ž„ โ‘ฃ์ฐจ๋ณ„๊ณผ ํŽธ๊ฒฌ ๊ธˆ์ง€ โ‘ค์‚ฌํšŒ์  ์•ฝ์ž ๋ณดํ˜ธ
101
+ ์ œ2์กฐใ€Œ์ทจ์žฌ ์ค€์น™ใ€โ‘ ์‹ ๋ถ„ ์‚ฌ์นญยท์œ„์žฅ ๊ธˆ์ง€ โ‘ก์ž๋ฃŒ ๋ฌด๋‹จ ์ด์šฉ ๊ธˆ์ง€ โ‘ข์žฌ๋‚œ ๋ฐ ์‚ฌ๊ณ  ์ทจ์žฌ โ‘ฃ์ „ํ™” ๋ฐ ๋””์ง€ํ„ธ ๊ธฐ๊ธฐ ํ™œ์šฉ ์ทจ์žฌ โ‘ค๋„์ฒญ ๋ฐ ๋น„๋ฐ€์ดฌ์˜ ๊ธˆ์ง€ โ‘ฅ๋ถ€๋‹นํ•œ ๊ธˆ์ „ ์ œ๊ณต ๊ธˆ์ง€
102
+ ์ œ3์กฐใ€Œ๋ณด๋„ ์ค€์น™ใ€โ‘ ๋ณด๋„๊ธฐ์‚ฌ์˜ ์‚ฌ์‹ค๊ณผ ์˜๊ฒฌ ๊ตฌ๋ถ„ โ‘ก๊ณต์ • ๋ณด๋„ โ‘ข๋ฐ˜๋ก ์˜ ๊ธฐํšŒ โ‘ฃ๋ฏธํ™•์ธ ๋ณด๋„ ๋ช…์‹œ ์›์น™ โ‘ค๋ณด๋„์ž๋ฃŒ ๊ฒ€์ฆ โ‘ฅ์„ ์ • ๋ณด๋„ ๊ธˆ์ง€ โ‘ฆ์žฌ๋‚œ ๋ณด๋„์˜ ์‹ ์ค‘ โ‘ง์ž์‚ด ๋ณด๋„์˜ ์ฃผ์˜ โ‘จํ”ผ์˜์‚ฌ์‹ค ๋ณด๋„ โ‘ฉํ‘œ์ค€์–ด ์‚ฌ์šฉ
103
+ ์ œ4์กฐใ€Œ์‚ฌ๋ฒ• ๋ณด๋„ ์ค€์น™ใ€โ‘ ์žฌํŒ ๋ถ€๋‹น ์˜ํ–ฅ ๊ธˆ์ง€ โ‘กํŒ๊ฒฐ๋ฌธ ๋“ฑ์˜ ์‚ฌ์ „๋ณด๋„ ๊ธˆ์ง€
104
+ ์ œ5์กฐใ€Œ์ทจ์žฌ์›์˜ ๋ช…์‹œ์™€ ๋ณดํ˜ธใ€โ‘ ์ทจ์žฌ์› ๋ณดํ˜ธ โ‘ก์ทจ์žฌ์› ๋ช…์‹œ์™€ ์ต๋ช… ์กฐ๊ฑด โ‘ข์ œ3์ž ๋น„๋ฐฉ๊ณผ ์ต๋ช…๋ณด๋„ ๊ธˆ์ง€ โ‘ฃ์ทจ์žฌ์›๊ณผ์˜ ๋น„๋ณด๋„ ์•ฝ์†
105
+ ์ œ6์กฐใ€Œ๋ณด๋„์œ ์˜ˆ ์‹œํ•œใ€โ‘ ๋ณด๋„์œ ์˜ˆ ์‹œํ•œ ์—ฐ์žฅ ๊ธˆ์ง€ โ‘ก๋ณด๋„์œ ์˜ˆ ์‹œํ•œ์˜ ํšจ๋ ฅ ์ƒ์‹ค
106
+ ์ œ7์กฐใ€Œ๋ฒ”์ฃ„๋ณด๋„์™€ ์ธ๊ถŒ์กด์ค‘ใ€โ‘ ํ”ผ์˜์ž ๋ฐ ํ”ผ๊ณ ์ธ์˜ ๋ช…์˜ˆ ์กด์ค‘ โ‘กํ”ผ์˜์žยทํ”ผ๊ณ ์ธยท์ฐธ๊ณ ์ธ ๋“ฑ ์ดฌ์˜ ์‹ ์ค‘ โ‘ข๋ฒ”์ฃ„์™€ ๋ฌด๊ด€ํ•œ ๊ฐ€์กฑ ๋ณดํ˜ธ โ‘ฃ์„ฑ๋ฒ”์ฃ„ ๋“ฑ์˜ 2์ฐจ ํ”ผํ•ด ๋ฐฉ์ง€ โ‘ค๋ฏธ์„ฑ๋…„ ํ”ผ์˜์ž ์‹ ์› ๋ณดํ˜ธ
107
+ ์ œ8์กฐใ€Œ์ €์ž‘๋ฌผ์˜ ์ „์žฌ์™€ ์ธ์šฉใ€โ‘ ํ†ต์‹ ๊ธฐ์‚ฌ์˜ ์ถœ์ฒ˜ ๋ช…์‹œ โ‘กํƒ€ ์–ธ๋ก ์‚ฌ ๋ณด๋„ ๋“ฑ์˜ ํ‘œ์ ˆ ๊ธˆ์ง€ โ‘ข์ถœํŒ๋ฌผ ๋“ฑ์˜ ํ‘œ์ ˆ ๊ธˆ์ง€ โ‘ฃ์‚ฌ์ง„, ์˜์ƒ ๋“ฑ์˜ ์ €์ž‘๊ถŒ ๋ณดํ˜ธ
108
+ ์ œ9์กฐใ€Œํ‰๋ก ์˜ ์›์น™ใ€โ‘ ์‚ฌ์„ค์˜ ์ •๋ก ์„ฑ โ‘กํ‰๋ก ์˜ ์ž์œ 
109
+ ์ œ10์กฐใ€ŒํŽธ์ง‘ ์ง€์นจใ€โ‘ ์ œ๋ชฉ์˜ ์›์น™ โ‘กํŽธ์ง‘ ๋ณ€๊ฒฝ ๊ธˆ์ง€ โ‘ข๊ธฐ๊ณ ๋ฌธ ๋ณ€๊ฒฝ ๊ธˆ์ง€ โ‘ฃ๊ธฐ์‚ฌ ์ •์ • โ‘ค๊ด€๋ จ์‚ฌ์ง„ ๊ฒŒ์žฌ โ‘ฅ์‚ฌ์ง„ ๋ฐ ์˜์ƒ ์กฐ์ž‘ ๊ธˆ์ง€ โ‘ฆ๊ธฐ์‚ฌ์™€ ๊ด‘๊ณ ์˜ ๊ตฌ๋ถ„ โ‘ง์ด์šฉ์ž์˜ ๊ถŒ๋ฆฌ ๋ณดํ˜ธ โ‘จ๋ถ€๋‹นํ•œ ์žฌ์ „์†ก ๊ธˆ์ง€
110
+ ์ œ11์กฐใ€Œ๋ช…์˜ˆ์™€ ์‹ ์šฉ์กด์ค‘ใ€โ‘ ๋ช…์˜ˆยท์‹ ์šฉ ํ›ผ์† ๊ธˆ์ง€ โ‘ก์‚ฌ์ž์˜ ๋ช…์˜ˆ ์กด์ค‘
111
+ ์ œ12์กฐใ€Œ์‚ฌ์ƒํ™œ ๋ณดํ˜ธใ€โ‘ ์‚ฌ์ƒํ™œ ์นจํ•ด ๊ธˆ์ง€ โ‘ก๊ฐœ์ธ์ •๋ณด ๋ฌด๋‹จ ๊ฒ€์ƒ‰ ๋“ฑ ๊ธˆ์ง€ โ‘ข์‚ฌ์ƒํ™œ ๋“ฑ์˜ ์ดฌ์˜ ๋ฐ ๋ณด๋„ ๊ธˆ์ง€ โ‘ฃ๊ณต์ธ์˜ ์‚ฌ์ƒํ™œ ๋ณด๋„
112
+ ๏ฟฝ๏ฟฝ๏ฟฝ13์กฐใ€Œ์ฒญ์†Œ๋…„๊ณผ ์–ด๋ฆฐ์ด ๋ณดํ˜ธใ€โ‘ ์ฒญ์†Œ๋…„๊ณผ ์–ด๋ฆฐ์ด ์ทจ์žฌ ๋ณด๋„ โ‘ก๋ฒ”์ฃ„ ๋ณด๋„์™€ ์ฒญ์†Œ๋…„, ์–ด๋ฆฐ์ด ๋ณดํ˜ธ โ‘ข์œ ํ•ดํ™˜๊ฒฝ์œผ๋กœ๋ถ€ํ„ฐ์˜ ๋ณดํ˜ธ โ‘ฃ์œ ๊ดดยท๋‚ฉ์น˜ ๋ณด๋„์ œํ•œ ํ˜‘์กฐ
113
+ ์ œ14์กฐใ€Œ์ •๋ณด์˜ ๋ถ€๋‹น์ด์šฉ๊ธˆ์ง€ใ€โ‘ ์†Œ์œ  ์ฃผ์‹ ๋“ฑ์— ๊ด€ํ•œ ๋ณด๋„ ์ œํ•œ โ‘ก์ฃผ์‹ยท๋ถ€๋™์‚ฐ ๋“ฑ์˜ ๋ถ€๋‹น ๊ฑฐ๋ž˜ ๊ธˆ์ง€
114
+ ์ œ15์กฐใ€Œ์–ธ๋ก ์ธ์˜ ํ’ˆ์œ„ใ€โ‘ ๊ธˆํ’ˆ์ˆ˜์ˆ˜ ๋ฐ ํ–ฅ์‘, ์ฒญํƒ ๊ธˆ์ง€ โ‘ก๋ถ€๋‹นํ•œ ์ง‘๋‹จ ์˜ํ–ฅ๋ ฅ ํ–‰์‚ฌ ๊ธˆ์ง€ โ‘ข๊ด‘๊ณ ยทํŒ๋งค ๋“ฑ ์˜์—…ํ–‰์œ„ ๊ธˆ์ง€
115
+ ์ œ16์กฐใ€Œ๊ณต์ต์˜ ์ •์˜ใ€โ‘ ๊ตญ๊ฐ€ ์•ˆ์ „ ๋“ฑ โ‘ก๊ณต์ค‘ ์•ˆ๋…• โ‘ข๋ฒ”์ฃ„์˜ ํญ๋กœ โ‘ฃ๊ณต์ค‘์˜ ์˜ค๋„ ๋ฐฉ์ง€"""
116
+
117
+ def parse_regulation_dict():
118
+ """REGULATION์„ ํŒŒ์‹ฑํ•˜์—ฌ ์กฐํ•ญ ๋”•์…”๋„ˆ๋ฆฌ ์ƒ์„ฑ"""
119
+ articles = {}
120
+ lines = REGULATION.split('\n')
121
+ for line in lines:
122
+ if line.startswith('์ œ'):
123
+ match = re.match(r'์ œ(\d+)์กฐใ€Œ([^ใ€]+)ใ€(.+)', line)
124
+ if match:
125
+ num = match.group(1)
126
+ name = match.group(2)
127
+ items_text = match.group(3)
128
+ items = {}
129
+ item_pattern = r'([โ‘ โ‘กโ‘ขโ‘ฃโ‘คโ‘ฅโ‘ฆโ‘งโ‘จโ‘ฉ])([^โ‘ โ‘กโ‘ขโ‘ฃโ‘คโ‘ฅโ‘ฆโ‘งโ‘จโ‘ฉ]+)'
130
+ for item_match in re.finditer(item_pattern, items_text):
131
+ item_num = item_match.group(1)
132
+ item_content = item_match.group(2).strip()
133
+ items[item_num] = item_content
134
+ articles[num] = {'name': name, 'items': items}
135
+ return articles
136
+
137
+ REGULATION_DICT = parse_regulation_dict()
138
+
139
+ def correct_article_reference(text):
140
+ """์‹ฌ์˜๋ฌธ์˜ ์กฐํ•ญ ์ฐธ์กฐ๋ฅผ REGULATION_DICT์— ๋งž๊ฒŒ ์ž๋™ ์ˆ˜์ •"""
141
+ pattern = r'์ œ(\d+)์กฐใ€Œ([^ใ€]+)ใ€([โ‘ โ‘กโ‘ขโ‘ฃโ‘คโ‘ฅโ‘ฆโ‘งโ‘จโ‘ฉ])(?:ํ•ญ|ํ˜ธ)?(?:\([^)]*\))*'
142
+
143
+ def replace_match(match):
144
+ article_num = match.group(1)
145
+ cited_name = match.group(2).strip()
146
+ item_num = match.group(3)
147
+
148
+ if article_num in REGULATION_DICT:
149
+ correct_name = REGULATION_DICT[article_num]['name']
150
+ items = REGULATION_DICT[article_num]['items']
151
+
152
+ if item_num in items:
153
+ item_content = items[item_num]
154
+ return f'์ œ{article_num}์กฐใ€Œ{correct_name}ใ€{item_num}({item_content})'
155
+ else:
156
+ return f'์ œ{article_num}์กฐใ€Œ{correct_name}ใ€{item_num}'
157
+ return match.group(0)
158
+
159
+ return re.sub(pattern, replace_match, text)
160
+
161
+ INST_PROMPT = """#์‹ฌ์˜ ์ง€์นจ:
162
+ 1. **๋ณด์ˆ˜์  ํŒ๋‹จ ์›์น™**: ์‹ ๋ฌธ์œค๋ฆฌ์‹ค์ฒœ์š”๊ฐ•์„ ์ฒด๊ณ„์ ์œผ๋กœ ๊ฒ€ํ† ํ•˜๋˜, ๋งค์šฐ ๋ณด์ˆ˜์ ์œผ๋กœ ํŒ๋‹จ
163
+ 2. **๋ช…๋ฐฑํ•˜๊ณ  ์‹ฌ๊ฐํ•œ ์œ„๋ฐ˜๋งŒ ์ง€์ **: ์˜์‹ฌ์Šค๋Ÿฝ๊ฑฐ๋‚˜ ๊ฒฝ๋ฏธํ•˜๊ฑฐ๋‚˜ ๋ถˆ๋ถ„๋ช…ํ•œ ์‚ฌ์•ˆ์€ ๋ชจ๋‘ "์œ„๋ฐ˜ ์—†์Œ"
164
+ 3. ์œ ์‚ฌ ์‚ฌ๋ก€๋ฅผ ์ฐธ๊ณ ํ•˜๋˜, ํ•ด๋‹น ๊ธฐ์‚ฌ์˜ ๊ตฌ์ฒด์  ๋‚ด์šฉ๊ณผ ๋งฅ๋ฝ์„ ์ค‘์‹ฌ์œผ๋กœ ๋…๋ฆฝ์ ์œผ๋กœ ํŒ๋‹จ
165
+ 4. **ํŠน๋ณ„ ์ฃผ์˜์‚ฌํ•ญ**:
166
+ - ํŠน์ • ๋‹จ์ฒด/๊ธฐ์—…์˜ ํ™œ๋™์„ ์ง€๋‚˜์น˜๊ฒŒ ์นญ์ฐฌํ•˜๊ณ  ํ™๋ณดํ•˜๋Š” ๋‚ด์šฉ โ†’ ์ œ1์กฐโ‘ก(์‚ฌํšŒยท๊ฒฝ์ œ ์„ธ๋ ฅ์œผ๋กœ๋ถ€ํ„ฐ์˜ ๋…๋ฆฝ) ๋˜๋Š” ์ œ10์กฐโ‘ฆ(๊ธฐ์‚ฌ์™€ ๊ด‘๊ณ ์˜ ๊ตฌ๋ถ„) ์ ์šฉ ๊ฒ€ํ† 
167
+ - ๋‹จ์ˆœํžˆ ๋‹จ์ฒด ํ™œ๋™์„ ์†Œ๊ฐœํ•˜๋Š” ์ˆ˜์ค€์€ ์œ„๋ฐ˜ ์•„๋‹˜. ๋ช…๋ฐฑํ•œ ํ™๋ณด/๊ด‘๊ณ  ๋ชฉ์ ์ด์–ด์•ผ ํ•จ
168
+
169
+ #์ž‘์„ฑ ํ˜•์‹ (๋ฐ˜๋“œ์‹œ ์ •ํ™•ํžˆ ์ค€์ˆ˜):
170
+
171
+ **[์œ„๋ฐ˜ ์—†์Œ ์‹œ] - ์ ˆ๋Œ€ ์—„์ˆ˜:**
172
+ - ์˜ค์ง "์œ„๋ฐ˜ ์—†์Œ" ๊ธ€์ž๋งŒ ์ถœ๋ ฅ. ์–ด๋– ํ•œ ์ถ”๊ฐ€ ์„ค๋ช…, ์ด์œ , ์ฝ”๋ฉ˜ํŠธ๋„ ์ ˆ๋Œ€ ๊ธˆ์ง€
173
+
174
+ **[์œ„๋ฐ˜ ์‹œ] - ์ •ํ™•ํžˆ ์ค€์ˆ˜:**
175
+ ์•„๋ž˜ 4๋‹จ๊ณ„๋ฅผ ๋ฐ˜๋“œ์‹œ ์ˆœ์„œ๋Œ€๋กœ ๋”ฐ๋ฅด๋˜, "1๋‹จ๊ณ„", "2๋‹จ๊ณ„" ๋“ฑ์˜ ์†Œ์ œ๋ชฉ ์—†์ด ์ž์—ฐ์Šค๋Ÿฌ์šด ๋ฌธ์žฅ์œผ๋กœ ์—ฐ๊ฒฐ:
176
+ 1๋‹จ๊ณ„) ๊ธฐ์‚ฌ ์š”์•ฝ 2~3๋ฌธ์žฅ
177
+ - "์œ„ ๊ธฐ์‚ฌ๋Š” โ—‹โ—‹โ—‹์— ๋Œ€ํ•ด ๋ณด๋„ํ•˜๋ฉด์„œ..." ํ˜•์‹์œผ๋กœ ์‹œ์ž‘
178
+ 2๋‹จ๊ณ„) ๋ฌธ์ œ์  ์ง€์  1~2๋ฌธ์žฅ
179
+ - "๊ทธ๋Ÿฌ๋‚˜ ์ด ๋ณด๋„๋Š”...", "ํ•˜์ง€๋งŒ..." ๋“ฑ์œผ๋กœ ๋ฌธ์ œ์  ๋ช…ํ™•ํžˆ ์ง€์ 
180
+ 3๋‹จ๊ณ„) ๊ทœ์ • ๊ทผ๊ฑฐ 1~2๋ฌธ์žฅ
181
+ - ์‹ ๋ฌธ์œค๋ฆฌ์‹ค์ฒœ์š”๊ฐ•์„ ๋ฐ”ํƒ•์œผ๋กœ ๋ฌธ์ œ์˜ ์œ„๋ฐ˜ ์ •๋‹น์„ฑ์„ ์ œ์‹œ
182
+ 4๋‹จ๊ณ„) ๊ฒฐ๋ก  ๋ฌธ์žฅ (์ •ํ™•ํžˆ ์ด ํ˜•์‹ ์ค€์ˆ˜)
183
+ - "๋”ฐ๋ผ์„œ ์œ„ ๋ณด๋„๋Š” ์‹ ๋ฌธ์œค๋ฆฌ์‹ค์ฒœ์š”๊ฐ• ์ œโ—‹์กฐใ€Œ์กฐํ•ญ๋ช…ใ€โ—‹ํ•ญ(์„ธ๋ถ€๋‚ด์šฉ)์„ ์œ„๋ฐ˜ํ–ˆ๋‹ค๊ณ  ์ธ์ •ํ•˜์—ฌ ์ฃผ๋ฌธ๊ณผ ๊ฐ™์ด ๊ฒฐ์ •ํ•œ๋‹ค."
184
+ - ์ „์ฒด 6๋ฌธ์žฅ ์ด์ƒ
185
+ - ์œ ์‚ฌ ์‚ฌ๋ก€์˜ ์ž์—ฐ์Šค๋Ÿฌ์šด ๋ฌธ์žฅ์ฒด ์ฐธ๊ณ 
186
+ - "1)", "2)", "3)" ๋“ฑ์˜ ๋ฒˆํ˜ธ๋‚˜ ์†Œ์ œ๋ชฉ ์ ˆ๋Œ€ ์‚ฌ์šฉ ๊ธˆ์ง€"""
187
+
188
+ # ========== Gemini API ํ˜ธ์ถœ ํ•จ์ˆ˜ ==========
189
+ def call_gemini(api_key: str, prompt: str, image_data: str = None, temperature: float = 0.0) -> str:
190
+ """Gemini API ํ˜ธ์ถœ"""
191
+ genai.configure(api_key=api_key)
192
+ model = genai.GenerativeModel('gemini-2.0-flash-exp')
193
+
194
+ generation_config = genai.GenerationConfig(
195
+ temperature=temperature,
196
+ max_output_tokens=8192,
197
+ )
198
+
199
+ if image_data:
200
+ image_part = {
201
+ "mime_type": "image/jpeg",
202
+ "data": base64.b64decode(image_data)
203
+ }
204
+ response = model.generate_content(
205
+ [prompt, image_part],
206
+ generation_config=generation_config
207
+ )
208
+ else:
209
+ response = model.generate_content(
210
+ prompt,
211
+ generation_config=generation_config
212
+ )
213
+
214
+ return response.text
215
+
216
+ # ========== ๋ถ„์„ ํ•จ์ˆ˜ ==========
217
+ def analyze_article_streamlit(url: str, api_key: str, progress_container, status_container):
218
+ """Streamlit์šฉ ๊ธฐ์‚ฌ ๋ถ„์„ ํ•จ์ˆ˜"""
219
+
220
+ # ์ง„ํ–‰ ์ƒํ™ฉ ํ‘œ์‹œ
221
+ progress_bar = progress_container.progress(0)
222
+
223
+ # 1. ๊ธฐ์‚ฌ ์ถ”์ถœ
224
+ status_container.info("๐Ÿ” 1๋‹จ๊ณ„: ๊ธฐ์‚ฌ ์ถ”์ถœ ์ค‘...")
225
+ progress_bar.progress(10)
226
+
227
+ try:
228
+ article = extract_article(url)
229
+ if not article or not article.get('text'):
230
+ status_container.error("โŒ ๊ธฐ์‚ฌ ์ถ”์ถœ ์‹คํŒจ: ์œ ํšจํ•œ ๊ธฐ์‚ฌ๋ฅผ ์ฐพ์„ ์ˆ˜ ์—†์Šต๋‹ˆ๋‹ค.")
231
+ return None
232
+ status_container.success(f"โœ… ๊ธฐ์‚ฌ ์ถ”์ถœ ์™„๋ฃŒ: {article.get('title', '')[:50]}...")
233
+ except Exception as e:
234
+ status_container.error(f"โŒ ๊ธฐ์‚ฌ ์ถ”์ถœ ์˜ค๋ฅ˜: {e}")
235
+ return None
236
+
237
+ progress_bar.progress(20)
238
+
239
+ # 2. ์ด๋ฏธ์ง€ ์ฒ˜๋ฆฌ
240
+ status_container.info("๐Ÿ–ผ๏ธ 2๋‹จ๊ณ„: ์ด๋ฏธ์ง€ ์ฒ˜๋ฆฌ ์ค‘...")
241
+ image_desc = None
242
+ img_url = article.get('image_url')
243
+
244
+ if img_url:
245
+ try:
246
+ resp = requests.get(img_url, timeout=10, headers={'User-Agent': 'Mozilla/5.0'})
247
+ img = Image.open(BytesIO(resp.content))
248
+ if img.mode == 'RGBA':
249
+ bg = Image.new('RGB', img.size, (255, 255, 255))
250
+ bg.paste(img, mask=img.split()[-1])
251
+ img = bg
252
+ elif img.mode != 'RGB':
253
+ img = img.convert('RGB')
254
+
255
+ buffer = BytesIO()
256
+ img.save(buffer, format='JPEG', quality=85)
257
+ b64 = base64.b64encode(buffer.getvalue()).decode('utf-8')
258
+
259
+ image_desc = call_gemini(api_key, "์ด ์ด๋ฏธ์ง€๋ฅผ ํ•œ๊ตญ์–ด๋กœ ์ƒ์„ธํžˆ ์„ค๋ช…ํ•ด์ฃผ์„ธ์š”.", image_data=b64, temperature=0.3)
260
+ status_container.success("โœ… ์ด๋ฏธ์ง€ ์„ค๋ช… ์ƒ์„ฑ ์™„๋ฃŒ")
261
+ except Exception as e:
262
+ status_container.warning(f"โš ๏ธ ์ด๋ฏธ์ง€ ์ฒ˜๋ฆฌ ์‹คํŒจ: {e}")
263
+ else:
264
+ status_container.info("โ„น๏ธ ์ด๋ฏธ์ง€ ์—†์Œ")
265
+
266
+ progress_bar.progress(40)
267
+
268
+ # 3. ์œ ์‚ฌ ์‚ฌ๋ก€ ๊ฒ€์ƒ‰
269
+ status_container.info("๐Ÿ”Ž 3๋‹จ๊ณ„: ์œ ์‚ฌ ์‚ฌ๋ก€ ๊ฒ€์ƒ‰ ์ค‘...")
270
+ similar_cases = ""
271
+ violation_count = 0
272
+ no_violation_count = 0
273
+
274
+ try:
275
+ ef = load_embedding_model()
276
+ collection = load_chroma_collection()
277
+
278
+ if collection:
279
+ text = f"{article.get('title', '')} {article.get('text', '')[:2000]}"
280
+ query_emb = ef([text])
281
+ results = collection.query(query_embeddings=query_emb, n_results=5)
282
+ cases = []
283
+ for i in range(len(results["documents"][0])):
284
+ reason = results['metadatas'][0][i]['reason']
285
+ cases.append(f"{i+1}. {reason}")
286
+
287
+ # ์œ„๋ฐ˜ ๊ฐœ์ˆ˜ ์นด์šดํŒ…
288
+ if '์œ„๋ฐ˜' in reason and '์œ„๋ฐ˜ ์—†์Œ' not in reason and '์œ„๋ฐ˜์—†์Œ' not in reason:
289
+ violation_count += 1
290
+ elif '์œ„๋ฐ˜ ์—†์Œ' in reason or '์œ„๋ฐ˜์—†์Œ' in reason:
291
+ no_violation_count += 1
292
+
293
+ similar_cases = "\n".join(cases)
294
+ status_container.success(f"โœ… ์œ ์‚ฌ ์‚ฌ๋ก€ {len(cases)}๊ฐœ ๊ฒ€์ƒ‰ ์™„๋ฃŒ (์œ„๋ฐ˜ {violation_count}/5, ์œ„๋ฐ˜์—†์Œ {no_violation_count}/5)")
295
+ else:
296
+ status_container.warning("โš ๏ธ ์œ ์‚ฌ ์‚ฌ๋ก€ ๊ฒ€์ƒ‰ ์‹คํŒจ: ChromaDB ๋กœ๋“œ ์˜ค๋ฅ˜")
297
+ except Exception as e:
298
+ status_container.warning(f"โš ๏ธ ์œ ์‚ฌ ์‚ฌ๋ก€ ๊ฒ€์ƒ‰ ์‹คํŒจ: {e}")
299
+
300
+ progress_bar.progress(60)
301
+
302
+ # 4. ์‹ฌ์˜๋ฌธ ์ƒ์„ฑ
303
+ status_container.info("๐Ÿ“ 4๋‹จ๊ณ„: ์‹ฌ์˜๋ฌธ ์ƒ์„ฑ ์ค‘...")
304
+
305
+ try:
306
+ prompt = f"{REGULATION}\n\n{INST_PROMPT}\n\n#๊ธฐ์‚ฌ:\n{article.get('title', '')} {article.get('text', '')[:2000]}"
307
+ if image_desc:
308
+ prompt += f"\n\n#์ด๋ฏธ์ง€:\n{image_desc}"
309
+ if similar_cases:
310
+ prompt += f"\n\n#์œ ์‚ฌ์‚ฌ๋ก€:\n{similar_cases}"
311
+
312
+ if no_violation_count >= 4:
313
+ prompt += f"\n\n**์ค‘์š”**: ์œ ์‚ฌ ์‚ฌ๋ก€ 5๊ฐœ ์ค‘ {no_violation_count}๊ฐœ๊ฐ€ '์œ„๋ฐ˜ ์—†์Œ'์ž…๋‹ˆ๋‹ค. 4๊ฐœ ์ด์ƒ์ด๋ฏ€๋กœ ์ด ๊ธฐ์‚ฌ๋„ '์œ„๋ฐ˜ ์—†์Œ'์„ ๊ฐ•๋ ฅํ•˜๊ฒŒ ๊ณ ๋ คํ•˜์‹ญ์‹œ์˜ค."
314
+
315
+ decision = call_gemini(api_key, prompt, temperature=0.0)
316
+ status_container.success("โœ… ์‹ฌ์˜๋ฌธ ์ƒ์„ฑ ์™„๋ฃŒ")
317
+ except Exception as e:
318
+ status_container.error(f"โŒ ์‹ฌ์˜๋ฌธ ์ƒ์„ฑ ์‹คํŒจ: {e}")
319
+ return None
320
+
321
+ progress_bar.progress(80)
322
+
323
+ # 5. ์ตœ์ข… ๊ฒ€ํ† 
324
+ status_container.info("๐Ÿ” 5๋‹จ๊ณ„: ์ตœ์ข… ๊ฒ€ํ†  ์ค‘...")
325
+
326
+ if "์œ„๋ฐ˜ ์—†์Œ" in decision or "์œ„๋ฐ˜์—†์Œ" in decision:
327
+ final_decision = "์œ„๋ฐ˜ ์—†์Œ"
328
+ status_container.success("โœ… ๊ฒ€ํ†  ์™„๋ฃŒ: ์œ„๋ฐ˜ ์—†์Œ")
329
+ else:
330
+ try:
331
+ review_prompt = f"""๋‹น์‹ ์€ ์‹ ๋ฌธ์œค๋ฆฌ์œ„์›ํšŒ ๊ฒ€ํ†  ๋‹ด๋‹น์ž์ž…๋‹ˆ๋‹ค. ์ƒ์„ฑ๋œ ์‹ฌ์˜๋ฌธ์„ ๊ฒ€ํ† ํ•˜๊ณ  ์ˆ˜์ •ํ•˜์„ธ์š”.
332
+
333
+ #๋ถ„์„ ๋Œ€์ƒ ๊ธฐ์‚ฌ:
334
+ ์ œ๋ชฉ: {article.get('title', '')}
335
+ ๋ณธ๋ฌธ: {article.get('text', '')[:2000]}
336
+
337
+ #์ƒ์„ฑ๋œ ์‹ฌ์˜๋ฌธ:
338
+ {decision}
339
+
340
+ #์‹ ๋ฌธ์œค๋ฆฌ์‹ค์ฒœ์š”๊ฐ•:
341
+ {REGULATION}
342
+
343
+ #๊ฒ€ํ†  ์ž„๋ฌด (๋ฐ˜๋“œ์‹œ ์ค€์ˆ˜):
344
+ 1. **์กฐํ•ญ ์ •ํ™•์„ฑ**: ์ธ์šฉ๋œ ์กฐํ•ญ์ด ์‹ ๋ฌธ์œค๋ฆฌ์‹ค์ฒœ์š”๊ฐ•์— ์ •ํ™•ํžˆ ์กด์žฌํ•˜๋Š”์ง€ ํ™•์ธ(์กฐํ•ญ ๋ฒˆํ˜ธ, ์กฐํ•ญ๋ช… ๋Œ€์กฐ)ํ•˜๊ณ  ํ‹€๋ฆฐ ๋ถ€๋ถ„ ์ˆ˜์ •
345
+ 2. **๊ธฐ์‚ฌ ๊ด€๋ จ์„ฑ**: ์‹ฌ์˜๋ฌธ์ด ์‹ค์ œ ๊ธฐ์‚ฌ ๋‚ด์šฉ๊ณผ ์ผ์น˜ํ•˜๋Š”์ง€ ํ™•์ธ(ํ™˜๊ฐ ๋‚ด์šฉ ์‚ญ์ œ)ํ•˜๊ณ  ํ•„์š”์‹œ ์ˆ˜์ •
346
+ 3. **ํ˜•์‹ ๊ฒ€์ฆ ๋ฐ ์ˆ˜์ •**:
347
+ - "1)", "2)", "3)" ๋“ฑ์˜ ๋ฒˆํ˜ธ๋‚˜ ์†Œ์ œ๋ชฉ์ด ์žˆ์œผ๋ฉด ๋ชจ๋‘ ์‚ญ์ œํ•˜๊ณ  ์ž์—ฐ์Šค๋Ÿฌ์šด ๋ฌธ์žฅ์ฒด๋กœ ์ˆ˜์ •
348
+ - ๋ฐ˜๋“œ์‹œ: ๊ธฐ์‚ฌ ์š”์•ฝ(2~3๋ฌธ์žฅ) โ†’ ๋ฌธ์ œ์ (1~2๋ฌธ์žฅ) โ†’ ๊ทผ๊ฑฐ(1~2๋ฌธ์žฅ) โ†’ ๊ฒฐ๋ก ("๋”ฐ๋ผ์„œ ์œ„ ๋ณด๋„๋Š”...") ์ˆœ์„œ ์ค€์ˆ˜
349
+ 4. **๊ฒ€ํ†  ์˜๊ฒฌ ์™„์ „ ์ œ๊ฑฐ**: "์‹ฌ์˜๋ฌธ์—์„œ ์–ธ๊ธ‰๋œ...", "ํ™•์ธ๋˜์ง€ ์•Š์Šต๋‹ˆ๋‹ค", "๊ฒ€ํ†  ๊ฒฐ๊ณผ..." ๋“ฑ์˜ ๊ฒ€ํ†  ์˜๊ฒฌ์„ ์ ˆ๋Œ€ ํฌํ•จํ•˜์ง€ ๋ง ๊ฒƒ
350
+ - ๊ฒ€ํ† ์ž์˜ ๋ฉ”ํƒ€์  ์ฝ”๋ฉ˜ํŠธ๋Š” ๋ชจ๋‘ ์‚ญ์ œ
351
+ - ์˜ค์ง ์‹ฌ์˜๋ฌธ ๋ณธ๋ฌธ๋งŒ ์ถœ๋ ฅ
352
+
353
+ ์ˆ˜์ •๋œ ์ตœ์ข… ์‹ฌ์˜๋ฌธ๋งŒ ์ถœ๋ ฅํ•˜์‹œ์˜ค (๊ฒ€ํ†  ์˜๊ฒฌ ์ ˆ๋Œ€ ํฌํ•จ ๊ธˆ์ง€):"""
354
+
355
+ final_decision = call_gemini(api_key, review_prompt, temperature=0.0)
356
+ final_decision = correct_article_reference(final_decision.strip())
357
+ status_container.success("โœ… ๊ฒ€ํ†  ์™„๋ฃŒ: ์กฐํ•ญ ์ •ํ™•์„ฑ ๋ฐ ๊ธฐ์‚ฌ ๊ด€๋ จ์„ฑ ๊ฒ€์ฆ ์™„๋ฃŒ")
358
+ except Exception as e:
359
+ status_container.warning(f"โš ๏ธ ๊ฒ€ํ†  ์‹คํŒจ: {e}")
360
+ final_decision = decision
361
+
362
+ progress_bar.progress(100)
363
+ status_container.success("๐ŸŽ‰ ๋ถ„์„ ์™„๋ฃŒ!")
364
+
365
+ # ๊ฒฐ๊ณผ ๋ฐ˜ํ™˜
366
+ return {
367
+ 'article': article,
368
+ 'image_desc': image_desc,
369
+ 'similar_cases': similar_cases,
370
+ 'violation_count': violation_count,
371
+ 'no_violation_count': no_violation_count,
372
+ 'final_decision': final_decision
373
+ }
374
+
375
+ # ========== ๋ฉ”์ธ UI ==========
376
+ url_input = st.text_input(
377
+ "๐Ÿ“Ž ๊ธฐ์‚ฌ URL ์ž…๋ ฅ",
378
+ placeholder="https://news.example.com/article/12345",
379
+ help="๋ถ„์„ํ•  ๋‰ด์Šค ๊ธฐ์‚ฌ์˜ URL์„ ์ž…๋ ฅํ•˜์„ธ์š”"
380
+ )
381
+
382
+ col1, col2, col3 = st.columns([1, 1, 4])
383
+ with col1:
384
+ analyze_button = st.button("๐Ÿš€ ๋ถ„์„ ์‹œ์ž‘", type="primary", use_container_width=True)
385
+ with col2:
386
+ clear_button = st.button("๐Ÿ”„ ์ดˆ๊ธฐํ™”", use_container_width=True)
387
+
388
+ if clear_button:
389
+ st.rerun()
390
+
391
+ st.markdown("---")
392
+
393
+ # ========== ๋ถ„์„ ์‹คํ–‰ ==========
394
+ if analyze_button:
395
+ if not gemini_api_key:
396
+ st.error("โŒ Gemini API Key๋ฅผ ์ž…๋ ฅํ•ด์ฃผ์„ธ์š”!")
397
+ elif not url_input:
398
+ st.error("โŒ ๊ธฐ์‚ฌ URL์„ ์ž…๋ ฅํ•ด์ฃผ์„ธ์š”!")
399
+ else:
400
+ # ์ง„ํ–‰ ์ƒํ™ฉ ์ปจํ…Œ์ด๋„ˆ
401
+ progress_container = st.container()
402
+ status_container = st.container()
403
+
404
+ # ๋ถ„์„ ์‹คํ–‰
405
+ result = analyze_article_streamlit(url_input, gemini_api_key, progress_container, status_container)
406
+
407
+ if result:
408
+ st.markdown("---")
409
+ st.header("๐Ÿ“Š ๋ถ„์„ ๊ฒฐ๊ณผ")
410
+
411
+ # ๊ฒฐ๊ณผ ํ‘œ์‹œ
412
+ col1, col2 = st.columns([2, 1])
413
+
414
+ with col1:
415
+ st.subheader("๐Ÿ“ฐ ๊ธฐ์‚ฌ ์ •๋ณด")
416
+ article = result['article']
417
+
418
+ st.write(f"**์ œ๋ชฉ:** {article.get('title', 'N/A')}")
419
+ st.write(f"**์–ธ๋ก ์‚ฌ:** {article.get('media', 'N/A')}")
420
+ st.write(f"**๋‚ ์งœ:** {article.get('date', 'N/A')}")
421
+ st.write(f"**URL:** {article.get('url', url_input)}")
422
+
423
+ if article.get('image_url'):
424
+ st.write(f"**์ด๋ฏธ์ง€ URL:** {article.get('image_url', 'N/A')}")
425
+ try:
426
+ st.image(article['image_url'], caption="๊ธฐ์‚ฌ ์ด๋ฏธ์ง€", use_container_width=True)
427
+ except:
428
+ st.warning("์ด๋ฏธ์ง€๋ฅผ ๋ถˆ๋Ÿฌ์˜ฌ ์ˆ˜ ์—†์Šต๋‹ˆ๋‹ค.")
429
+
430
+ with st.expander("๐Ÿ“„ ๊ธฐ์‚ฌ ๋ณธ๋ฌธ", expanded=False):
431
+ st.write(article.get('text', 'N/A')[:1000] + "..." if len(article.get('text', '')) > 1000 else article.get('text', 'N/A'))
432
+
433
+ with col2:
434
+ st.subheader("๐Ÿ“ˆ ๋ถ„์„ ์ •๋ณด")
435
+
436
+ # ๋ถ„์„ ๊ฒฐ๊ณผ
437
+ if result['final_decision'].strip() == "์œ„๋ฐ˜ ์—†์Œ":
438
+ st.success("โœ… **๋ถ„์„ ๊ฒฐ๊ณผ:** ์œ„๋ฐ˜ ์—†์Œ")
439
+ else:
440
+ st.error("โš ๏ธ **๋ถ„์„ ๊ฒฐ๊ณผ:** ์œ„๋ฐ˜")
441
+
442
+ # ์œ ์‚ฌ ์‚ฌ๋ก€ ํ†ต๊ณ„
443
+ st.metric("์œ„๋ฐ˜ ์‚ฌ๋ก€", f"{result['violation_count']}/5")
444
+ st.metric("์œ„๋ฐ˜ ์—†์Œ ์‚ฌ๋ก€", f"{result['no_violation_count']}/5")
445
+
446
+ # ์‹ฌ์˜๋ฌธ
447
+ st.markdown("---")
448
+ st.subheader("โš–๏ธ ์ตœ์ข… ์‹ฌ์˜๋ฌธ")
449
+ st.info(result['final_decision'])
450
+
451
+ # ์œ ์‚ฌ ์‚ฌ๋ก€
452
+ if result['similar_cases']:
453
+ with st.expander("๐Ÿ“š ์œ ์‚ฌ ์‚ฌ๋ก€ (5๊ฐœ)", expanded=False):
454
+ st.text(result['similar_cases'])
455
+
456
+ # ์ด๋ฏธ์ง€ ์„ค๋ช…
457
+ if result['image_desc']:
458
+ with st.expander("๐Ÿ–ผ๏ธ ์ด๋ฏธ์ง€ ์„ค๋ช…", expanded=False):
459
+ st.write(result['image_desc'])
460
+
461
+ # ๋‹ค์šด๋กœ๋“œ ๋ฒ„ํŠผ
462
+ st.markdown("---")
463
+ result_text = f"""
464
+ # ๋‰ด์Šค ์‹ฌ์˜๋ฌธ ๋ถ„์„ ๊ฒฐ๊ณผ
465
+
466
+ ## ๊ธฐ์‚ฌ ์ •๋ณด
467
+ - **์ œ๋ชฉ:** {article.get('title', 'N/A')}
468
+ - **์–ธ๋ก ์‚ฌ:** {article.get('media', 'N/A')}
469
+ - **๋‚ ์งœ:** {article.get('date', 'N/A')}
470
+ - **URL:** {article.get('url', url_input)}
471
+ - **์ด๋ฏธ์ง€ URL:** {article.get('image_url', 'N/A')}
472
+
473
+ ## ๋ถ„์„ ๊ฒฐ๊ณผ
474
+ - **๊ฒฐ๊ณผ:** {"์œ„๋ฐ˜ ์—†์Œ" if result['final_decision'].strip() == "์œ„๋ฐ˜ ์—†์Œ" else "์œ„๋ฐ˜"}
475
+ - **์œ ์‚ฌ ์‚ฌ๋ก€ ์œ„๋ฐ˜ ์ˆ˜:** {result['violation_count']}/5
476
+
477
+ ## ์ตœ์ข… ์‹ฌ์˜๋ฌธ
478
+ {result['final_decision']}
479
+
480
+ ## ์œ ์‚ฌ ์‚ฌ๋ก€
481
+ {result['similar_cases']}
482
+ """
483
+
484
+ st.download_button(
485
+ label="๐Ÿ’พ ๊ฒฐ๊ณผ ๋‹ค์šด๋กœ๋“œ (TXT)",
486
+ data=result_text,
487
+ file_name=f"์‹ฌ์˜๋ฌธ_๋ถ„์„๊ฒฐ๊ณผ_{time.strftime('%Y%m%d_%H%M%S')}.txt",
488
+ mime="text/plain"
489
+ )
490
+
491
+ else:
492
+ st.info("๐Ÿ‘† ๊ธฐ์‚ฌ URL์„ ์ž…๋ ฅํ•˜๊ณ  '๋ถ„์„ ์‹œ์ž‘' ๋ฒ„ํŠผ์„ ํด๋ฆญํ•˜์„ธ์š”.")
news_text_scraper.py ADDED
@@ -0,0 +1,315 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ ๋‰ด์Šค ๊ธฐ์‚ฌ ์Šคํฌ๋ž˜ํ•‘ ๋„๊ตฌ (์ตœ์ ํ™” ๋ฒ„์ „)
3
+
4
+ ์‚ฌ์šฉ๋ฒ•:
5
+ article = extract_article(url)
6
+
7
+ ๋ฐ˜ํ™˜ ํ˜•์‹ (JSON):
8
+ {
9
+ 'title': '๊ธฐ์‚ฌ ์ œ๋ชฉ',
10
+ 'text': '๊ธฐ์‚ฌ ๋ณธ๋ฌธ ํ…์ŠคํŠธ',
11
+ 'image_url': '๋Œ€ํ‘œ ์ด๋ฏธ์ง€ URL'
12
+ }
13
+
14
+ ์˜์กด์„ฑ: pip3 install trafilatura newspaper3k playwright beautifulsoup4 requests fake-useragent extruct
15
+ Playwright ์ดˆ๊ธฐ ์„ค์น˜: playwright install chromium
16
+
17
+ ์„ฑ๋Šฅ ์ตœ์  ์ˆœ์„œ:
18
+ 1. Trafilatura (๊ฐ€์žฅ ๋น ๋ฅด๊ณ  ์ •ํ™•, ์ •์  ์ฝ˜ํ…์ธ )
19
+ 2. Newspaper3k (๋น ๋ฅด๊ณ  ํ•œ๊ตญ์–ด ์ง€์› ์šฐ์ˆ˜)
20
+ 3. Playwright + Trafilatura (JavaScript ๋ Œ๋”๋ง ํ•„์š”์‹œ)
21
+ 4. Playwright + Newspaper3k (๋Œ€์ฒด ๋ฐฉ๋ฒ•)
22
+ """
23
+
24
+ import json
25
+ import time
26
+ from typing import Optional, Dict
27
+ from urllib.parse import urljoin
28
+
29
+ import requests
30
+ import trafilatura
31
+ from bs4 import BeautifulSoup
32
+ from newspaper import Article
33
+
34
+ try:
35
+ from fake_useragent import UserAgent
36
+ ua = UserAgent()
37
+ USER_AGENT = ua.random
38
+ except ImportError:
39
+ USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
40
+
41
+ try:
42
+ from playwright.sync_api import sync_playwright, TimeoutError as PlaywrightTimeout
43
+ PLAYWRIGHT_AVAILABLE = True
44
+ except ImportError:
45
+ PLAYWRIGHT_AVAILABLE = False
46
+ print("โš ๏ธ Playwright ๋ฏธ์„ค์น˜ - JavaScript ๋ Œ๋”๋ง ๊ธฐ๋Šฅ ๋น„ํ™œ์„ฑํ™”")
47
+
48
+ try:
49
+ import extruct
50
+ EXTRUCT_AVAILABLE = True
51
+ except ImportError:
52
+ EXTRUCT_AVAILABLE = False
53
+
54
+ # HTTP ํ—ค๋” ์„ค์ •
55
+ HEADERS = {
56
+ 'User-Agent': USER_AGENT,
57
+ 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
58
+ 'Accept-Language': 'ko-KR,ko;q=0.8,en-US;q=0.5,en;q=0.3',
59
+ 'Accept-Encoding': 'gzip, deflate',
60
+ 'DNT': '1',
61
+ 'Connection': 'keep-alive',
62
+ 'Upgrade-Insecure-Requests': '1',
63
+ }
64
+
65
+ def fetch_with_headers(url: str) -> str:
66
+ """HTTP ํ—ค๋”๋ฅผ ํฌํ•จํ•œ URL ์š”์ฒญ"""
67
+ response = requests.get(url, headers=HEADERS, timeout=30)
68
+ response.raise_for_status()
69
+ return response.text
70
+
71
+ def extract_images_from_html(html: str, base_url: str = "") -> Optional[str]:
72
+ """HTML์—์„œ ์ด๋ฏธ์ง€ ์ถ”์ถœ (์—ฌ๋Ÿฌ ๋ฐฉ๋ฒ• ์‹œ๋„)"""
73
+ soup = BeautifulSoup(html, 'html.parser')
74
+
75
+ # 1. og:image ๋ฉ”ํƒ€ํƒœ๊ทธ
76
+ og_image = soup.find('meta', property='og:image')
77
+ if og_image and og_image.get('content'):
78
+ return og_image.get('content')
79
+
80
+ # 2. twitter:image
81
+ tw_image = soup.find('meta', attrs={'name': 'twitter:image'})
82
+ if tw_image and tw_image.get('content'):
83
+ return tw_image.get('content')
84
+
85
+ # 3. extruct๋กœ JSON-LD ํŒŒ์‹ฑ
86
+ if EXTRUCT_AVAILABLE:
87
+ try:
88
+ metadata = extruct.extract(html, base_url=base_url)
89
+ # Schema.org ImageObject ์ฐพ๊ธฐ
90
+ for item in metadata.get('json-ld', []):
91
+ if isinstance(item, dict):
92
+ if item.get('image'):
93
+ img = item['image']
94
+ if isinstance(img, str):
95
+ return img
96
+ elif isinstance(img, dict) and img.get('url'):
97
+ return img['url']
98
+ elif isinstance(img, list) and len(img) > 0:
99
+ return img[0] if isinstance(img[0], str) else img[0].get('url')
100
+ except:
101
+ pass
102
+
103
+ # 4. article ๋‚ด๋ถ€์˜ ์ฒซ ๋ฒˆ์งธ ์ด๋ฏธ์ง€
104
+ article_imgs = soup.select('article img[src], .article img[src], #article img[src]')
105
+ if article_imgs:
106
+ src = article_imgs[0].get('src')
107
+ return urljoin(base_url, src) if src else None
108
+
109
+ # 5. ์ผ๋ฐ˜ img ํƒœ๊ทธ
110
+ imgs = soup.find_all('img', src=True)
111
+ for img in imgs:
112
+ src = img.get('src')
113
+ # ๋กœ๊ณ , ์•„์ด์ฝ˜ ์ œ์™ธ
114
+ if src and not any(x in src.lower() for x in ['logo', 'icon', 'avatar', 'profile', 'ad', 'banner']):
115
+ # ์ตœ์†Œ ํฌ๊ธฐ ํ™•์ธ (width/height ์†์„ฑ)
116
+ width = img.get('width', '0')
117
+ height = img.get('height', '0')
118
+ try:
119
+ if int(width) >= 200 or int(height) >= 200:
120
+ return urljoin(base_url, src)
121
+ except:
122
+ return urljoin(base_url, src)
123
+
124
+ return None
125
+
126
+ def extract_trafilatura(url: str) -> Optional[Dict[str, str]]:
127
+ """Trafilatura ๊ธฐ์‚ฌ ์ถ”์ถœ"""
128
+ try:
129
+ html = fetch_with_headers(url)
130
+ result = trafilatura.extract(html, output_format='json', url=url,
131
+ include_images=True, include_links=True)
132
+ if result:
133
+ data = json.loads(result)
134
+ image_url = data.get('image') or extract_images_from_html(html, url)
135
+
136
+ return {
137
+ 'title': data.get('title'),
138
+ 'text': data.get('text'),
139
+ 'image_url': image_url
140
+ }
141
+ except Exception as e:
142
+ print(f"Trafilatura ์‹คํŒจ: {e}")
143
+ return None
144
+
145
+ def extract_newspaper(url: str) -> Optional[Dict[str, str]]:
146
+ """Newspaper3k ๊ธฐ์‚ฌ ์ถ”์ถœ"""
147
+ try:
148
+ html = fetch_with_headers(url)
149
+ article = Article(url)
150
+ article.config.browser_user_agent = HEADERS['User-Agent']
151
+ article.set_html(html)
152
+ article.parse()
153
+
154
+ image_url = article.top_image or extract_images_from_html(html, url)
155
+
156
+ return {
157
+ 'title': article.title,
158
+ 'text': article.text,
159
+ 'image_url': image_url
160
+ }
161
+ except Exception as e:
162
+ print(f"Newspaper3k ์‹คํŒจ: {e}")
163
+ return None
164
+
165
+ def get_rendered_html_playwright(url: str, wait: int = 2) -> Optional[str]:
166
+ """Playwright๋กœ ๋ Œ๋”๋ง๋œ HTML ๊ฐ€์ ธ์˜ค๊ธฐ"""
167
+ try:
168
+ with sync_playwright() as p:
169
+ browser = p.chromium.launch(headless=True)
170
+ context = browser.new_context(
171
+ user_agent=HEADERS['User-Agent'],
172
+ viewport={'width': 1920, 'height': 1080}
173
+ )
174
+ page = context.new_page()
175
+ page.goto(url, wait_until='domcontentloaded', timeout=30000)
176
+ time.sleep(wait)
177
+ html = page.content()
178
+ browser.close()
179
+ return html
180
+ except Exception as e:
181
+ print(f"Playwright ์˜ค๋ฅ˜: {e}")
182
+ return None
183
+
184
+ def extract_playwright_trafilatura(url: str) -> Optional[Dict[str, str]]:
185
+ """Playwright + Trafilatura ์กฐํ•ฉ"""
186
+ try:
187
+ html = get_rendered_html_playwright(url)
188
+ if html:
189
+ result = trafilatura.extract(html, output_format='json', url=url,
190
+ include_images=True, include_links=True)
191
+ if result:
192
+ data = json.loads(result)
193
+ image_url = data.get('image') or extract_images_from_html(html, url)
194
+
195
+ return {
196
+ 'title': data.get('title'),
197
+ 'text': data.get('text'),
198
+ 'image_url': image_url
199
+ }
200
+ except Exception as e:
201
+ print(f"Playwright+Trafilatura ์‹คํŒจ: {e}")
202
+ return None
203
+
204
+ def extract_playwright_newspaper(url: str) -> Optional[Dict[str, str]]:
205
+ """Playwright + Newspaper3k ์กฐํ•ฉ"""
206
+ try:
207
+ html = get_rendered_html_playwright(url)
208
+ if html:
209
+ article = Article(url='')
210
+ article.set_html(html)
211
+ article.parse()
212
+
213
+ image_url = article.top_image or extract_images_from_html(html, url)
214
+
215
+ return {
216
+ 'title': article.title,
217
+ 'text': article.text,
218
+ 'image_url': image_url
219
+ }
220
+ except Exception as e:
221
+ print(f"Playwright+Newspaper3k ์‹คํŒจ: {e}")
222
+ return None
223
+
224
+
225
+ def extract_article(url: str) -> Optional[Dict[str, str]]:
226
+ """๊ธฐ์‚ฌ ์ถ”์ถœ - ์ตœ์  ์ˆœ์„œ๋กœ ์‹œ๋„"""
227
+ print(f"๐Ÿ” ์ถ”์ถœ ์‹œ์ž‘: {url}")
228
+
229
+ result = {'title': None, 'text': None, 'image_url': None}
230
+
231
+ # ์ตœ์  ์ˆœ์„œ: ๋น ๋ฅด๊ณ  ์ •ํ™•ํ•œ ๊ฒƒ๋ถ€ํ„ฐ ์‹œ๋„
232
+ # 1. Trafilatura - ๊ฐ€์žฅ ๋น ๋ฅด๊ณ  ์ •ํ™• (์ •์  ์ฝ˜ํ…์ธ )
233
+ # 2. Newspaper3k - ๋น ๋ฅด๊ณ  ํ•œ๊ตญ์–ด ์ง€์› ์šฐ์ˆ˜
234
+ # 3. Playwright + Trafilatura - JavaScript ๋ Œ๋”๋ง์ด ํ•„์š”ํ•œ ๊ฒฝ์šฐ
235
+ # 4. Playwright + Newspaper3k - ๋Œ€์ฒด ๋ฐฉ๋ฒ•
236
+ extractors = [
237
+ ("Trafilatura", extract_trafilatura),
238
+ ("Newspaper3k", extract_newspaper),
239
+ ]
240
+
241
+ # Playwright ์ถ”๊ฐ€ (JavaScript ๋ Œ๋”๋ง ํ•„์š”์‹œ)
242
+ if PLAYWRIGHT_AVAILABLE:
243
+ extractors.extend([
244
+ ("Playwright+Trafilatura", extract_playwright_trafilatura),
245
+ ("Playwright+Newspaper3k", extract_playwright_newspaper),
246
+ ])
247
+
248
+ for i, (name, extractor) in enumerate(extractors, 1):
249
+ print(f" {i}๏ธโƒฃ {name} ์‹œ๋„...")
250
+ try:
251
+ data = extractor(url)
252
+ if data:
253
+ # ๊ฒฐ๊ณผ ์—…๋ฐ์ดํŠธ
254
+ updated = []
255
+ for key in result:
256
+ if not result[key] and data.get(key):
257
+ result[key] = data[key]
258
+ updated.append(key)
259
+
260
+ if updated:
261
+ print(f" โ†’ ์ถ”์ถœ ์„ฑ๊ณต: {', '.join(updated)}")
262
+
263
+ # ์ œ๋ชฉ, ๋ณธ๋ฌธ, ์ด๋ฏธ์ง€ ๋ชจ๋‘ ์žˆ์œผ๋ฉด ์„ฑ๊ณต
264
+ if result['title'] and result['text'] and result['image_url']:
265
+ print(f" โœ… {name} ์™„๋ฃŒ! (์ œ๋ชฉ O, ๋ณธ๋ฌธ O, ์ด๋ฏธ์ง€ O)")
266
+ return result
267
+
268
+ # ์ƒํƒœ ์ถœ๋ ฅ
269
+ status = f"์ œ๋ชฉ: {'O' if result['title'] else 'X'}, ๋ณธ๋ฌธ: {'O' if result['text'] else 'X'}, ์ด๋ฏธ์ง€: {'O' if result['image_url'] else 'X'}"
270
+ if result['title'] and result['text']:
271
+ print(f" โš ๏ธ ์ด๋ฏธ์ง€ ์—†์Œ - ๋‹ค์Œ ๋‹จ๊ณ„ ๊ณ„์† ({status})")
272
+ else:
273
+ print(f" โš ๏ธ ๋ถ€๋ถ„ ์„ฑ๊ณต ({status})")
274
+ else:
275
+ print(f" โŒ {name} ์‹คํŒจ")
276
+ except requests.HTTPError as e:
277
+ if e.response.status_code in (403, 429):
278
+ print(f" โŒ {name} ์ฐจ๋‹จ๋จ (HTTP {e.response.status_code})")
279
+ raise
280
+ print(f" โŒ {name} ์˜ค๋ฅ˜: {e}")
281
+ except Exception as e:
282
+ print(f" โŒ {name} ์˜ค๋ฅ˜: {e}")
283
+
284
+ if result['title'] or result['text']:
285
+ print(f" โœ… ์ตœ์ข… ๊ฒฐ๊ณผ - ์ œ๋ชฉ: {'O' if result['title'] else 'X'}, ๋ณธ๋ฌธ: {'O' if result['text'] else 'X'}, ์ด๋ฏธ์ง€: {'O' if result['image_url'] else 'X'}")
286
+ return result
287
+
288
+ print(" โŒ ๋ชจ๋“  ๋ฐฉ๋ฒ• ์‹คํŒจ")
289
+ return None
290
+
291
+ if __name__ == "__main__":
292
+ test_urls = [
293
+ "https://www.chosun.com/national/education/2025/07/19/4OMZBICJSNDGXA567IKPRBUFKA/",
294
+ "https://news.nate.com/view/20250521n37437",
295
+ "https://www.hani.co.kr/arti/society/society_general/1204840.html"
296
+ ]
297
+
298
+ print(f"์‚ฌ์šฉ ๊ฐ€๋Šฅํ•œ ๋„๊ตฌ:")
299
+ print(f" - Playwright: {'โœ…' if PLAYWRIGHT_AVAILABLE else 'โŒ'}")
300
+ print(f" - Extruct: {'โœ…' if EXTRUCT_AVAILABLE else 'โŒ'}")
301
+ print(f" - Fake UserAgent: {'โœ…' if 'ua' in dir() else 'โŒ'}\n")
302
+
303
+ for url in test_urls:
304
+ print(f"\n{'='*60}")
305
+ try:
306
+ article = extract_article(url)
307
+ if article:
308
+ print(f"\n๐Ÿ“„ ์ œ๋ชฉ: {article.get('title', 'N/A')[:100]}...")
309
+ print(f"๐Ÿ“ ๋ณธ๋ฌธ: {len(article.get('text', ''))}์ž")
310
+ print(f"๐Ÿ–ผ๏ธ ์ด๋ฏธ์ง€: {article.get('image_url', 'N/A')[:80]}..." if article.get('image_url') else "๐Ÿ–ผ๏ธ ์ด๋ฏธ์ง€: ์—†์Œ")
311
+ else:
312
+ print("์ถ”์ถœ ์‹คํŒจ")
313
+ except Exception as e:
314
+ print(f"์ „์ฒด ์‹คํŒจ: {e}")
315
+ print("="*60)
packages.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ # System packages for Streamlit and dependencies
requirements.txt CHANGED
@@ -1,3 +1,31 @@
1
- altair
2
- pandas
3
- streamlit
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Core Dependencies
2
+ streamlit==1.51.0
3
+ google-generativeai==0.8.5
4
+ python-dotenv==1.1.1
5
+
6
+ # LangGraph and Agents
7
+ langgraph==0.6.8
8
+ langgraph-checkpoint==2.1.1
9
+
10
+ # Vector Database
11
+ chromadb==1.1.0
12
+
13
+ # Machine Learning & Embeddings
14
+ sentence-transformers==5.1.1
15
+ torch>=2.0.0
16
+ transformers>=4.30.0
17
+
18
+ # Image Processing
19
+ Pillow>=11.0.0
20
+
21
+ # Web Scraping
22
+ requests==2.32.5
23
+ beautifulsoup4==4.14.2
24
+ lxml==6.0.2
25
+
26
+ # Data Processing
27
+ pandas>=2.0.0
28
+ numpy>=1.23.0
29
+
30
+ # Utilities
31
+ tqdm>=4.60.0