Update app.py
Browse files
app.py
CHANGED
|
@@ -1,6 +1,7 @@
|
|
| 1 |
import gradio as gr
|
| 2 |
import requests
|
| 3 |
from bs4 import BeautifulSoup
|
|
|
|
| 4 |
|
| 5 |
# ๋๋ฒ๊น
(๋ก๊ทธ)์ฉ ํจ์
|
| 6 |
def debug_log(message: str):
|
|
@@ -27,26 +28,60 @@ def scrape_naver_blog(url: str) -> str:
|
|
| 27 |
}
|
| 28 |
|
| 29 |
try:
|
|
|
|
| 30 |
response = requests.get(url, headers=headers)
|
| 31 |
-
debug_log("HTTP GET ์์ฒญ ์๋ฃ")
|
| 32 |
|
| 33 |
# ์๋ต ์ํ์ฝ๋ ํ์ธ
|
| 34 |
if response.status_code != 200:
|
| 35 |
debug_log(f"์์ฒญ ์คํจ, ์ํ์ฝ๋: {response.status_code}")
|
| 36 |
return f"์ค๋ฅ๊ฐ ๋ฐ์ํ์ต๋๋ค. ์ํ์ฝ๋: {response.status_code}"
|
| 37 |
|
| 38 |
-
# BeautifulSoup ํ์ฑ
|
| 39 |
soup = BeautifulSoup(response.text, "html.parser")
|
| 40 |
-
debug_log("HTML ํ์ฑ ์๋ฃ")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 41 |
|
| 42 |
# ์ ๋ชฉ ์ถ์ถ
|
| 43 |
-
title_div =
|
| 44 |
title = title_div.get_text(strip=True) if title_div else "์ ๋ชฉ์ ์ฐพ์ ์ ์์ต๋๋ค."
|
| 45 |
debug_log(f"์ถ์ถ๋ ์ ๋ชฉ: {title}")
|
| 46 |
|
| 47 |
# ๋ณธ๋ฌธ ์ถ์ถ
|
| 48 |
-
content_div =
|
| 49 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 50 |
debug_log("๋ณธ๋ฌธ ์ถ์ถ ์๋ฃ")
|
| 51 |
|
| 52 |
# ๊ฒฐ๊ณผ ํฉ์น๊ธฐ
|
|
@@ -62,16 +97,14 @@ def scrape_naver_blog(url: str) -> str:
|
|
| 62 |
|
| 63 |
# Gradio ์ธํฐํ์ด์ค
|
| 64 |
def main_interface():
|
| 65 |
-
# ์
๋ ฅ: ๋ค์ด๋ฒ ๋ธ๋ก๊ทธ ๋งํฌ
|
| 66 |
-
# ์ถ๋ ฅ: ์ ๋ชฉ + ๋ณธ๋ฌธ ๋ด์ฉ
|
| 67 |
interface = gr.Interface(
|
| 68 |
fn=scrape_naver_blog,
|
| 69 |
-
inputs=gr.
|
| 70 |
lines=1,
|
| 71 |
label="๋ค์ด๋ฒ ๋ธ๋ก๊ทธ ๋งํฌ",
|
| 72 |
placeholder="์: https://blog.naver.com/ssboost/222983068507"
|
| 73 |
),
|
| 74 |
-
outputs=gr.
|
| 75 |
title="๋ค์ด๋ฒ ๋ธ๋ก๊ทธ ์คํฌ๋ํผ",
|
| 76 |
description="๋ค์ด๋ฒ ๋ธ๋ก๊ทธ ๋งํฌ๋ฅผ ์
๋ ฅํ๋ฉด ์ ๋ชฉ๊ณผ ๋ณธ๋ฌธ์ ์ถ์ถํ์ฌ ํ์ํฉ๋๋ค."
|
| 77 |
)
|
|
@@ -81,5 +114,4 @@ if __name__ == "__main__":
|
|
| 81 |
debug_log("Gradio ์ฑ ์คํ ์์")
|
| 82 |
demo = main_interface()
|
| 83 |
demo.launch()
|
| 84 |
-
debug_log("Gradio ์ฑ ์คํ ์ข
๋ฃ")
|
| 85 |
-
|
|
|
|
| 1 |
import gradio as gr
|
| 2 |
import requests
|
| 3 |
from bs4 import BeautifulSoup
|
| 4 |
+
import urllib.parse # iframe ๊ฒฝ๋ก๊ฐ ์๋๊ฒฝ๋ก์ผ ๊ฒฝ์ฐ ์ ๋๊ฒฝ๋ก๋ก ๋ง๋ค๊ธฐ ์ํด ์ฌ์ฉ
|
| 5 |
|
| 6 |
# ๋๋ฒ๊น
(๋ก๊ทธ)์ฉ ํจ์
|
| 7 |
def debug_log(message: str):
|
|
|
|
| 28 |
}
|
| 29 |
|
| 30 |
try:
|
| 31 |
+
# 1) ๋ค์ด๋ฒ ๋ธ๋ก๊ทธ '๋ฉ์ธ' ํ์ด์ง ์์ฒญ
|
| 32 |
response = requests.get(url, headers=headers)
|
| 33 |
+
debug_log("HTTP GET ์์ฒญ(๋ฉ์ธ ํ์ด์ง) ์๋ฃ")
|
| 34 |
|
| 35 |
# ์๋ต ์ํ์ฝ๋ ํ์ธ
|
| 36 |
if response.status_code != 200:
|
| 37 |
debug_log(f"์์ฒญ ์คํจ, ์ํ์ฝ๋: {response.status_code}")
|
| 38 |
return f"์ค๋ฅ๊ฐ ๋ฐ์ํ์ต๋๋ค. ์ํ์ฝ๋: {response.status_code}"
|
| 39 |
|
| 40 |
+
# BeautifulSoup ํ์ฑ (๋ฉ์ธ ํ์ด์ง)
|
| 41 |
soup = BeautifulSoup(response.text, "html.parser")
|
| 42 |
+
debug_log("HTML ํ์ฑ(๋ฉ์ธ ํ์ด์ง) ์๋ฃ")
|
| 43 |
+
|
| 44 |
+
# 2) iframe ํ๊ทธ ์ฐพ๊ธฐ
|
| 45 |
+
iframe = soup.select_one("iframe#mainFrame")
|
| 46 |
+
if not iframe:
|
| 47 |
+
# iframe ์์ฒด๋ฅผ ์ฐพ์ง ๋ชปํ ๊ฒฝ์ฐ
|
| 48 |
+
debug_log("iframe#mainFrame ํ๊ทธ๋ฅผ ์ฐพ์ ์ ์์ต๋๋ค.")
|
| 49 |
+
return "๋ณธ๋ฌธ iframe์ ์ฐพ์ ์ ์์ต๋๋ค."
|
| 50 |
+
|
| 51 |
+
iframe_src = iframe.get("src")
|
| 52 |
+
if not iframe_src:
|
| 53 |
+
debug_log("iframe src๊ฐ ์กด์ฌํ์ง ์์ต๋๋ค.")
|
| 54 |
+
return "๋ณธ๋ฌธ iframe์ src๋ฅผ ์ฐพ์ ์ ์์ต๋๋ค."
|
| 55 |
+
|
| 56 |
+
# 3) iframe src๊ฐ ์๋๊ฒฝ๋ก์ธ ๊ฒฝ์ฐ ์ ๋๊ฒฝ๋ก๋ก ๋ณด์
|
| 57 |
+
# (์: //blog.naver.com/~~~ ์ ๊ฐ์ ๊ฒฝ์ฐ๋ฅผ ์ฒ๋ฆฌ)
|
| 58 |
+
parsed_iframe_url = urllib.parse.urljoin(url, iframe_src)
|
| 59 |
+
|
| 60 |
+
# iframe ํ์ด์ง๋ก ์ฌ์์ฒญ
|
| 61 |
+
debug_log(f"iframe ํ์ด์ง ์์ฒญ URL: {parsed_iframe_url}")
|
| 62 |
+
iframe_response = requests.get(parsed_iframe_url, headers=headers)
|
| 63 |
+
debug_log("HTTP GET ์์ฒญ(iframe ํ์ด์ง) ์๋ฃ")
|
| 64 |
+
|
| 65 |
+
if iframe_response.status_code != 200:
|
| 66 |
+
debug_log(f"iframe ์์ฒญ ์คํจ, ์ํ์ฝ๋: {iframe_response.status_code}")
|
| 67 |
+
return f"iframe์์ ์ค๋ฅ๊ฐ ๋ฐ์ํ์ต๋๋ค. ์ํ์ฝ๋: {iframe_response.status_code}"
|
| 68 |
+
|
| 69 |
+
# 4) iframe ํ์ด์ง ํ์ฑ
|
| 70 |
+
iframe_soup = BeautifulSoup(iframe_response.text, "html.parser")
|
| 71 |
+
debug_log("HTML ํ์ฑ(iframe ํ์ด์ง) ์๋ฃ")
|
| 72 |
|
| 73 |
# ์ ๋ชฉ ์ถ์ถ
|
| 74 |
+
title_div = iframe_soup.select_one('.se-module.se-module-text.se-title-text')
|
| 75 |
title = title_div.get_text(strip=True) if title_div else "์ ๋ชฉ์ ์ฐพ์ ์ ์์ต๋๋ค."
|
| 76 |
debug_log(f"์ถ์ถ๋ ์ ๋ชฉ: {title}")
|
| 77 |
|
| 78 |
# ๋ณธ๋ฌธ ์ถ์ถ
|
| 79 |
+
content_div = iframe_soup.select_one('.se-main-container')
|
| 80 |
+
if content_div:
|
| 81 |
+
# ๋ณธ๋ฌธ์ \n ๊ธฐ์ค์ผ๋ก ๊ตฌ๋ถํด์ ์ข ๋ ๊น๋ํ๊ฒ ๋ง๋ค๊ธฐ
|
| 82 |
+
content = content_div.get_text("\n", strip=True)
|
| 83 |
+
else:
|
| 84 |
+
content = "๋ณธ๋ฌธ์ ์ฐพ์ ์ ์์ต๋๋ค."
|
| 85 |
debug_log("๋ณธ๋ฌธ ์ถ์ถ ์๋ฃ")
|
| 86 |
|
| 87 |
# ๊ฒฐ๊ณผ ํฉ์น๊ธฐ
|
|
|
|
| 97 |
|
| 98 |
# Gradio ์ธํฐํ์ด์ค
|
| 99 |
def main_interface():
|
|
|
|
|
|
|
| 100 |
interface = gr.Interface(
|
| 101 |
fn=scrape_naver_blog,
|
| 102 |
+
inputs=gr.Textbox(
|
| 103 |
lines=1,
|
| 104 |
label="๋ค์ด๋ฒ ๋ธ๋ก๊ทธ ๋งํฌ",
|
| 105 |
placeholder="์: https://blog.naver.com/ssboost/222983068507"
|
| 106 |
),
|
| 107 |
+
outputs=gr.Textbox(label="๊ฒฐ๊ณผ"),
|
| 108 |
title="๋ค์ด๋ฒ ๋ธ๋ก๊ทธ ์คํฌ๋ํผ",
|
| 109 |
description="๋ค์ด๋ฒ ๋ธ๋ก๊ทธ ๋งํฌ๋ฅผ ์
๋ ฅํ๋ฉด ์ ๋ชฉ๊ณผ ๋ณธ๋ฌธ์ ์ถ์ถํ์ฌ ํ์ํฉ๋๋ค."
|
| 110 |
)
|
|
|
|
| 114 |
debug_log("Gradio ์ฑ ์คํ ์์")
|
| 115 |
demo = main_interface()
|
| 116 |
demo.launch()
|
| 117 |
+
debug_log("Gradio ์ฑ ์คํ ์ข
๋ฃ")
|
|
|