File size: 4,811 Bytes
1a02034 36ac952 1a02034 f8c17f1 0d8bd1c 36ac952 0d8bd1c 1a02034 36ac952 d26d216 0d8bd1c 1a02034 36ac952 0d8bd1c a241c57 d26d216 1a02034 0d8bd1c d26d216 0d8bd1c d26d216 0d8bd1c d26d216 1a02034 b18692e d26d216 c947445 d26d216 1a02034 b18692e d26d216 c947445 1a02034 d26d216 1a02034 b18692e d26d216 1a02034 0d8bd1c 1a02034 0d8bd1c 1a02034 aa70416 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 | import gradio as gr
import requests
from lxml import html
from bs4 import BeautifulSoup
from urllib.parse import urljoin
def scrape_naver_blog(url):
try:
# ์ธ์
์์ฑ
session = requests.Session()
# HTTP ์์ฒญ ํค๋ ์ค์ (๋ค์ด๋ฒ๋ User-Agent๋ฅผ ํ์ธํ ์ ์์)
headers = {
'User-Agent': ('Mozilla/5.0 (Windows NT 10.0; Win64; x64) '
'AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/58.0.3029.110 Safari/537.3')
}
# 1. ๋ฉ์ธ ํ์ด์ง ์์ฒญ
response = session.get(url, headers=headers)
if response.status_code != 200:
return f"Error: Unable to fetch the main page. Status code: {response.status_code}", ""
# ๋๋ฒ๊น
: ๋ฉ์ธ ํ์ด์ง์ HTML ์ผ๋ถ ์ถ๋ ฅ (์ฒ์ 5000์)
soup_debug_main = BeautifulSoup(response.text, 'html.parser')
debug_info = soup_debug_main.prettify()[:5000]
# BeautifulSoup์ ์ฌ์ฉํ์ฌ iframe src ์ถ์ถ
soup_main = BeautifulSoup(response.text, 'html.parser')
iframe = soup_main.find('iframe', id='mainFrame')
if not iframe:
return "Error: iframe์ ์ฐพ์ ์ ์์ต๋๋ค.", debug_info
iframe_src = iframe.get('src')
if not iframe_src:
return "Error: iframe์ src ์์ฑ์ ์ฐพ์ ์ ์์ต๋๋ค.", debug_info
# iframe src๊ฐ ์๋ ๊ฒฝ๋ก์ผ ๊ฒฝ์ฐ ์ ๋ ๊ฒฝ๋ก๋ก ๋ณํ
iframe_url = urljoin(url, iframe_src)
# 2. iframe ํ์ด์ง ์์ฒญ
iframe_response = session.get(iframe_url, headers=headers)
if iframe_response.status_code != 200:
return f"Error: Unable to fetch the iframe page. Status code: {iframe_response.status_code}", debug_info
# ๋๋ฒ๊น
: iframe ํ์ด์ง์ HTML ์ผ๋ถ ์ถ๊ฐ ์ถ๋ ฅ (์ฒ์ 5000์)
soup_debug_iframe = BeautifulSoup(iframe_response.text, 'html.parser')
debug_info += "\n\n=== iframe HTML ์ผ๋ถ ===\n" + soup_debug_iframe.prettify()[:5000]
# 3. lxml์ ์ฌ์ฉํ์ฌ iframe HTML ํ์ฑ
tree = html.fromstring(iframe_response.content)
# ์ฌ์ฉ์ ์ ๊ณต XPath๋ฅผ ์ฌ์ฉํ์ฌ ์ ๋ชฉ๊ณผ ๋ณธ๋ฌธ ์ถ์ถ
title_xpath = '/html/body/div[7]/div[1]/div[2]/div[2]/div[2]/div[1]/div/div/div[10]/div[1]/div/table[2]/tbody/tr/td[2]/div[1]/div/div/div[1]/div/div/div[2]/div/p'
body_xpath = '/html/body/div[7]/div[1]/div[2]/div[2]/div[2]/div[1]/div/div/div[10]/div[1]/div/table[2]/tbody/tr/td[2]/div[1]/div/div/div[2]/div[1]/div/div/div/p[1]'
# ์ ๋ชฉ ์ถ์ถ
title_elements = tree.xpath(title_xpath)
if not title_elements:
# XPath๋ก ์ ๋ชฉ์ ์ฐพ์ง ๋ชปํ ๊ฒฝ์ฐ, og:title ๋ฉํ ํ๊ทธ ์ฌ์ฉ
meta_title = tree.xpath('//meta[@property="og:title"]/@content')
if meta_title:
title = meta_title[0].strip()
else:
title = "์ ๋ชฉ์ ์ฐพ์ ์ ์์ต๋๋ค."
else:
# ์ถ์ถ๋ ์์์ ํ
์คํธ ํฉ์น๊ธฐ
title = ''.join(title_elements[0].itertext()).strip()
# ๋ณธ๋ฌธ ์ถ์ถ
body_elements = tree.xpath(body_xpath)
if not body_elements:
# XPath๋ก ๋ณธ๋ฌธ์ ์ฐพ์ง ๋ชปํ ๊ฒฝ์ฐ, og:description ๋ฉํ ํ๊ทธ ์ฌ์ฉ
meta_description = tree.xpath('//meta[@property="og:description"]/@content')
if meta_description:
body = meta_description[0].strip()
else:
body = "๋ด์ฉ์ ์ฐพ์ ์ ์์ต๋๋ค."
else:
body = ''.join(body_elements[0].itertext()).strip()
# ์ถ๋ ฅ ํ์
output_title = f"์ ๋ชฉ :\n{title}"
output_content = f"๋ด์ฉ :\n{body}"
# ์ต์ข
๋๋ฒ๊น
์ ๋ณด
final_debug_info = f"๋๋ฒ๊น
์ ๋ณด (๋ฉ์ธ ํ์ด์ง HTML ์ผ๋ถ):\n{debug_info}"
return f"{output_title}\n\n{output_content}", final_debug_info
except Exception as e:
# ์์ธ ๋ฐ์ ์ ์๋ฌ ๋ฉ์์ง์ ๋น ๋๋ฒ๊น
์ ๋ณด๋ฅผ ๋ฐํ
return f"An error occurred: {str(e)}", ""
# Gradio ์ธํฐํ์ด์ค ๊ตฌ์ฑ (ํจ์ ๋ฐ์ ์์น)
title = "๋ค์ด๋ฒ ๋ธ๋ก๊ทธ ์คํฌ๋ํผ"
description = "๋ค์ด๋ฒ ๋ธ๋ก๊ทธ URL์ ์
๋ ฅํ๋ฉด ์ ๋ชฉ๊ณผ ๋ด์ฉ์ ์คํฌ๋ํํฉ๋๋ค."
iface = gr.Interface(
fn=scrape_naver_blog,
inputs=gr.Textbox(lines=2, placeholder="https://blog.naver.com/...", label="๋ธ๋ก๊ทธ URL"),
outputs=[
gr.Textbox(label="๊ฒฐ๊ณผ"),
gr.Textbox(label="๋๋ฒ๊น
์ ๋ณด")
],
title=title,
description=description,
allow_flagging="never"
)
iface.launch()
|