ddd / app.py
Kims12's picture
Update app.py
aa70416 verified
import gradio as gr
import requests
from lxml import html
from bs4 import BeautifulSoup
from urllib.parse import urljoin
def scrape_naver_blog(url):
try:
# ์„ธ์…˜ ์ƒ์„ฑ
session = requests.Session()
# HTTP ์š”์ฒญ ํ—ค๋” ์„ค์ • (๋„ค์ด๋ฒ„๋Š” User-Agent๋ฅผ ํ™•์ธํ•  ์ˆ˜ ์žˆ์Œ)
headers = {
'User-Agent': ('Mozilla/5.0 (Windows NT 10.0; Win64; x64) '
'AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/58.0.3029.110 Safari/537.3')
}
# 1. ๋ฉ”์ธ ํŽ˜์ด์ง€ ์š”์ฒญ
response = session.get(url, headers=headers)
if response.status_code != 200:
return f"Error: Unable to fetch the main page. Status code: {response.status_code}", ""
# ๋””๋ฒ„๊น…: ๋ฉ”์ธ ํŽ˜์ด์ง€์˜ HTML ์ผ๋ถ€ ์ถœ๋ ฅ (์ฒ˜์Œ 5000์ž)
soup_debug_main = BeautifulSoup(response.text, 'html.parser')
debug_info = soup_debug_main.prettify()[:5000]
# BeautifulSoup์„ ์‚ฌ์šฉํ•˜์—ฌ iframe src ์ถ”์ถœ
soup_main = BeautifulSoup(response.text, 'html.parser')
iframe = soup_main.find('iframe', id='mainFrame')
if not iframe:
return "Error: iframe์„ ์ฐพ์„ ์ˆ˜ ์—†์Šต๋‹ˆ๋‹ค.", debug_info
iframe_src = iframe.get('src')
if not iframe_src:
return "Error: iframe์˜ src ์†์„ฑ์„ ์ฐพ์„ ์ˆ˜ ์—†์Šต๋‹ˆ๋‹ค.", debug_info
# iframe src๊ฐ€ ์ƒ๋Œ€ ๊ฒฝ๋กœ์ผ ๊ฒฝ์šฐ ์ ˆ๋Œ€ ๊ฒฝ๋กœ๋กœ ๋ณ€ํ™˜
iframe_url = urljoin(url, iframe_src)
# 2. iframe ํŽ˜์ด์ง€ ์š”์ฒญ
iframe_response = session.get(iframe_url, headers=headers)
if iframe_response.status_code != 200:
return f"Error: Unable to fetch the iframe page. Status code: {iframe_response.status_code}", debug_info
# ๋””๋ฒ„๊น…: iframe ํŽ˜์ด์ง€์˜ HTML ์ผ๋ถ€ ์ถ”๊ฐ€ ์ถœ๋ ฅ (์ฒ˜์Œ 5000์ž)
soup_debug_iframe = BeautifulSoup(iframe_response.text, 'html.parser')
debug_info += "\n\n=== iframe HTML ์ผ๋ถ€ ===\n" + soup_debug_iframe.prettify()[:5000]
# 3. lxml์„ ์‚ฌ์šฉํ•˜์—ฌ iframe HTML ํŒŒ์‹ฑ
tree = html.fromstring(iframe_response.content)
# ์‚ฌ์šฉ์ž ์ œ๊ณต XPath๋ฅผ ์‚ฌ์šฉํ•˜์—ฌ ์ œ๋ชฉ๊ณผ ๋ณธ๋ฌธ ์ถ”์ถœ
title_xpath = '/html/body/div[7]/div[1]/div[2]/div[2]/div[2]/div[1]/div/div/div[10]/div[1]/div/table[2]/tbody/tr/td[2]/div[1]/div/div/div[1]/div/div/div[2]/div/p'
body_xpath = '/html/body/div[7]/div[1]/div[2]/div[2]/div[2]/div[1]/div/div/div[10]/div[1]/div/table[2]/tbody/tr/td[2]/div[1]/div/div/div[2]/div[1]/div/div/div/p[1]'
# ์ œ๋ชฉ ์ถ”์ถœ
title_elements = tree.xpath(title_xpath)
if not title_elements:
# XPath๋กœ ์ œ๋ชฉ์„ ์ฐพ์ง€ ๋ชปํ•œ ๊ฒฝ์šฐ, og:title ๋ฉ”ํƒ€ ํƒœ๊ทธ ์‚ฌ์šฉ
meta_title = tree.xpath('//meta[@property="og:title"]/@content')
if meta_title:
title = meta_title[0].strip()
else:
title = "์ œ๋ชฉ์„ ์ฐพ์„ ์ˆ˜ ์—†์Šต๋‹ˆ๋‹ค."
else:
# ์ถ”์ถœ๋œ ์š”์†Œ์˜ ํ…์ŠคํŠธ ํ•ฉ์น˜๊ธฐ
title = ''.join(title_elements[0].itertext()).strip()
# ๋ณธ๋ฌธ ์ถ”์ถœ
body_elements = tree.xpath(body_xpath)
if not body_elements:
# XPath๋กœ ๋ณธ๋ฌธ์„ ์ฐพ์ง€ ๋ชปํ•œ ๊ฒฝ์šฐ, og:description ๋ฉ”ํƒ€ ํƒœ๊ทธ ์‚ฌ์šฉ
meta_description = tree.xpath('//meta[@property="og:description"]/@content')
if meta_description:
body = meta_description[0].strip()
else:
body = "๋‚ด์šฉ์„ ์ฐพ์„ ์ˆ˜ ์—†์Šต๋‹ˆ๋‹ค."
else:
body = ''.join(body_elements[0].itertext()).strip()
# ์ถœ๋ ฅ ํ˜•์‹
output_title = f"์ œ๋ชฉ :\n{title}"
output_content = f"๋‚ด์šฉ :\n{body}"
# ์ตœ์ข… ๋””๋ฒ„๊น… ์ •๋ณด
final_debug_info = f"๋””๋ฒ„๊น… ์ •๋ณด (๋ฉ”์ธ ํŽ˜์ด์ง€ HTML ์ผ๋ถ€):\n{debug_info}"
return f"{output_title}\n\n{output_content}", final_debug_info
except Exception as e:
# ์˜ˆ์™ธ ๋ฐœ์ƒ ์‹œ ์—๋Ÿฌ ๋ฉ”์‹œ์ง€์™€ ๋นˆ ๋””๋ฒ„๊น… ์ •๋ณด๋ฅผ ๋ฐ˜ํ™˜
return f"An error occurred: {str(e)}", ""
# Gradio ์ธํ„ฐํŽ˜์ด์Šค ๊ตฌ์„ฑ (ํ•จ์ˆ˜ ๋ฐ–์— ์œ„์น˜)
title = "๋„ค์ด๋ฒ„ ๋ธ”๋กœ๊ทธ ์Šคํฌ๋ž˜ํผ"
description = "๋„ค์ด๋ฒ„ ๋ธ”๋กœ๊ทธ URL์„ ์ž…๋ ฅํ•˜๋ฉด ์ œ๋ชฉ๊ณผ ๋‚ด์šฉ์„ ์Šคํฌ๋ž˜ํ•‘ํ•ฉ๋‹ˆ๋‹ค."
iface = gr.Interface(
fn=scrape_naver_blog,
inputs=gr.Textbox(lines=2, placeholder="https://blog.naver.com/...", label="๋ธ”๋กœ๊ทธ URL"),
outputs=[
gr.Textbox(label="๊ฒฐ๊ณผ"),
gr.Textbox(label="๋””๋ฒ„๊น… ์ •๋ณด")
],
title=title,
description=description,
allow_flagging="never"
)
iface.launch()