| | import gradio as gr |
| | import requests |
| | from lxml import html |
| | from bs4 import BeautifulSoup |
| | from urllib.parse import urljoin |
| |
|
| | def scrape_naver_blog(url): |
| | try: |
| | |
| | session = requests.Session() |
| | |
| | |
| | headers = { |
| | 'User-Agent': ('Mozilla/5.0 (Windows NT 10.0; Win64; x64) ' |
| | 'AppleWebKit/537.36 (KHTML, like Gecko) ' |
| | 'Chrome/58.0.3029.110 Safari/537.3') |
| | } |
| | |
| | |
| | response = session.get(url, headers=headers) |
| | |
| | if response.status_code != 200: |
| | return f"Error: Unable to fetch the main page. Status code: {response.status_code}", "" |
| | |
| | |
| | soup_debug_main = BeautifulSoup(response.text, 'html.parser') |
| | debug_info = soup_debug_main.prettify()[:5000] |
| | |
| | |
| | soup_main = BeautifulSoup(response.text, 'html.parser') |
| | iframe = soup_main.find('iframe', id='mainFrame') |
| | if not iframe: |
| | return "Error: iframe์ ์ฐพ์ ์ ์์ต๋๋ค.", debug_info |
| | |
| | iframe_src = iframe.get('src') |
| | if not iframe_src: |
| | return "Error: iframe์ src ์์ฑ์ ์ฐพ์ ์ ์์ต๋๋ค.", debug_info |
| | |
| | |
| | iframe_url = urljoin(url, iframe_src) |
| | |
| | |
| | iframe_response = session.get(iframe_url, headers=headers) |
| | |
| | if iframe_response.status_code != 200: |
| | return f"Error: Unable to fetch the iframe page. Status code: {iframe_response.status_code}", debug_info |
| | |
| | |
| | soup_debug_iframe = BeautifulSoup(iframe_response.text, 'html.parser') |
| | debug_info += "\n\n=== iframe HTML ์ผ๋ถ ===\n" + soup_debug_iframe.prettify()[:5000] |
| | |
| | |
| | tree = html.fromstring(iframe_response.content) |
| | |
| | |
| | title_xpath = '/html/body/div[7]/div[1]/div[2]/div[2]/div[2]/div[1]/div/div/div[10]/div[1]/div/table[2]/tbody/tr/td[2]/div[1]/div/div/div[1]/div/div/div[2]/div/p' |
| | body_xpath = '/html/body/div[7]/div[1]/div[2]/div[2]/div[2]/div[1]/div/div/div[10]/div[1]/div/table[2]/tbody/tr/td[2]/div[1]/div/div/div[2]/div[1]/div/div/div/p[1]' |
| | |
| | |
| | title_elements = tree.xpath(title_xpath) |
| | if not title_elements: |
| | |
| | meta_title = tree.xpath('//meta[@property="og:title"]/@content') |
| | if meta_title: |
| | title = meta_title[0].strip() |
| | else: |
| | title = "์ ๋ชฉ์ ์ฐพ์ ์ ์์ต๋๋ค." |
| | else: |
| | |
| | title = ''.join(title_elements[0].itertext()).strip() |
| | |
| | |
| | body_elements = tree.xpath(body_xpath) |
| | if not body_elements: |
| | |
| | meta_description = tree.xpath('//meta[@property="og:description"]/@content') |
| | if meta_description: |
| | body = meta_description[0].strip() |
| | else: |
| | body = "๋ด์ฉ์ ์ฐพ์ ์ ์์ต๋๋ค." |
| | else: |
| | body = ''.join(body_elements[0].itertext()).strip() |
| | |
| | |
| | output_title = f"์ ๋ชฉ :\n{title}" |
| | output_content = f"๋ด์ฉ :\n{body}" |
| | |
| | |
| | final_debug_info = f"๋๋ฒ๊น
์ ๋ณด (๋ฉ์ธ ํ์ด์ง HTML ์ผ๋ถ):\n{debug_info}" |
| | |
| | return f"{output_title}\n\n{output_content}", final_debug_info |
| | |
| | except Exception as e: |
| | |
| | return f"An error occurred: {str(e)}", "" |
| |
|
| | |
| | title = "๋ค์ด๋ฒ ๋ธ๋ก๊ทธ ์คํฌ๋ํผ" |
| | description = "๋ค์ด๋ฒ ๋ธ๋ก๊ทธ URL์ ์
๋ ฅํ๋ฉด ์ ๋ชฉ๊ณผ ๋ด์ฉ์ ์คํฌ๋ํํฉ๋๋ค." |
| |
|
| | iface = gr.Interface( |
| | fn=scrape_naver_blog, |
| | inputs=gr.Textbox(lines=2, placeholder="https://blog.naver.com/...", label="๋ธ๋ก๊ทธ URL"), |
| | outputs=[ |
| | gr.Textbox(label="๊ฒฐ๊ณผ"), |
| | gr.Textbox(label="๋๋ฒ๊น
์ ๋ณด") |
| | ], |
| | title=title, |
| | description=description, |
| | allow_flagging="never" |
| | ) |
| |
|
| | iface.launch() |
| |
|