Spaces:

Kims12
/

ddd

Sleeping

File size: 4,811 Bytes

1a02034
36ac952
1a02034
f8c17f1
0d8bd1c
36ac952
 
 
0d8bd1c
 
 
1a02034
36ac952
d26d216
 
 
0d8bd1c
 
 
 
1a02034
36ac952
0d8bd1c
a241c57
d26d216
 
 
1a02034
0d8bd1c
d26d216
 
0d8bd1c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d26d216
0d8bd1c
 
 
d26d216
 
 
 
 
 
1a02034
b18692e
d26d216
 
c947445
 
 
 
 
 
d26d216
 
 
1a02034
b18692e
d26d216
 
c947445
 
 
 
 
 
1a02034
d26d216
1a02034
 
b18692e
d26d216
1a02034
0d8bd1c
 
1a02034
0d8bd1c
1a02034
aa70416

import gradio as gr
import requests
from lxml import html
from bs4 import BeautifulSoup
from urllib.parse import urljoin

def scrape_naver_blog(url):
    try:
        # 세션 생성
        session = requests.Session()
        
        # HTTP 요청 헤더 설정 (네이버는 User-Agent를 확인할 수 있음)
        headers = {
            'User-Agent': ('Mozilla/5.0 (Windows NT 10.0; Win64; x64) '
                           'AppleWebKit/537.36 (KHTML, like Gecko) '
                           'Chrome/58.0.3029.110 Safari/537.3')
        }
        
        # 1. 메인 페이지 요청
        response = session.get(url, headers=headers)
        
        if response.status_code != 200:
            return f"Error: Unable to fetch the main page. Status code: {response.status_code}", ""
        
        # 디버깅: 메인 페이지의 HTML 일부 출력 (처음 5000자)
        soup_debug_main = BeautifulSoup(response.text, 'html.parser')
        debug_info = soup_debug_main.prettify()[:5000]
        
        # BeautifulSoup을 사용하여 iframe src 추출
        soup_main = BeautifulSoup(response.text, 'html.parser')
        iframe = soup_main.find('iframe', id='mainFrame')
        if not iframe:
            return "Error: iframe을 찾을 수 없습니다.", debug_info
        
        iframe_src = iframe.get('src')
        if not iframe_src:
            return "Error: iframe의 src 속성을 찾을 수 없습니다.", debug_info
        
        # iframe src가 상대 경로일 경우 절대 경로로 변환
        iframe_url = urljoin(url, iframe_src)
        
        # 2. iframe 페이지 요청
        iframe_response = session.get(iframe_url, headers=headers)
        
        if iframe_response.status_code != 200:
            return f"Error: Unable to fetch the iframe page. Status code: {iframe_response.status_code}", debug_info
        
        # 디버깅: iframe 페이지의 HTML 일부 추가 출력 (처음 5000자)
        soup_debug_iframe = BeautifulSoup(iframe_response.text, 'html.parser')
        debug_info += "\n\n=== iframe HTML 일부 ===\n" + soup_debug_iframe.prettify()[:5000]
        
        # 3. lxml을 사용하여 iframe HTML 파싱
        tree = html.fromstring(iframe_response.content)
        
        # 사용자 제공 XPath를 사용하여 제목과 본문 추출
        title_xpath = '/html/body/div[7]/div[1]/div[2]/div[2]/div[2]/div[1]/div/div/div[10]/div[1]/div/table[2]/tbody/tr/td[2]/div[1]/div/div/div[1]/div/div/div[2]/div/p'
        body_xpath = '/html/body/div[7]/div[1]/div[2]/div[2]/div[2]/div[1]/div/div/div[10]/div[1]/div/table[2]/tbody/tr/td[2]/div[1]/div/div/div[2]/div[1]/div/div/div/p[1]'
        
        # 제목 추출
        title_elements = tree.xpath(title_xpath)
        if not title_elements:
            # XPath로 제목을 찾지 못한 경우, og:title 메타 태그 사용
            meta_title = tree.xpath('//meta[@property="og:title"]/@content')
            if meta_title:
                title = meta_title[0].strip()
            else:
                title = "제목을 찾을 수 없습니다."
        else:
            # 추출된 요소의 텍스트 합치기
            title = ''.join(title_elements[0].itertext()).strip()
        
        # 본문 추출
        body_elements = tree.xpath(body_xpath)
        if not body_elements:
            # XPath로 본문을 찾지 못한 경우, og:description 메타 태그 사용
            meta_description = tree.xpath('//meta[@property="og:description"]/@content')
            if meta_description:
                body = meta_description[0].strip()
            else:
                body = "내용을 찾을 수 없습니다."
        else:
            body = ''.join(body_elements[0].itertext()).strip()
        
        # 출력 형식
        output_title = f"제목 :\n{title}"
        output_content = f"내용 :\n{body}"
        
        # 최종 디버깅 정보
        final_debug_info = f"디버깅 정보 (메인 페이지 HTML 일부):\n{debug_info}"
        
        return f"{output_title}\n\n{output_content}", final_debug_info
    
    except Exception as e:
        # 예외 발생 시 에러 메시지와 빈 디버깅 정보를 반환
        return f"An error occurred: {str(e)}", ""

# Gradio 인터페이스 구성 (함수 밖에 위치)
title = "네이버 블로그 스크래퍼"
description = "네이버 블로그 URL을 입력하면 제목과 내용을 스크래핑합니다."

iface = gr.Interface(
    fn=scrape_naver_blog,
    inputs=gr.Textbox(lines=2, placeholder="https://blog.naver.com/...", label="블로그 URL"),
    outputs=[
        gr.Textbox(label="결과"),
        gr.Textbox(label="디버깅 정보")
    ],
    title=title,
    description=description,
    allow_flagging="never"
)

iface.launch()