File size: 4,811 Bytes
1a02034
36ac952
1a02034
f8c17f1
0d8bd1c
36ac952
 
 
0d8bd1c
 
 
1a02034
36ac952
d26d216
 
 
0d8bd1c
 
 
 
1a02034
36ac952
0d8bd1c
a241c57
d26d216
 
 
1a02034
0d8bd1c
d26d216
 
0d8bd1c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d26d216
0d8bd1c
 
 
d26d216
 
 
 
 
 
1a02034
b18692e
d26d216
 
c947445
 
 
 
 
 
d26d216
 
 
1a02034
b18692e
d26d216
 
c947445
 
 
 
 
 
1a02034
d26d216
1a02034
 
b18692e
d26d216
1a02034
0d8bd1c
 
1a02034
0d8bd1c
1a02034
aa70416
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
import gradio as gr
import requests
from lxml import html
from bs4 import BeautifulSoup
from urllib.parse import urljoin

def scrape_naver_blog(url):
    try:
        # ์„ธ์…˜ ์ƒ์„ฑ
        session = requests.Session()
        
        # HTTP ์š”์ฒญ ํ—ค๋” ์„ค์ • (๋„ค์ด๋ฒ„๋Š” User-Agent๋ฅผ ํ™•์ธํ•  ์ˆ˜ ์žˆ์Œ)
        headers = {
            'User-Agent': ('Mozilla/5.0 (Windows NT 10.0; Win64; x64) '
                           'AppleWebKit/537.36 (KHTML, like Gecko) '
                           'Chrome/58.0.3029.110 Safari/537.3')
        }
        
        # 1. ๋ฉ”์ธ ํŽ˜์ด์ง€ ์š”์ฒญ
        response = session.get(url, headers=headers)
        
        if response.status_code != 200:
            return f"Error: Unable to fetch the main page. Status code: {response.status_code}", ""
        
        # ๋””๋ฒ„๊น…: ๋ฉ”์ธ ํŽ˜์ด์ง€์˜ HTML ์ผ๋ถ€ ์ถœ๋ ฅ (์ฒ˜์Œ 5000์ž)
        soup_debug_main = BeautifulSoup(response.text, 'html.parser')
        debug_info = soup_debug_main.prettify()[:5000]
        
        # BeautifulSoup์„ ์‚ฌ์šฉํ•˜์—ฌ iframe src ์ถ”์ถœ
        soup_main = BeautifulSoup(response.text, 'html.parser')
        iframe = soup_main.find('iframe', id='mainFrame')
        if not iframe:
            return "Error: iframe์„ ์ฐพ์„ ์ˆ˜ ์—†์Šต๋‹ˆ๋‹ค.", debug_info
        
        iframe_src = iframe.get('src')
        if not iframe_src:
            return "Error: iframe์˜ src ์†์„ฑ์„ ์ฐพ์„ ์ˆ˜ ์—†์Šต๋‹ˆ๋‹ค.", debug_info
        
        # iframe src๊ฐ€ ์ƒ๋Œ€ ๊ฒฝ๋กœ์ผ ๊ฒฝ์šฐ ์ ˆ๋Œ€ ๊ฒฝ๋กœ๋กœ ๋ณ€ํ™˜
        iframe_url = urljoin(url, iframe_src)
        
        # 2. iframe ํŽ˜์ด์ง€ ์š”์ฒญ
        iframe_response = session.get(iframe_url, headers=headers)
        
        if iframe_response.status_code != 200:
            return f"Error: Unable to fetch the iframe page. Status code: {iframe_response.status_code}", debug_info
        
        # ๋””๋ฒ„๊น…: iframe ํŽ˜์ด์ง€์˜ HTML ์ผ๋ถ€ ์ถ”๊ฐ€ ์ถœ๋ ฅ (์ฒ˜์Œ 5000์ž)
        soup_debug_iframe = BeautifulSoup(iframe_response.text, 'html.parser')
        debug_info += "\n\n=== iframe HTML ์ผ๋ถ€ ===\n" + soup_debug_iframe.prettify()[:5000]
        
        # 3. lxml์„ ์‚ฌ์šฉํ•˜์—ฌ iframe HTML ํŒŒ์‹ฑ
        tree = html.fromstring(iframe_response.content)
        
        # ์‚ฌ์šฉ์ž ์ œ๊ณต XPath๋ฅผ ์‚ฌ์šฉํ•˜์—ฌ ์ œ๋ชฉ๊ณผ ๋ณธ๋ฌธ ์ถ”์ถœ
        title_xpath = '/html/body/div[7]/div[1]/div[2]/div[2]/div[2]/div[1]/div/div/div[10]/div[1]/div/table[2]/tbody/tr/td[2]/div[1]/div/div/div[1]/div/div/div[2]/div/p'
        body_xpath = '/html/body/div[7]/div[1]/div[2]/div[2]/div[2]/div[1]/div/div/div[10]/div[1]/div/table[2]/tbody/tr/td[2]/div[1]/div/div/div[2]/div[1]/div/div/div/p[1]'
        
        # ์ œ๋ชฉ ์ถ”์ถœ
        title_elements = tree.xpath(title_xpath)
        if not title_elements:
            # XPath๋กœ ์ œ๋ชฉ์„ ์ฐพ์ง€ ๋ชปํ•œ ๊ฒฝ์šฐ, og:title ๋ฉ”ํƒ€ ํƒœ๊ทธ ์‚ฌ์šฉ
            meta_title = tree.xpath('//meta[@property="og:title"]/@content')
            if meta_title:
                title = meta_title[0].strip()
            else:
                title = "์ œ๋ชฉ์„ ์ฐพ์„ ์ˆ˜ ์—†์Šต๋‹ˆ๋‹ค."
        else:
            # ์ถ”์ถœ๋œ ์š”์†Œ์˜ ํ…์ŠคํŠธ ํ•ฉ์น˜๊ธฐ
            title = ''.join(title_elements[0].itertext()).strip()
        
        # ๋ณธ๋ฌธ ์ถ”์ถœ
        body_elements = tree.xpath(body_xpath)
        if not body_elements:
            # XPath๋กœ ๋ณธ๋ฌธ์„ ์ฐพ์ง€ ๋ชปํ•œ ๊ฒฝ์šฐ, og:description ๋ฉ”ํƒ€ ํƒœ๊ทธ ์‚ฌ์šฉ
            meta_description = tree.xpath('//meta[@property="og:description"]/@content')
            if meta_description:
                body = meta_description[0].strip()
            else:
                body = "๋‚ด์šฉ์„ ์ฐพ์„ ์ˆ˜ ์—†์Šต๋‹ˆ๋‹ค."
        else:
            body = ''.join(body_elements[0].itertext()).strip()
        
        # ์ถœ๋ ฅ ํ˜•์‹
        output_title = f"์ œ๋ชฉ :\n{title}"
        output_content = f"๋‚ด์šฉ :\n{body}"
        
        # ์ตœ์ข… ๋””๋ฒ„๊น… ์ •๋ณด
        final_debug_info = f"๋””๋ฒ„๊น… ์ •๋ณด (๋ฉ”์ธ ํŽ˜์ด์ง€ HTML ์ผ๋ถ€):\n{debug_info}"
        
        return f"{output_title}\n\n{output_content}", final_debug_info
    
    except Exception as e:
        # ์˜ˆ์™ธ ๋ฐœ์ƒ ์‹œ ์—๋Ÿฌ ๋ฉ”์‹œ์ง€์™€ ๋นˆ ๋””๋ฒ„๊น… ์ •๋ณด๋ฅผ ๋ฐ˜ํ™˜
        return f"An error occurred: {str(e)}", ""

# Gradio ์ธํ„ฐํŽ˜์ด์Šค ๊ตฌ์„ฑ (ํ•จ์ˆ˜ ๋ฐ–์— ์œ„์น˜)
title = "๋„ค์ด๋ฒ„ ๋ธ”๋กœ๊ทธ ์Šคํฌ๋ž˜ํผ"
description = "๋„ค์ด๋ฒ„ ๋ธ”๋กœ๊ทธ URL์„ ์ž…๋ ฅํ•˜๋ฉด ์ œ๋ชฉ๊ณผ ๋‚ด์šฉ์„ ์Šคํฌ๋ž˜ํ•‘ํ•ฉ๋‹ˆ๋‹ค."

iface = gr.Interface(
    fn=scrape_naver_blog,
    inputs=gr.Textbox(lines=2, placeholder="https://blog.naver.com/...", label="๋ธ”๋กœ๊ทธ URL"),
    outputs=[
        gr.Textbox(label="๊ฒฐ๊ณผ"),
        gr.Textbox(label="๋””๋ฒ„๊น… ์ •๋ณด")
    ],
    title=title,
    description=description,
    allow_flagging="never"
)

iface.launch()