File size: 4,485 Bytes
4d71151
 
 
ed67663
4d71151
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ed67663
4d71151
ed67663
4d71151
 
 
 
 
 
ed67663
4d71151
ed67663
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4d71151
 
ed67663
4d71151
 
 
 
ed67663
 
 
 
 
 
4d71151
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ed67663
4d71151
 
 
 
ed67663
4d71151
 
 
 
 
 
 
 
 
ed67663
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
import gradio as gr
import requests
from bs4 import BeautifulSoup
import urllib.parse  # iframe ๊ฒฝ๋กœ๊ฐ€ ์ƒ๋Œ€๊ฒฝ๋กœ์ผ ๊ฒฝ์šฐ ์ ˆ๋Œ€๊ฒฝ๋กœ๋กœ ๋งŒ๋“ค๊ธฐ ์œ„ํ•ด ์‚ฌ์šฉ

# ๋””๋ฒ„๊น…(๋กœ๊ทธ)์šฉ ํ•จ์ˆ˜
def debug_log(message: str):
    """
    ๊ฐ„๋‹จํ•œ ๋””๋ฒ„๊น…(๋กœ๊ทธ) ์ถœ๋ ฅ์„ ์œ„ํ•œ ํ•จ์ˆ˜
    """
    print(f"[DEBUG] {message}")

def scrape_naver_blog(url: str) -> str:
    """
    ์ฃผ์–ด์ง„ ๋„ค์ด๋ฒ„ ๋ธ”๋กœ๊ทธ URL์—์„œ
    ์ œ๋ชฉ๊ณผ ๋ณธ๋ฌธ์„ ์ถ”์ถœํ•˜์—ฌ ๋ฐ˜ํ™˜ํ•ฉ๋‹ˆ๋‹ค.
    """
    debug_log("scrape_naver_blog ํ•จ์ˆ˜ ์‹œ์ž‘")
    debug_log(f"์š”์ฒญ๋ฐ›์€ URL: {url}")

    # ํ—ค๋” ์„ธํŒ…(ํฌ๋กค๋ง ์ฐจ๋‹จ ๋ฐฉ์ง€ ์ผ๋ถ€ ๋„์›€)
    headers = {
        "User-Agent": (
            "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
            "AppleWebKit/537.36 (KHTML, like Gecko) "
            "Chrome/96.0.4664.110 Safari/537.36"
        )
    }

    try:
        # 1) ๋„ค์ด๋ฒ„ ๋ธ”๋กœ๊ทธ '๋ฉ”์ธ' ํŽ˜์ด์ง€ ์š”์ฒญ
        response = requests.get(url, headers=headers)
        debug_log("HTTP GET ์š”์ฒญ(๋ฉ”์ธ ํŽ˜์ด์ง€) ์™„๋ฃŒ")

        # ์‘๋‹ต ์ƒํƒœ์ฝ”๋“œ ํ™•์ธ
        if response.status_code != 200:
            debug_log(f"์š”์ฒญ ์‹คํŒจ, ์ƒํƒœ์ฝ”๋“œ: {response.status_code}")
            return f"์˜ค๋ฅ˜๊ฐ€ ๋ฐœ์ƒํ–ˆ์Šต๋‹ˆ๋‹ค. ์ƒํƒœ์ฝ”๋“œ: {response.status_code}"

        # BeautifulSoup ํŒŒ์‹ฑ (๋ฉ”์ธ ํŽ˜์ด์ง€)
        soup = BeautifulSoup(response.text, "html.parser")
        debug_log("HTML ํŒŒ์‹ฑ(๋ฉ”์ธ ํŽ˜์ด์ง€) ์™„๋ฃŒ")

        # 2) iframe ํƒœ๊ทธ ์ฐพ๊ธฐ
        iframe = soup.select_one("iframe#mainFrame")
        if not iframe:
            # iframe ์ž์ฒด๋ฅผ ์ฐพ์ง€ ๋ชปํ•œ ๊ฒฝ์šฐ
            debug_log("iframe#mainFrame ํƒœ๊ทธ๋ฅผ ์ฐพ์„ ์ˆ˜ ์—†์Šต๋‹ˆ๋‹ค.")
            return "๋ณธ๋ฌธ iframe์„ ์ฐพ์„ ์ˆ˜ ์—†์Šต๋‹ˆ๋‹ค."

        iframe_src = iframe.get("src")
        if not iframe_src:
            debug_log("iframe src๊ฐ€ ์กด์žฌํ•˜์ง€ ์•Š์Šต๋‹ˆ๋‹ค.")
            return "๋ณธ๋ฌธ iframe์˜ src๋ฅผ ์ฐพ์„ ์ˆ˜ ์—†์Šต๋‹ˆ๋‹ค."

        # 3) iframe src๊ฐ€ ์ƒ๋Œ€๊ฒฝ๋กœ์ธ ๊ฒฝ์šฐ ์ ˆ๋Œ€๊ฒฝ๋กœ๋กœ ๋ณด์ •
        #    (์˜ˆ: //blog.naver.com/~~~ ์™€ ๊ฐ™์€ ๊ฒฝ์šฐ๋ฅผ ์ฒ˜๋ฆฌ)
        parsed_iframe_url = urllib.parse.urljoin(url, iframe_src)

        # iframe ํŽ˜์ด์ง€๋กœ ์žฌ์š”์ฒญ
        debug_log(f"iframe ํŽ˜์ด์ง€ ์š”์ฒญ URL: {parsed_iframe_url}")
        iframe_response = requests.get(parsed_iframe_url, headers=headers)
        debug_log("HTTP GET ์š”์ฒญ(iframe ํŽ˜์ด์ง€) ์™„๋ฃŒ")

        if iframe_response.status_code != 200:
            debug_log(f"iframe ์š”์ฒญ ์‹คํŒจ, ์ƒํƒœ์ฝ”๋“œ: {iframe_response.status_code}")
            return f"iframe์—์„œ ์˜ค๋ฅ˜๊ฐ€ ๋ฐœ์ƒํ–ˆ์Šต๋‹ˆ๋‹ค. ์ƒํƒœ์ฝ”๋“œ: {iframe_response.status_code}"

        # 4) iframe ํŽ˜์ด์ง€ ํŒŒ์‹ฑ
        iframe_soup = BeautifulSoup(iframe_response.text, "html.parser")
        debug_log("HTML ํŒŒ์‹ฑ(iframe ํŽ˜์ด์ง€) ์™„๋ฃŒ")

        # ์ œ๋ชฉ ์ถ”์ถœ
        title_div = iframe_soup.select_one('.se-module.se-module-text.se-title-text')
        title = title_div.get_text(strip=True) if title_div else "์ œ๋ชฉ์„ ์ฐพ์„ ์ˆ˜ ์—†์Šต๋‹ˆ๋‹ค."
        debug_log(f"์ถ”์ถœ๋œ ์ œ๋ชฉ: {title}")

        # ๋ณธ๋ฌธ ์ถ”์ถœ
        content_div = iframe_soup.select_one('.se-main-container')
        if content_div:
            # ๋ณธ๋ฌธ์„ \n ๊ธฐ์ค€์œผ๋กœ ๊ตฌ๋ถ„ํ•ด์„œ ์ข€ ๋” ๊น”๋”ํ•˜๊ฒŒ ๋งŒ๋“ค๊ธฐ
            content = content_div.get_text("\n", strip=True)
        else:
            content = "๋ณธ๋ฌธ์„ ์ฐพ์„ ์ˆ˜ ์—†์Šต๋‹ˆ๋‹ค."
        debug_log("๋ณธ๋ฌธ ์ถ”์ถœ ์™„๋ฃŒ")

        # ๊ฒฐ๊ณผ ํ•ฉ์น˜๊ธฐ
        result = f"[์ œ๋ชฉ]\n{title}\n\n[๋ณธ๋ฌธ]\n{content}"
        debug_log("์ œ๋ชฉ๊ณผ ๋ณธ๋ฌธ์„ ํ•ฉ์ณ ๋ฐ˜ํ™˜ ์ค€๋น„ ์™„๋ฃŒ")

        return result

    except Exception as e:
        debug_log(f"์—๋Ÿฌ ๋ฐœ์ƒ: {str(e)}")
        return f"์Šคํฌ๋ž˜ํ•‘ ์ค‘ ์˜ค๋ฅ˜๊ฐ€ ๋ฐœ์ƒํ–ˆ์Šต๋‹ˆ๋‹ค: {str(e)}"


# Gradio ์ธํ„ฐํŽ˜์ด์Šค
def main_interface():
    interface = gr.Interface(
        fn=scrape_naver_blog,
        inputs=gr.Textbox(
            lines=1,
            label="๋„ค์ด๋ฒ„ ๋ธ”๋กœ๊ทธ ๋งํฌ",
            placeholder="์˜ˆ: https://blog.naver.com/ssboost/222983068507"
        ),
        outputs=gr.Textbox(label="๊ฒฐ๊ณผ"),
        title="๋„ค์ด๋ฒ„ ๋ธ”๋กœ๊ทธ ์Šคํฌ๋ž˜ํผ",
        description="๋„ค์ด๋ฒ„ ๋ธ”๋กœ๊ทธ ๋งํฌ๋ฅผ ์ž…๋ ฅํ•˜๋ฉด ์ œ๋ชฉ๊ณผ ๋ณธ๋ฌธ์„ ์ถ”์ถœํ•˜์—ฌ ํ‘œ์‹œํ•ฉ๋‹ˆ๋‹ค."
    )
    return interface

if __name__ == "__main__":
    debug_log("Gradio ์•ฑ ์‹คํ–‰ ์‹œ์ž‘")
    demo = main_interface()
    demo.launch()
    debug_log("Gradio ์•ฑ ์‹คํ–‰ ์ข…๋ฃŒ")