Update app.py
Browse files
app.py
CHANGED
|
@@ -1,89 +1,68 @@
|
|
|
|
|
| 1 |
import requests
|
|
|
|
| 2 |
from bs4 import BeautifulSoup
|
| 3 |
-
import gradio as gr
|
| 4 |
|
| 5 |
def scrape_naver_blog(url):
|
| 6 |
try:
|
| 7 |
-
#
|
| 8 |
-
print(f"Received URL: {url}")
|
| 9 |
-
|
| 10 |
-
# HTTP ์์ฒญ ํค๋ ์ค์ (๋ค์ด๋ฒ ๋ธ๋ก๊ทธ๋ User-Agent๊ฐ ํ์ํ ์ ์์)
|
| 11 |
headers = {
|
| 12 |
-
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)'
|
| 13 |
-
|
| 14 |
-
|
| 15 |
-
# ์น ํ์ด์ง ๊ฐ์ ธ์ค๊ธฐ
|
| 16 |
response = requests.get(url, headers=headers)
|
| 17 |
-
|
| 18 |
-
|
| 19 |
if response.status_code != 200:
|
| 20 |
-
return f"Error: Unable to fetch the page. Status code: {response.status_code}"
|
| 21 |
-
|
| 22 |
-
# BeautifulSoup์ ์ฌ์ฉํ์ฌ HTML ํ์ฑ
|
| 23 |
-
soup = BeautifulSoup(response.content, 'html.parser')
|
| 24 |
-
|
| 25 |
-
# ์ ๋ชฉ ์ถ์ถ
|
| 26 |
-
# ์ค์ HTML ๊ตฌ์กฐ์ ๋ง๊ฒ ํด๋์ค๋ช
๊ณผ ํ๊ทธ๋ฅผ ์์ ํด์ผ ํฉ๋๋ค.
|
| 27 |
-
title = None
|
| 28 |
-
|
| 29 |
-
# ์์ 1: <h3 class="se_textarea">์ ์ ๋ชฉ์ด ์๋ ๊ฒฝ์ฐ
|
| 30 |
-
title_element = soup.find('h3', class_='se_textarea')
|
| 31 |
-
if title_element and title_element.get_text(strip=True):
|
| 32 |
-
title = title_element.get_text(strip=True)
|
| 33 |
-
print(f"์ถ์ถ๋ ์ ๋ชฉ (h3.se_textarea): {title}")
|
| 34 |
|
| 35 |
-
#
|
| 36 |
-
|
| 37 |
-
|
| 38 |
-
|
| 39 |
-
|
| 40 |
-
|
| 41 |
-
|
| 42 |
-
|
| 43 |
-
|
|
|
|
|
|
|
| 44 |
title = "์ ๋ชฉ์ ์ฐพ์ ์ ์์ต๋๋ค."
|
| 45 |
-
|
| 46 |
-
# ๋ด์ฉ ํ
์คํธ ์ถ์ถ
|
| 47 |
-
# ์ค์ HTML ๊ตฌ์กฐ์ ๋ง๊ฒ ํด๋์ค๋ช
๊ณผ ํ๊ทธ๋ฅผ ์์ ํด์ผ ํฉ๋๋ค.
|
| 48 |
-
content = None
|
| 49 |
-
|
| 50 |
-
# ์์ 1: <div class="se-main-container"> ๋ด์ ๋ชจ๋ ํ
์คํธ ์ถ์ถ
|
| 51 |
-
content_container = soup.find('div', class_='se-main-container')
|
| 52 |
-
if content_container:
|
| 53 |
-
content = content_container.get_text(separator='\n', strip=True)
|
| 54 |
-
print(f"์ถ์ถ๋ ๋ด์ฉ (div.se-main-container): {content[:100]}...") # ์ผ๋ถ๋ง ์ถ๋ ฅ
|
| 55 |
else:
|
| 56 |
-
|
| 57 |
-
|
| 58 |
-
|
| 59 |
-
|
| 60 |
-
|
| 61 |
-
|
| 62 |
-
|
| 63 |
-
|
| 64 |
-
|
| 65 |
-
|
| 66 |
-
|
| 67 |
-
|
| 68 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 69 |
except Exception as e:
|
| 70 |
-
|
| 71 |
-
print(f"์์ธ ๋ฐ์: {e}")
|
| 72 |
-
return f"An error occurred: {e}"
|
| 73 |
|
| 74 |
-
# Gradio ์ธํฐํ์ด์ค
|
| 75 |
-
|
| 76 |
-
|
| 77 |
-
|
| 78 |
-
|
| 79 |
-
|
| 80 |
-
|
| 81 |
-
|
| 82 |
-
|
| 83 |
-
|
| 84 |
-
|
| 85 |
-
|
| 86 |
-
|
|
|
|
|
|
|
| 87 |
|
| 88 |
-
# ์ ํ๋ฆฌ์ผ์ด์
์คํ
|
| 89 |
iface.launch()
|
|
|
|
| 1 |
+
import gradio as gr
|
| 2 |
import requests
|
| 3 |
+
from lxml import html
|
| 4 |
from bs4 import BeautifulSoup
|
|
|
|
| 5 |
|
| 6 |
def scrape_naver_blog(url):
|
| 7 |
try:
|
| 8 |
+
# HTTP ์์ฒญ ํค๋ ์ค์ (๋ค์ด๋ฒ๋ User-Agent๋ฅผ ํ์ธํ ์ ์์)
|
|
|
|
|
|
|
|
|
|
| 9 |
headers = {
|
| 10 |
+
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)'
|
| 11 |
+
' Chrome/58.0.3029.110 Safari/537.3'}
|
|
|
|
|
|
|
| 12 |
response = requests.get(url, headers=headers)
|
| 13 |
+
|
|
|
|
| 14 |
if response.status_code != 200:
|
| 15 |
+
return f"Error: Unable to fetch the page. Status code: {response.status_code}", ""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 16 |
|
| 17 |
+
# ๋๋ฒ๊น
: ์๋ต ๋ฐ์ HTML ๋ด์ฉ ์ผ๋ถ ์ถ๋ ฅ
|
| 18 |
+
soup_debug = BeautifulSoup(response.text, 'html.parser')
|
| 19 |
+
debug_info = soup_debug.prettify()[:1000] # ์ฒ์ 1000์๋ง ํ์
|
| 20 |
+
|
| 21 |
+
# lxml์ ์ฌ์ฉํ์ฌ HTML ํ์ฑ
|
| 22 |
+
tree = html.fromstring(response.content)
|
| 23 |
+
|
| 24 |
+
# ์ ๋ชฉ ์ถ์ถ
|
| 25 |
+
title_xpath = '/html/body/div[7]/div[1]/div[2]/div[2]/div[2]/div[1]/div/div/div[10]/div[1]/div/table[2]/tbody/tr/td[2]/div[1]/div/div/div[1]/div/div/div[2]/div/p/span'
|
| 26 |
+
title_elements = tree.xpath(title_xpath)
|
| 27 |
+
if not title_elements:
|
| 28 |
title = "์ ๋ชฉ์ ์ฐพ์ ์ ์์ต๋๋ค."
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 29 |
else:
|
| 30 |
+
title = ''.join(title_elements[0].itertext()).strip()
|
| 31 |
+
|
| 32 |
+
# ๋ด์ฉ ์ถ์ถ
|
| 33 |
+
content_xpath = '/html/body/div[7]/div[1]/div[2]/div[2]/div[2]/div[1]/div/div/div[10]/div[1]/div/table[2]/tbody/tr/td[2]/div[1]/div/div/div[2]/div[1]/div/div/div/p[1]/span/b'
|
| 34 |
+
content_elements = tree.xpath(content_xpath)
|
| 35 |
+
if not content_elements:
|
| 36 |
+
content = "๋ด์ฉ์ ์ฐพ์ ์ ์์ต๋๋ค."
|
| 37 |
+
else:
|
| 38 |
+
content = ''.join(content_elements[0].itertext()).strip()
|
| 39 |
+
|
| 40 |
+
# ์ถ๋ ฅ ํ์
|
| 41 |
+
output_title = f"์ ๋ชฉ :\n{title}"
|
| 42 |
+
output_content = f"๋ด์ฉ :\n{content}"
|
| 43 |
+
|
| 44 |
+
# ๋๋ฒ๊น
์ ๋ณด ์ถ๊ฐ
|
| 45 |
+
debug_output = f"๋๋ฒ๊น
์ ๋ณด (HTML ์ผ๋ถ):\n{debug_info}"
|
| 46 |
+
|
| 47 |
+
return f"{output_title}\n\n{output_content}", debug_output
|
| 48 |
+
|
| 49 |
except Exception as e:
|
| 50 |
+
return f"An error occurred: {str(e)}", ""
|
|
|
|
|
|
|
| 51 |
|
| 52 |
+
# Gradio ์ธํฐํ์ด์ค ๊ตฌ์ฑ
|
| 53 |
+
title = "๋ค์ด๋ฒ ๋ธ๋ก๊ทธ ์คํฌ๋ํผ"
|
| 54 |
+
description = "๋ค์ด๋ฒ ๋ธ๋ก๊ทธ URL์ ์
๋ ฅํ๋ฉด ์ ๋ชฉ๊ณผ ๋ด์ฉ์ ์คํฌ๋ํํฉ๋๋ค."
|
| 55 |
+
|
| 56 |
+
iface = gr.Interface(
|
| 57 |
+
fn=scrape_naver_blog,
|
| 58 |
+
inputs=gr.inputs.Textbox(lines=2, placeholder="https://blog.naver.com/...", label="๋ธ๋ก๊ทธ URL"),
|
| 59 |
+
outputs=[
|
| 60 |
+
gr.outputs.Textbox(label="๊ฒฐ๊ณผ"),
|
| 61 |
+
gr.outputs.Textbox(label="๋๋ฒ๊น
์ ๋ณด")
|
| 62 |
+
],
|
| 63 |
+
title=title,
|
| 64 |
+
description=description,
|
| 65 |
+
allow_flagging="never"
|
| 66 |
+
)
|
| 67 |
|
|
|
|
| 68 |
iface.launch()
|