Spaces:

Kims12
/

ddd

Sleeping

App Files Files Community

ddd / app.py

Kims12

Update app.py

aa70416 verified about 1 year ago

raw

history blame contribute delete

4.81 kB

	import gradio as gr
	import requests
	from lxml import html
	from bs4 import BeautifulSoup
	from urllib.parse import urljoin

	def scrape_naver_blog(url):
	try:
	# 세션 생성
	session = requests.Session()

	# HTTP 요청 헤더 설정 (네이버는 User-Agent를 확인할 수 있음)
	headers = {
	'User-Agent': ('Mozilla/5.0 (Windows NT 10.0; Win64; x64) '
	'AppleWebKit/537.36 (KHTML, like Gecko) '
	'Chrome/58.0.3029.110 Safari/537.3')
	}

	# 1. 메인 페이지 요청
	response = session.get(url, headers=headers)

	if response.status_code != 200:
	return f"Error: Unable to fetch the main page. Status code: {response.status_code}", ""

	# 디버깅: 메인 페이지의 HTML 일부 출력 (처음 5000자)
	soup_debug_main = BeautifulSoup(response.text, 'html.parser')
	debug_info = soup_debug_main.prettify()[:5000]

	# BeautifulSoup을 사용하여 iframe src 추출
	soup_main = BeautifulSoup(response.text, 'html.parser')
	iframe = soup_main.find('iframe', id='mainFrame')
	if not iframe:
	return "Error: iframe을 찾을 수 없습니다.", debug_info

	iframe_src = iframe.get('src')
	if not iframe_src:
	return "Error: iframe의 src 속성을 찾을 수 없습니다.", debug_info

	# iframe src가 상대 경로일 경우 절대 경로로 변환
	iframe_url = urljoin(url, iframe_src)

	# 2. iframe 페이지 요청
	iframe_response = session.get(iframe_url, headers=headers)

	if iframe_response.status_code != 200:
	return f"Error: Unable to fetch the iframe page. Status code: {iframe_response.status_code}", debug_info

	# 디버깅: iframe 페이지의 HTML 일부 추가 출력 (처음 5000자)
	soup_debug_iframe = BeautifulSoup(iframe_response.text, 'html.parser')
	debug_info += "\n\n=== iframe HTML 일부 ===\n" + soup_debug_iframe.prettify()[:5000]

	# 3. lxml을 사용하여 iframe HTML 파싱
	tree = html.fromstring(iframe_response.content)

	# 사용자 제공 XPath를 사용하여 제목과 본문 추출
	title_xpath = '/html/body/div[7]/div[1]/div[2]/div[2]/div[2]/div[1]/div/div/div[10]/div[1]/div/table[2]/tbody/tr/td[2]/div[1]/div/div/div[1]/div/div/div[2]/div/p'
	body_xpath = '/html/body/div[7]/div[1]/div[2]/div[2]/div[2]/div[1]/div/div/div[10]/div[1]/div/table[2]/tbody/tr/td[2]/div[1]/div/div/div[2]/div[1]/div/div/div/p[1]'

	# 제목 추출
	title_elements = tree.xpath(title_xpath)
	if not title_elements:
	# XPath로 제목을 찾지 못한 경우, og:title 메타 태그 사용
	meta_title = tree.xpath('//meta[@property="og:title"]/@content')
	if meta_title:
	title = meta_title[0].strip()
	else:
	title = "제목을 찾을 수 없습니다."
	else:
	# 추출된 요소의 텍스트 합치기
	title = ''.join(title_elements[0].itertext()).strip()

	# 본문 추출
	body_elements = tree.xpath(body_xpath)
	if not body_elements:
	# XPath로 본문을 찾지 못한 경우, og:description 메타 태그 사용
	meta_description = tree.xpath('//meta[@property="og:description"]/@content')
	if meta_description:
	body = meta_description[0].strip()
	else:
	body = "내용을 찾을 수 없습니다."
	else:
	body = ''.join(body_elements[0].itertext()).strip()

	# 출력 형식
	output_title = f"제목 :\n{title}"
	output_content = f"내용 :\n{body}"

	# 최종 디버깅 정보
	final_debug_info = f"디버깅 정보 (메인 페이지 HTML 일부):\n{debug_info}"

	return f"{output_title}\n\n{output_content}", final_debug_info

	except Exception as e:
	# 예외 발생 시 에러 메시지와 빈 디버깅 정보를 반환
	return f"An error occurred: {str(e)}", ""

	# Gradio 인터페이스 구성 (함수 밖에 위치)
	title = "네이버 블로그 스크래퍼"
	description = "네이버 블로그 URL을 입력하면 제목과 내용을 스크래핑합니다."

	iface = gr.Interface(
	fn=scrape_naver_blog,
	inputs=gr.Textbox(lines=2, placeholder="https://blog.naver.com/...", label="블로그 URL"),
	outputs=[
	gr.Textbox(label="결과"),
	gr.Textbox(label="디버깅 정보")
	],
	title=title,
	description=description,
	allow_flagging="never"
	)

	iface.launch()