Spaces:

Kims12
/

ddd

Sleeping

App Files Files Community

Kims12 commited on Jan 20, 2025

Commit

1a02034

verified ·

1 Parent(s): ddfb11c

Update app.py

Browse files

Files changed (1) hide show

app.py +53 -74

app.py CHANGED Viewed

@@ -1,89 +1,68 @@
 import requests
 from bs4 import BeautifulSoup
-import gradio as gr
 def scrape_naver_blog(url):
     try:
-        # 디버깅: URL 수신 확인
-        print(f"Received URL: {url}")
-        # HTTP 요청 헤더 설정 (네이버 블로그는 User-Agent가 필요할 수 있음)
         headers = {
-            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)'
-        }
-        # 웹 페이지 가져오기
         response = requests.get(url, headers=headers)
-        print(f"HTTP GET 요청 상태 코드: {response.status_code}")
         if response.status_code != 200:
-            return f"Error: Unable to fetch the page. Status code: {response.status_code}"
-        # BeautifulSoup을 사용하여 HTML 파싱
-        soup = BeautifulSoup(response.content, 'html.parser')
-        # 제목 추출
-        # 실제 HTML 구조에 맞게 클래스명과 태그를 수정해야 합니다.
-        title = None
-        # 예시 1: <h3 class="se_textarea">에 제목이 있는 경우
-        title_element = soup.find('h3', class_='se_textarea')
-        if title_element and title_element.get_text(strip=True):
-            title = title_element.get_text(strip=True)
-            print(f"추출된 제목 (h3.se_textarea): {title}")
-        # 예시 2: meta 태그에서 제목 추출
-        if not title:
-            title_meta = soup.find('meta', property='og:title')
-            if title_meta and title_meta.get('content'):
-                title = title_meta.get('content').strip()
-                print(f"추출된 제목 (meta og:title): {title}")
-        if not title:
-            print("제목을 찾을 수 없습니다.")
             title = "제목을 찾을 수 없습니다."
-        # 내용 텍스트 추출
-        # 실제 HTML 구조에 맞게 클래스명과 태그를 수정해야 합니다.
-        content = None
-        # 예시 1: <div class="se-main-container"> 내의 모든 텍스트 추출
-        content_container = soup.find('div', class_='se-main-container')
-        if content_container:
-            content = content_container.get_text(separator='\n', strip=True)
-            print(f"추출된 내용 (div.se-main-container): {content[:100]}...")  # 일부만 출력
         else:
-            # 예시 2: 모든 <p> 태그를 합치는 방법
-            p_tags = soup.find_all('p')
-            if p_tags:
-                content = '\n'.join([p.get_text(strip=True) for p in p_tags])
-                print(f"추출된 내용 (p tags): {content[:100]}...")
-            else:
-                print("내용을 찾을 수 없습니다.")
-                content = "내용을 찾을 수 없습니다."
-        # 출력 형식 지정
-        output = f"제목: {title}\n\n내용: {content}"
-        return output
     except Exception as e:
-        # 예외 발생 시 디버깅 정보 반환
-        print(f"예외 발생: {e}")
-        return f"An error occurred: {e}"
-# Gradio 인터페이스 설정
-with gr.Blocks() as iface:
-    gr.Markdown("# 네이버 블로그 스크래퍼")
-    gr.Markdown("네이버 블로그 URL을 입력하면 제목과 ��용을 추출합니다.")
-    with gr.Row():
-        url_input = gr.Textbox(label="네이버 블로그 URL", placeholder="https://blog.naver.com/lafleur7/223723723486")
-    scrape_button = gr.Button("스크래핑")
-    output_text = gr.Textbox(label="결과", lines=20)
-    scrape_button.click(fn=scrape_naver_blog, inputs=url_input, outputs=output_text)
-# 애플리케이션 실행
 iface.launch()

+import gradio as gr
 import requests
+from lxml import html
 from bs4 import BeautifulSoup
 def scrape_naver_blog(url):
     try:
+        # HTTP 요청 헤더 설정 (네이버는 User-Agent를 확인할 수 있음)
         headers = {
+            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)'
+                          ' Chrome/58.0.3029.110 Safari/537.3'}
         response = requests.get(url, headers=headers)
         if response.status_code != 200:
+            return f"Error: Unable to fetch the page. Status code: {response.status_code}", ""
+        # 디버깅: 응답 받은 HTML 내용 일부 출력
+        soup_debug = BeautifulSoup(response.text, 'html.parser')
+        debug_info = soup_debug.prettify()[:1000]  # 처음 1000자만 표시
+        # lxml을 사용하여 HTML 파싱
+        tree = html.fromstring(response.content)
+        # 제목 추출
+        title_xpath = '/html/body/div[7]/div[1]/div[2]/div[2]/div[2]/div[1]/div/div/div[10]/div[1]/div/table[2]/tbody/tr/td[2]/div[1]/div/div/div[1]/div/div/div[2]/div/p/span'
+        title_elements = tree.xpath(title_xpath)
+        if not title_elements:
             title = "제목을 찾을 수 없습니다."
         else:
+            title = ''.join(title_elements[0].itertext()).strip()
+        # 내용 추출
+        content_xpath = '/html/body/div[7]/div[1]/div[2]/div[2]/div[2]/div[1]/div/div/div[10]/div[1]/div/table[2]/tbody/tr/td[2]/div[1]/div/div/div[2]/div[1]/div/div/div/p[1]/span/b'
+        content_elements = tree.xpath(content_xpath)
+        if not content_elements:
+            content = "내용을 찾을 수 없습니다."
+        else:
+            content = ''.join(content_elements[0].itertext()).strip()
+        # 출력 형식
+        output_title = f"제목 :\n{title}"
+        output_content = f"내용 :\n{content}"
+        # 디버깅 정보 추가
+        debug_output = f"디버깅 정보 (HTML 일부):\n{debug_info}"
+        return f"{output_title}\n\n{output_content}", debug_output
     except Exception as e:
+        return f"An error occurred: {str(e)}", ""
+# Gradio 인터페이스 구성
+title = "네이버 블로그 스크래퍼"
+description = "네이버 블로그 URL을 입력하면 제목과 내용을 스크래핑합니다."
+iface = gr.Interface(
+    fn=scrape_naver_blog,
+    inputs=gr.inputs.Textbox(lines=2, placeholder="https://blog.naver.com/...", label="블로그 URL"),
+    outputs=[
+        gr.outputs.Textbox(label="결과"),
+        gr.outputs.Textbox(label="디버깅 정보")
+    ],
+    title=title,
+    description=description,
+    allow_flagging="never"
+)
 iface.launch()