Spaces:

Kims12
/

N_NEWS

Sleeping

App Files Files Community

Kims12 commited on Jan 21, 2025

Commit

c131937

verified ·

1 Parent(s): d90cf0f

Update app.py

Browse files

Files changed (1) hide show

app.py +65 -23

app.py CHANGED Viewed

@@ -17,8 +17,10 @@ def scrap_naver_news(keyword):
     4) HTML 표로 정리
     5) BytesIO -> 임시파일로 저장 -> 다운로드 가능하도록 반환
     6) 신문사 "언론사 선정" 문구 제거
-    7) 엑셀 데이터 data URI 링크를 추가로 생성하여 HTML에 삽입
     """
     debug_msgs = []
     base_url = "https://search.naver.com/search.naver?sm=tab_hty.top&where=news&ssc=tab.news.all&query="
@@ -32,30 +34,51 @@ def scrap_naver_news(keyword):
     soup = BeautifulSoup(response.text, "html.parser")
-    # div.news_area 내부에 기사 정보가 존재(요청 예시 구조 참고)
     news_list = soup.select("div.news_area")
     debug_msgs.append(f"[디버그] news_area 추출 개수: {len(news_list)}")
     results = []
     for idx, news in enumerate(news_list):
-        # 신문사
         try:
-            press = news.select_one(".info.press").get_text(strip=True)
             # "언론사 선정" 문구 제거
-            press = press.replace("언론사 선정", "").strip()
         except:
-            press = "확인불가"
-        # 날짜/발행일
         try:
             info_group = news.select_one(".info_group")
             info_all = info_group.select(".info")
-            date = info_all[-1].get_text(strip=True) if info_all else "확인불가"
         except:
-            date = "확인불가"
-        # 제목 & 링크
         try:
             title_elem = news.select_one(".news_tit")
             title = title_elem.get("title", "").strip()
@@ -64,18 +87,21 @@ def scrap_naver_news(keyword):
             title = "제목확인불가"
             link = ""
-        # 뉴스 간략정보
         try:
             desc_elem = news.select_one(".news_dsc .api_txt_lines")
             desc = desc_elem.get_text(strip=True) if desc_elem else "내용확인불가"
         except:
             desc = "내용확인불가"
-        debug_msgs.append(f"[디버그] {idx+1}번째 기사 파싱결과 -> 신문사: {press}, 발행일: {date}, 제목: {title}, 링크: {link}")
         results.append({
-            "신문사": press,
-            "발행일": date,
             "제목": title,
             "뉴스간략정보": desc,
             "링크": link
@@ -104,7 +130,7 @@ def scrap_naver_news(keyword):
         table_html += f"<td style='padding: 5px;'>{row['발행일']}</td>"
         table_html += f"<td style='padding: 5px;'>{row['제목']}</td>"
         table_html += f"<td style='padding: 5px;'>{row['뉴스간략정보']}</td>"
-        # 링크는 클릭 가능하도록 a 태그 삽입
         table_html += f"<td style='padding: 5px;'><a href='{row['링크']}' target='_blank'>바로가기</a></td>"
         table_html += "</tr>"
@@ -114,9 +140,27 @@ def scrap_naver_news(keyword):
     """
     # ------------------------------
-    # 엑셀(Excel) 생성
     # ------------------------------
-    df = pd.DataFrame(results)
     output_io = BytesIO()
     with pd.ExcelWriter(output_io, engine="openpyxl") as writer:
         df.to_excel(writer, index=False, sheet_name="네이버뉴스")
@@ -130,12 +174,10 @@ def scrap_naver_news(keyword):
     debug_msgs.append(f"[디버그] 엑셀 임시파일 생성 완료 -> {tmp_path}")
     # ------------------------------
-    # 엑셀 다운로드용 data URI 생성
     # ------------------------------
-    # 다시 output_io 포인터를 처음으로 되돌려 base64 생성
     output_io.seek(0)
     excel_data_base64 = base64.b64encode(output_io.getvalue()).decode()
-    # 클릭 시 다운로드(또는 Excel로 열기) 링크
     excel_download_html = f"""
     <br>
     <a
@@ -148,7 +190,7 @@ def scrap_naver_news(keyword):
     </a>
     """
-    # table_html 뒤에 엑셀 다운로드 링크를 추가
     table_html += excel_download_html
     return table_html, tmp_path, debug_msgs
@@ -165,7 +207,7 @@ def run_search(keyword):
     if file_path is None:
         return table_html, None, "\n".join(debug_info)
-    # Gradio의 File 출력
     return table_html, file_path, "\n".join(debug_info)
 def launch_app():
@@ -184,7 +226,7 @@ def launch_app():
         # 엑셀 다운로드 (Gradio File 컴포넌트)
         download_file = gr.File(label="엑셀 다운로드")
-        # 디버그 메세지
         debug_box = gr.Textbox(label="디버그 로그", lines=10)
         # 버튼 동작 정의

     4) HTML 표로 정리
     5) BytesIO -> 임시파일로 저장 -> 다운로드 가능하도록 반환
     6) 신문사 "언론사 선정" 문구 제거
+    7) 발행일이 '네이버뉴스'로 잘못 표기되는 경우 개선 (필요한 텍스트만 가져오기)
+    8) 엑셀 파일 내 링크를 클릭 시 바로 이동 가능하도록 HYPERLINK 공식 적용
     """
     debug_msgs = []
     base_url = "https://search.naver.com/search.naver?sm=tab_hty.top&where=news&ssc=tab.news.all&query="
     soup = BeautifulSoup(response.text, "html.parser")
+    # div.news_area 내부에 기사 정보가 존재
     news_list = soup.select("div.news_area")
     debug_msgs.append(f"[디버그] news_area 추출 개수: {len(news_list)}")
     results = []
     for idx, news in enumerate(news_list):
+        # ------------------------------------------------------------------
+        # [1] 신문사
+        # ------------------------------------------------------------------
         try:
+            press_text = news.select_one(".info.press").get_text(strip=True)
             # "언론사 선정" 문구 제거
+            press_text = press_text.replace("언론사 선정", "").strip()
         except:
+            press_text = "확인불가"
+        # ------------------------------------------------------------------
+        # [2] 발행일 (네이버뉴스 등 불필요한 항목 제외)
+        # ------------------------------------------------------------------
+        date_text = "확인불가"
         try:
             info_group = news.select_one(".info_group")
             info_all = info_group.select(".info")
+            # 왼쪽부터 순서대로 확인하며 '언론사 선정', '네이버뉴스', a 태그(링크) 등은 건너뛰고
+            # 최초로 만나는 나머지 텍스트를 발행일로 간주
+            for info_item in info_all:
+                t = info_item.get_text(strip=True)
+                # 건너뛸 조건
+                if "언론사 선정" in t:
+                    continue
+                if "네이버뉴스" in t:
+                    continue
+                # a 태그는 press 링크, 네이버뉴스 링크 등인 경우가 많으므로 건너뜀
+                if info_item.name == "a":
+                    continue
+                # 여기까지 왔다면 날짜일 가능성이 높음
+                date_text = t
+                break
         except:
+            pass
+        # ------------------------------------------------------------------
+        # [3] 제목 & 기사 링크
+        # ------------------------------------------------------------------
         try:
             title_elem = news.select_one(".news_tit")
             title = title_elem.get("title", "").strip()
             title = "제목확인불가"
             link = ""
+        # ------------------------------------------------------------------
+        # [4] 뉴스 간략정보
+        # ------------------------------------------------------------------
         try:
             desc_elem = news.select_one(".news_dsc .api_txt_lines")
             desc = desc_elem.get_text(strip=True) if desc_elem else "내용확인불가"
         except:
             desc = "내용확인불가"
+        debug_msgs.append(f"[디버그] {idx+1}번째 기사 파싱결과 -> 신문사: {press_text}, 발행일: {date_text}, 제목: {title}, 링크: {link}")
+        # 결과 리스트에 담기
         results.append({
+            "신문사": press_text,
+            "발행일": date_text,
             "제목": title,
             "뉴스간략정보": desc,
             "링크": link
         table_html += f"<td style='padding: 5px;'>{row['발행일']}</td>"
         table_html += f"<td style='padding: 5px;'>{row['제목']}</td>"
         table_html += f"<td style='padding: 5px;'>{row['뉴스간략정보']}</td>"
+        # HTML에서 링크 바로가기
         table_html += f"<td style='padding: 5px;'><a href='{row['링크']}' target='_blank'>바로가기</a></td>"
         table_html += "</tr>"
     """
     # ------------------------------
+    # [엑셀(Excel) 생성 및 링크 클릭 가능 처리]
+    # ------------------------------
+    # 1) DataFrame 생성
+    # 2) '링크' 열에 엑셀 하이퍼링크 공식 삽입
+    #    예) =HYPERLINK("http://기사링크", "바로가기")
     # ------------------------------
+    df_list = []
+    for row in results:
+        # 엑셀용 링크(클릭 시 바로 이동)
+        excel_link_formula = f'=HYPERLINK("{row["링크"]}", "바로가기")'
+        df_list.append({
+            "신문사": row["신문사"],
+            "발행일": row["발행일"],
+            "제목": row["제목"],
+            "뉴스간략정보": row["뉴스간략정보"],
+            "링크": excel_link_formula  # Excel 공식
+        })
+    df = pd.DataFrame(df_list)
+    # BytesIO에 저장
     output_io = BytesIO()
     with pd.ExcelWriter(output_io, engine="openpyxl") as writer:
         df.to_excel(writer, index=False, sheet_name="네이버뉴스")
     debug_msgs.append(f"[디버그] 엑셀 임시파일 생성 완료 -> {tmp_path}")
     # ------------------------------
+    # Gradio UI에서 직접 다운로드 가능한 data URI(선택사항)
     # ------------------------------
     output_io.seek(0)
     excel_data_base64 = base64.b64encode(output_io.getvalue()).decode()
     excel_download_html = f"""
     <br>
     <a
     </a>
     """
+    # 테이블 HTML 뒤에 다운로드 링크 추가
     table_html += excel_download_html
     return table_html, tmp_path, debug_msgs
     if file_path is None:
         return table_html, None, "\n".join(debug_info)
+    # Gradio의 File 컴포넌트 출력
     return table_html, file_path, "\n".join(debug_info)
 def launch_app():
         # 엑셀 다운로드 (Gradio File 컴포넌트)
         download_file = gr.File(label="엑셀 다운로드")
+        # 디버그 메시지
         debug_box = gr.Textbox(label="디버그 로그", lines=10)
         # 버튼 동작 정의