taeyeol commited on
Commit
8a0aec6
ยท
verified ยท
1 Parent(s): 642e75c

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +180 -0
app.py ADDED
@@ -0,0 +1,180 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ import requests
3
+ import pandas as pd
4
+ from bs4 import BeautifulSoup
5
+ import gradio as gr
6
+ from io import BytesIO
7
+ import tempfile
8
+ import os
9
+
10
+ def scrap_naver_news(keyword):
11
+ """
12
+ [๋””๋ฒ„๊น… ๋ฉ”์„ธ์ง€ ์ถ”๊ฐ€]
13
+ 1) ๊ฒ€์ƒ‰ ํ‚ค์›Œ๋“œ๋กœ ๋„ค์ด๋ฒ„๋‰ด์Šค ๊ฒ€์ƒ‰URL ๊ตฌ์„ฑ
14
+ 2) requests๋ฅผ ํ†ตํ•ด HTML ๋ฐ์ดํ„ฐ ์ˆ˜์‹ 
15
+ 3) BeautifulSoup๋กœ ํŒŒ์‹ฑํ•˜์—ฌ ๊ธฐ์‚ฌ ์ •๋ณด ์ˆ˜์ง‘
16
+ 4) HTML ํ‘œ๋กœ ์ •๋ฆฌ
17
+ 5) BytesIO -> ์ž„์‹œํŒŒ์ผ๋กœ ์ €์žฅ -> ๋‹ค์šด๋กœ๋“œ ๊ฐ€๋Šฅํ•˜๋„๋ก ๋ฐ˜ํ™˜
18
+ """
19
+ debug_msgs = []
20
+
21
+ base_url = "https://search.naver.com/search.naver?sm=tab_hty.top&where=news&ssc=tab.news.all&query="
22
+ target_url = base_url + keyword
23
+ debug_msgs.append(f"[๋””๋ฒ„๊ทธ] ์š”์ฒญ URL: {target_url}")
24
+
25
+ response = requests.get(target_url)
26
+ if response.status_code != 200:
27
+ debug_msgs.append(f"[์˜ค๋ฅ˜] ๋„ค์ด๋ฒ„ ๋‰ด์Šค ๊ฒ€์ƒ‰ ์‹คํŒจ (์‘๋‹ต ์ฝ”๋“œ: {response.status_code})")
28
+ return "์˜ค๋ฅ˜ ๋ฐœ์ƒ", None, debug_msgs
29
+
30
+ soup = BeautifulSoup(response.text, "html.parser")
31
+
32
+ # div.news_area ๋‚ด๋ถ€์— ๊ธฐ์‚ฌ ์ •๋ณด๊ฐ€ ์กด์žฌ(์š”์ฒญ ์˜ˆ์‹œ ๊ตฌ์กฐ ์ฐธ๊ณ )
33
+ news_list = soup.select("div.news_area")
34
+ debug_msgs.append(f"[๋””๋ฒ„๊ทธ] news_area ์ถ”์ถœ ๊ฐœ์ˆ˜: {len(news_list)}")
35
+
36
+ results = []
37
+
38
+ for idx, news in enumerate(news_list):
39
+ # ์‹ ๋ฌธ์‚ฌ
40
+ try:
41
+ press = news.select_one(".info.press").get_text(strip=True)
42
+ except:
43
+ press = "ํ™•์ธ๋ถˆ๊ฐ€"
44
+
45
+ # ๋‚ ์งœ/๋ฐœํ–‰์ผ
46
+ # - .info_group > .info:nth-last-child(1) ํ˜•ํƒœ๋กœ ๋“ค์–ด๊ฐ€๋Š” ๊ฒฝ์šฐ๊ฐ€ ๋งŽ์Œ
47
+ # - ์ผ์ž ์ •๋ณด๊ฐ€ ์—ฌ๋Ÿฌ ๊ฐœ๋ฉด ๋งˆ์ง€๋ง‰ ๊ฒƒ(๋˜๋Š” ์ค‘๊ฐ„) ๋“ฑ์„ ๊ณ ๋ ค
48
+ try:
49
+ info_group = news.select_one(".info_group")
50
+ # info ํƒœ๊ทธ๊ฐ€ ์—ฌ๋Ÿฌ๊ฐœ ์žˆ์„ ์ˆ˜ ์žˆ์œผ๋‹ˆ '1์ผ ์ „', '3์‹œ๊ฐ„ ์ „' ๋“ฑ๋งŒ ๊ณจ๋ผ๋‚ผ ์ˆ˜๋„ ์žˆ์Œ
51
+ # ์—ฌ๊ธฐ์„œ๋Š” ๊ฐ€์žฅ ๋งˆ์ง€๋ง‰ .info๋ฅผ ์ถ”์ถœ(๋‚ ์งœ ๋˜๋Š” '๋„ค์ด๋ฒ„๋‰ด์Šค' ๋“ฑ์ผ ์ˆ˜ ์žˆ์Œ)
52
+ # ์‹ค์ œ๋กœ๋Š” ์–ธ๋ก ์‚ฌ ์˜†์— ์žˆ๋Š”๊ฒŒ ๋‚ ์งœ์ด๋ฏ€๋กœ, ์ƒํ™ฉ์— ๋”ฐ๋ผ ํŒŒ์‹ฑ์ด ๋‹ฌ๋ผ์งˆ ์ˆ˜ ์žˆ์Œ
53
+ info_all = info_group.select(".info")
54
+ date = info_all[-1].get_text(strip=True) if info_all else "ํ™•์ธ๋ถˆ๊ฐ€"
55
+ # '๋„ค์ด๋ฒ„๋‰ด์Šค' ๋“ฑ ํ…์ŠคํŠธ๊ฐ€ ๋“ค์–ด์žˆ์„ ์ˆ˜ ์žˆ์œผ๋ฏ€๋กœ ๋‚ ์งœ ํ˜•ํƒœ๊ฐ€ ์•„๋‹ˆ๋ฉด ์žฌ์ฒ˜๋ฆฌ ๊ฐ€๋Šฅ
56
+ # ์—ฌ๊ธฐ์„œ๋Š” ๋‹จ์ˆœ ์˜ˆ์‹œ๋กœ ๋„˜์–ด๊ฐ
57
+ except:
58
+ date = "ํ™•์ธ๋ถˆ๊ฐ€"
59
+
60
+ # ์ œ๋ชฉ & ๋งํฌ
61
+ try:
62
+ title_elem = news.select_one(".news_tit")
63
+ title = title_elem.get("title", "").strip()
64
+ link = title_elem.get("href", "").strip()
65
+ except:
66
+ title = "์ œ๋ชฉํ™•์ธ๋ถˆ๊ฐ€"
67
+ link = ""
68
+
69
+ # ๋‰ด์Šค ๊ฐ„๋žต์ •๋ณด
70
+ try:
71
+ desc_elem = news.select_one(".news_dsc .api_txt_lines")
72
+ desc = desc_elem.get_text(strip=True) if desc_elem else "๋‚ด์šฉํ™•์ธ๋ถˆ๊ฐ€"
73
+ except:
74
+ desc = "๋‚ด์šฉํ™•์ธ๋ถˆ๊ฐ€"
75
+
76
+ debug_msgs.append(f"[๋””๋ฒ„๊ทธ] {idx+1}๋ฒˆ์งธ ๊ธฐ์‚ฌ ํŒŒ์‹ฑ๊ฒฐ๊ณผ -> ์‹ ๋ฌธ์‚ฌ: {press}, ๋ฐœํ–‰์ผ: {date}, ์ œ๋ชฉ: {title}, ๋งํฌ: {link}")
77
+
78
+ results.append({
79
+ "์‹ ๋ฌธ์‚ฌ": press,
80
+ "๋ฐœํ–‰์ผ": date,
81
+ "์ œ๋ชฉ": title,
82
+ "๋‰ด์Šค๊ฐ„๋žต์ •๋ณด": desc,
83
+ "๋งํฌ": link
84
+ })
85
+
86
+ # ------------------------------
87
+ # HTML ํ…Œ์ด๋ธ” ์ƒ์„ฑ
88
+ # ------------------------------
89
+ table_html = """
90
+ <table border="1" style="border-collapse: collapse; width: 100%;">
91
+ <thead>
92
+ <tr>
93
+ <th style="padding: 5px;">์‹ ๋ฌธ์‚ฌ</th>
94
+ <th style="padding: 5px;">๋ฐœํ–‰์ผ</th>
95
+ <th style="padding: 5px;">์ œ๋ชฉ</th>
96
+ <th style="padding: 5px;">๋‰ด์Šค๊ฐ„๋žต์ •๋ณด</th>
97
+ <th style="padding: 5px;">๋งํฌ</th>
98
+ </tr>
99
+ </thead>
100
+ <tbody>
101
+ """
102
+
103
+ for row in results:
104
+ table_html += "<tr>"
105
+ table_html += f"<td style='padding: 5px;'>{row['์‹ ๋ฌธ์‚ฌ']}</td>"
106
+ table_html += f"<td style='padding: 5px;'>{row['๋ฐœํ–‰์ผ']}</td>"
107
+ table_html += f"<td style='padding: 5px;'>{row['์ œ๋ชฉ']}</td>"
108
+ table_html += f"<td style='padding: 5px;'>{row['๋‰ด์Šค๊ฐ„๋žต์ •๋ณด']}</td>"
109
+ # ๋งํฌ๋Š” ํด๋ฆญ ๊ฐ€๋Šฅํ•˜๋„๋ก a ํƒœ๊ทธ ์‚ฝ์ž…
110
+ table_html += f"<td style='padding: 5px;'><a href='{row['๋งํฌ']}' target='_blank'>๋ฐ”๋กœ๊ฐ€๊ธฐ</a></td>"
111
+ table_html += "</tr>"
112
+
113
+ table_html += """
114
+ </tbody>
115
+ </table>
116
+ """
117
+
118
+ # ------------------------------
119
+ # ์—‘์…€(Excel) ์ƒ์„ฑ
120
+ # ------------------------------
121
+ df = pd.DataFrame(results)
122
+ output_io = BytesIO()
123
+ with pd.ExcelWriter(output_io, engine="openpyxl") as writer:
124
+ df.to_excel(writer, index=False, sheet_name="๋„ค์ด๋ฒ„๋‰ด์Šค")
125
+ output_io.seek(0) # ํฌ์ธํ„ฐ ์œ„์น˜ ์ดˆ๊ธฐํ™”
126
+
127
+ # ์ž„์‹œํŒŒ์ผ์— ์“ฐ๊ธฐ
128
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".xlsx") as tmp:
129
+ tmp.write(output_io.read())
130
+ tmp_path = tmp.name
131
+
132
+ debug_msgs.append(f"[๋””๋ฒ„๊ทธ] ์—‘์…€ ์ž„์‹œํŒŒ์ผ ์ƒ์„ฑ ์™„๋ฃŒ -> {tmp_path}")
133
+
134
+ return table_html, tmp_path, debug_msgs
135
+
136
+ def run_search(keyword):
137
+ """
138
+ Gradio ์ธํ„ฐํŽ˜์ด์Šค์—์„œ ํ˜ธ์ถœ๋˜๋Š” ํ•จ์ˆ˜
139
+ - HTML ์ถœ๋ ฅ
140
+ - ํŒŒ์ผ ๋‹ค์šด๋กœ๋“œ
141
+ - ๋””๋ฒ„๊ทธ ๋ฉ”์„ธ์ง€
142
+ """
143
+ table_html, file_path, debug_info = scrap_naver_news(keyword)
144
+
145
+ if file_path is None:
146
+ return table_html, None, "\n".join(debug_info)
147
+
148
+ # Gradio์—์„œ ํŒŒ์ผ์„ ์—…๋ฐ์ดํŠธํ•  ๋•Œ๋Š” ๋ฐ˜ํ™˜๊ฐ’์œผ๋กœ ํŒŒ์ผ ๊ฒฝ๋กœ ์ง€์ •
149
+ return table_html, file_path, "\n".join(debug_info)
150
+
151
+ def launch_app():
152
+ with gr.Blocks() as demo:
153
+ gr.Markdown("## ๋„ค์ด๋ฒ„ ๋‰ด์Šค ์Šคํฌ๋ž˜ํ•‘ ์˜ˆ์‹œ")
154
+
155
+ with gr.Row():
156
+ keyword_input = gr.Textbox(label="๊ฒ€์ƒ‰ ํ‚ค์›Œ๋“œ ์ž…๋ ฅ", placeholder="์˜ˆ: ์˜ค์ง•์–ด")
157
+
158
+ with gr.Row():
159
+ search_button = gr.Button("์Šคํฌ๋ž˜ํ•‘ ์‹œ์ž‘")
160
+
161
+ # ๊ฒฐ๊ณผ ์ถœ๋ ฅ (HTML)
162
+ result_html = gr.HTML(label="์Šคํฌ๋ž˜ํ•‘ ๊ฒฐ๊ณผ (ํ‘œ ํ˜•์‹)")
163
+
164
+ # ์—‘์…€ ๋‹ค์šด๋กœ๋“œ
165
+ download_file = gr.File(label="์—‘์…€ ๋‹ค์šด๋กœ๋“œ")
166
+
167
+ # ๋””๋ฒ„๊ทธ ๋ฉ”์„ธ์ง€
168
+ debug_box = gr.Textbox(label="๋””๋ฒ„๊ทธ ๋กœ๊ทธ", lines=10)
169
+
170
+ # ๋ฒ„ํŠผ ๋™์ž‘ ์ •์˜
171
+ search_button.click(
172
+ fn=run_search,
173
+ inputs=[keyword_input],
174
+ outputs=[result_html, download_file, debug_box]
175
+ )
176
+
177
+ demo.launch()
178
+
179
+ if __name__ == "__main__":
180
+ launch_app()