Kims12 commited on
Commit
d5fb63f
Β·
verified Β·
1 Parent(s): 17c4358

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +310 -0
app.py ADDED
@@ -0,0 +1,310 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import requests
3
+ from bs4 import BeautifulSoup
4
+ import urllib.parse # iframe 경둜 보정을 μœ„ν•œ λͺ¨λ“ˆ
5
+ import re
6
+ import logging
7
+ import tempfile
8
+ import pandas as pd
9
+ import mecab # python‑mecab‑ko 라이브러리 μ‚¬μš©
10
+ import os
11
+ import time
12
+ import hmac
13
+ import hashlib
14
+ import base64
15
+
16
+ # 디버깅(둜그)용 ν•¨μˆ˜
17
+ def debug_log(message: str):
18
+ print(f"[DEBUG] {message}")
19
+
20
+ # =============================================================================
21
+ # [κΈ°λ³Έμ½”λ“œ]: 넀이버 λΈ”λ‘œκ·Έμ—μ„œ 제λͺ©κ³Ό 본문을 μΆ”μΆœν•˜λŠ” ν•¨μˆ˜
22
+ # =============================================================================
23
+ def scrape_naver_blog(url: str) -> str:
24
+ debug_log("scrape_naver_blog ν•¨μˆ˜ μ‹œμž‘")
25
+ debug_log(f"μš”μ²­λ°›μ€ URL: {url}")
26
+
27
+ # 헀더 μ„ΈνŒ…(크둀링 차단 λ°©μ§€)
28
+ headers = {
29
+ "User-Agent": (
30
+ "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
31
+ "AppleWebKit/537.36 (KHTML, like Gecko) "
32
+ "Chrome/96.0.4664.110 Safari/537.36"
33
+ )
34
+ }
35
+
36
+ try:
37
+ # 1) 넀이버 λΈ”λ‘œκ·Έ 메인 νŽ˜μ΄μ§€ μš”μ²­
38
+ response = requests.get(url, headers=headers)
39
+ debug_log("HTTP GET μš”μ²­(메인 νŽ˜μ΄μ§€) μ™„λ£Œ")
40
+
41
+ if response.status_code != 200:
42
+ debug_log(f"μš”μ²­ μ‹€νŒ¨, μƒνƒœμ½”λ“œ: {response.status_code}")
43
+ return f"였λ₯˜κ°€ λ°œμƒν–ˆμŠ΅λ‹ˆλ‹€. μƒνƒœμ½”λ“œ: {response.status_code}"
44
+
45
+ soup = BeautifulSoup(response.text, "html.parser")
46
+ debug_log("HTML νŒŒμ‹±(메인 νŽ˜μ΄μ§€) μ™„λ£Œ")
47
+
48
+ # 2) iframe νƒœκ·Έ μ°ΎκΈ°
49
+ iframe = soup.select_one("iframe#mainFrame")
50
+ if not iframe:
51
+ debug_log("iframe#mainFrame νƒœκ·Έλ₯Ό 찾을 수 μ—†μŠ΅λ‹ˆλ‹€.")
52
+ return "λ³Έλ¬Έ iframe을 찾을 수 μ—†μŠ΅λ‹ˆλ‹€."
53
+
54
+ iframe_src = iframe.get("src")
55
+ if not iframe_src:
56
+ debug_log("iframe srcκ°€ μ‘΄μž¬ν•˜μ§€ μ•ŠμŠ΅λ‹ˆλ‹€.")
57
+ return "λ³Έλ¬Έ iframe의 srcλ₯Ό 찾을 수 μ—†μŠ΅λ‹ˆλ‹€."
58
+
59
+ # 3) iframe srcκ°€ μƒλŒ€κ²½λ‘œμΈ 경우 μ ˆλŒ€κ²½λ‘œλ‘œ 보정
60
+ parsed_iframe_url = urllib.parse.urljoin(url, iframe_src)
61
+ debug_log(f"iframe νŽ˜μ΄μ§€ μš”μ²­ URL: {parsed_iframe_url}")
62
+
63
+ # 4) iframe νŽ˜μ΄μ§€ μž¬μš”μ²­
64
+ iframe_response = requests.get(parsed_iframe_url, headers=headers)
65
+ debug_log("HTTP GET μš”μ²­(iframe νŽ˜μ΄μ§€) μ™„λ£Œ")
66
+
67
+ if iframe_response.status_code != 200:
68
+ debug_log(f"iframe μš”μ²­ μ‹€νŒ¨, μƒνƒœμ½”λ“œ: {iframe_response.status_code}")
69
+ return f"iframeμ—μ„œ 였λ₯˜κ°€ λ°œμƒν–ˆμŠ΅λ‹ˆλ‹€. μƒνƒœμ½”λ“œ: {iframe_response.status_code}"
70
+
71
+ iframe_soup = BeautifulSoup(iframe_response.text, "html.parser")
72
+ debug_log("HTML νŒŒμ‹±(iframe νŽ˜μ΄μ§€) μ™„λ£Œ")
73
+
74
+ # 제λͺ© μΆ”μΆœ
75
+ title_div = iframe_soup.select_one('.se-module.se-module-text.se-title-text')
76
+ title = title_div.get_text(strip=True) if title_div else "제λͺ©μ„ 찾을 수 μ—†μŠ΅λ‹ˆλ‹€."
77
+ debug_log(f"μΆ”μΆœλœ 제λͺ©: {title}")
78
+
79
+ # λ³Έλ¬Έ μΆ”μΆœ
80
+ content_div = iframe_soup.select_one('.se-main-container')
81
+ if content_div:
82
+ content = content_div.get_text("\n", strip=True)
83
+ else:
84
+ content = "본문을 찾을 수 μ—†μŠ΅λ‹ˆλ‹€."
85
+ debug_log("λ³Έλ¬Έ μΆ”μΆœ μ™„λ£Œ")
86
+
87
+ # κ²°κ³Ό ν•©μΉ˜κΈ°
88
+ result = f"[제λͺ©]\n{title}\n\n[λ³Έλ¬Έ]\n{content}"
89
+ debug_log("제λͺ©κ³Ό 본문을 합쳐 λ°˜ν™˜ μ€€λΉ„ μ™„λ£Œ")
90
+ return result
91
+
92
+ except Exception as e:
93
+ debug_log(f"μ—λŸ¬ λ°œμƒ: {str(e)}")
94
+ return f"μŠ€ν¬λž˜ν•‘ 쀑 였λ₯˜κ°€ λ°œμƒν–ˆμŠ΅λ‹ˆλ‹€: {str(e)}"
95
+
96
+ # =============================================================================
97
+ # [μ°Έμ‘°μ½”λ“œ-1]: ν˜•νƒœμ†Œ 뢄석 ν•¨μˆ˜ (Mecab 이용)
98
+ # =============================================================================
99
+ logging.basicConfig(level=logging.DEBUG)
100
+ logger = logging.getLogger(__name__)
101
+
102
+ def analyze_text(text: str):
103
+ logger.debug("원본 ν…μŠ€νŠΈ: %s", text)
104
+
105
+ # 1. ν•œκ΅­μ–΄λ§Œ 남기기 (곡백, μ˜μ–΄, 기호 λ“± 제거)
106
+ filtered_text = re.sub(r'[^κ°€-힣]', '', text)
107
+ logger.debug("ν•„ν„°λ§λœ ν…μŠ€νŠΈ (ν•œκ΅­μ–΄λ§Œ, 곡백 제거): %s", filtered_text)
108
+
109
+ if not filtered_text:
110
+ logger.debug("μœ νš¨ν•œ ν•œκ΅­μ–΄ ν…μŠ€νŠΈκ°€ μ—†μŒ.")
111
+ return pd.DataFrame(columns=["단어", "λΉˆλ„μˆ˜"]), ""
112
+
113
+ # 2. Mecab을 μ΄μš©ν•œ ν˜•νƒœμ†Œ 뢄석 (λͺ…사와 볡합λͺ…μ‚¬λ§Œ μΆ”μΆœ)
114
+ mecab_instance = mecab.MeCab() # μΈμŠ€ν„΄μŠ€ 생성
115
+ tokens = mecab_instance.pos(filtered_text)
116
+ logger.debug("ν˜•νƒœμ†Œ 뢄석 κ²°κ³Ό: %s", tokens)
117
+
118
+ freq = {}
119
+ for word, pos in tokens:
120
+ if word and word.strip():
121
+ if pos.startswith("NN"):
122
+ freq[word] = freq.get(word, 0) + 1
123
+ logger.debug("단어: %s, ν’ˆμ‚¬: %s, ν˜„μž¬ λΉˆλ„: %d", word, pos, freq[word])
124
+
125
+ # 3. λΉˆλ„μˆ˜λ₯Ό λ‚΄λ¦Όμ°¨μˆœ μ •λ ¬
126
+ sorted_freq = sorted(freq.items(), key=lambda x: x[1], reverse=True)
127
+ logger.debug("λ‚΄λ¦Όμ°¨μˆœ μ •λ ¬λœ 단어 λΉˆλ„: %s", sorted_freq)
128
+
129
+ # 4. κ²°κ³Ό DataFrame 생성
130
+ df = pd.DataFrame(sorted_freq, columns=["단어", "λΉˆλ„μˆ˜"])
131
+ logger.debug("κ²°κ³Ό DataFrame 생성됨, shape: %s", df.shape)
132
+
133
+ # 5. Excel 파일 생성 (μž„μ‹œ 파일 μ €μž₯)
134
+ temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".xlsx")
135
+ df.to_excel(temp_file.name, index=False, engine='openpyxl')
136
+ temp_file.close()
137
+ logger.debug("Excel 파일 생성됨: %s", temp_file.name)
138
+
139
+ return df, temp_file.name
140
+
141
+ # =============================================================================
142
+ # [μ°Έμ‘°μ½”λ“œ-2]: ν‚€μ›Œλ“œ κ²€μƒ‰λŸ‰ 및 λΈ”λ‘œκ·Έ λ¬Έμ„œμˆ˜ 쑰회 κ΄€λ ¨ ν•¨μˆ˜
143
+ # =============================================================================
144
+ def generate_signature(timestamp, method, uri, secret_key):
145
+ message = f"{timestamp}.{method}.{uri}"
146
+ digest = hmac.new(secret_key.encode("utf-8"), message.encode("utf-8"), hashlib.sha256).digest()
147
+ return base64.b64encode(digest).decode()
148
+
149
+ def get_header(method, uri, api_key, secret_key, customer_id):
150
+ timestamp = str(round(time.time() * 1000))
151
+ signature = generate_signature(timestamp, method, uri, secret_key)
152
+ return {
153
+ "Content-Type": "application/json; charset=UTF-8",
154
+ "X-Timestamp": timestamp,
155
+ "X-API-KEY": api_key,
156
+ "X-Customer": str(customer_id),
157
+ "X-Signature": signature
158
+ }
159
+
160
+ def fetch_related_keywords(keyword):
161
+ API_KEY = os.environ["NAVER_API_KEY"]
162
+ SECRET_KEY = os.environ["NAVER_SECRET_KEY"]
163
+ CUSTOMER_ID = os.environ["NAVER_CUSTOMER_ID"]
164
+
165
+ BASE_URL = "https://api.naver.com"
166
+ uri = "/keywordstool"
167
+ method = "GET"
168
+ headers = get_header(method, uri, API_KEY, SECRET_KEY, CUSTOMER_ID)
169
+ params = {
170
+ "hintKeywords": [keyword],
171
+ "showDetail": "1"
172
+ }
173
+ response = requests.get(BASE_URL + uri, params=params, headers=headers)
174
+ data = response.json()
175
+ if "keywordList" not in data:
176
+ return pd.DataFrame()
177
+ df = pd.DataFrame(data["keywordList"])
178
+ if len(df) > 100:
179
+ df = df.head(100)
180
+
181
+ def parse_count(x):
182
+ try:
183
+ return int(str(x).replace(",", ""))
184
+ except:
185
+ return 0
186
+
187
+ df["PCμ›”κ²€μƒ‰λŸ‰"] = df["monthlyPcQcCnt"].apply(parse_count)
188
+ df["λͺ¨λ°”μΌμ›”κ²€μƒ‰λŸ‰"] = df["monthlyMobileQcCnt"].apply(parse_count)
189
+ df["ν† νƒˆμ›”κ²€μƒ‰λŸ‰"] = df["PCμ›”κ²€μƒ‰λŸ‰"] + df["λͺ¨λ°”μΌμ›”κ²€μƒ‰λŸ‰"]
190
+ df.rename(columns={"relKeyword": "μ •λ³΄ν‚€μ›Œλ“œ"}, inplace=True)
191
+ result_df = df[["μ •λ³΄ν‚€μ›Œλ“œ", "PCμ›”κ²€μƒ‰λŸ‰", "λͺ¨λ°”μΌμ›”κ²€μƒ‰λŸ‰", "ν† νƒˆμ›”κ²€μƒ‰λŸ‰"]]
192
+ return result_df
193
+
194
+ def fetch_blog_count(keyword):
195
+ client_id = os.environ["NAVER_SEARCH_CLIENT_ID"]
196
+ client_secret = os.environ["NAVER_SEARCH_CLIENT_SECRET"]
197
+ url = "https://openapi.naver.com/v1/search/blog.json"
198
+ headers = {
199
+ "X-Naver-Client-Id": client_id,
200
+ "X-Naver-Client-Secret": client_secret
201
+ }
202
+ params = {"query": keyword, "display": 1}
203
+ response = requests.get(url, headers=headers, params=params)
204
+ if response.status_code == 200:
205
+ data = response.json()
206
+ return data.get("total", 0)
207
+ else:
208
+ return 0
209
+
210
+ def create_excel_file(df):
211
+ with tempfile.NamedTemporaryFile(suffix=".xlsx", delete=False) as tmp:
212
+ excel_path = tmp.name
213
+ df.to_excel(excel_path, index=False)
214
+ return excel_path
215
+
216
+ def process_keyword(keywords: str, include_related: bool):
217
+ """
218
+ μ—¬λŸ¬ ν‚€μ›Œλ“œλ₯Ό μ—”ν„°λ‘œ κ΅¬λΆ„ν•˜μ—¬ 리슀트둜 λ§Œλ“€κ³ ,
219
+ 각 ν‚€μ›Œλ“œμ— λŒ€ν•΄ 넀이버 κ΄‘κ³  API둜 κ²€μƒ‰λŸ‰ 정보λ₯Ό μ‘°νšŒν•˜λ©°,
220
+ 첫 번째 ν‚€μ›Œλ“œμ˜ 경우 μ˜΅μ…˜μ— 따라 연관검색어도 μΆ”κ°€ν•œ ν›„,
221
+ 각 μ •λ³΄ν‚€μ›Œλ“œμ— λŒ€ν•΄ λΈ”λ‘œκ·Έ λ¬Έμ„œμˆ˜λ₯Ό μ‘°νšŒν•˜μ—¬ DataFrameκ³Ό Excel νŒŒμΌμ„ λ°˜ν™˜ν•©λ‹ˆλ‹€.
222
+ """
223
+ input_keywords = [k.strip() for k in keywords.splitlines() if k.strip()]
224
+ result_dfs = []
225
+
226
+ for idx, kw in enumerate(input_keywords):
227
+ df_kw = fetch_related_keywords(kw)
228
+ if df_kw.empty:
229
+ continue
230
+ row_kw = df_kw[df_kw["μ •λ³΄ν‚€μ›Œλ“œ"] == kw]
231
+ if not row_kw.empty:
232
+ result_dfs.append(row_kw)
233
+ else:
234
+ result_dfs.append(df_kw.head(1))
235
+ if include_related and idx == 0:
236
+ df_related = df_kw[df_kw["μ •λ³΄ν‚€μ›Œλ“œ"] != kw]
237
+ if not df_related.empty:
238
+ result_dfs.append(df_related)
239
+
240
+ if result_dfs:
241
+ result_df = pd.concat(result_dfs, ignore_index=True)
242
+ result_df.drop_duplicates(subset=["μ •λ³΄ν‚€μ›Œλ“œ"], inplace=True)
243
+ else:
244
+ result_df = pd.DataFrame(columns=["μ •λ³΄ν‚€μ›Œλ“œ", "PCμ›”κ²€μƒ‰λŸ‰", "λͺ¨λ°”μΌμ›”κ²€μƒ‰λŸ‰", "ν† νƒˆμ›”κ²€μƒ‰λŸ‰"])
245
+
246
+ result_df["λΈ”λ‘œκ·Έλ¬Έμ„œμˆ˜"] = result_df["μ •λ³΄ν‚€μ›Œλ“œ"].apply(fetch_blog_count)
247
+ result_df.sort_values(by="ν† νƒˆμ›”κ²€μƒ‰λŸ‰", ascending=False, inplace=True)
248
+
249
+ return result_df, create_excel_file(result_df)
250
+
251
+ # =============================================================================
252
+ # 톡합 처리 ν•¨μˆ˜: λΈ”λ‘œκ·Έ λ‚΄μš©(ν…μŠ€νŠΈ)에 λŒ€ν•΄ ν˜•νƒœμ†Œ 뢄석을 μˆ˜ν–‰ν•œ ν›„,
253
+ # ν‚€μ›Œλ“œμ˜ κ²€μƒ‰λŸ‰ 및 λΈ”λ‘œκ·Έ λ¬Έμ„œμˆ˜λ₯Ό μΆ”κ°€ν•˜μ—¬ μ΅œμ’… κ²°κ³Όλ₯Ό λ°˜ν™˜ν•¨.
254
+ # =============================================================================
255
+ def process_blog_content(text: str):
256
+ debug_log("process_blog_content ν•¨μˆ˜ μ‹œμž‘")
257
+ # 1. ν˜•νƒœμ†Œ 뢄석 μ‹€ν–‰ ([μ°Έμ‘°μ½”λ“œ-1] ν™œμš©)
258
+ df_morph, morph_excel = analyze_text(text)
259
+ debug_log("ν˜•νƒœμ†Œ 뢄석 μ™„λ£Œ")
260
+
261
+ if df_morph.empty:
262
+ debug_log("ν˜•νƒœμ†Œ 뢄석 κ²°κ³Όκ°€ λΉ„μ–΄μžˆμŒ")
263
+ return df_morph, ""
264
+
265
+ # 2. ν˜•νƒœμ†Œ λΆ„μ„λœ 단어 λͺ©λ‘ μΆ”μΆœ (ν‚€μ›Œλ“œ 쑰회용)
266
+ keywords = "\n".join(df_morph["단어"].tolist())
267
+ debug_log(f"μΆ”μΆœλœ 단어 λͺ©λ‘: {keywords}")
268
+
269
+ # 3. ν‚€μ›Œλ“œ κ²€μƒ‰λŸ‰ 및 λΈ”λ‘œκ·Έ λ¬Έμ„œμˆ˜ 쑰회 ([μ°Έμ‘°μ½”λ“œ-2] ν™œμš©)
270
+ df_keyword, keyword_excel = process_keyword(keywords, include_related=False)
271
+ debug_log("ν‚€μ›Œλ“œ 검색 정보 쑰회 μ™„λ£Œ")
272
+
273
+ # 4. ν˜•νƒœμ†Œ 뢄석 결과와 ν‚€μ›Œλ“œ 정보λ₯Ό 단어 κΈ°μ€€μœΌλ‘œ 병합
274
+ df_merged = pd.merge(df_morph, df_keyword, left_on="단어", right_on="μ •λ³΄ν‚€μ›Œλ“œ", how="left")
275
+ debug_log("데이터 병합 μ™„λ£Œ")
276
+ df_merged.drop(columns=["μ •λ³΄ν‚€μ›Œλ“œ"], inplace=True)
277
+
278
+ # 5. 병합 κ²°κ³Όλ₯Ό Excel 파일둜 생성
279
+ merged_excel = create_excel_file(df_merged)
280
+ debug_log(f"병합 κ²°κ³Ό Excel 파일 생성됨: {merged_excel}")
281
+
282
+ return df_merged, merged_excel
283
+
284
+ # =============================================================================
285
+ # Gradio μΈν„°νŽ˜μ΄μŠ€ ꡬ성 (ν—ˆκΉ…νŽ˜μ΄μŠ€ κ·ΈλΌλ””μ˜€ ν™˜κ²½)
286
+ # =============================================================================
287
+ with gr.Blocks() as demo:
288
+ gr.Markdown("# λΈ”λ‘œκ·Έ κΈ€ ν˜•νƒœμ†Œ 뢄석 및 ν‚€μ›Œλ“œ 정보 쑰회")
289
+
290
+ with gr.Tab("λΈ”λ‘œκ·Έ λ‚΄μš© μž…λ ₯ 및 μŠ€ν¬λž˜ν•‘"):
291
+ with gr.Row():
292
+ blog_url = gr.Textbox(label="넀이버 λΈ”λ‘œκ·Έ 링크", placeholder="예: https://blog.naver.com/ssboost/222983068507")
293
+ fetch_button = gr.Button("λΈ”λ‘œκ·Έλ‚΄μš©κ°€μ Έμ˜€κΈ°")
294
+ blog_content = gr.Textbox(label="λΈ”λ‘œκ·Έ λ‚΄μš© (제λͺ© 및 λ³Έλ¬Έ)", lines=10, placeholder="λΈ”λ‘œκ·Έ λ‚΄μš©μ„ κ°€μ Έμ˜€κ±°λ‚˜ 직접 μž…λ ₯ν•˜μ„Έμš”.")
295
+ # 'λΈ”λ‘œκ·Έλ‚΄μš©κ°€μ Έμ˜€κΈ°' λ²„νŠΌ 클릭 μ‹œ μŠ€ν¬λž˜ν•‘ μ‹€ν–‰ν•˜μ—¬ blog_content에 반영
296
+ fetch_button.click(fn=scrape_naver_blog, inputs=blog_url, outputs=blog_content)
297
+
298
+ with gr.Tab("ν˜•νƒœμ†Œ 뢄석 μ‹€ν–‰"):
299
+ with gr.Row():
300
+ analysis_button = gr.Button("ν˜•νƒœμ†ŒλΆ„μ„")
301
+ # 뢄석 κ²°κ³ΌλŠ” μˆ˜μ • κ°€λŠ₯ν•˜λ„λ‘ interactive=True μ„€μ •
302
+ output_table = gr.Dataframe(label="뢄석 κ²°κ³Ό (ν˜•νƒœμ†Œ 및 ν‚€μ›Œλ“œ 정보)", interactive=True)
303
+ output_file = gr.File(label="Excel λ‹€μš΄λ‘œλ“œ")
304
+ # 'ν˜•νƒœμ†ŒλΆ„μ„' λ²„νŠΌ 클릭 μ‹œ process_blog_content ν•¨μˆ˜ μ‹€ν–‰
305
+ analysis_button.click(fn=process_blog_content, inputs=blog_content, outputs=[output_table, output_file])
306
+
307
+ if __name__ == "__main__":
308
+ debug_log("Gradio μ•± μ‹€ν–‰ μ‹œμž‘")
309
+ demo.launch()
310
+ debug_log("Gradio μ•± μ‹€ν–‰ μ’…λ£Œ")