Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -17,14 +17,11 @@ import base64
|
|
| 17 |
def debug_log(message: str):
|
| 18 |
print(f"[DEBUG] {message}")
|
| 19 |
|
| 20 |
-
#
|
| 21 |
-
# [๊ธฐ๋ณธ์ฝ๋]: ๋ค์ด๋ฒ ๋ธ๋ก๊ทธ์์ ์ ๋ชฉ๊ณผ ๋ณธ๋ฌธ์ ์ถ์ถํ๋ ํจ์
|
| 22 |
-
# =============================================================================
|
| 23 |
def scrape_naver_blog(url: str) -> str:
|
| 24 |
debug_log("scrape_naver_blog ํจ์ ์์")
|
| 25 |
debug_log(f"์์ฒญ๋ฐ์ URL: {url}")
|
| 26 |
|
| 27 |
-
# ํค๋ ์ธํ
(ํฌ๋กค๋ง ์ฐจ๋จ ๋ฐฉ์ง)
|
| 28 |
headers = {
|
| 29 |
"User-Agent": (
|
| 30 |
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
|
|
@@ -34,57 +31,51 @@ def scrape_naver_blog(url: str) -> str:
|
|
| 34 |
}
|
| 35 |
|
| 36 |
try:
|
| 37 |
-
# 1) ๋ค์ด๋ฒ ๋ธ๋ก๊ทธ ๋ฉ์ธ ํ์ด์ง ์์ฒญ
|
| 38 |
response = requests.get(url, headers=headers)
|
| 39 |
debug_log("HTTP GET ์์ฒญ(๋ฉ์ธ ํ์ด์ง) ์๋ฃ")
|
| 40 |
-
|
| 41 |
if response.status_code != 200:
|
| 42 |
debug_log(f"์์ฒญ ์คํจ, ์ํ์ฝ๋: {response.status_code}")
|
| 43 |
return f"์ค๋ฅ๊ฐ ๋ฐ์ํ์ต๋๋ค. ์ํ์ฝ๋: {response.status_code}"
|
| 44 |
-
|
|
|
|
| 45 |
soup = BeautifulSoup(response.text, "html.parser")
|
| 46 |
debug_log("HTML ํ์ฑ(๋ฉ์ธ ํ์ด์ง) ์๋ฃ")
|
| 47 |
-
|
| 48 |
-
#
|
| 49 |
iframe = soup.select_one("iframe#mainFrame")
|
| 50 |
if not iframe:
|
| 51 |
debug_log("iframe#mainFrame ํ๊ทธ๋ฅผ ์ฐพ์ ์ ์์ต๋๋ค.")
|
| 52 |
return "๋ณธ๋ฌธ iframe์ ์ฐพ์ ์ ์์ต๋๋ค."
|
| 53 |
-
|
| 54 |
iframe_src = iframe.get("src")
|
| 55 |
if not iframe_src:
|
| 56 |
debug_log("iframe src๊ฐ ์กด์ฌํ์ง ์์ต๋๋ค.")
|
| 57 |
return "๋ณธ๋ฌธ iframe์ src๋ฅผ ์ฐพ์ ์ ์์ต๋๋ค."
|
| 58 |
-
|
| 59 |
-
#
|
| 60 |
parsed_iframe_url = urllib.parse.urljoin(url, iframe_src)
|
| 61 |
debug_log(f"iframe ํ์ด์ง ์์ฒญ URL: {parsed_iframe_url}")
|
| 62 |
-
|
| 63 |
-
#
|
| 64 |
iframe_response = requests.get(parsed_iframe_url, headers=headers)
|
| 65 |
debug_log("HTTP GET ์์ฒญ(iframe ํ์ด์ง) ์๋ฃ")
|
| 66 |
-
|
| 67 |
if iframe_response.status_code != 200:
|
| 68 |
debug_log(f"iframe ์์ฒญ ์คํจ, ์ํ์ฝ๋: {iframe_response.status_code}")
|
| 69 |
return f"iframe์์ ์ค๋ฅ๊ฐ ๋ฐ์ํ์ต๋๋ค. ์ํ์ฝ๋: {iframe_response.status_code}"
|
| 70 |
-
|
| 71 |
iframe_soup = BeautifulSoup(iframe_response.text, "html.parser")
|
| 72 |
debug_log("HTML ํ์ฑ(iframe ํ์ด์ง) ์๋ฃ")
|
| 73 |
-
|
| 74 |
-
#
|
| 75 |
title_div = iframe_soup.select_one('.se-module.se-module-text.se-title-text')
|
| 76 |
title = title_div.get_text(strip=True) if title_div else "์ ๋ชฉ์ ์ฐพ์ ์ ์์ต๋๋ค."
|
| 77 |
debug_log(f"์ถ์ถ๋ ์ ๋ชฉ: {title}")
|
| 78 |
-
|
| 79 |
-
# ๋ณธ๋ฌธ ์ถ์ถ
|
| 80 |
content_div = iframe_soup.select_one('.se-main-container')
|
| 81 |
if content_div:
|
| 82 |
content = content_div.get_text("\n", strip=True)
|
| 83 |
else:
|
| 84 |
content = "๋ณธ๋ฌธ์ ์ฐพ์ ์ ์์ต๋๋ค."
|
| 85 |
debug_log("๋ณธ๋ฌธ ์ถ์ถ ์๋ฃ")
|
| 86 |
-
|
| 87 |
-
# ๊ฒฐ๊ณผ ํฉ์น๊ธฐ
|
| 88 |
result = f"[์ ๋ชฉ]\n{title}\n\n[๋ณธ๋ฌธ]\n{content}"
|
| 89 |
debug_log("์ ๋ชฉ๊ณผ ๋ณธ๋ฌธ์ ํฉ์ณ ๋ฐํ ์ค๋น ์๋ฃ")
|
| 90 |
return result
|
|
@@ -93,13 +84,10 @@ def scrape_naver_blog(url: str) -> str:
|
|
| 93 |
debug_log(f"์๋ฌ ๋ฐ์: {str(e)}")
|
| 94 |
return f"์คํฌ๋ํ ์ค ์ค๋ฅ๊ฐ ๋ฐ์ํ์ต๋๋ค: {str(e)}"
|
| 95 |
|
| 96 |
-
#
|
| 97 |
-
# [์ฐธ์กฐ์ฝ๋-1]: ํํ์ ๋ถ์ ํจ์ (Mecab ์ด์ฉ)
|
| 98 |
-
# =============================================================================
|
| 99 |
-
logging.basicConfig(level=logging.DEBUG)
|
| 100 |
-
logger = logging.getLogger(__name__)
|
| 101 |
-
|
| 102 |
def analyze_text(text: str):
|
|
|
|
|
|
|
| 103 |
logger.debug("์๋ณธ ํ
์คํธ: %s", text)
|
| 104 |
|
| 105 |
# 1. ํ๊ตญ์ด๋ง ๋จ๊ธฐ๊ธฐ (๊ณต๋ฐฑ, ์์ด, ๊ธฐํธ ๋ฑ ์ ๊ฑฐ)
|
|
@@ -111,7 +99,7 @@ def analyze_text(text: str):
|
|
| 111 |
return pd.DataFrame(columns=["๋จ์ด", "๋น๋์"]), ""
|
| 112 |
|
| 113 |
# 2. Mecab์ ์ด์ฉํ ํํ์ ๋ถ์ (๋ช
์ฌ์ ๋ณตํฉ๋ช
์ฌ๋ง ์ถ์ถ)
|
| 114 |
-
mecab_instance = mecab.MeCab()
|
| 115 |
tokens = mecab_instance.pos(filtered_text)
|
| 116 |
logger.debug("ํํ์ ๋ถ์ ๊ฒฐ๊ณผ: %s", tokens)
|
| 117 |
|
|
@@ -130,7 +118,7 @@ def analyze_text(text: str):
|
|
| 130 |
df = pd.DataFrame(sorted_freq, columns=["๋จ์ด", "๋น๋์"])
|
| 131 |
logger.debug("๊ฒฐ๊ณผ DataFrame ์์ฑ๋จ, shape: %s", df.shape)
|
| 132 |
|
| 133 |
-
# 5. Excel ํ์ผ ์์ฑ (์์ ํ์ผ
|
| 134 |
temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".xlsx")
|
| 135 |
df.to_excel(temp_file.name, index=False, engine='openpyxl')
|
| 136 |
temp_file.close()
|
|
@@ -138,9 +126,7 @@ def analyze_text(text: str):
|
|
| 138 |
|
| 139 |
return df, temp_file.name
|
| 140 |
|
| 141 |
-
#
|
| 142 |
-
# [์ฐธ์กฐ์ฝ๋-2]: ํค์๋ ๊ฒ์๋ ๋ฐ ๋ธ๋ก๊ทธ ๋ฌธ์์ ์กฐํ ๊ด๋ จ ํจ์
|
| 143 |
-
# =============================================================================
|
| 144 |
def generate_signature(timestamp, method, uri, secret_key):
|
| 145 |
message = f"{timestamp}.{method}.{uri}"
|
| 146 |
digest = hmac.new(secret_key.encode("utf-8"), message.encode("utf-8"), hashlib.sha256).digest()
|
|
@@ -158,6 +144,7 @@ def get_header(method, uri, api_key, secret_key, customer_id):
|
|
| 158 |
}
|
| 159 |
|
| 160 |
def fetch_related_keywords(keyword):
|
|
|
|
| 161 |
API_KEY = os.environ["NAVER_API_KEY"]
|
| 162 |
SECRET_KEY = os.environ["NAVER_SECRET_KEY"]
|
| 163 |
CUSTOMER_ID = os.environ["NAVER_CUSTOMER_ID"]
|
|
@@ -189,9 +176,11 @@ def fetch_related_keywords(keyword):
|
|
| 189 |
df["ํ ํ์๊ฒ์๋"] = df["PC์๊ฒ์๋"] + df["๋ชจ๋ฐ์ผ์๊ฒ์๋"]
|
| 190 |
df.rename(columns={"relKeyword": "์ ๋ณดํค์๋"}, inplace=True)
|
| 191 |
result_df = df[["์ ๋ณดํค์๋", "PC์๊ฒ์๋", "๋ชจ๋ฐ์ผ์๊ฒ์๋", "ํ ํ์๊ฒ์๋"]]
|
|
|
|
| 192 |
return result_df
|
| 193 |
|
| 194 |
def fetch_blog_count(keyword):
|
|
|
|
| 195 |
client_id = os.environ["NAVER_SEARCH_CLIENT_ID"]
|
| 196 |
client_secret = os.environ["NAVER_SEARCH_CLIENT_SECRET"]
|
| 197 |
url = "https://openapi.naver.com/v1/search/blog.json"
|
|
@@ -203,23 +192,21 @@ def fetch_blog_count(keyword):
|
|
| 203 |
response = requests.get(url, headers=headers, params=params)
|
| 204 |
if response.status_code == 200:
|
| 205 |
data = response.json()
|
|
|
|
| 206 |
return data.get("total", 0)
|
| 207 |
else:
|
|
|
|
| 208 |
return 0
|
| 209 |
|
| 210 |
def create_excel_file(df):
|
| 211 |
with tempfile.NamedTemporaryFile(suffix=".xlsx", delete=False) as tmp:
|
| 212 |
excel_path = tmp.name
|
| 213 |
df.to_excel(excel_path, index=False)
|
|
|
|
| 214 |
return excel_path
|
| 215 |
|
| 216 |
def process_keyword(keywords: str, include_related: bool):
|
| 217 |
-
""
|
| 218 |
-
์ฌ๋ฌ ํค์๋๋ฅผ ์ํฐ๋ก ๊ตฌ๋ถํ์ฌ ๋ฆฌ์คํธ๋ก ๋ง๋ค๊ณ ,
|
| 219 |
-
๊ฐ ํค์๋์ ๋ํด ๋ค์ด๋ฒ ๊ด๊ณ API๋ก ๊ฒ์๋ ์ ๋ณด๋ฅผ ์กฐํํ๋ฉฐ,
|
| 220 |
-
์ฒซ ๋ฒ์งธ ํค์๋์ ๊ฒฝ์ฐ ์ต์
์ ๋ฐ๋ผ ์ฐ๊ด๊ฒ์์ด๋ ์ถ๊ฐํ ํ,
|
| 221 |
-
๊ฐ ์ ๋ณดํค์๋์ ๋ํด ๋ธ๋ก๊ทธ ๋ฌธ์์๋ฅผ ์กฐํํ์ฌ DataFrame๊ณผ Excel ํ์ผ์ ๋ฐํํฉ๋๋ค.
|
| 222 |
-
"""
|
| 223 |
input_keywords = [k.strip() for k in keywords.splitlines() if k.strip()]
|
| 224 |
result_dfs = []
|
| 225 |
|
|
@@ -245,64 +232,62 @@ def process_keyword(keywords: str, include_related: bool):
|
|
| 245 |
|
| 246 |
result_df["๋ธ๋ก๊ทธ๋ฌธ์์"] = result_df["์ ๋ณดํค์๋"].apply(fetch_blog_count)
|
| 247 |
result_df.sort_values(by="ํ ํ์๊ฒ์๋", ascending=False, inplace=True)
|
| 248 |
-
|
| 249 |
return result_df, create_excel_file(result_df)
|
| 250 |
|
| 251 |
-
#
|
| 252 |
-
|
| 253 |
-
|
| 254 |
-
|
| 255 |
-
|
| 256 |
-
|
| 257 |
-
|
| 258 |
-
|
| 259 |
-
|
| 260 |
-
|
| 261 |
-
|
| 262 |
-
|
| 263 |
-
|
| 264 |
-
|
| 265 |
-
# 2. ํํ์ ๋ถ์๋ ๋จ์ด ๋ชฉ๋ก ์ถ์ถ (ํค์๋ ์กฐํ์ฉ)
|
| 266 |
-
keywords = "\n".join(df_morph["๋จ์ด"].tolist())
|
| 267 |
-
debug_log(f"์ถ์ถ๋ ๋จ์ด ๋ชฉ๋ก: {keywords}")
|
| 268 |
|
| 269 |
-
#
|
| 270 |
-
|
| 271 |
-
debug_log("
|
| 272 |
|
| 273 |
-
#
|
| 274 |
-
|
| 275 |
-
debug_log("
|
| 276 |
-
df_merged.drop(columns=["์ ๋ณดํค์๋"], inplace=True)
|
| 277 |
|
| 278 |
-
#
|
| 279 |
-
|
| 280 |
-
|
| 281 |
|
| 282 |
-
|
|
|
|
|
|
|
|
|
|
| 283 |
|
| 284 |
-
#
|
| 285 |
-
|
| 286 |
-
#
|
| 287 |
-
with gr.Blocks() as demo:
|
| 288 |
-
gr.Markdown("# ๋ธ๋ก๊ทธ ๊ธ ํํ์ ๋ถ์ ๋ฐ ํค์๋ ์ ๋ณด ์กฐํ")
|
| 289 |
|
| 290 |
-
with gr.Tab("๋ธ๋ก๊ทธ ๋ด์ฉ
|
| 291 |
with gr.Row():
|
| 292 |
-
|
| 293 |
fetch_button = gr.Button("๋ธ๋ก๊ทธ๋ด์ฉ๊ฐ์ ธ์ค๊ธฐ")
|
| 294 |
-
blog_content = gr.Textbox(label="๋ธ๋ก๊ทธ ๋ด์ฉ
|
| 295 |
-
|
| 296 |
-
fetch_button.click(fn=scrape_naver_blog, inputs=blog_url, outputs=blog_content)
|
| 297 |
|
| 298 |
-
with gr.Tab("ํํ์ ๋ถ์
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 299 |
with gr.Row():
|
| 300 |
-
|
| 301 |
-
|
| 302 |
-
output_table = gr.Dataframe(label="๋ถ์ ๊ฒฐ๊ณผ (ํํ์ ๋ฐ ํค์๋ ์ ๋ณด)", interactive=True)
|
| 303 |
-
output_file = gr.File(label="Excel ๋ค์ด๋ก๋")
|
| 304 |
-
# 'ํํ์๋ถ์' ๋ฒํผ ํด๋ฆญ ์ process_blog_content ํจ์ ์คํ
|
| 305 |
-
analysis_button.click(fn=process_blog_content, inputs=blog_content, outputs=[output_table, output_file])
|
| 306 |
|
| 307 |
if __name__ == "__main__":
|
| 308 |
debug_log("Gradio ์ฑ ์คํ ์์")
|
|
|
|
| 17 |
def debug_log(message: str):
|
| 18 |
print(f"[DEBUG] {message}")
|
| 19 |
|
| 20 |
+
# [๊ธฐ๋ณธ์ฝ๋] - ๋ค์ด๋ฒ ๋ธ๋ก๊ทธ ์คํฌ๋ํ ๊ธฐ๋ฅ
|
|
|
|
|
|
|
| 21 |
def scrape_naver_blog(url: str) -> str:
|
| 22 |
debug_log("scrape_naver_blog ํจ์ ์์")
|
| 23 |
debug_log(f"์์ฒญ๋ฐ์ URL: {url}")
|
| 24 |
|
|
|
|
| 25 |
headers = {
|
| 26 |
"User-Agent": (
|
| 27 |
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
|
|
|
|
| 31 |
}
|
| 32 |
|
| 33 |
try:
|
| 34 |
+
# 1) ๋ค์ด๋ฒ ๋ธ๋ก๊ทธ '๋ฉ์ธ' ํ์ด์ง ์์ฒญ
|
| 35 |
response = requests.get(url, headers=headers)
|
| 36 |
debug_log("HTTP GET ์์ฒญ(๋ฉ์ธ ํ์ด์ง) ์๋ฃ")
|
|
|
|
| 37 |
if response.status_code != 200:
|
| 38 |
debug_log(f"์์ฒญ ์คํจ, ์ํ์ฝ๋: {response.status_code}")
|
| 39 |
return f"์ค๋ฅ๊ฐ ๋ฐ์ํ์ต๋๋ค. ์ํ์ฝ๋: {response.status_code}"
|
| 40 |
+
|
| 41 |
+
# 2) ๋ฉ์ธ ํ์ด์ง ํ์ฑ
|
| 42 |
soup = BeautifulSoup(response.text, "html.parser")
|
| 43 |
debug_log("HTML ํ์ฑ(๋ฉ์ธ ํ์ด์ง) ์๋ฃ")
|
| 44 |
+
|
| 45 |
+
# 3) iframe ํ๊ทธ ์ฐพ๊ธฐ
|
| 46 |
iframe = soup.select_one("iframe#mainFrame")
|
| 47 |
if not iframe:
|
| 48 |
debug_log("iframe#mainFrame ํ๊ทธ๋ฅผ ์ฐพ์ ์ ์์ต๋๋ค.")
|
| 49 |
return "๋ณธ๋ฌธ iframe์ ์ฐพ์ ์ ์์ต๋๋ค."
|
|
|
|
| 50 |
iframe_src = iframe.get("src")
|
| 51 |
if not iframe_src:
|
| 52 |
debug_log("iframe src๊ฐ ์กด์ฌํ์ง ์์ต๋๋ค.")
|
| 53 |
return "๋ณธ๋ฌธ iframe์ src๋ฅผ ์ฐพ์ ์ ์์ต๋๋ค."
|
| 54 |
+
|
| 55 |
+
# 4) iframe src ๋ณด์ (์ ๋๊ฒฝ๋ก ์ฒ๋ฆฌ)
|
| 56 |
parsed_iframe_url = urllib.parse.urljoin(url, iframe_src)
|
| 57 |
debug_log(f"iframe ํ์ด์ง ์์ฒญ URL: {parsed_iframe_url}")
|
| 58 |
+
|
| 59 |
+
# 5) iframe ํ์ด์ง ์์ฒญ ๋ฐ ํ์ฑ
|
| 60 |
iframe_response = requests.get(parsed_iframe_url, headers=headers)
|
| 61 |
debug_log("HTTP GET ์์ฒญ(iframe ํ์ด์ง) ์๋ฃ")
|
|
|
|
| 62 |
if iframe_response.status_code != 200:
|
| 63 |
debug_log(f"iframe ์์ฒญ ์คํจ, ์ํ์ฝ๋: {iframe_response.status_code}")
|
| 64 |
return f"iframe์์ ์ค๋ฅ๊ฐ ๋ฐ์ํ์ต๋๋ค. ์ํ์ฝ๋: {iframe_response.status_code}"
|
|
|
|
| 65 |
iframe_soup = BeautifulSoup(iframe_response.text, "html.parser")
|
| 66 |
debug_log("HTML ํ์ฑ(iframe ํ์ด์ง) ์๋ฃ")
|
| 67 |
+
|
| 68 |
+
# 6) ์ ๋ชฉ๊ณผ ๋ณธ๋ฌธ ์ถ์ถ
|
| 69 |
title_div = iframe_soup.select_one('.se-module.se-module-text.se-title-text')
|
| 70 |
title = title_div.get_text(strip=True) if title_div else "์ ๋ชฉ์ ์ฐพ์ ์ ์์ต๋๋ค."
|
| 71 |
debug_log(f"์ถ์ถ๋ ์ ๋ชฉ: {title}")
|
|
|
|
|
|
|
| 72 |
content_div = iframe_soup.select_one('.se-main-container')
|
| 73 |
if content_div:
|
| 74 |
content = content_div.get_text("\n", strip=True)
|
| 75 |
else:
|
| 76 |
content = "๋ณธ๋ฌธ์ ์ฐพ์ ์ ์์ต๋๋ค."
|
| 77 |
debug_log("๋ณธ๋ฌธ ์ถ์ถ ์๋ฃ")
|
| 78 |
+
|
|
|
|
| 79 |
result = f"[์ ๋ชฉ]\n{title}\n\n[๋ณธ๋ฌธ]\n{content}"
|
| 80 |
debug_log("์ ๋ชฉ๊ณผ ๋ณธ๋ฌธ์ ํฉ์ณ ๋ฐํ ์ค๋น ์๋ฃ")
|
| 81 |
return result
|
|
|
|
| 84 |
debug_log(f"์๋ฌ ๋ฐ์: {str(e)}")
|
| 85 |
return f"์คํฌ๋ํ ์ค ์ค๋ฅ๊ฐ ๋ฐ์ํ์ต๋๋ค: {str(e)}"
|
| 86 |
|
| 87 |
+
# [์ฐธ์กฐ์ฝ๋-1] ํํ์ ๋ถ์ ๊ธฐ๋ฅ
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 88 |
def analyze_text(text: str):
|
| 89 |
+
logging.basicConfig(level=logging.DEBUG)
|
| 90 |
+
logger = logging.getLogger(__name__)
|
| 91 |
logger.debug("์๋ณธ ํ
์คํธ: %s", text)
|
| 92 |
|
| 93 |
# 1. ํ๊ตญ์ด๋ง ๋จ๊ธฐ๊ธฐ (๊ณต๋ฐฑ, ์์ด, ๊ธฐํธ ๋ฑ ์ ๊ฑฐ)
|
|
|
|
| 99 |
return pd.DataFrame(columns=["๋จ์ด", "๋น๋์"]), ""
|
| 100 |
|
| 101 |
# 2. Mecab์ ์ด์ฉํ ํํ์ ๋ถ์ (๋ช
์ฌ์ ๋ณตํฉ๋ช
์ฌ๋ง ์ถ์ถ)
|
| 102 |
+
mecab_instance = mecab.MeCab()
|
| 103 |
tokens = mecab_instance.pos(filtered_text)
|
| 104 |
logger.debug("ํํ์ ๋ถ์ ๊ฒฐ๊ณผ: %s", tokens)
|
| 105 |
|
|
|
|
| 118 |
df = pd.DataFrame(sorted_freq, columns=["๋จ์ด", "๋น๋์"])
|
| 119 |
logger.debug("๊ฒฐ๊ณผ DataFrame ์์ฑ๋จ, shape: %s", df.shape)
|
| 120 |
|
| 121 |
+
# 5. Excel ํ์ผ ์์ฑ (์์ ํ์ผ)
|
| 122 |
temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".xlsx")
|
| 123 |
df.to_excel(temp_file.name, index=False, engine='openpyxl')
|
| 124 |
temp_file.close()
|
|
|
|
| 126 |
|
| 127 |
return df, temp_file.name
|
| 128 |
|
| 129 |
+
# [์ฐธ์กฐ์ฝ๋-2] ๋ค์ด๋ฒ ๊ด๊ณ API ๋ฐ ๊ฒ์๋/๋ธ๋ก๊ทธ๋ฌธ์์ ์กฐํ ๊ธฐ๋ฅ
|
|
|
|
|
|
|
| 130 |
def generate_signature(timestamp, method, uri, secret_key):
|
| 131 |
message = f"{timestamp}.{method}.{uri}"
|
| 132 |
digest = hmac.new(secret_key.encode("utf-8"), message.encode("utf-8"), hashlib.sha256).digest()
|
|
|
|
| 144 |
}
|
| 145 |
|
| 146 |
def fetch_related_keywords(keyword):
|
| 147 |
+
debug_log(f"fetch_related_keywords ํธ์ถ, ํค์๋: {keyword}")
|
| 148 |
API_KEY = os.environ["NAVER_API_KEY"]
|
| 149 |
SECRET_KEY = os.environ["NAVER_SECRET_KEY"]
|
| 150 |
CUSTOMER_ID = os.environ["NAVER_CUSTOMER_ID"]
|
|
|
|
| 176 |
df["ํ ํ์๊ฒ์๋"] = df["PC์๊ฒ์๋"] + df["๋ชจ๋ฐ์ผ์๊ฒ์๋"]
|
| 177 |
df.rename(columns={"relKeyword": "์ ๋ณดํค์๋"}, inplace=True)
|
| 178 |
result_df = df[["์ ๋ณดํค์๋", "PC์๊ฒ์๋", "๋ชจ๋ฐ์ผ์๊ฒ์๋", "ํ ํ์๊ฒ์๋"]]
|
| 179 |
+
debug_log("fetch_related_keywords ์๋ฃ")
|
| 180 |
return result_df
|
| 181 |
|
| 182 |
def fetch_blog_count(keyword):
|
| 183 |
+
debug_log(f"fetch_blog_count ํธ์ถ, ํค์๋: {keyword}")
|
| 184 |
client_id = os.environ["NAVER_SEARCH_CLIENT_ID"]
|
| 185 |
client_secret = os.environ["NAVER_SEARCH_CLIENT_SECRET"]
|
| 186 |
url = "https://openapi.naver.com/v1/search/blog.json"
|
|
|
|
| 192 |
response = requests.get(url, headers=headers, params=params)
|
| 193 |
if response.status_code == 200:
|
| 194 |
data = response.json()
|
| 195 |
+
debug_log(f"fetch_blog_count ๊ฒฐ๊ณผ: {data.get('total', 0)}")
|
| 196 |
return data.get("total", 0)
|
| 197 |
else:
|
| 198 |
+
debug_log(f"fetch_blog_count ์ค๋ฅ, ์ํ์ฝ๋: {response.status_code}")
|
| 199 |
return 0
|
| 200 |
|
| 201 |
def create_excel_file(df):
|
| 202 |
with tempfile.NamedTemporaryFile(suffix=".xlsx", delete=False) as tmp:
|
| 203 |
excel_path = tmp.name
|
| 204 |
df.to_excel(excel_path, index=False)
|
| 205 |
+
debug_log(f"Excel ํ์ผ ์์ฑ๋จ: {excel_path}")
|
| 206 |
return excel_path
|
| 207 |
|
| 208 |
def process_keyword(keywords: str, include_related: bool):
|
| 209 |
+
debug_log(f"process_keyword ํธ์ถ, ํค์๋๋ค: {keywords}, ์ฐ๊ด๊ฒ์์ด ํฌํจ: {include_related}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 210 |
input_keywords = [k.strip() for k in keywords.splitlines() if k.strip()]
|
| 211 |
result_dfs = []
|
| 212 |
|
|
|
|
| 232 |
|
| 233 |
result_df["๋ธ๋ก๊ทธ๋ฌธ์์"] = result_df["์ ๋ณดํค์๋"].apply(fetch_blog_count)
|
| 234 |
result_df.sort_values(by="ํ ํ์๊ฒ์๋", ascending=False, inplace=True)
|
| 235 |
+
debug_log("process_keyword ์๋ฃ")
|
| 236 |
return result_df, create_excel_file(result_df)
|
| 237 |
|
| 238 |
+
# ์๋ก์ด ๊ธฐ๋ฅ: '๋ธ๋ก๊ทธ๋ด์ฉ๊ฐ์ ธ์ค๊ธฐ' ์คํ ์ ๋ธ๋ก๊ทธ ๋งํฌ๋ก๋ถํฐ ์ ๋ชฉ/๋ณธ๋ฌธ ์คํฌ๋ํ
|
| 239 |
+
def fetch_blog_content(url: str):
|
| 240 |
+
debug_log("fetch_blog_content ํจ์ ์์")
|
| 241 |
+
content = scrape_naver_blog(url)
|
| 242 |
+
debug_log("fetch_blog_content ํจ์ ์๋ฃ")
|
| 243 |
+
return content
|
| 244 |
+
|
| 245 |
+
# ์๋ก์ด ๊ธฐ๋ฅ: ํํ์ ๋ถ์ ๋ฐ ๊ฒ์๋, ๋ธ๋ก๊ทธ๋ฌธ์์ ์ถ๊ฐ
|
| 246 |
+
def morphological_analysis_and_enrich(text: str):
|
| 247 |
+
debug_log("morphological_analysis_and_enrich ํจ์ ์์")
|
| 248 |
+
df_freq, _ = analyze_text(text)
|
| 249 |
+
if df_freq.empty:
|
| 250 |
+
debug_log("ํํ์ ๋ถ์ ๊ฒฐ๊ณผ๊ฐ ๋น ๋ฐ์ดํฐํ๋ ์์
๋๋ค.")
|
| 251 |
+
return df_freq, ""
|
|
|
|
|
|
|
|
|
|
| 252 |
|
| 253 |
+
# ํํ์ ๋ถ์ ๊ฒฐ๊ณผ์์ ํค์๋ ์ถ์ถ (๊ฐ ๋จ์ด๋ฅผ ์ํฐ๋ก ๊ตฌ๋ถ)
|
| 254 |
+
keywords = "\n".join(df_freq["๋จ์ด"].tolist())
|
| 255 |
+
debug_log(f"๋ถ์๋ ํค์๋: {keywords}")
|
| 256 |
|
| 257 |
+
# [์ฐธ์กฐ์ฝ๋-2]๋ฅผ ํ์ฉํ์ฌ ๊ฐ ํค์๋์ ๊ฒ์๋ ๋ฐ ๋ธ๋ก๊ทธ๋ฌธ์์ ์กฐํ (์ฐ๊ด๊ฒ์์ด ๋ฏธํฌํจ)
|
| 258 |
+
df_keyword_info, _ = process_keyword(keywords, include_related=False)
|
| 259 |
+
debug_log("๊ฒ์๋ ๋ฐ ๋ธ๋ก๊ทธ๋ฌธ์์ ์กฐํ ์๋ฃ")
|
|
|
|
| 260 |
|
| 261 |
+
# ํํ์ ๋ถ์ ๊ฒฐ๊ณผ์ ๊ฒ์๋ ์ ๋ณด๋ฅผ ๋ณํฉ (ํค์๋ ๊ธฐ์ค)
|
| 262 |
+
merged_df = pd.merge(df_freq, df_keyword_info, left_on="๋จ์ด", right_on="์ ๋ณดํค์๋", how="left")
|
| 263 |
+
merged_df.drop(columns=["์ ๋ณดํค์๋"], inplace=True)
|
| 264 |
|
| 265 |
+
# ๋ณํฉ ๊ฒฐ๊ณผ Excel ํ์ผ ์์ฑ
|
| 266 |
+
merged_excel_path = create_excel_file(merged_df)
|
| 267 |
+
debug_log("morphological_analysis_and_enrich ํจ์ ์๋ฃ")
|
| 268 |
+
return merged_df, merged_excel_path
|
| 269 |
|
| 270 |
+
# Gradio ์ธํฐํ์ด์ค ๊ตฌ์ฑ (Hugging Face Spaces ํ๊ฒฝ์ ์ ํฉ)
|
| 271 |
+
with gr.Blocks(title="๋ธ๋ก๊ทธ๊ธ ํํ์ ๋ถ์ ์คํ์ด์ค", css=".gradio-container { max-width: 960px; margin: auto; }") as demo:
|
| 272 |
+
gr.Markdown("# ๋ธ๋ก๊ทธ๊ธ ํํ์ ๋ถ์ ์คํ์ด์ค")
|
|
|
|
|
|
|
| 273 |
|
| 274 |
+
with gr.Tab("๋ธ๋ก๊ทธ ๋ด์ฉ ๊ฐ์ ธ์ค๊ธฐ"):
|
| 275 |
with gr.Row():
|
| 276 |
+
blog_url_input = gr.Textbox(label="๋ค์ด๋ฒ ๋ธ๋ก๊ทธ ๋งํฌ", placeholder="์: https://blog.naver.com/ssboost/222983068507", lines=1)
|
| 277 |
fetch_button = gr.Button("๋ธ๋ก๊ทธ๋ด์ฉ๊ฐ์ ธ์ค๊ธฐ")
|
| 278 |
+
blog_content = gr.Textbox(label="๋ธ๋ก๊ทธ ๋ด์ฉ", lines=10, placeholder="๋ธ๋ก๊ทธ ๋ด์ฉ์ ๊ฐ์ ธ์ค๊ฑฐ๋ ์ง์ ์
๋ ฅํ์ธ์.")
|
| 279 |
+
fetch_button.click(fn=fetch_blog_content, inputs=blog_url_input, outputs=blog_content)
|
|
|
|
| 280 |
|
| 281 |
+
with gr.Tab("ํํ์ ๋ถ์"):
|
| 282 |
+
with gr.Row():
|
| 283 |
+
analysis_input = gr.Textbox(label="๋ถ์ํ ํ
์คํธ", lines=10, placeholder="๋ถ์ํ ํ
์คํธ๋ฅผ ์
๋ ฅํ๊ฑฐ๋ '๋ธ๋ก๊ทธ ๋ด์ฉ ๊ฐ์ ธ์ค๊ธฐ'์์ ๊ฐ์ ธ์จ ๋ด์ฉ์ ์์ ํ์ธ์.")
|
| 284 |
+
with gr.Row():
|
| 285 |
+
analyze_button = gr.Button("ํํ์๋ถ์")
|
| 286 |
+
with gr.Row():
|
| 287 |
+
analysis_result = gr.Dataframe(label="๋ถ์ ๊ฒฐ๊ณผ (๋จ์ด, ๋น๋์, ๊ฒ์๋, ๋ธ๋ก๊ทธ๋ฌธ์์ ๋ฑ)")
|
| 288 |
with gr.Row():
|
| 289 |
+
analysis_excel = gr.File(label="Excel ๋ค์ด๋ก๋")
|
| 290 |
+
analyze_button.click(fn=morphological_analysis_and_enrich, inputs=analysis_input, outputs=[analysis_result, analysis_excel])
|
|
|
|
|
|
|
|
|
|
|
|
|
| 291 |
|
| 292 |
if __name__ == "__main__":
|
| 293 |
debug_log("Gradio ์ฑ ์คํ ์์")
|