Spaces:
Build error
Build error
Update app.py
Browse files
app.py
CHANGED
|
@@ -126,7 +126,7 @@ def analyze_text(text: str):
|
|
| 126 |
|
| 127 |
return df, temp_file.name
|
| 128 |
|
| 129 |
-
# [μ°Έμ‘°μ½λ-2] λ€μ΄λ² κ΄κ³ API
|
| 130 |
def generate_signature(timestamp, method, uri, secret_key):
|
| 131 |
message = f"{timestamp}.{method}.{uri}"
|
| 132 |
digest = hmac.new(secret_key.encode("utf-8"), message.encode("utf-8"), hashlib.sha256).digest()
|
|
@@ -143,7 +143,6 @@ def get_header(method, uri, api_key, secret_key, customer_id):
|
|
| 143 |
"X-Signature": signature
|
| 144 |
}
|
| 145 |
|
| 146 |
-
# κΈ°μ‘΄ λ¨μΌ ν€μλμ© ν¨μ (μ°Έκ³ μ©)
|
| 147 |
def fetch_related_keywords(keyword):
|
| 148 |
debug_log(f"fetch_related_keywords νΈμΆ, ν€μλ: {keyword}")
|
| 149 |
API_KEY = os.environ["NAVER_API_KEY"]
|
|
@@ -180,83 +179,6 @@ def fetch_related_keywords(keyword):
|
|
| 180 |
debug_log("fetch_related_keywords μλ£")
|
| 181 |
return result_df
|
| 182 |
|
| 183 |
-
# μ κ· μΆκ°: ν€μλ 10κ°μ© κ·Έλ£ΉμΌλ‘ λ¬Άμ΄ ν λ²μ API νΈμΆμ νλ ν¨μ
|
| 184 |
-
# (λ¨, κ° κ·Έλ£Ήμ μμ°¨μ μΌλ‘ νΈμΆλ¨)
|
| 185 |
-
def fetch_related_keywords_batch(keywords: list):
|
| 186 |
-
debug_log(f"fetch_related_keywords_batch νΈμΆ, ν€μλ κ·Έλ£Ή: {keywords}")
|
| 187 |
-
API_KEY = os.environ["NAVER_API_KEY"]
|
| 188 |
-
SECRET_KEY = os.environ["NAVER_SECRET_KEY"]
|
| 189 |
-
CUSTOMER_ID = os.environ["NAVER_CUSTOMER_ID"]
|
| 190 |
-
|
| 191 |
-
BASE_URL = "https://api.naver.com"
|
| 192 |
-
uri = "/keywordstool"
|
| 193 |
-
method = "GET"
|
| 194 |
-
headers = get_header(method, uri, API_KEY, SECRET_KEY, CUSTOMER_ID)
|
| 195 |
-
params = {
|
| 196 |
-
"hintKeywords": keywords, # 리μ€νΈ κ·Έλλ‘ μ λ¬ (μ΅λ 10κ°)
|
| 197 |
-
"showDetail": "1"
|
| 198 |
-
}
|
| 199 |
-
response = requests.get(BASE_URL + uri, params=params, headers=headers)
|
| 200 |
-
data = response.json()
|
| 201 |
-
if "keywordList" not in data:
|
| 202 |
-
return pd.DataFrame()
|
| 203 |
-
df = pd.DataFrame(data["keywordList"])
|
| 204 |
-
if len(df) > 100:
|
| 205 |
-
df = df.head(100)
|
| 206 |
-
|
| 207 |
-
def parse_count(x):
|
| 208 |
-
try:
|
| 209 |
-
return int(str(x).replace(",", ""))
|
| 210 |
-
except:
|
| 211 |
-
return 0
|
| 212 |
-
|
| 213 |
-
df["PCμκ²μλ"] = df["monthlyPcQcCnt"].apply(parse_count)
|
| 214 |
-
df["λͺ¨λ°μΌμκ²μλ"] = df["monthlyMobileQcCnt"].apply(parse_count)
|
| 215 |
-
df["ν νμκ²μλ"] = df["PCμκ²μλ"] + df["λͺ¨λ°μΌμκ²μλ"]
|
| 216 |
-
df.rename(columns={"relKeyword": "μ 보ν€μλ"}, inplace=True)
|
| 217 |
-
result_df = df[["μ 보ν€μλ", "PCμκ²μλ", "λͺ¨λ°μΌμκ²μλ", "ν νμκ²μλ"]]
|
| 218 |
-
debug_log("fetch_related_keywords_batch μλ£")
|
| 219 |
-
return result_df
|
| 220 |
-
|
| 221 |
-
# process_keyword ν¨μλ₯Ό κ·Έλ£Ήλ³λ‘(κ° κ·Έλ£Ήμ μμ°¨μ μΌλ‘) μ²λ¦¬νλλ‘ κ°μ
|
| 222 |
-
def process_keyword(keywords: str, include_related: bool):
|
| 223 |
-
debug_log(f"process_keyword νΈμΆ, ν€μλλ€: {keywords}, μ°κ΄κ²μμ΄ ν¬ν¨: {include_related}")
|
| 224 |
-
input_keywords = [k.strip() for k in keywords.splitlines() if k.strip()]
|
| 225 |
-
groups = [input_keywords[i:i+10] for i in range(0, len(input_keywords), 10)]
|
| 226 |
-
result_dfs = []
|
| 227 |
-
|
| 228 |
-
# κ° κ·Έλ£Ήμ μμ°¨μ μΌλ‘ μ²λ¦¬ (λμμ νΈμΆνμ§ μμ)
|
| 229 |
-
for idx, group in enumerate(groups):
|
| 230 |
-
debug_log(f"κ·Έλ£Ή {idx+1} μ²λ¦¬ μμ: {group}")
|
| 231 |
-
df_batch = fetch_related_keywords_batch(group)
|
| 232 |
-
if df_batch.empty:
|
| 233 |
-
continue
|
| 234 |
-
# κ·Έλ£Ή λ΄ κ° ν€μλμ λν΄ κ²°κ³Ό μΆμΆ
|
| 235 |
-
for kw in group:
|
| 236 |
-
row_kw = df_batch[df_batch["μ 보ν€μλ"] == kw]
|
| 237 |
-
if not row_kw.empty:
|
| 238 |
-
result_dfs.append(row_kw)
|
| 239 |
-
else:
|
| 240 |
-
result_dfs.append(df_batch.head(1))
|
| 241 |
-
# 첫 λ²μ§Έ κ·Έλ£Ήμ λν΄μλ§ μ°κ΄κ²μμ΄ μ΅μ
μ μ© (첫 ν€μλ μ μΈ)
|
| 242 |
-
if include_related and idx == 0:
|
| 243 |
-
first_keyword = group[0]
|
| 244 |
-
df_related = df_batch[df_batch["μ 보ν€μλ"] != first_keyword]
|
| 245 |
-
if not df_related.empty:
|
| 246 |
-
result_dfs.append(df_related)
|
| 247 |
-
debug_log(f"κ·Έλ£Ή {idx+1} μ²λ¦¬ μλ£")
|
| 248 |
-
|
| 249 |
-
if result_dfs:
|
| 250 |
-
result_df = pd.concat(result_dfs, ignore_index=True)
|
| 251 |
-
result_df.drop_duplicates(subset=["μ 보ν€μλ"], inplace=True)
|
| 252 |
-
else:
|
| 253 |
-
result_df = pd.DataFrame(columns=["μ 보ν€μλ", "PCμκ²μλ", "λͺ¨λ°μΌμκ²μλ", "ν νμκ²μλ"])
|
| 254 |
-
|
| 255 |
-
result_df["λΈλ‘κ·Έλ¬Έμμ"] = result_df["μ 보ν€μλ"].apply(fetch_blog_count)
|
| 256 |
-
result_df.sort_values(by="ν νμκ²μλ", ascending=False, inplace=True)
|
| 257 |
-
debug_log("process_keyword μλ£")
|
| 258 |
-
return result_df, create_excel_file(result_df)
|
| 259 |
-
|
| 260 |
def fetch_blog_count(keyword):
|
| 261 |
debug_log(f"fetch_blog_count νΈμΆ, ν€μλ: {keyword}")
|
| 262 |
client_id = os.environ["NAVER_SEARCH_CLIENT_ID"]
|
|
@@ -283,6 +205,36 @@ def create_excel_file(df):
|
|
| 283 |
debug_log(f"Excel νμΌ μμ±λ¨: {excel_path}")
|
| 284 |
return excel_path
|
| 285 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 286 |
# [μ°Έμ‘°μ½λ-1] λ° [μ°Έμ‘°μ½λ-2]λ₯Ό νμ©ν ννμ λΆμ λ° κ²μλ, λΈλ‘κ·Έλ¬Έμμ μΆκ° (λΉλμ1 μ κ±° μ΅μ
ν¬ν¨)
|
| 287 |
def morphological_analysis_and_enrich(text: str, remove_freq1: bool):
|
| 288 |
debug_log("morphological_analysis_and_enrich ν¨μ μμ")
|
|
@@ -313,7 +265,32 @@ def morphological_analysis_and_enrich(text: str, remove_freq1: bool):
|
|
| 313 |
debug_log("morphological_analysis_and_enrich ν¨μ μλ£")
|
| 314 |
return merged_df, merged_excel_path
|
| 315 |
|
| 316 |
-
# μλ‘κ² μΆκ°λ κΈ°λ₯: μ
λ ₯ν
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 317 |
def fetch_blog_content(url: str):
|
| 318 |
debug_log("fetch_blog_content ν¨μ μμ")
|
| 319 |
content = scrape_naver_blog(url)
|
|
@@ -325,23 +302,29 @@ with gr.Blocks(title="λ€μ΄λ² λΈλ‘κ·Έ ννμ λΆμ μ€νμ΄μ€", css=".
|
|
| 325 |
gr.Markdown("# λ€μ΄λ² λΈλ‘κ·Έ ννμ λΆμ μ€νμ΄μ€")
|
| 326 |
with gr.Row():
|
| 327 |
blog_url_input = gr.Textbox(label="λ€μ΄λ² λΈλ‘κ·Έ λ§ν¬", placeholder="μ: https://blog.naver.com/ssboost/222983068507", lines=1)
|
| 328 |
-
with gr.Row():
|
| 329 |
scrape_button = gr.Button("μ€ν¬λν μ€ν")
|
| 330 |
with gr.Row():
|
| 331 |
blog_content_box = gr.Textbox(label="λΈλ‘κ·Έ λ΄μ© (μμ κ°λ₯)", lines=10, placeholder="μ€ν¬λνλ λΈλ‘κ·Έ λ΄μ©μ΄ μ¬κΈ°μ νμλ©λλ€.")
|
| 332 |
with gr.Row():
|
| 333 |
remove_freq_checkbox = gr.Checkbox(label="λΉλμ1 μ κ±°", value=False)
|
|
|
|
|
|
|
| 334 |
with gr.Row():
|
| 335 |
analyze_button = gr.Button("λΆμ μ€ν")
|
|
|
|
| 336 |
with gr.Row():
|
| 337 |
-
|
|
|
|
|
|
|
| 338 |
with gr.Row():
|
| 339 |
-
|
|
|
|
| 340 |
|
| 341 |
-
# μ€ν¬λν μ€ν
|
| 342 |
scrape_button.click(fn=fetch_blog_content, inputs=blog_url_input, outputs=blog_content_box)
|
| 343 |
-
# λΆμ μ€ν
|
| 344 |
-
analyze_button.click(fn=
|
|
|
|
| 345 |
|
| 346 |
if __name__ == "__main__":
|
| 347 |
debug_log("Gradio μ± μ€ν μμ")
|
|
|
|
| 126 |
|
| 127 |
return df, temp_file.name
|
| 128 |
|
| 129 |
+
# [μ°Έμ‘°μ½λ-2] λ€μ΄λ² κ΄κ³ API λ° κ²μλ/λΈλ‘κ·Έλ¬Έμμ μ‘°ν κΈ°λ₯
|
| 130 |
def generate_signature(timestamp, method, uri, secret_key):
|
| 131 |
message = f"{timestamp}.{method}.{uri}"
|
| 132 |
digest = hmac.new(secret_key.encode("utf-8"), message.encode("utf-8"), hashlib.sha256).digest()
|
|
|
|
| 143 |
"X-Signature": signature
|
| 144 |
}
|
| 145 |
|
|
|
|
| 146 |
def fetch_related_keywords(keyword):
|
| 147 |
debug_log(f"fetch_related_keywords νΈμΆ, ν€μλ: {keyword}")
|
| 148 |
API_KEY = os.environ["NAVER_API_KEY"]
|
|
|
|
| 179 |
debug_log("fetch_related_keywords μλ£")
|
| 180 |
return result_df
|
| 181 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 182 |
def fetch_blog_count(keyword):
|
| 183 |
debug_log(f"fetch_blog_count νΈμΆ, ν€μλ: {keyword}")
|
| 184 |
client_id = os.environ["NAVER_SEARCH_CLIENT_ID"]
|
|
|
|
| 205 |
debug_log(f"Excel νμΌ μμ±λ¨: {excel_path}")
|
| 206 |
return excel_path
|
| 207 |
|
| 208 |
+
def process_keyword(keywords: str, include_related: bool):
|
| 209 |
+
debug_log(f"process_keyword νΈμΆ, ν€μλλ€: {keywords}, μ°κ΄κ²μμ΄ ν¬ν¨: {include_related}")
|
| 210 |
+
input_keywords = [k.strip() for k in keywords.splitlines() if k.strip()]
|
| 211 |
+
result_dfs = []
|
| 212 |
+
|
| 213 |
+
for idx, kw in enumerate(input_keywords):
|
| 214 |
+
df_kw = fetch_related_keywords(kw)
|
| 215 |
+
if df_kw.empty:
|
| 216 |
+
continue
|
| 217 |
+
row_kw = df_kw[df_kw["μ 보ν€μλ"] == kw]
|
| 218 |
+
if not row_kw.empty:
|
| 219 |
+
result_dfs.append(row_kw)
|
| 220 |
+
else:
|
| 221 |
+
result_dfs.append(df_kw.head(1))
|
| 222 |
+
if include_related and idx == 0:
|
| 223 |
+
df_related = df_kw[df_kw["μ 보ν€μλ"] != kw]
|
| 224 |
+
if not df_related.empty:
|
| 225 |
+
result_dfs.append(df_related)
|
| 226 |
+
|
| 227 |
+
if result_dfs:
|
| 228 |
+
result_df = pd.concat(result_dfs, ignore_index=True)
|
| 229 |
+
result_df.drop_duplicates(subset=["μ 보ν€μλ"], inplace=True)
|
| 230 |
+
else:
|
| 231 |
+
result_df = pd.DataFrame(columns=["μ 보ν€μλ", "PCμκ²μλ", "λͺ¨λ°μΌμκ²μλ", "ν νμκ²μλ"])
|
| 232 |
+
|
| 233 |
+
result_df["λΈλ‘κ·Έλ¬Έμμ"] = result_df["μ 보ν€μλ"].apply(fetch_blog_count)
|
| 234 |
+
result_df.sort_values(by="ν νμκ²μλ", ascending=False, inplace=True)
|
| 235 |
+
debug_log("process_keyword μλ£")
|
| 236 |
+
return result_df, create_excel_file(result_df)
|
| 237 |
+
|
| 238 |
# [μ°Έμ‘°μ½λ-1] λ° [μ°Έμ‘°μ½λ-2]λ₯Ό νμ©ν ννμ λΆμ λ° κ²μλ, λΈλ‘κ·Έλ¬Έμμ μΆκ° (λΉλμ1 μ κ±° μ΅μ
ν¬ν¨)
|
| 239 |
def morphological_analysis_and_enrich(text: str, remove_freq1: bool):
|
| 240 |
debug_log("morphological_analysis_and_enrich ν¨μ μμ")
|
|
|
|
| 265 |
debug_log("morphological_analysis_and_enrich ν¨μ μλ£")
|
| 266 |
return merged_df, merged_excel_path
|
| 267 |
|
| 268 |
+
# μλ‘κ² μΆκ°λ κΈ°λ₯ 1,2,3: μ§μ μ
λ ₯ν ν€μλ(μν° λλ ','λ‘ κ΅¬λΆλ λ€μμ ν€μλ)κ° λΈλ‘κ·Έ λ³Έλ¬Έ λ΄ λ±μ₯ λΉλμλ₯Ό 체ν¬
|
| 269 |
+
def direct_keyword_analysis(text: str, keyword_input: str):
|
| 270 |
+
debug_log("direct_keyword_analysis ν¨μ μμ")
|
| 271 |
+
# μν° λλ μΌνλ‘ λΆλ¦¬νμ¬ ν€μλ λͺ©λ‘ μμ±
|
| 272 |
+
keywords = re.split(r'[\n,]+', keyword_input)
|
| 273 |
+
keywords = [kw.strip() for kw in keywords if kw.strip()]
|
| 274 |
+
debug_log(f"μ
λ ₯λ ν€μλ λͺ©λ‘: {keywords}")
|
| 275 |
+
results = []
|
| 276 |
+
for kw in keywords:
|
| 277 |
+
count = text.count(kw)
|
| 278 |
+
results.append((kw, count))
|
| 279 |
+
debug_log(f"ν€μλ '{kw}'μ λΉλμ: {count}")
|
| 280 |
+
df = pd.DataFrame(results, columns=["ν€μλ", "λΉλμ"])
|
| 281 |
+
excel_path = create_excel_file(df)
|
| 282 |
+
debug_log("direct_keyword_analysis ν¨μ μλ£")
|
| 283 |
+
return df, excel_path
|
| 284 |
+
|
| 285 |
+
# λΆμ μ€ν λ²νΌ ν΄λ¦ μ, μμ κ°λ₯ν λΈλ‘κ·Έ λ³Έλ¬Έμ λμμΌλ‘ ννμ λΆμκ³Ό μ§μ ν€μλ λΆμμ ν¨κ» μ§ν
|
| 286 |
+
def analyze_combined(blog_text: str, remove_freq1: bool, keyword_input: str):
|
| 287 |
+
debug_log("analyze_combined ν¨μ μμ")
|
| 288 |
+
morph_df, morph_excel = morphological_analysis_and_enrich(blog_text, remove_freq1)
|
| 289 |
+
direct_df, direct_excel = direct_keyword_analysis(blog_text, keyword_input)
|
| 290 |
+
debug_log("analyze_combined ν¨μ μλ£")
|
| 291 |
+
return morph_df, morph_excel, direct_df, direct_excel
|
| 292 |
+
|
| 293 |
+
# μ€ν¬λν μ€ν: λΈλ‘κ·Έ λ§ν¬λ₯Ό ν΅ν΄ λ΄μ©μ κ°μ Έμ μμ κ°λ₯ν ν
μ€νΈ λ°μ€μ μΆλ ₯
|
| 294 |
def fetch_blog_content(url: str):
|
| 295 |
debug_log("fetch_blog_content ν¨μ μμ")
|
| 296 |
content = scrape_naver_blog(url)
|
|
|
|
| 302 |
gr.Markdown("# λ€μ΄λ² λΈλ‘κ·Έ ννμ λΆμ μ€νμ΄μ€")
|
| 303 |
with gr.Row():
|
| 304 |
blog_url_input = gr.Textbox(label="λ€μ΄λ² λΈλ‘κ·Έ λ§ν¬", placeholder="μ: https://blog.naver.com/ssboost/222983068507", lines=1)
|
|
|
|
| 305 |
scrape_button = gr.Button("μ€ν¬λν μ€ν")
|
| 306 |
with gr.Row():
|
| 307 |
blog_content_box = gr.Textbox(label="λΈλ‘κ·Έ λ΄μ© (μμ κ°λ₯)", lines=10, placeholder="μ€ν¬λνλ λΈλ‘κ·Έ λ΄μ©μ΄ μ¬κΈ°μ νμλ©λλ€.")
|
| 308 |
with gr.Row():
|
| 309 |
remove_freq_checkbox = gr.Checkbox(label="λΉλμ1 μ κ±°", value=False)
|
| 310 |
+
with gr.Row():
|
| 311 |
+
keyword_input_box = gr.Textbox(label="μ§μ ν€μλ μ
λ ₯ (μν° λλ ','λ‘ κ΅¬λΆ)", lines=2, placeholder="μ: ν€μλ1, ν€μλ2\nν€μλ3")
|
| 312 |
with gr.Row():
|
| 313 |
analyze_button = gr.Button("λΆμ μ€ν")
|
| 314 |
+
with gr.Markdown("### ννμ λΆμ κ²°κ³Ό")
|
| 315 |
with gr.Row():
|
| 316 |
+
morph_result_df = gr.Dataframe(label="ννμ λΆμ κ²°κ³Ό (λ¨μ΄, λΉλμ, κ²μλ, λΈλ‘κ·Έλ¬Έμμ λ±)")
|
| 317 |
+
morph_excel_file = gr.File(label="ννμ λΆμ Excel λ€μ΄λ‘λ")
|
| 318 |
+
with gr.Markdown("### μ§μ ν€μλ λΆμ κ²°κ³Ό")
|
| 319 |
with gr.Row():
|
| 320 |
+
direct_result_df = gr.Dataframe(label="μ§μ ν€μλ λΆμ κ²°κ³Ό (ν€μλ, λΉλμ)")
|
| 321 |
+
direct_excel_file = gr.File(label="μ§μ ν€μλ λΆμ Excel λ€μ΄λ‘λ")
|
| 322 |
|
| 323 |
+
# μ€ν¬λν μ€ν: URLμ μ
λ ₯νλ©΄ λΈλ‘κ·Έ λ΄μ©μ μμ κ°λ₯ν ν
μ€νΈ λ°μ€μ μ±μμ€
|
| 324 |
scrape_button.click(fn=fetch_blog_content, inputs=blog_url_input, outputs=blog_content_box)
|
| 325 |
+
# λΆμ μ€ν: μμ λ λΈλ‘κ·Έ λ΄μ©κ³Ό λΉλμ1 μ κ±° μ΅μ
, μ§μ μ
λ ₯ ν€μλλ₯Ό λμμΌλ‘ λ λΆμμ ν¨κ» μ§ν
|
| 326 |
+
analyze_button.click(fn=analyze_combined, inputs=[blog_content_box, remove_freq_checkbox, keyword_input_box],
|
| 327 |
+
outputs=[morph_result_df, morph_excel_file, direct_result_df, direct_excel_file])
|
| 328 |
|
| 329 |
if __name__ == "__main__":
|
| 330 |
debug_log("Gradio μ± μ€ν μμ")
|