Spaces:
Build error
Build error
Update app.py
Browse files
app.py
CHANGED
|
@@ -1,7 +1,5 @@
|
|
| 1 |
import gradio as gr
|
| 2 |
import requests
|
| 3 |
-
import aiohttp
|
| 4 |
-
import asyncio
|
| 5 |
from bs4 import BeautifulSoup
|
| 6 |
import urllib.parse # iframe κ²½λ‘ λ³΄μ μ μν λͺ¨λ
|
| 7 |
import re
|
|
@@ -19,8 +17,8 @@ import base64
|
|
| 19 |
def debug_log(message: str):
|
| 20 |
print(f"[DEBUG] {message}")
|
| 21 |
|
| 22 |
-
# --- λ€μ΄λ² λΈλ‘κ·Έ μ€ν¬λν
|
| 23 |
-
|
| 24 |
debug_log("scrape_naver_blog ν¨μ μμ")
|
| 25 |
debug_log(f"μμ²λ°μ URL: {url}")
|
| 26 |
headers = {
|
|
@@ -31,51 +29,75 @@ async def scrape_naver_blog(url: str) -> str:
|
|
| 31 |
)
|
| 32 |
}
|
| 33 |
try:
|
| 34 |
-
|
| 35 |
-
|
| 36 |
-
|
| 37 |
-
|
| 38 |
-
|
| 39 |
-
|
| 40 |
-
|
| 41 |
-
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
|
| 49 |
-
|
| 50 |
-
|
| 51 |
-
|
| 52 |
-
|
| 53 |
-
|
| 54 |
-
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
|
| 58 |
-
|
| 59 |
-
|
| 60 |
-
|
| 61 |
-
|
| 62 |
-
|
| 63 |
-
|
| 64 |
-
|
| 65 |
-
|
| 66 |
-
|
| 67 |
-
|
| 68 |
-
|
| 69 |
-
|
| 70 |
-
debug_log("λ³Έλ¬Έ μΆμΆ μλ£")
|
| 71 |
-
result = f"[μ λͺ©]\n{title}\n\n[λ³Έλ¬Έ]\n{content}"
|
| 72 |
-
debug_log("μ λͺ©κ³Ό λ³Έλ¬Έ ν©μΉ¨ μλ£")
|
| 73 |
-
return result
|
| 74 |
except Exception as e:
|
| 75 |
debug_log(f"μλ¬ λ°μ: {str(e)}")
|
| 76 |
return f"μ€ν¬λν μ€ μ€λ₯κ° λ°μνμ΅λλ€: {str(e)}"
|
| 77 |
|
| 78 |
-
# ---
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 79 |
def generate_signature(timestamp, method, uri, secret_key):
|
| 80 |
message = f"{timestamp}.{method}.{uri}"
|
| 81 |
digest = hmac.new(secret_key.encode("utf-8"), message.encode("utf-8"), hashlib.sha256).digest()
|
|
@@ -92,8 +114,7 @@ def get_header(method, uri, api_key, secret_key, customer_id):
|
|
| 92 |
"X-Signature": signature
|
| 93 |
}
|
| 94 |
|
| 95 |
-
|
| 96 |
-
async def fetch_related_keywords(keyword):
|
| 97 |
debug_log(f"fetch_related_keywords νΈμΆ, ν€μλ: {keyword}")
|
| 98 |
API_KEY = os.environ["NAVER_API_KEY"]
|
| 99 |
SECRET_KEY = os.environ["NAVER_SECRET_KEY"]
|
|
@@ -106,9 +127,8 @@ async def fetch_related_keywords(keyword):
|
|
| 106 |
"hintKeywords": [keyword],
|
| 107 |
"showDetail": "1"
|
| 108 |
}
|
| 109 |
-
|
| 110 |
-
|
| 111 |
-
data = await response.json()
|
| 112 |
if "keywordList" not in data:
|
| 113 |
return pd.DataFrame()
|
| 114 |
df = pd.DataFrame(data["keywordList"])
|
|
@@ -127,8 +147,7 @@ async def fetch_related_keywords(keyword):
|
|
| 127 |
debug_log("fetch_related_keywords μλ£")
|
| 128 |
return result_df
|
| 129 |
|
| 130 |
-
|
| 131 |
-
async def fetch_blog_count(keyword):
|
| 132 |
debug_log(f"fetch_blog_count νΈμΆ, ν€μλ: {keyword}")
|
| 133 |
client_id = os.environ["NAVER_SEARCH_CLIENT_ID"]
|
| 134 |
client_secret = os.environ["NAVER_SEARCH_CLIENT_SECRET"]
|
|
@@ -138,30 +157,28 @@ async def fetch_blog_count(keyword):
|
|
| 138 |
"X-Naver-Client-Secret": client_secret
|
| 139 |
}
|
| 140 |
params = {"query": keyword, "display": 1}
|
| 141 |
-
|
| 142 |
-
|
| 143 |
-
|
| 144 |
-
|
| 145 |
-
|
| 146 |
-
|
| 147 |
-
|
| 148 |
-
|
| 149 |
-
return 0
|
| 150 |
|
| 151 |
def create_excel_file(df):
|
| 152 |
with tempfile.NamedTemporaryFile(suffix=".xlsx", delete=False) as tmp:
|
| 153 |
excel_path = tmp.name
|
| 154 |
-
df.to_excel(excel_path, index=False
|
| 155 |
debug_log(f"Excel νμΌ μμ±λ¨: {excel_path}")
|
| 156 |
return excel_path
|
| 157 |
|
| 158 |
-
|
| 159 |
-
async def process_keyword(keywords: str, include_related: bool):
|
| 160 |
debug_log(f"process_keyword νΈμΆ, ν€μλλ€: {keywords}, μ°κ΄κ²μμ΄ ν¬ν¨: {include_related}")
|
| 161 |
input_keywords = [k.strip() for k in keywords.splitlines() if k.strip()]
|
| 162 |
result_dfs = []
|
| 163 |
for idx, kw in enumerate(input_keywords):
|
| 164 |
-
df_kw =
|
| 165 |
if df_kw.empty:
|
| 166 |
continue
|
| 167 |
row_kw = df_kw[df_kw["μ 보ν€μλ"] == kw]
|
|
@@ -178,44 +195,13 @@ async def process_keyword(keywords: str, include_related: bool):
|
|
| 178 |
result_df.drop_duplicates(subset=["μ 보ν€μλ"], inplace=True)
|
| 179 |
else:
|
| 180 |
result_df = pd.DataFrame(columns=["μ 보ν€μλ", "PCμκ²μλ", "λͺ¨λ°μΌμκ²μλ", "ν νμκ²μλ"])
|
| 181 |
-
|
| 182 |
-
tasks = [fetch_blog_count(kw) for kw in result_df["μ 보ν€μλ"]]
|
| 183 |
-
counts = await asyncio.gather(*tasks)
|
| 184 |
-
result_df["λΈλ‘κ·Έλ¬Έμμ"] = counts
|
| 185 |
result_df.sort_values(by="ν νμκ²μλ", ascending=False, inplace=True)
|
| 186 |
debug_log("process_keyword μλ£")
|
| 187 |
return result_df, create_excel_file(result_df)
|
| 188 |
|
| 189 |
-
# --- ννμ λΆμ
|
| 190 |
-
def
|
| 191 |
-
logging.basicConfig(level=logging.DEBUG)
|
| 192 |
-
logger = logging.getLogger(__name__)
|
| 193 |
-
logger.debug("μλ³Έ ν
μ€νΈ: %s", text)
|
| 194 |
-
filtered_text = re.sub(r'[^κ°-ν£]', '', text)
|
| 195 |
-
logger.debug("νν°λ§λ ν
μ€νΈ: %s", filtered_text)
|
| 196 |
-
if not filtered_text:
|
| 197 |
-
logger.debug("μ ν¨ν νκ΅μ΄ ν
μ€νΈκ° μμ.")
|
| 198 |
-
return pd.DataFrame(columns=["λ¨μ΄", "λΉλμ"]), ""
|
| 199 |
-
mecab_instance = mecab.MeCab()
|
| 200 |
-
tokens = mecab_instance.pos(filtered_text)
|
| 201 |
-
logger.debug("ννμ λΆμ κ²°κ³Ό: %s", tokens)
|
| 202 |
-
freq = {}
|
| 203 |
-
for word, pos in tokens:
|
| 204 |
-
if word and word.strip() and pos.startswith("NN"):
|
| 205 |
-
freq[word] = freq.get(word, 0) + 1
|
| 206 |
-
logger.debug("λ¨μ΄: %s, νμ¬: %s, λΉλ: %d", word, pos, freq[word])
|
| 207 |
-
sorted_freq = sorted(freq.items(), key=lambda x: x[1], reverse=True)
|
| 208 |
-
logger.debug("μ λ ¬λ λ¨μ΄ λΉλ: %s", sorted_freq)
|
| 209 |
-
df = pd.DataFrame(sorted_freq, columns=["λ¨μ΄", "λΉλμ"])
|
| 210 |
-
logger.debug("ννμ λΆμ DataFrame μμ±λ¨, shape: %s", df.shape)
|
| 211 |
-
temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".xlsx")
|
| 212 |
-
df.to_excel(temp_file.name, index=False, engine='openpyxl')
|
| 213 |
-
temp_file.close()
|
| 214 |
-
logger.debug("Excel νμΌ μμ±λ¨: %s", temp_file.name)
|
| 215 |
-
return df, temp_file.name
|
| 216 |
-
|
| 217 |
-
# --- ννμ λΆμκ³Ό κ²μλ/λΈλ‘κ·Έλ¬Έμμ λ³ν© (λΉλκΈ°) ---
|
| 218 |
-
async def morphological_analysis_and_enrich(text: str, remove_freq1: bool):
|
| 219 |
debug_log("morphological_analysis_and_enrich ν¨μ μμ")
|
| 220 |
df_freq, _ = analyze_text(text)
|
| 221 |
if df_freq.empty:
|
|
@@ -227,7 +213,7 @@ async def morphological_analysis_and_enrich(text: str, remove_freq1: bool):
|
|
| 227 |
debug_log(f"λΉλμ 1 μ κ±° μ μ©λ¨. {before_shape} -> {df_freq.shape}")
|
| 228 |
keywords = "\n".join(df_freq["λ¨μ΄"].tolist())
|
| 229 |
debug_log(f"λΆμλ ν€μλ: {keywords}")
|
| 230 |
-
df_keyword_info, _ =
|
| 231 |
debug_log("κ²μλ λ° λΈλ‘κ·Έλ¬Έμμ μ‘°ν μλ£")
|
| 232 |
merged_df = pd.merge(df_freq, df_keyword_info, left_on="λ¨μ΄", right_on="μ 보ν€μλ", how="left")
|
| 233 |
merged_df.drop(columns=["μ 보ν€μλ"], inplace=True)
|
|
@@ -235,8 +221,8 @@ async def morphological_analysis_and_enrich(text: str, remove_freq1: bool):
|
|
| 235 |
debug_log("morphological_analysis_and_enrich ν¨μ μλ£")
|
| 236 |
return merged_df, merged_excel_path
|
| 237 |
|
| 238 |
-
# --- μ§μ ν€μλ λΆμ (λ¨λ
λΆμ
|
| 239 |
-
|
| 240 |
debug_log("direct_keyword_analysis ν¨μ μμ")
|
| 241 |
keywords = re.split(r'[\n,]+', keyword_input)
|
| 242 |
keywords = [kw.strip() for kw in keywords if kw.strip()]
|
|
@@ -246,28 +232,15 @@ async def direct_keyword_analysis(text: str, keyword_input: str):
|
|
| 246 |
count = text.count(kw)
|
| 247 |
results.append((kw, count))
|
| 248 |
debug_log(f"ν€μλ '{kw}'μ λΉλμ: {count}")
|
| 249 |
-
# μ§μ μ
λ ₯ ν€μλκ° λ³Έλ¬Έμ μμΌλ©΄ μΆκ° μ‘°ν
|
| 250 |
-
if kw not in text:
|
| 251 |
-
df_direct, _ = await process_keyword(kw, include_related=False)
|
| 252 |
-
if (not df_direct.empty) and (kw in df_direct["μ 보ν€μλ"].values):
|
| 253 |
-
row = df_direct[df_direct["μ 보ν€μλ"] == kw].iloc[0]
|
| 254 |
-
pc = row.get("PCμκ²μλ", None)
|
| 255 |
-
mobile = row.get("λͺ¨λ°μΌμκ²μλ", None)
|
| 256 |
-
total = row.get("ν νμκ²μλ", None)
|
| 257 |
-
blog_count = row.get("λΈλ‘κ·Έλ¬Έμμ", None)
|
| 258 |
-
else:
|
| 259 |
-
pc = mobile = total = blog_count = None
|
| 260 |
-
# κ²°κ³Όμ μ ν μΆκ°
|
| 261 |
-
results.append((kw, count))
|
| 262 |
df = pd.DataFrame(results, columns=["ν€μλ", "λΉλμ"])
|
| 263 |
excel_path = create_excel_file(df)
|
| 264 |
debug_log("direct_keyword_analysis ν¨μ μλ£")
|
| 265 |
return df, excel_path
|
| 266 |
|
| 267 |
-
# --- ν΅ν© λΆμ (ννμ λΆμ + μ§μ ν€μλ λΆμ
|
| 268 |
-
|
| 269 |
debug_log("combined_analysis ν¨μ μμ")
|
| 270 |
-
merged_df, _ =
|
| 271 |
if "μ§μ μ
λ ₯" not in merged_df.columns:
|
| 272 |
merged_df["μ§μ μ
λ ₯"] = ""
|
| 273 |
direct_keywords = re.split(r'[\n,]+', direct_keyword_input)
|
|
@@ -278,7 +251,7 @@ async def combined_analysis(blog_text: str, remove_freq1: bool, direct_keyword_i
|
|
| 278 |
merged_df.loc[merged_df["λ¨μ΄"] == dk, "μ§μ μ
λ ₯"] = "μ§μ μ
λ ₯"
|
| 279 |
else:
|
| 280 |
freq = blog_text.count(dk)
|
| 281 |
-
df_direct, _ =
|
| 282 |
if (not df_direct.empty) and (dk in df_direct["μ 보ν€μλ"].values):
|
| 283 |
row = df_direct[df_direct["μ 보ν€μλ"] == dk].iloc[0]
|
| 284 |
pc = row.get("PCμκ²μλ", None)
|
|
@@ -302,18 +275,20 @@ async def combined_analysis(blog_text: str, remove_freq1: bool, direct_keyword_i
|
|
| 302 |
debug_log("combined_analysis ν¨μ μλ£")
|
| 303 |
return merged_df, combined_excel
|
| 304 |
|
| 305 |
-
# --- λΆμ νΈλ€λ¬
|
| 306 |
-
|
| 307 |
debug_log("analysis_handler ν¨μ μμ")
|
| 308 |
if direct_keyword_only:
|
| 309 |
-
|
|
|
|
| 310 |
else:
|
| 311 |
-
|
|
|
|
| 312 |
|
| 313 |
-
# --- μ€ν¬λν μ€ν
|
| 314 |
-
|
| 315 |
debug_log("fetch_blog_content ν¨μ μμ")
|
| 316 |
-
content =
|
| 317 |
debug_log("fetch_blog_content ν¨μ μλ£")
|
| 318 |
return content
|
| 319 |
|
|
@@ -399,6 +374,7 @@ custom_css = """
|
|
| 399 |
# --- Gradio μΈν°νμ΄μ€ κ΅¬μ± ---
|
| 400 |
with gr.Blocks(title="λ€μ΄λ² λΈλ‘κ·Έ ννμ λΆμ μλΉμ€", css=custom_css) as demo:
|
| 401 |
gr.HTML("<div class='custom-header'>λ€μ΄λ² λΈλ‘κ·Έ ννμ λΆμ μλΉμ€ π</div>")
|
|
|
|
| 402 |
with gr.Group(elem_classes="custom-group"):
|
| 403 |
with gr.Row():
|
| 404 |
blog_url_input = gr.Textbox(label="λ€μ΄λ² λΈλ‘κ·Έ λ§ν¬", placeholder="μ: https://blog.naver.com/ssboost/222983068507", lines=1)
|
|
@@ -420,6 +396,7 @@ with gr.Blocks(title="λ€μ΄λ² λΈλ‘κ·Έ ννμ λΆμ μλΉμ€", css=custo
|
|
| 420 |
result_df = gr.Dataframe(label="ν΅ν© λΆμ κ²°κ³Ό (λ¨μ΄, λΉλμ, κ²μλ, λΈλ‘κ·Έλ¬Έμμ, μ§μ μ
λ ₯)", interactive=True)
|
| 421 |
with gr.Group(elem_classes="custom-group"):
|
| 422 |
excel_file = gr.File(label="Excel λ€μ΄λ‘λ")
|
|
|
|
| 423 |
with gr.Group(elem_classes="custom-group"):
|
| 424 |
usage_html = gr.HTML("""
|
| 425 |
<div class="usage-instructions">
|
|
@@ -441,7 +418,7 @@ with gr.Blocks(title="λ€μ΄λ² λΈλ‘κ·Έ ννμ λΆμ μλΉμ€", css=custo
|
|
| 441 |
<p><strong>Tip:</strong> λΆμ κ²°κ³Όλ μ€μκ°μΌλ‘ μ
λ°μ΄νΈλλ©°, νμμ μμ ν λ€μ λΆμν μ μμ΅λλ€. μ¦κ±°μ΄ λΆμ λμΈμ! π</p>
|
| 442 |
</div>
|
| 443 |
""")
|
| 444 |
-
# μ΄λ²€νΈ μ°κ²°
|
| 445 |
scrape_button.click(fn=fetch_blog_content, inputs=blog_url_input, outputs=blog_content_box)
|
| 446 |
analyze_button.click(fn=analysis_handler,
|
| 447 |
inputs=[blog_content_box, remove_freq_checkbox, direct_keyword_box, direct_keyword_only_checkbox],
|
|
@@ -450,4 +427,4 @@ with gr.Blocks(title="λ€μ΄λ² λΈλ‘κ·Έ ννμ λΆμ μλΉμ€", css=custo
|
|
| 450 |
if __name__ == "__main__":
|
| 451 |
debug_log("Gradio μ± μ€ν μμ")
|
| 452 |
demo.launch()
|
| 453 |
-
debug_log("Gradio μ± μ€ν μ’
λ£")
|
|
|
|
| 1 |
import gradio as gr
|
| 2 |
import requests
|
|
|
|
|
|
|
| 3 |
from bs4 import BeautifulSoup
|
| 4 |
import urllib.parse # iframe κ²½λ‘ λ³΄μ μ μν λͺ¨λ
|
| 5 |
import re
|
|
|
|
| 17 |
def debug_log(message: str):
|
| 18 |
print(f"[DEBUG] {message}")
|
| 19 |
|
| 20 |
+
# --- λ€μ΄λ² λΈλ‘κ·Έ μ€ν¬λν ---
|
| 21 |
+
def scrape_naver_blog(url: str) -> str:
|
| 22 |
debug_log("scrape_naver_blog ν¨μ μμ")
|
| 23 |
debug_log(f"μμ²λ°μ URL: {url}")
|
| 24 |
headers = {
|
|
|
|
| 29 |
)
|
| 30 |
}
|
| 31 |
try:
|
| 32 |
+
response = requests.get(url, headers=headers)
|
| 33 |
+
debug_log("HTTP GET μμ²(λ©μΈ νμ΄μ§) μλ£")
|
| 34 |
+
if response.status_code != 200:
|
| 35 |
+
debug_log(f"μμ² μ€ν¨, μνμ½λ: {response.status_code}")
|
| 36 |
+
return f"μ€λ₯κ° λ°μνμ΅λλ€. μνμ½λ: {response.status_code}"
|
| 37 |
+
soup = BeautifulSoup(response.text, "html.parser")
|
| 38 |
+
debug_log("HTML νμ±(λ©μΈ νμ΄μ§) μλ£")
|
| 39 |
+
iframe = soup.select_one("iframe#mainFrame")
|
| 40 |
+
if not iframe:
|
| 41 |
+
debug_log("iframe#mainFrame νκ·Έλ₯Ό μ°Ύμ μ μμ΅λλ€.")
|
| 42 |
+
return "λ³Έλ¬Έ iframeμ μ°Ύμ μ μμ΅λλ€."
|
| 43 |
+
iframe_src = iframe.get("src")
|
| 44 |
+
if not iframe_src:
|
| 45 |
+
debug_log("iframe srcκ° μ‘΄μ¬νμ§ μμ΅λλ€.")
|
| 46 |
+
return "λ³Έλ¬Έ iframeμ srcλ₯Ό μ°Ύμ μ μμ΅λλ€."
|
| 47 |
+
parsed_iframe_url = urllib.parse.urljoin(url, iframe_src)
|
| 48 |
+
debug_log(f"iframe νμ΄μ§ μμ² URL: {parsed_iframe_url}")
|
| 49 |
+
iframe_response = requests.get(parsed_iframe_url, headers=headers)
|
| 50 |
+
debug_log("HTTP GET μμ²(iframe νμ΄μ§) μλ£")
|
| 51 |
+
if iframe_response.status_code != 200:
|
| 52 |
+
debug_log(f"iframe μμ² μ€ν¨, μνμ½λ: {iframe_response.status_code}")
|
| 53 |
+
return f"iframeμμ μ€λ₯κ° λ°μνμ΅λλ€. μνμ½λ: {iframe_response.status_code}"
|
| 54 |
+
iframe_soup = BeautifulSoup(iframe_response.text, "html.parser")
|
| 55 |
+
debug_log("HTML νμ±(iframe νμ΄μ§) μλ£")
|
| 56 |
+
title_div = iframe_soup.select_one('.se-module.se-module-text.se-title-text')
|
| 57 |
+
title = title_div.get_text(strip=True) if title_div else "οΏ½οΏ½λͺ©μ μ°Ύμ μ μμ΅λλ€."
|
| 58 |
+
debug_log(f"μΆμΆλ μ λͺ©: {title}")
|
| 59 |
+
content_div = iframe_soup.select_one('.se-main-container')
|
| 60 |
+
if content_div:
|
| 61 |
+
content = content_div.get_text("\n", strip=True)
|
| 62 |
+
else:
|
| 63 |
+
content = "λ³Έλ¬Έμ μ°Ύμ μ μμ΅λλ€."
|
| 64 |
+
debug_log("λ³Έλ¬Έ μΆμΆ μλ£")
|
| 65 |
+
result = f"[μ λͺ©]\n{title}\n\n[λ³Έλ¬Έ]\n{content}"
|
| 66 |
+
debug_log("μ λͺ©κ³Ό λ³Έλ¬Έ ν©μΉ¨ μλ£")
|
| 67 |
+
return result
|
|
|
|
|
|
|
|
|
|
|
|
|
| 68 |
except Exception as e:
|
| 69 |
debug_log(f"μλ¬ λ°μ: {str(e)}")
|
| 70 |
return f"μ€ν¬λν μ€ μ€λ₯κ° λ°μνμ΅λλ€: {str(e)}"
|
| 71 |
|
| 72 |
+
# --- ννμ λΆμ (μ°Έμ‘°μ½λ-1) ---
|
| 73 |
+
def analyze_text(text: str):
|
| 74 |
+
logging.basicConfig(level=logging.DEBUG)
|
| 75 |
+
logger = logging.getLogger(__name__)
|
| 76 |
+
logger.debug("μλ³Έ ν
μ€νΈ: %s", text)
|
| 77 |
+
filtered_text = re.sub(r'[^κ°-ν£]', '', text)
|
| 78 |
+
logger.debug("νν°λ§λ ν
μ€νΈ: %s", filtered_text)
|
| 79 |
+
if not filtered_text:
|
| 80 |
+
logger.debug("μ ν¨ν νκ΅μ΄ ν
μ€νΈκ° μμ.")
|
| 81 |
+
return pd.DataFrame(columns=["λ¨μ΄", "λΉλμ"]), ""
|
| 82 |
+
mecab_instance = mecab.MeCab()
|
| 83 |
+
tokens = mecab_instance.pos(filtered_text)
|
| 84 |
+
logger.debug("ννμ λΆμ κ²°κ³Ό: %s", tokens)
|
| 85 |
+
freq = {}
|
| 86 |
+
for word, pos in tokens:
|
| 87 |
+
if word and word.strip() and pos.startswith("NN"):
|
| 88 |
+
freq[word] = freq.get(word, 0) + 1
|
| 89 |
+
logger.debug("λ¨μ΄: %s, νμ¬: %s, λΉλ: %d", word, pos, freq[word])
|
| 90 |
+
sorted_freq = sorted(freq.items(), key=lambda x: x[1], reverse=True)
|
| 91 |
+
logger.debug("μ λ ¬λ λ¨μ΄ λΉλ: %s", sorted_freq)
|
| 92 |
+
df = pd.DataFrame(sorted_freq, columns=["λ¨μ΄", "λΉλμ"])
|
| 93 |
+
logger.debug("ννμ λΆμ DataFrame μμ±λ¨, shape: %s", df.shape)
|
| 94 |
+
temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".xlsx")
|
| 95 |
+
df.to_excel(temp_file.name, index=False, engine='openpyxl')
|
| 96 |
+
temp_file.close()
|
| 97 |
+
logger.debug("Excel νμΌ μμ±λ¨: %s", temp_file.name)
|
| 98 |
+
return df, temp_file.name
|
| 99 |
+
|
| 100 |
+
# --- λ€μ΄λ² κ²μ λ° κ΄κ³ API κ΄λ ¨ (μ°Έμ‘°μ½λ-2) ---
|
| 101 |
def generate_signature(timestamp, method, uri, secret_key):
|
| 102 |
message = f"{timestamp}.{method}.{uri}"
|
| 103 |
digest = hmac.new(secret_key.encode("utf-8"), message.encode("utf-8"), hashlib.sha256).digest()
|
|
|
|
| 114 |
"X-Signature": signature
|
| 115 |
}
|
| 116 |
|
| 117 |
+
def fetch_related_keywords(keyword):
|
|
|
|
| 118 |
debug_log(f"fetch_related_keywords νΈμΆ, ν€μλ: {keyword}")
|
| 119 |
API_KEY = os.environ["NAVER_API_KEY"]
|
| 120 |
SECRET_KEY = os.environ["NAVER_SECRET_KEY"]
|
|
|
|
| 127 |
"hintKeywords": [keyword],
|
| 128 |
"showDetail": "1"
|
| 129 |
}
|
| 130 |
+
response = requests.get(BASE_URL + uri, params=params, headers=headers)
|
| 131 |
+
data = response.json()
|
|
|
|
| 132 |
if "keywordList" not in data:
|
| 133 |
return pd.DataFrame()
|
| 134 |
df = pd.DataFrame(data["keywordList"])
|
|
|
|
| 147 |
debug_log("fetch_related_keywords μλ£")
|
| 148 |
return result_df
|
| 149 |
|
| 150 |
+
def fetch_blog_count(keyword):
|
|
|
|
| 151 |
debug_log(f"fetch_blog_count νΈμΆ, ν€μλ: {keyword}")
|
| 152 |
client_id = os.environ["NAVER_SEARCH_CLIENT_ID"]
|
| 153 |
client_secret = os.environ["NAVER_SEARCH_CLIENT_SECRET"]
|
|
|
|
| 157 |
"X-Naver-Client-Secret": client_secret
|
| 158 |
}
|
| 159 |
params = {"query": keyword, "display": 1}
|
| 160 |
+
response = requests.get(url, headers=headers, params=params)
|
| 161 |
+
if response.status_code == 200:
|
| 162 |
+
data = response.json()
|
| 163 |
+
debug_log(f"fetch_blog_count κ²°κ³Ό: {data.get('total', 0)}")
|
| 164 |
+
return data.get("total", 0)
|
| 165 |
+
else:
|
| 166 |
+
debug_log(f"fetch_blog_count μ€λ₯, μνμ½λ: {response.status_code}")
|
| 167 |
+
return 0
|
|
|
|
| 168 |
|
| 169 |
def create_excel_file(df):
|
| 170 |
with tempfile.NamedTemporaryFile(suffix=".xlsx", delete=False) as tmp:
|
| 171 |
excel_path = tmp.name
|
| 172 |
+
df.to_excel(excel_path, index=False)
|
| 173 |
debug_log(f"Excel νμΌ μμ±λ¨: {excel_path}")
|
| 174 |
return excel_path
|
| 175 |
|
| 176 |
+
def process_keyword(keywords: str, include_related: bool):
|
|
|
|
| 177 |
debug_log(f"process_keyword νΈμΆ, ν€μλλ€: {keywords}, μ°κ΄κ²μμ΄ ν¬ν¨: {include_related}")
|
| 178 |
input_keywords = [k.strip() for k in keywords.splitlines() if k.strip()]
|
| 179 |
result_dfs = []
|
| 180 |
for idx, kw in enumerate(input_keywords):
|
| 181 |
+
df_kw = fetch_related_keywords(kw)
|
| 182 |
if df_kw.empty:
|
| 183 |
continue
|
| 184 |
row_kw = df_kw[df_kw["μ 보ν€μλ"] == kw]
|
|
|
|
| 195 |
result_df.drop_duplicates(subset=["μ 보ν€μλ"], inplace=True)
|
| 196 |
else:
|
| 197 |
result_df = pd.DataFrame(columns=["μ 보ν€μλ", "PCμκ²μλ", "λͺ¨λ°μΌμκ²μλ", "ν νμκ²μλ"])
|
| 198 |
+
result_df["λΈλ‘κ·Έλ¬Έμμ"] = result_df["μ 보ν€μλ"].apply(fetch_blog_count)
|
|
|
|
|
|
|
|
|
|
| 199 |
result_df.sort_values(by="ν νμκ²μλ", ascending=False, inplace=True)
|
| 200 |
debug_log("process_keyword μλ£")
|
| 201 |
return result_df, create_excel_file(result_df)
|
| 202 |
|
| 203 |
+
# --- ννμ λΆμκ³Ό κ²μλ/λΈλ‘κ·Έλ¬Έμμ λ³ν© ---
|
| 204 |
+
def morphological_analysis_and_enrich(text: str, remove_freq1: bool):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 205 |
debug_log("morphological_analysis_and_enrich ν¨μ μμ")
|
| 206 |
df_freq, _ = analyze_text(text)
|
| 207 |
if df_freq.empty:
|
|
|
|
| 213 |
debug_log(f"λΉλμ 1 μ κ±° μ μ©λ¨. {before_shape} -> {df_freq.shape}")
|
| 214 |
keywords = "\n".join(df_freq["λ¨μ΄"].tolist())
|
| 215 |
debug_log(f"λΆμλ ν€μλ: {keywords}")
|
| 216 |
+
df_keyword_info, _ = process_keyword(keywords, include_related=False)
|
| 217 |
debug_log("κ²μλ λ° λΈλ‘κ·Έλ¬Έμμ μ‘°ν μλ£")
|
| 218 |
merged_df = pd.merge(df_freq, df_keyword_info, left_on="λ¨μ΄", right_on="μ 보ν€μλ", how="left")
|
| 219 |
merged_df.drop(columns=["μ 보ν€μλ"], inplace=True)
|
|
|
|
| 221 |
debug_log("morphological_analysis_and_enrich ν¨μ μλ£")
|
| 222 |
return merged_df, merged_excel_path
|
| 223 |
|
| 224 |
+
# --- μ§μ ν€μλ λΆμ (λ¨λ
λΆμ) ---
|
| 225 |
+
def direct_keyword_analysis(text: str, keyword_input: str):
|
| 226 |
debug_log("direct_keyword_analysis ν¨μ μμ")
|
| 227 |
keywords = re.split(r'[\n,]+', keyword_input)
|
| 228 |
keywords = [kw.strip() for kw in keywords if kw.strip()]
|
|
|
|
| 232 |
count = text.count(kw)
|
| 233 |
results.append((kw, count))
|
| 234 |
debug_log(f"ν€μλ '{kw}'μ λΉλμ: {count}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 235 |
df = pd.DataFrame(results, columns=["ν€μλ", "λΉλμ"])
|
| 236 |
excel_path = create_excel_file(df)
|
| 237 |
debug_log("direct_keyword_analysis ν¨μ μλ£")
|
| 238 |
return df, excel_path
|
| 239 |
|
| 240 |
+
# --- ν΅ν© λΆμ (ννμ λΆμ + μ§μ ν€μλ λΆμ) ---
|
| 241 |
+
def combined_analysis(blog_text: str, remove_freq1: bool, direct_keyword_input: str):
|
| 242 |
debug_log("combined_analysis ν¨μ μμ")
|
| 243 |
+
merged_df, _ = morphological_analysis_and_enrich(blog_text, remove_freq1)
|
| 244 |
if "μ§μ μ
λ ₯" not in merged_df.columns:
|
| 245 |
merged_df["μ§μ μ
λ ₯"] = ""
|
| 246 |
direct_keywords = re.split(r'[\n,]+', direct_keyword_input)
|
|
|
|
| 251 |
merged_df.loc[merged_df["λ¨μ΄"] == dk, "μ§μ μ
λ ₯"] = "μ§μ μ
λ ₯"
|
| 252 |
else:
|
| 253 |
freq = blog_text.count(dk)
|
| 254 |
+
df_direct, _ = process_keyword(dk, include_related=False)
|
| 255 |
if (not df_direct.empty) and (dk in df_direct["μ 보ν€μλ"].values):
|
| 256 |
row = df_direct[df_direct["μ 보ν€μλ"] == dk].iloc[0]
|
| 257 |
pc = row.get("PCμκ²μλ", None)
|
|
|
|
| 275 |
debug_log("combined_analysis ν¨μ μλ£")
|
| 276 |
return merged_df, combined_excel
|
| 277 |
|
| 278 |
+
# --- λΆμ νΈλ€λ¬ ---
|
| 279 |
+
def analysis_handler(blog_text: str, remove_freq1: bool, direct_keyword_input: str, direct_keyword_only: bool):
|
| 280 |
debug_log("analysis_handler ν¨μ μμ")
|
| 281 |
if direct_keyword_only:
|
| 282 |
+
# "μ§μ ν€μλ μ
λ ₯λ§ λΆμ" μ ν μ λ¨λ
λΆμ μν
|
| 283 |
+
return direct_keyword_analysis(blog_text, direct_keyword_input)
|
| 284 |
else:
|
| 285 |
+
# κΈ°λ³Έ ν΅ν© λΆμ μν
|
| 286 |
+
return combined_analysis(blog_text, remove_freq1, direct_keyword_input)
|
| 287 |
|
| 288 |
+
# --- μ€ν¬λν μ€ν ---
|
| 289 |
+
def fetch_blog_content(url: str):
|
| 290 |
debug_log("fetch_blog_content ν¨μ μμ")
|
| 291 |
+
content = scrape_naver_blog(url)
|
| 292 |
debug_log("fetch_blog_content ν¨μ μλ£")
|
| 293 |
return content
|
| 294 |
|
|
|
|
| 374 |
# --- Gradio μΈν°νμ΄μ€ κ΅¬μ± ---
|
| 375 |
with gr.Blocks(title="λ€μ΄λ² λΈλ‘κ·Έ ννμ λΆμ μλΉμ€", css=custom_css) as demo:
|
| 376 |
gr.HTML("<div class='custom-header'>λ€μ΄λ² λΈλ‘κ·Έ ννμ λΆμ μλΉμ€ π</div>")
|
| 377 |
+
# λΈλ‘κ·Έ λ§ν¬μ μ€ν¬λν μ€ν λ²νΌμ ν κ·Έλ£Ή λ΄μ λ°°μΉ (λ²νΌμ κ°μ΄λ° μ λ ¬)
|
| 378 |
with gr.Group(elem_classes="custom-group"):
|
| 379 |
with gr.Row():
|
| 380 |
blog_url_input = gr.Textbox(label="λ€μ΄λ² λΈλ‘κ·Έ λ§ν¬", placeholder="μ: https://blog.naver.com/ssboost/222983068507", lines=1)
|
|
|
|
| 396 |
result_df = gr.Dataframe(label="ν΅ν© λΆμ κ²°κ³Ό (λ¨μ΄, λΉλμ, κ²μλ, λΈλ‘κ·Έλ¬Έμμ, μ§μ μ
λ ₯)", interactive=True)
|
| 397 |
with gr.Group(elem_classes="custom-group"):
|
| 398 |
excel_file = gr.File(label="Excel λ€μ΄λ‘λ")
|
| 399 |
+
# μ¬μ©μ€λͺ
HTML λΈλ‘ (μλμ λ°°μΉ)
|
| 400 |
with gr.Group(elem_classes="custom-group"):
|
| 401 |
usage_html = gr.HTML("""
|
| 402 |
<div class="usage-instructions">
|
|
|
|
| 418 |
<p><strong>Tip:</strong> λΆμ κ²°κ³Όλ μ€μκ°μΌλ‘ μ
λ°μ΄νΈλλ©°, νμμ μμ ν λ€μ λΆμν μ μμ΅λλ€. μ¦κ±°μ΄ λΆμ λμΈμ! π</p>
|
| 419 |
</div>
|
| 420 |
""")
|
| 421 |
+
# μ΄λ²€νΈ μ°κ²°
|
| 422 |
scrape_button.click(fn=fetch_blog_content, inputs=blog_url_input, outputs=blog_content_box)
|
| 423 |
analyze_button.click(fn=analysis_handler,
|
| 424 |
inputs=[blog_content_box, remove_freq_checkbox, direct_keyword_box, direct_keyword_only_checkbox],
|
|
|
|
| 427 |
if __name__ == "__main__":
|
| 428 |
debug_log("Gradio μ± μ€ν μμ")
|
| 429 |
demo.launch()
|
| 430 |
+
debug_log("Gradio μ± μ€ν μ’
λ£")
|