mecab / app.py
Kims12's picture
Create app.py
4ccb643 verified
import re
import logging
import tempfile
import pandas as pd
import gradio as gr
import mecab # pythonโ€‘mecabโ€‘ko ๋ผ์ด๋ธŒ๋Ÿฌ๋ฆฌ ์‚ฌ์šฉ
# ๋””๋ฒ„๊น…์„ ์œ„ํ•œ ๋กœ๊น… ์„ค์ •
logging.basicConfig(level=logging.DEBUG)
logger = logging.getLogger(__name__)
def analyze_text(text: str):
logger.debug("์›๋ณธ ํ…์ŠคํŠธ: %s", text)
# 1. ํ•œ๊ตญ์–ด๋งŒ ๋‚จ๊ธฐ๊ธฐ (๊ณต๋ฐฑ, ์˜์–ด, ๊ธฐํ˜ธ ๋“ฑ ์ œ๊ฑฐ)
filtered_text = re.sub(r'[^๊ฐ€-ํžฃ]', '', text)
logger.debug("ํ•„ํ„ฐ๋ง๋œ ํ…์ŠคํŠธ (ํ•œ๊ตญ์–ด๋งŒ, ๊ณต๋ฐฑ ์ œ๊ฑฐ): %s", filtered_text)
if not filtered_text:
logger.debug("์œ ํšจํ•œ ํ•œ๊ตญ์–ด ํ…์ŠคํŠธ๊ฐ€ ์—†์Œ.")
# ๋นˆ DataFrame๊ณผ ๋นˆ ๋ฌธ์ž์—ด ๋ฐ˜ํ™˜ (Excel ํŒŒ์ผ ๊ฒฝ๋กœ)
return pd.DataFrame(columns=["๋‹จ์–ด", "๋นˆ๋„์ˆ˜"]), ""
# 2. Mecab์„ ์ด์šฉํ•œ ํ˜•ํƒœ์†Œ ๋ถ„์„ (๋ช…์‚ฌ์™€ ๋ณตํ•ฉ๋ช…์‚ฌ๋งŒ ์ถ”์ถœ)
mecab_instance = mecab.MeCab() # ์ˆ˜์ •: ์ง์ ‘ ์ธ์Šคํ„ด์Šค ์ƒ์„ฑ
tokens = mecab_instance.pos(filtered_text)
logger.debug("ํ˜•ํƒœ์†Œ ๋ถ„์„ ๊ฒฐ๊ณผ: %s", tokens)
freq = {}
for word, pos in tokens:
# ๋นˆ ๋ฌธ์ž์—ด ์—ฌ๋ถ€ ๋ฐ ์œ ํšจ์„ฑ ๊ฒ€์‚ฌ
if word and word.strip():
# Mecab์—์„œ ๋ช…์‚ฌ์˜ ๊ฒฝ์šฐ ์ผ๋ฐ˜์ ์œผ๋กœ 'NN'์œผ๋กœ ์‹œ์ž‘ (์˜ˆ: NNG, NNP ๋“ฑ)
if pos.startswith("NN"):
freq[word] = freq.get(word, 0) + 1
logger.debug("๋‹จ์–ด: %s, ํ’ˆ์‚ฌ: %s, ํ˜„์žฌ ๋นˆ๋„: %d", word, pos, freq[word])
# 3. ๋นˆ๋„์ˆ˜๋ฅผ ๋‚ด๋ฆผ์ฐจ์ˆœ ์ •๋ ฌ
sorted_freq = sorted(freq.items(), key=lambda x: x[1], reverse=True)
logger.debug("๋‚ด๋ฆผ์ฐจ์ˆœ ์ •๋ ฌ๋œ ๋‹จ์–ด ๋นˆ๋„: %s", sorted_freq)
# 4. ๊ฒฐ๊ณผ DataFrame ์ƒ์„ฑ (ํ‘œ๋กœ ์ถœ๋ ฅํ•˜๊ธฐ ์œ„ํ•จ)
df = pd.DataFrame(sorted_freq, columns=["๋‹จ์–ด", "๋นˆ๋„์ˆ˜"])
logger.debug("๊ฒฐ๊ณผ DataFrame ์ƒ์„ฑ๋จ, shape: %s", df.shape)
# 5. Excel ํŒŒ์ผ ์ƒ์„ฑ (์ž„์‹œ ํŒŒ์ผ์— ์ €์žฅํ•˜์—ฌ ๋‹ค์šด๋กœ๋“œ ์ œ๊ณต)
temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".xlsx")
df.to_excel(temp_file.name, index=False, engine='openpyxl')
temp_file.close()
logger.debug("Excel ํŒŒ์ผ ์ƒ์„ฑ๋จ: %s", temp_file.name)
return df, temp_file.name
# Gradio ์ธํ„ฐํŽ˜์ด์Šค ๊ตฌ์„ฑ
with gr.Blocks() as demo:
gr.Markdown("# ํ˜•ํƒœ์†Œ ๋ถ„์„ ์ŠคํŽ˜์ด์Šค")
with gr.Row():
text_input = gr.Textbox(label="ํ…์ŠคํŠธ ์ž…๋ ฅ", lines=5, placeholder="๋ถ„์„ํ•  ํ…์ŠคํŠธ๋ฅผ ์ž…๋ ฅํ•˜์„ธ์š”.")
with gr.Row():
analyze_button = gr.Button("๋ถ„์„ ์‹คํ–‰")
with gr.Row():
output_table = gr.Dataframe(label="๋ถ„์„ ๊ฒฐ๊ณผ (๋‹จ์–ด ๋ฐ ๋นˆ๋„์ˆ˜)")
with gr.Row():
output_file = gr.File(label="Excel ๋‹ค์šด๋กœ๋“œ")
analyze_button.click(fn=analyze_text, inputs=text_input, outputs=[output_table, output_file])
if __name__ == "__main__":
demo.launch()