Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| import pandas as pd | |
| import tempfile | |
| import re | |
| import logging | |
| from mecab import MeCab | |
| logging.basicConfig(level=logging.DEBUG) | |
| ############################## | |
| # 1) ๊ณตํต ํจ์๋ค | |
| ############################## | |
| def preprocess_text(text: str) -> str: | |
| """ | |
| ์ผํ, ๋ง์นจํ, ๊ณต๋ฐฑ, ์ซ์, ์์ด ๋ฑ | |
| ํ๊ธ(๊ฐ-ํฃ) ์ด์ธ์ ๋ฌธ์๋ฅผ ๋ชจ๋ ์ ๊ฑฐํ๊ณ | |
| ํ๊ธ๋ง ์ฐ์์ผ๋ก ๋จ๊ธด๋ค. | |
| """ | |
| return re.sub(r'[^๊ฐ-ํฃ]', '', text) | |
| def expand_columns_if_needed(df, needed_index: int): | |
| """ | |
| df์ (needed_index + 1)๋ฒ์งธ ์ด์ด ์กด์ฌํ์ง ์์ผ๋ฉด | |
| ์์๋ก ํ์ฅํด์ ๋น ์ด์ ๋ง๋ ๋ค. | |
| ์) needed_index=13 โ N์ด(14๋ฒ์งธ ์ด)์ ์ฐ๋ ค๋ฉด | |
| df.shape[1]์ด 14 ์ด์์ด ๋๋๋ก ํ์ฅ | |
| """ | |
| while df.shape[1] <= needed_index: | |
| # ๋งจ ๋์ ๋น ์ด ์ถ๊ฐ | |
| df[df.shape[1]] = None | |
| ############################## | |
| # 2) ํค์๋ ์นด์ดํธ ํจ์ | |
| ############################## | |
| def count_keywords(main_text, excel_file, direct_input): | |
| """ | |
| - ์ง์ ์ ๋ ฅ ํค์๋(์ค๋ฐ๊ฟ ๊ตฌ๋ถ)๊ฐ ์์ผ๋ฉด ์ฐ์ ์ฌ์ฉ(A์ด=ํค์๋, B์ด=์นด์ดํธ) | |
| - ์์ผ๋ฉด ์์ ์ฌ์ฉ: | |
| * ํค๋๋ฅผ ์ฌ์ฉํ์ง ์์(header=None) โ 1ํ ๊ทธ๋๋ก ๋ณด์กด | |
| * A5~A10000: ํค์๋ | |
| * N5~N10000: ์นด์ดํธ ๊ธฐ๋ก(์ด ์ธ๋ฑ์ค 13) | |
| - ๋ณธ๋ฌธ์ ํ๊ธ๋ง ๋จ๊ธฐ๊ณ .count(ํค์๋)๋ก ๋น๋์๋ฅผ ๊ณ์ฐ | |
| - 1ํ ์ด์์ธ ํค์๋๋ง ๊ฒฐ๊ณผ ํ(Markdown)์ ํ์ | |
| """ | |
| logging.debug(f"main_text: {main_text}") | |
| logging.debug(f"excel_file: {excel_file}") | |
| logging.debug(f"direct_input: {direct_input}") | |
| # ๋ณธ๋ฌธ ์ ์ฒ๋ฆฌ | |
| cleaned_text = preprocess_text(main_text) | |
| direct_input = direct_input.strip() | |
| if direct_input: | |
| # ===== ์ง์ ์ ๋ ฅ ํค์๋ ์ฌ์ฉ ===== | |
| keywords = [kw.strip() for kw in direct_input.split('\n') if kw.strip()] | |
| if not keywords: | |
| return ("์ง์ ์ ๋ ฅ ํค์๋๊ฐ ์์ต๋๋ค.", None) | |
| # counts | |
| counts = [cleaned_text.count(k) for k in keywords] | |
| # 1ํ ์ด์ ํํฐ | |
| filtered = [(k, c) for k, c in zip(keywords, counts) if c > 0] | |
| if not filtered: | |
| # ์ ๋ถ 0ํ | |
| msg = "๋ณธ๋ฌธ์ ํด๋น ํค์๋๊ฐ ์ ํ ๋ฑ์ฅํ์ง ์์์ต๋๋ค." | |
| # ๊ทธ๋๋ ๊ฒฐ๊ณผ CSV(A,B) ๋ง๋ค์ด์ ๋ฐํ | |
| tmp_df = pd.DataFrame({"๋ช ์ฌ": keywords, "๋น๋์": counts}) | |
| with tempfile.NamedTemporaryFile(delete=False, suffix=".csv") as tmp: | |
| tmp_df.to_csv(tmp.name, index=False, encoding='utf-8-sig') | |
| tmp_path = tmp.name | |
| return (msg, tmp_path) | |
| # 1ํ ์ด์ ํ(Markdown) | |
| lines = ["| ๋ช ์ฌ | ๋น๋์ |", "|---|---|"] | |
| for (k, c) in filtered: | |
| lines.append(f"| {k} | {c} |") | |
| md_table = "\n".join(lines) | |
| # CSV ์ ์ฅ | |
| tmp_df = pd.DataFrame({"๋ช ์ฌ": keywords, "๋น๋์": counts}) | |
| with tempfile.NamedTemporaryFile(delete=False, suffix=".csv") as tmp: | |
| tmp_df.to_csv(tmp.name, index=False, encoding='utf-8-sig') | |
| tmp_path = tmp.name | |
| return (md_table, tmp_path) | |
| else: | |
| # ===== ์์ ํ์ผ ์ฌ์ฉ ===== | |
| if not excel_file: | |
| return ("์์ ํ์ผ์ ์ ๋ก๋ํ๊ฑฐ๋ ํค์๋๋ฅผ ์ง์ ์ ๋ ฅํ์ธ์.", None) | |
| # 1) ์์ ์ ์ฒด๋ฅผ header=None๋ก ์ฝ์ โ 1ํ ๊ทธ๋๋ก ๋ณด์กด | |
| df = pd.read_excel(excel_file.name, header=None) | |
| # 2) A5~A10000 โ (์ธ๋ฑ์ค 4~9999) ํค์๋ | |
| max_row = min(df.shape[0], 10000) # ์ค์ ํ ๊ฐ์ vs 10000 ์ค ๋ ์์ ๊ฒ | |
| sub_df = df.iloc[4:max_row, 0] # ์ฒซ ๋ฒ์งธ ์ด(์ธ๋ฑ์ค=0) | |
| # strip + NaN ์ ๊ฑฐ | |
| keywords = sub_df.dropna().astype(str).apply(lambda x: x.strip()).tolist() | |
| if not keywords: | |
| return ("A5~A10000 ๋ฒ์์ ํค์๋๊ฐ ์์ต๋๋ค.", None) | |
| # counts | |
| counts = [cleaned_text.count(k) for k in keywords] | |
| # 1ํ ์ด์ ํํฐ | |
| filtered = [(k, c) for k, c in zip(keywords, counts) if c > 0] | |
| if not filtered: | |
| msg = "๋ณธ๋ฌธ์ ํด๋น ํค์๋๊ฐ ์ ํ ๋ฑ์ฅํ์ง ์์์ต๋๋ค(0ํ)." | |
| # ๊ทธ๋๋ N5~N10000์ ๊ธฐ๋ก | |
| expand_columns_if_needed(df, 13) # N์ด=13 | |
| for i, cnt_val in enumerate(counts): | |
| row_idx = 4 + i | |
| if row_idx < df.shape[0]: | |
| df.iloc[row_idx, 13] = cnt_val | |
| with tempfile.NamedTemporaryFile(delete=False, suffix=".csv") as tmp: | |
| df.to_csv(tmp.name, index=False, encoding='utf-8-sig') | |
| tmp_path = tmp.name | |
| return (msg, tmp_path) | |
| # 1ํ ์ด์ ํ(Markdown) | |
| lines = ["| ๋ช ์ฌ | ๋น๋์ |", "|---|---|"] | |
| for (k, c) in filtered: | |
| lines.append(f"| {k} | {c} |") | |
| md_table = "\n".join(lines) | |
| # N5~N10000์ ๊ธฐ๋ก | |
| expand_columns_if_needed(df, 13) | |
| for i, cnt_val in enumerate(counts): | |
| row_idx = 4 + i | |
| if row_idx < df.shape[0]: | |
| df.iloc[row_idx, 13] = cnt_val | |
| with tempfile.NamedTemporaryFile(delete=False, suffix=".csv") as tmp: | |
| df.to_csv(tmp.name, index=False, encoding='utf-8-sig') | |
| tmp_path = tmp.name | |
| return (md_table, tmp_path) | |
| ############################## | |
| # 3) ํํ์ ๋ถ์ ๊ธฐ๋ฐ ํค์๋ ์นด์ดํธ ํจ์ | |
| ############################## | |
| def morph_analysis_and_count(text: str): | |
| """ | |
| 1) ์ ๋ ฅ๋ ํ ์คํธ์์ ํ๊ธ๋ง ๋จ๊น | |
| 2) Mecab ํํ์ ๋ถ์ (python-mecab-ko) | |
| 3) ๋ช ์ฌ ๋ฐ ๋ณตํฉ๋ช ์ฌ๋ง ์ถ์ถ | |
| 4) ๊ฐ ํค์๋๋ฅผ ๋ณธ๋ฌธ์์ ๋ค์ ๊ฒ์ํ์ฌ ๋น๋์ ์นด์ดํธ | |
| """ | |
| # 1) ์ ์ฒ๋ฆฌ | |
| cleaned = preprocess_text(text) | |
| # 2) Mecab ๋ถ์ | |
| tagger = MeCab() | |
| parsed = tagger.pos(cleaned) | |
| # 3) ๋ช ์ฌ ๋ฐ ๋ณตํฉ๋ช ์ฌ๋ง ์ถ์ถ | |
| noun_tags = ['NNG', 'NNP', 'NP', 'NNB'] | |
| nouns = [word for (word, pos) in parsed if pos in noun_tags] | |
| # ์ค๋ณต ์ ๊ฑฐํ์ฌ ๊ณ ์ ํค์๋ ๋ฆฌ์คํธ ์์ฑ | |
| unique_nouns = list(set(nouns)) | |
| # 4) ๊ฐ ํค์๋๋ฅผ ๋ณธ๋ฌธ์์ ๊ฒ์ํ์ฌ ๋น๋์ ์นด์ดํธ | |
| freq_dict = {} | |
| for noun in unique_nouns: | |
| count = cleaned.count(noun) | |
| freq_dict[noun] = count | |
| filtered_freq = {k: v for k, v in freq_dict.items() if v > 0} | |
| if not filtered_freq: | |
| return "์ถ์ถ๋ ๋ช ์ฌ๊ฐ ์์ต๋๋ค.", None | |
| freq_df = pd.DataFrame(list(filtered_freq.items()), columns=['๋ช ์ฌ', '๋น๋์']) | |
| freq_df = freq_df.sort_values(by='๋น๋์', ascending=False).reset_index(drop=True) | |
| try: | |
| md_table = freq_df.to_markdown(index=False) | |
| except ImportError: | |
| md_table = "Markdown ๋ณํ์ ์ํด 'tabulate' ๋ผ์ด๋ธ๋ฌ๋ฆฌ๊ฐ ํ์ํฉ๋๋ค." | |
| return md_table, None | |
| with tempfile.NamedTemporaryFile(delete=False, suffix=".csv") as tmp: | |
| freq_df.to_csv(tmp.name, index=False, encoding='utf-8-sig') | |
| tmp_path = tmp.name | |
| return md_table, tmp_path | |
| ######################## | |
| # 4) Gradio ์ธํฐํ์ด์ค # | |
| ######################## | |
| # ๊ธฐ์กด CSS์ ๋ฒํผ ์์ ์ถ๊ฐ | |
| css = """ | |
| /* '๋ถ์ํ๊ธฐ' ๋ฒํผ ์์ ๋ฐ ๊ธ์์ ๋ณ๊ฒฝ */ | |
| #run_analysis_button > button, | |
| #morph_analysis_button > button { | |
| background-color: #EA580C !important; /* ์งํ ์ฃผํฉ์ */ | |
| color: #FFFFFF !important; /* ํฐ์ ๊ธ์ */ | |
| } | |
| """ | |
| with gr.Blocks( | |
| theme=gr.themes.Soft( | |
| primary_hue=gr.themes.Color( | |
| c50="#FFF7ED", | |
| c100="#FFEDD5", | |
| c200="#FED7AA", | |
| c300="#FDBA74", | |
| c400="#FB923C", | |
| c500="#F97316", | |
| c600="#EA580C", | |
| c700="#C2410C", | |
| c800="#9A3412", | |
| c900="#7C2D12", | |
| c950="#431407", | |
| ), | |
| secondary_hue="zinc", | |
| neutral_hue="zinc", | |
| font=("Pretendard", "sans-serif") | |
| ), | |
| css=css | |
| ) as demo: | |
| with gr.Tab("ํค์๋ ์นด์ดํธ"): | |
| with gr.Row(): | |
| # ์ผ์ชฝ ์ ๋ ฅ ์์ญ | |
| with gr.Column(): | |
| main_textbox = gr.Textbox( | |
| label="๋ณธ๋ฌธ ํ ์คํธ", | |
| lines=16, | |
| placeholder="์ฌ๊ธฐ์ ๊ธด ๋ณธ๋ฌธ์ ๋ถ์ฌ๋ฃ์ผ์ธ์." | |
| ) | |
| keyword_input = gr.Textbox( | |
| label="(์ ํ) ์ง์ ์ ๋ ฅ ํค์๋ - ์ํฐ๋ก ๊ตฌ๋ถ", | |
| lines=6, | |
| placeholder="์)\n์ด์ํ๊ฐ์ต๊ธฐ\n๊ฐ์ต๊ธฐ\n..." | |
| ) | |
| excel_input = gr.File( | |
| label="(์ ํ) ์์ ์ ๋ก๋" | |
| ) | |
| # ๋ฒํผ์ elem_id ์ถ๊ฐ | |
| run_button = gr.Button("๋ถ์ํ๊ธฐ", elem_id="run_analysis_button") | |
| # ์ค๋ฅธ์ชฝ ์ถ๋ ฅ ์์ญ | |
| with gr.Column(): | |
| output_md = gr.Markdown(label="๊ฒฐ๊ณผ ํ") | |
| output_file = gr.File(label="๊ฒฐ๊ณผ ๋ค์ด๋ก๋") | |
| run_button.click( | |
| fn=count_keywords, | |
| inputs=[main_textbox, excel_input, keyword_input], | |
| outputs=[output_md, output_file] | |
| ) | |
| with gr.Tab("ํํ์ ๋ถ์ ๊ธฐ๋ฐ ์นด์ดํธ"): | |
| with gr.Row(): | |
| # ์ผ์ชฝ ์ ๋ ฅ ์์ญ | |
| with gr.Column(): | |
| morph_text_input = gr.Textbox( | |
| label="๋ณธ๋ฌธ ํ ์คํธ", | |
| lines=16, | |
| placeholder="์ฌ๊ธฐ์ ๊ธด ๋ณธ๋ฌธ์ ๋ถ์ฌ๋ฃ์ผ์ธ์." | |
| ) | |
| # ๋ฒํผ์ elem_id ์ถ๊ฐ | |
| morph_run_button = gr.Button("๋ถ์ํ๊ธฐ", elem_id="morph_analysis_button") | |
| # ์ค๋ฅธ์ชฝ ์ถ๋ ฅ ์์ญ | |
| with gr.Column(): | |
| morph_result_display = gr.Markdown(label="๋ถ์ ๊ฒฐ๊ณผ") | |
| morph_download_button = gr.File(label="๊ฒฐ๊ณผ ๋ค์ด๋ก๋") | |
| morph_run_button.click( | |
| fn=morph_analysis_and_count, | |
| inputs=morph_text_input, | |
| outputs=[morph_result_display, morph_download_button] | |
| ) | |
| if __name__ == "__main__": | |
| demo.launch() | |