| | import gradio as gr |
| | import pandas as pd |
| | import tempfile |
| | import re |
| | from mecab import MeCab |
| |
|
| | |
| | |
| | |
| |
|
| | def preprocess_text(text: str) -> str: |
| | """ |
| | ์ผํ, ๋ง์นจํ, ๊ณต๋ฐฑ, ์ซ์, ์์ด ๋ฑ |
| | ํ๊ธ(๊ฐ-ํฃ) ์ด์ธ์ ๋ฌธ์๋ฅผ ๋ชจ๋ ์ ๊ฑฐํ๊ณ |
| | ํ๊ธ๋ง ์ฐ์์ผ๋ก ๋จ๊ธด๋ค. |
| | """ |
| | return re.sub(r'[^๊ฐ-ํฃ]', '', text) |
| |
|
| | def expand_columns_if_needed(df, needed_index: int): |
| | """ |
| | df์ (needed_index + 1)๋ฒ์งธ ์ด์ด ์กด์ฌํ์ง ์์ผ๋ฉด |
| | ์์๋ก ํ์ฅํด์ ๋น ์ด์ ๋ง๋ ๋ค. |
| | ์) needed_index=13 โ N์ด(14๋ฒ์งธ ์ด)์ ์ฐ๋ ค๋ฉด |
| | df.shape[1]์ด 14 ์ด์์ด ๋๋๋ก ํ์ฅ |
| | """ |
| | while df.shape[1] <= needed_index: |
| | |
| | df[df.shape[1]] = None |
| |
|
| | |
| | |
| | |
| |
|
| | def count_keywords(main_text, excel_file, direct_input): |
| | """ |
| | - ์ง์ ์
๋ ฅ ํค์๋(์ค๋ฐ๊ฟ ๊ตฌ๋ถ)๊ฐ ์์ผ๋ฉด ์ฐ์ ์ฌ์ฉ(A์ด=ํค์๋, B์ด=์นด์ดํธ) |
| | - ์์ผ๋ฉด ์์
์ฌ์ฉ: |
| | * ํค๋๋ฅผ ์ฌ์ฉํ์ง ์์(header=None) โ 1ํ ๊ทธ๋๋ก ๋ณด์กด |
| | * A5~A10000: ํค์๋ |
| | * N5~N10000: ์นด์ดํธ ๊ธฐ๋ก(์ด ์ธ๋ฑ์ค 13) |
| | - ๋ณธ๋ฌธ์ ํ๊ธ๋ง ๋จ๊ธฐ๊ณ .count(ํค์๋)๋ก ๋ฑ์ฅ ํ์๋ฅผ ๊ณ์ฐ |
| | - 1ํ ์ด์์ธ ํค์๋๋ง ๊ฒฐ๊ณผ ํ(Markdown)์ ํ์ |
| | """ |
| | |
| | cleaned_text = preprocess_text(main_text) |
| |
|
| | direct_input = direct_input.strip() |
| | if direct_input: |
| | |
| | keywords = [kw.strip() for kw in direct_input.split('\n') if kw.strip()] |
| | if not keywords: |
| | return ("์ง์ ์
๋ ฅ ํค์๋๊ฐ ์์ต๋๋ค.", None) |
| | |
| | |
| | counts = [cleaned_text.count(k) for k in keywords] |
| | |
| | |
| | filtered = [(k, c) for k, c in zip(keywords, counts) if c > 0] |
| |
|
| | if not filtered: |
| | |
| | msg = "๋ณธ๋ฌธ์ ํด๋น ํค์๋๊ฐ ์ ํ ๋ฑ์ฅํ์ง ์์์ต๋๋ค." |
| | |
| | tmp_df = pd.DataFrame({"A": keywords, "B": counts}) |
| | with tempfile.NamedTemporaryFile(delete=False, suffix=".xlsx") as tmp: |
| | tmp_df.to_excel(tmp.name, index=False, header=False) |
| | |
| | tmp_path = tmp.name |
| | return (msg, tmp_path) |
| | |
| | |
| | lines = ["| ํค์๋ | ๋ฑ์ฅ ํ์ |", "|---|---|"] |
| | for (k, c) in filtered: |
| | lines.append(f"| {k} | {c} |") |
| | md_table = "\n".join(lines) |
| | |
| | |
| | tmp_df = pd.DataFrame({"A": keywords, "B": counts}) |
| | with tempfile.NamedTemporaryFile(delete=False, suffix=".xlsx") as tmp: |
| | tmp_df.to_excel(tmp.name, index=False, header=False) |
| | tmp_path = tmp.name |
| | |
| | return (md_table, tmp_path) |
| | |
| | else: |
| | |
| | if not excel_file: |
| | return ("์์
ํ์ผ์ ์
๋ก๋ํ๊ฑฐ๋ ํค์๋๋ฅผ ์ง์ ์
๋ ฅํ์ธ์.", None) |
| | |
| | |
| | df = pd.read_excel(excel_file.name, header=None) |
| | |
| | |
| | max_row = min(df.shape[0], 10000) |
| | sub_df = df.iloc[4:max_row, 0] |
| |
|
| | |
| | keywords = sub_df.dropna().astype(str).apply(lambda x: x.strip()).tolist() |
| | if not keywords: |
| | return ("A5~A10000 ๋ฒ์์ ํค์๋๊ฐ ์์ต๋๋ค.", None) |
| | |
| | |
| | counts = [cleaned_text.count(k) for k in keywords] |
| |
|
| | |
| | filtered = [(k, c) for k, c in zip(keywords, counts) if c > 0] |
| | if not filtered: |
| | msg = "๋ณธ๋ฌธ์ ํด๋น ํค์๋๊ฐ ์ ํ ๋ฑ์ฅํ์ง ์์์ต๋๋ค(0ํ)." |
| | |
| | expand_columns_if_needed(df, 13) |
| | for i, cnt_val in enumerate(counts): |
| | row_idx = 4 + i |
| | if row_idx < df.shape[0]: |
| | df.iloc[row_idx, 13] = cnt_val |
| | |
| | with tempfile.NamedTemporaryFile(delete=False, suffix=".xlsx") as tmp: |
| | df.to_excel(tmp.name, index=False, header=False) |
| | tmp_path = tmp.name |
| | return (msg, tmp_path) |
| | |
| | |
| | lines = ["| ํค์๋ | ๋ฑ์ฅ ํ์ |", "|---|---|"] |
| | for (k, c) in filtered: |
| | lines.append(f"| {k} | {c} |") |
| | md_table = "\n".join(lines) |
| | |
| | |
| | expand_columns_if_needed(df, 13) |
| | for i, cnt_val in enumerate(counts): |
| | row_idx = 4 + i |
| | if row_idx < df.shape[0]: |
| | df.iloc[row_idx, 13] = cnt_val |
| | |
| | with tempfile.NamedTemporaryFile(delete=False, suffix=".xlsx") as tmp: |
| | df.to_excel(tmp.name, index=False, header=False) |
| | tmp_path = tmp.name |
| | |
| | return (md_table, tmp_path) |
| |
|
| | |
| | |
| | |
| |
|
| | def morph_analysis_and_count(text: str): |
| | """ |
| | 1) ์
๋ ฅ๋ ํ
์คํธ์์ ํ๊ธ๋ง ๋จ๊น |
| | 2) Mecab ํํ์ ๋ถ์ (python-mecab-ko) |
| | 3) ๋ช
์ฌ ๋ฐ ๋ณตํฉ๋ช
์ฌ๋ง ์ถ์ถ |
| | 4) ๊ฐ ํค์๋๋ฅผ ๋ณธ๋ฌธ์์ ๋ค์ ๊ฒ์ํ์ฌ ๋น๋ ์นด์ดํธ |
| | """ |
| | |
| | cleaned = preprocess_text(text) |
| | |
| | |
| | tagger = MeCab() |
| | parsed = tagger.pos(cleaned) |
| | |
| | |
| | noun_tags = ['NNG', 'NNP', 'NP', 'NNB'] |
| | nouns = [word for (word, pos) in parsed if pos in noun_tags] |
| | |
| | |
| | unique_nouns = list(set(nouns)) |
| | |
| | |
| | freq_dict = {} |
| | for noun in unique_nouns: |
| | count = cleaned.count(noun) |
| | freq_dict[noun] = count |
| | |
| | |
| | filtered_freq = {k: v for k, v in freq_dict.items() if v > 0} |
| | |
| | if not filtered_freq: |
| | return "์ถ์ถ๋ ๋ช
์ฌ๊ฐ ์์ต๋๋ค.", None |
| | |
| | |
| | freq_df = pd.DataFrame(list(filtered_freq.items()), columns=['๋ช
์ฌ', '๋น๋']) |
| | freq_df = freq_df.sort_values(by='๋น๋', ascending=False).reset_index(drop=True) |
| | |
| | |
| | try: |
| | md_table = freq_df.to_markdown(index=False) |
| | except ImportError: |
| | md_table = "Markdown ๋ณํ์ ์ํด 'tabulate' ๋ผ์ด๋ธ๋ฌ๋ฆฌ๊ฐ ํ์ํฉ๋๋ค." |
| | return md_table, None |
| | |
| | |
| | with tempfile.NamedTemporaryFile(delete=False, suffix=".csv") as tmp: |
| | freq_df.to_csv(tmp.name, index=False, encoding='utf-8-sig') |
| | tmp_path = tmp.name |
| | |
| | return md_table, tmp_path |
| |
|
| | |
| | |
| | |
| |
|
| | with gr.Blocks() as demo: |
| | with gr.Tab("ํค์๋ ์นด์ดํธ"): |
| | with gr.Row(): |
| | |
| | with gr.Column(): |
| | main_textbox = gr.Textbox( |
| | label="๋ณธ๋ฌธ ํ
์คํธ", |
| | lines=16, |
| | placeholder="์ฌ๊ธฐ์ ๊ธด ๋ณธ๋ฌธ์ ๋ถ์ฌ๋ฃ์ผ์ธ์." |
| | ) |
| | keyword_input = gr.Textbox( |
| | label="(์ ํ) ์ง์ ์
๋ ฅ ํค์๋ - ์ํฐ๋ก ๊ตฌ๋ถ", |
| | lines=6, |
| | placeholder="์)\n์ด์ํ๊ฐ์ต๊ธฐ\n๊ฐ์ต๊ธฐ\n..." |
| | ) |
| | excel_input = gr.File( |
| | label="(์ ํ) ์์
์
๋ก๋", |
| | file_types=[".xlsx"] |
| | ) |
| | run_button = gr.Button("๋ถ์ํ๊ธฐ") |
| | |
| | |
| | with gr.Column(): |
| | output_md = gr.Markdown(label="๊ฒฐ๊ณผ ํ") |
| | output_file = gr.File(label="๊ฒฐ๊ณผ ๋ค์ด๋ก๋") |
| | |
| | run_button.click( |
| | fn=count_keywords, |
| | inputs=[main_textbox, excel_input, keyword_input], |
| | outputs=[output_md, output_file] |
| | ) |
| | |
| | with gr.Tab("ํํ์ ๋ถ์ ๊ธฐ๋ฐ ์นด์ดํธ"): |
| | with gr.Row(): |
| | |
| | with gr.Column(): |
| | morph_text_input = gr.Textbox( |
| | label="๋ณธ๋ฌธ ํ
์คํธ", |
| | lines=16, |
| | placeholder="์ฌ๊ธฐ์ ๊ธด ๋ณธ๋ฌธ์ ๋ถ์ฌ๋ฃ์ผ์ธ์." |
| | ) |
| | morph_run_button = gr.Button("๋ถ์ํ๊ธฐ") |
| | |
| | |
| | with gr.Column(): |
| | morph_result_display = gr.Markdown(label="๋ถ์ ๊ฒฐ๊ณผ") |
| | morph_download_button = gr.File(label="๊ฒฐ๊ณผ ๋ค์ด๋ก๋") |
| | |
| | morph_run_button.click( |
| | fn=morph_analysis_and_count, |
| | inputs=morph_text_input, |
| | outputs=[morph_result_display, morph_download_button] |
| | ) |
| | |
| |
|
| | if __name__ == "__main__": |
| | demo.launch() |