| """ |
| keyword_classifier.py |
| |
| ๋ฉํฐ๋ ์ด๋ธ ํค์๋ ์ถ์ถ ๋ชจ๋ |
| |
| ํจ์: |
| - load_patterns: ์ฌ์ ์ ์๋ ํค์๋ ๋ฆฌ์คํธ๋ฅผ ํจํด ๋์
๋๋ฆฌ๋ก ๋ณํ |
| - classify_keywords_df: DataFrame์ ๋ํด ๊ฐ ํค์๋ ํจํด์ ๋ง์ถฐ 0/1 ๋ ์ด๋ธ ์ปฌ๋ผ ์ถ๊ฐ |
| |
| CLI: |
| --input : ์
๋ ฅ CSV ๊ฒฝ๋ก (ID, divided_comment, sentiment_pred ํฌํจ) |
| --output : ์ถ๋ ฅ CSV ๊ฒฝ๋ก |
| --text-col : ํ
์คํธ ์ปฌ๋ผ๋ช
(๊ธฐ๋ณธ: 'divided_comment') |
| --preview : True์ผ ๋ ์ํ 20๊ฐ ํ์ถ |
| |
| ์ฌ์ฉ ์์: |
| python keyword_classifier.py \ |
| --input data/step3_sentiment.csv \ |
| --output data/step4_keywords.csv \ |
| --preview |
| """ |
|
|
| import argparse |
| import pandas as pd |
| import re |
| from ace_tools_open import display_dataframe_to_user |
|
|
|
|
| def load_patterns() -> dict: |
| """ |
| ์ฌ์ ์ ์๋ ํค์๋ ๋ฆฌ์คํธ๋ฅผ regex ํจํด dict๋ก ์ปดํ์ผํ์ฌ ๋ฐํ |
| """ |
| keyword_dict = { |
| '๋ง': [ |
| '๋ง์','๋ฌ๋ฌ','๋จ๋ง','์ง ๋ง','๋ง์','๊ฐ์น ๋ง','๋งค์','์ง ','์คํ', |
| '๋จ์ง ','๋ฐ๋ฐ','๋งค์ฝค','์ํผ','๋น๋ฆฟ','์ธ๊ณต์ ','๋น์ถฉ์ ', '๋๋ผ' |
| ], |
| '์๊ฐ': [ |
| '์๊ฐ','์ซ๋','์ซ๋ํจ','๋ฐ์ญ','ํฝํฝ','๋ถ๋๋ฌ์','์ซ์ซํจ','์ด์ด', |
| '์ง๊ฒจ','์น์ธ๋ฆ','๋น ์ญ','๊ฒ๋ฐ์์ด','๊พธ๋','๋ฏธ๋','์์ซ','๋ป๋ป','์ฌ๋ฅด๋ฅด' |
| ], |
| '๊ธฐํ': [ |
| 'ํฌ์ฅ','๋์์ธ','์คํ์ผ','ํธ์์ ','์ฌ์ง','์ธ์คํ','๋ธ๋๋','์ปฌ๋ฌ', |
| '๋น์ฃผ','๋น์ฃผ์ผ','์ ๋ฌผ','๋ฆฌ๋ด์ผ','CU','GS','์ธ๋ธ','์ธ๋ธ์ผ๋ ๋ธ', |
| '์ง์์ค','์จ์ ','์ด๋งํธ','๋
ธ๋ธ๋๋','์ด๋งํธ24','๋ฐฐ๋ฏผ','B๋งํธ', |
| '๋น๋งํธ','ํ์
','๋ฏธ๋์คํฑ' |
| ], |
| '๊ฐ๊ฒฉ': [ |
| '๊ฐ๊ฒฉ','๊ฐ์ฑ','ํ ์ธ','๊ฐ์ฑ๋น','๋ถ๋ด','๋๋น','์ธ๊ตฌ๋ ค','๊ฐ๊ฒฉ์๋นํด', |
| '๋น์ธ์ฌ','๋๋น์ธ','์ด๋ด๊ฒ','๊ฐ์ฌ๋น','์ด๊ฐ๊ฒฉ','ํฉ๋ฆฌ์ ' |
| ], |
| '์ฃผ๊ด์ ํ๊ฐ': [ |
| '๊ฐ๋','ํ๋ณต','๋ง์กฑ','๋๋ฐ','์ต๊ณ ','์ค๋ง','์์ฝ','์์ฌ์ด','์ถ์ฒ', |
| '์ง์ฌ','๊ฐํ','์กด๋ง','๋น์ถ','๋๋','ํ๋ฆฌํฐ','์ฌ๊ตฌ๋งค','๊ฐ์ถ', |
| '๊ตฟ๊ตฟ','์ค๋
์ฑ','๊ฐ์กด๋ง' |
| ] |
| } |
| |
| patterns = {} |
| for label, kw_list in keyword_dict.items(): |
| |
| regex = re.compile('|'.join(map(re.escape, kw_list)), flags=re.IGNORECASE) |
| patterns[label] = regex |
| return patterns |
|
|
|
|
| def classify_keywords_df( |
| df: pd.DataFrame, |
| text_col: str = 'divided_comment' |
| ) -> pd.DataFrame: |
| """ |
| 1) text_col ๊ธฐ์ค์ผ๋ก ํจํด ๋งค์นญํด 0/1 ์ปฌ๋ผ ์ถ๊ฐ |
| 2) ์๋ฌด ํค์๋์๋ ๋งค์นญ๋์ง ์์(๋ชจ๋ ์ปฌ๋ผ 0) ํ์ ์ญ์ |
| """ |
| patterns = load_patterns() |
| label_cols = list(patterns.keys()) |
|
|
| |
| for label, pat in patterns.items(): |
| df[label] = df[text_col].str.contains(pat).astype(int) |
|
|
| |
| mask = df[label_cols].sum(axis=1) > 0 |
| df_filtered = df[mask].reset_index(drop=True) |
|
|
| return df_filtered |
|
|
|
|
| if __name__ == '__main__': |
| parser = argparse.ArgumentParser(description='ํค์๋ ๊ธฐ๋ฐ ๋ฉํฐ๋ ์ด๋ธ ๋ถ๋ฅ ๋ชจ๋') |
| parser.add_argument('--input', '-i', required=True, help='์
๋ ฅ CSV ํ์ผ ๊ฒฝ๋ก') |
| parser.add_argument('--output', '-o', required=True, help='์ถ๋ ฅ CSV ํ์ผ ๊ฒฝ๋ก') |
| parser.add_argument('--text-col', default='divided_comment', help='ํ
์คํธ ์ปฌ๋ผ๋ช
') |
| parser.add_argument('--preview', action='store_true', help='์ํ 20๊ฐ ์ถ๋ ฅ') |
| args = parser.parse_args() |
|
|
| df = pd.read_csv(args.input, encoding='utf-8-sig') |
| df_out = classify_keywords_df(df, text_col=args.text_col) |
|
|
| if args.preview: |
| cols = ['ID', args.text_col, 'sentiment'] + list(load_patterns().keys()) |
| display_dataframe_to_user('ํค์๋ ๋ถ๋ฅ ์์', df_out[cols].head(20)) |
|
|
| df_out.to_csv(args.output, index=False, encoding='utf-8-sig') |
| print(f"[KEYWORD] ์ ์ฅ ์๋ฃ: {args.output} | ๋ ์ด๋ธ: {', '.join(load_patterns().keys())}") |