| | """ |
| | keyword_classifier.py |
| | |
| | ๋ฉํฐ๋ ์ด๋ธ ํค์๋ ์ถ์ถ ๋ชจ๋ |
| | |
| | ํจ์: |
| | - load_patterns: ์ฌ์ ์ ์๋ ํค์๋ ๋ฆฌ์คํธ๋ฅผ ํจํด ๋์
๋๋ฆฌ๋ก ๋ณํ |
| | - classify_keywords_df: DataFrame์ ๋ํด ๊ฐ ํค์๋ ํจํด์ ๋ง์ถฐ 0/1 ๋ ์ด๋ธ ์ปฌ๋ผ ์ถ๊ฐ |
| | |
| | CLI: |
| | --input : ์
๋ ฅ CSV ๊ฒฝ๋ก (ID, divided_comment, sentiment_pred ํฌํจ) |
| | --output : ์ถ๋ ฅ CSV ๊ฒฝ๋ก |
| | --text-col : ํ
์คํธ ์ปฌ๋ผ๋ช
(๊ธฐ๋ณธ: 'divided_comment') |
| | --preview : True์ผ ๋ ์ํ 20๊ฐ ํ์ถ |
| | |
| | ์ฌ์ฉ ์์: |
| | python keyword_classifier.py \ |
| | --input data/step3_sentiment.csv \ |
| | --output data/step4_keywords.csv \ |
| | --preview |
| | """ |
| |
|
| | import argparse |
| | import pandas as pd |
| | import re |
| | from ace_tools_open import display_dataframe_to_user |
| |
|
| |
|
| | def load_patterns() -> dict: |
| | """ |
| | ์ฌ์ ์ ์๋ ํค์๋ ๋ฆฌ์คํธ๋ฅผ regex ํจํด dict๋ก ์ปดํ์ผํ์ฌ ๋ฐํ |
| | """ |
| | keyword_dict = { |
| | '๋ง': [ |
| | '๋ง์','๋ฌ๋ฌ','๋จ๋ง','์ง ๋ง','๋ง์','๊ฐ์น ๋ง','๋งค์','์ง ','์คํ', |
| | '๋จ์ง ','๋ฐ๋ฐ','๋งค์ฝค','์ํผ','๋น๋ฆฟ','์ธ๊ณต์ ','๋น์ถฉ์ ', '๋๋ผ' |
| | ], |
| | '์๊ฐ': [ |
| | '์๊ฐ','์ซ๋','์ซ๋ํจ','๋ฐ์ญ','ํฝํฝ','๋ถ๋๋ฌ์','์ซ์ซํจ','์ด์ด', |
| | '์ง๊ฒจ','์น์ธ๋ฆ','๋น ์ญ','๊ฒ๋ฐ์์ด','๊พธ๋','๋ฏธ๋','์์ซ','๋ป๋ป','์ฌ๋ฅด๋ฅด' |
| | ], |
| | '๊ธฐํ': [ |
| | 'ํฌ์ฅ','๋์์ธ','์คํ์ผ','ํธ์์ ','์ฌ์ง','์ธ์คํ','๋ธ๋๋','์ปฌ๋ฌ', |
| | '๋น์ฃผ','๋น์ฃผ์ผ','์ ๋ฌผ','๋ฆฌ๋ด์ผ','CU','GS','์ธ๋ธ','์ธ๋ธ์ผ๋ ๋ธ', |
| | '์ง์์ค','์จ์ ','์ด๋งํธ','๋
ธ๋ธ๋๋','์ด๋งํธ24','๋ฐฐ๋ฏผ','B๋งํธ', |
| | '๋น๋งํธ','ํ์
','๋ฏธ๋์คํฑ' |
| | ], |
| | '๊ฐ๊ฒฉ': [ |
| | '๊ฐ๊ฒฉ','๊ฐ์ฑ','ํ ์ธ','๊ฐ์ฑ๋น','๋ถ๋ด','๋๋น','์ธ๊ตฌ๋ ค','๊ฐ๊ฒฉ์๋นํด', |
| | '๋น์ธ์ฌ','๋๋น์ธ','์ด๋ด๊ฒ','๊ฐ์ฌ๋น','์ด๊ฐ๊ฒฉ','ํฉ๋ฆฌ์ ' |
| | ], |
| | '์ฃผ๊ด์ ํ๊ฐ': [ |
| | '๊ฐ๋','ํ๋ณต','๋ง์กฑ','๋๋ฐ','์ต๊ณ ','์ค๋ง','์์ฝ','์์ฌ์ด','์ถ์ฒ', |
| | '์ง์ฌ','๊ฐํ','์กด๋ง','๋น์ถ','๋๋','ํ๋ฆฌํฐ','์ฌ๊ตฌ๋งค','๊ฐ์ถ', |
| | '๊ตฟ๊ตฟ','์ค๋
์ฑ','๊ฐ์กด๋ง' |
| | ] |
| | } |
| | |
| | patterns = {} |
| | for label, kw_list in keyword_dict.items(): |
| | |
| | regex = re.compile('|'.join(map(re.escape, kw_list)), flags=re.IGNORECASE) |
| | patterns[label] = regex |
| | return patterns |
| |
|
| |
|
| | def classify_keywords_df( |
| | df: pd.DataFrame, |
| | text_col: str = 'divided_comment' |
| | ) -> pd.DataFrame: |
| | """ |
| | 1) text_col ๊ธฐ์ค์ผ๋ก ํจํด ๋งค์นญํด 0/1 ์ปฌ๋ผ ์ถ๊ฐ |
| | 2) ์๋ฌด ํค์๋์๋ ๋งค์นญ๋์ง ์์(๋ชจ๋ ์ปฌ๋ผ 0) ํ์ ์ญ์ |
| | """ |
| | patterns = load_patterns() |
| | label_cols = list(patterns.keys()) |
| |
|
| | |
| | for label, pat in patterns.items(): |
| | df[label] = df[text_col].str.contains(pat).astype(int) |
| |
|
| | |
| | mask = df[label_cols].sum(axis=1) > 0 |
| | df_filtered = df[mask].reset_index(drop=True) |
| |
|
| | return df_filtered |
| |
|
| |
|
| | if __name__ == '__main__': |
| | parser = argparse.ArgumentParser(description='ํค์๋ ๊ธฐ๋ฐ ๋ฉํฐ๋ ์ด๋ธ ๋ถ๋ฅ ๋ชจ๋') |
| | parser.add_argument('--input', '-i', required=True, help='์
๋ ฅ CSV ํ์ผ ๊ฒฝ๋ก') |
| | parser.add_argument('--output', '-o', required=True, help='์ถ๋ ฅ CSV ํ์ผ ๊ฒฝ๋ก') |
| | parser.add_argument('--text-col', default='divided_comment', help='ํ
์คํธ ์ปฌ๋ผ๋ช
') |
| | parser.add_argument('--preview', action='store_true', help='์ํ 20๊ฐ ์ถ๋ ฅ') |
| | args = parser.parse_args() |
| |
|
| | df = pd.read_csv(args.input, encoding='utf-8-sig') |
| | df_out = classify_keywords_df(df, text_col=args.text_col) |
| |
|
| | if args.preview: |
| | cols = ['ID', args.text_col, 'sentiment'] + list(load_patterns().keys()) |
| | display_dataframe_to_user('ํค์๋ ๋ถ๋ฅ ์์', df_out[cols].head(20)) |
| |
|
| | df_out.to_csv(args.output, index=False, encoding='utf-8-sig') |
| | print(f"[KEYWORD] ์ ์ฅ ์๋ฃ: {args.output} | ๋ ์ด๋ธ: {', '.join(load_patterns().keys())}") |