Create source/keyword_classifier
Browse files- source/keyword_classifier +105 -0
source/keyword_classifier
ADDED
|
@@ -0,0 +1,105 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
keyword_classifier.py
|
| 3 |
+
|
| 4 |
+
๋ฉํฐ๋ ์ด๋ธ ํค์๋ ์ถ์ถ ๋ชจ๋
|
| 5 |
+
|
| 6 |
+
ํจ์:
|
| 7 |
+
- load_patterns: ์ฌ์ ์ ์๋ ํค์๋ ๋ฆฌ์คํธ๋ฅผ ํจํด ๋์
๋๋ฆฌ๋ก ๋ณํ
|
| 8 |
+
- classify_keywords_df: DataFrame์ ๋ํด ๊ฐ ํค์๋ ํจํด์ ๋ง์ถฐ 0/1 ๋ ์ด๋ธ ์ปฌ๋ผ ์ถ๊ฐ
|
| 9 |
+
|
| 10 |
+
CLI:
|
| 11 |
+
--input : ์
๋ ฅ CSV ๊ฒฝ๋ก (ID, divided_comment, sentiment_pred ํฌํจ)
|
| 12 |
+
--output : ์ถ๋ ฅ CSV ๊ฒฝ๋ก
|
| 13 |
+
--text-col : ํ
์คํธ ์ปฌ๋ผ๋ช
(๊ธฐ๋ณธ: 'divided_comment')
|
| 14 |
+
--preview : True์ผ ๋ ์ํ 20๊ฐ ํ์ถ
|
| 15 |
+
|
| 16 |
+
์ฌ์ฉ ์์:
|
| 17 |
+
python keyword_classifier.py \
|
| 18 |
+
--input data/step3_sentiment.csv \
|
| 19 |
+
--output data/step4_keywords.csv \
|
| 20 |
+
--preview
|
| 21 |
+
"""
|
| 22 |
+
|
| 23 |
+
import argparse
|
| 24 |
+
import pandas as pd
|
| 25 |
+
import re
|
| 26 |
+
from ace_tools_open import display_dataframe_to_user
|
| 27 |
+
|
| 28 |
+
|
| 29 |
+
def load_patterns() -> dict:
|
| 30 |
+
"""
|
| 31 |
+
์ฌ์ ์ ์๋ ํค์๋ ๋ฆฌ์คํธ๋ฅผ regex ํจํด dict๋ก ์ปดํ์ผํ์ฌ ๋ฐํ
|
| 32 |
+
"""
|
| 33 |
+
keyword_dict = {
|
| 34 |
+
'๋ง': [
|
| 35 |
+
'๋ง์','๋ฌ๋ฌ','๋จ๋ง','์ง ๋ง','๋ง์','๊ฐ์น ๋ง','๋งค์','์ง ','์คํ',
|
| 36 |
+
'๋จ์ง ','๋ฐ๋ฐ','๋งค์ฝค','์ํผ','๋น๋ฆฟ','์ธ๊ณต์ ','๋น์ถฉ์ ', '๋๋ผ'
|
| 37 |
+
],
|
| 38 |
+
'์๊ฐ': [
|
| 39 |
+
'์๊ฐ','์ซ๋','์ซ๋ํจ','๋ฐ์ญ','ํฝํฝ','๋ถ๋๋ฌ์','์ซ์ซํจ','์ด์ด',
|
| 40 |
+
'์ง๊ฒจ','์น์ธ๋ฆ','๋น ์ญ','๊ฒ๋ฐ์์ด','๊พธ๋','๋ฏธ๋','์์ซ','๋ป๋ป','์ฌ๋ฅด๋ฅด'
|
| 41 |
+
],
|
| 42 |
+
'๊ธฐํ': [
|
| 43 |
+
'ํฌ์ฅ','๋์์ธ','์คํ์ผ','ํธ์์ ','์ฌ์ง','์ธ์คํ','๋ธ๋๋','์ปฌ๋ฌ',
|
| 44 |
+
'๋น์ฃผ','๋น์ฃผ์ผ','์ ๋ฌผ','๋ฆฌ๋ด์ผ','CU','GS','์ธ๋ธ','์ธ๋ธ์ผ๋ ๋ธ',
|
| 45 |
+
'์ง์์ค','์จ์ ','์ด๋งํธ','๋
ธ๋ธ๋๋','์ด๋งํธ24','๋ฐฐ๋ฏผ','B๋งํธ',
|
| 46 |
+
'๋น๋งํธ','ํ์
','๋ฏธ๋์คํฑ'
|
| 47 |
+
],
|
| 48 |
+
'๊ฐ๊ฒฉ': [
|
| 49 |
+
'๊ฐ๊ฒฉ','๊ฐ์ฑ','ํ ์ธ','๊ฐ์ฑ๋น','๋ถ๋ด','๋๋น','์ธ๊ตฌ๋ ค','๊ฐ๊ฒฉ์๋นํด',
|
| 50 |
+
'๋น์ธ์ฌ','๋๋น์ธ','์ด๋ด๊ฒ','๊ฐ์ฌ๋น','์ด๊ฐ๊ฒฉ','ํฉ๋ฆฌ์ '
|
| 51 |
+
],
|
| 52 |
+
'์ฃผ๊ด์ ํ๊ฐ': [
|
| 53 |
+
'๊ฐ๋','ํ๋ณต','๋ง์กฑ','๋๋ฐ','์ต๊ณ ','์ค๋ง','์์ฝ','์์ฌ์ด','์ถ์ฒ',
|
| 54 |
+
'์ง์ฌ','๊ฐํ','์กด๋ง','๋น์ถ','๋๋','ํ๋ฆฌํฐ','์ฌ๊ตฌ๋งค','๊ฐ์ถ',
|
| 55 |
+
'๊ตฟ๊ตฟ','์ค๋
์ฑ','๊ฐ์กด๋ง'
|
| 56 |
+
]
|
| 57 |
+
}
|
| 58 |
+
# regex ์ปดํ์ผ
|
| 59 |
+
patterns = {}
|
| 60 |
+
for label, kw_list in keyword_dict.items():
|
| 61 |
+
# OR ๊ฒฐํฉ, ignore case
|
| 62 |
+
regex = re.compile('|'.join(map(re.escape, kw_list)), flags=re.IGNORECASE)
|
| 63 |
+
patterns[label] = regex
|
| 64 |
+
return patterns
|
| 65 |
+
|
| 66 |
+
|
| 67 |
+
def classify_keywords_df(
|
| 68 |
+
df: pd.DataFrame,
|
| 69 |
+
text_col: str = 'divided_comment'
|
| 70 |
+
) -> pd.DataFrame:
|
| 71 |
+
"""
|
| 72 |
+
1) text_col ๊ธฐ์ค์ผ๋ก ํจํด ๋งค์นญํด 0/1 ์ปฌ๋ผ ์ถ๊ฐ
|
| 73 |
+
2) ์๋ฌด ํค์๋์๋ ๋งค์นญ๋์ง ์์(๋ชจ๋ ์ปฌ๋ผ 0) ํ์ ์ญ์
|
| 74 |
+
"""
|
| 75 |
+
patterns = load_patterns()
|
| 76 |
+
label_cols = list(patterns.keys())
|
| 77 |
+
|
| 78 |
+
# 1) ๊ฐ ํจํด๋ณ ๋งค์นญ ์ฌ๋ถ ์ปฌ๋ผ ์์ฑ
|
| 79 |
+
for label, pat in patterns.items():
|
| 80 |
+
df[label] = df[text_col].str.contains(pat).astype(int)
|
| 81 |
+
|
| 82 |
+
# 2) ๋ชจ๋ ํค์๋ ์ปฌ๋ผ์ด 0์ธ ํ DROP
|
| 83 |
+
mask = df[label_cols].sum(axis=1) > 0
|
| 84 |
+
df_filtered = df[mask].reset_index(drop=True)
|
| 85 |
+
|
| 86 |
+
return df_filtered
|
| 87 |
+
|
| 88 |
+
|
| 89 |
+
if __name__ == '__main__':
|
| 90 |
+
parser = argparse.ArgumentParser(description='ํค์๋ ๊ธฐ๋ฐ ๋ฉํฐ๋ ์ด๋ธ ๋ถ๋ฅ ๋ชจ๋')
|
| 91 |
+
parser.add_argument('--input', '-i', required=True, help='์
๋ ฅ CSV ํ์ผ ๊ฒฝ๋ก')
|
| 92 |
+
parser.add_argument('--output', '-o', required=True, help='์ถ๋ ฅ CSV ํ์ผ ๊ฒฝ๋ก')
|
| 93 |
+
parser.add_argument('--text-col', default='divided_comment', help='ํ
์คํธ ์ปฌ๋ผ๋ช
')
|
| 94 |
+
parser.add_argument('--preview', action='store_true', help='์ํ 20๊ฐ ์ถ๋ ฅ')
|
| 95 |
+
args = parser.parse_args()
|
| 96 |
+
|
| 97 |
+
df = pd.read_csv(args.input, encoding='utf-8-sig')
|
| 98 |
+
df_out = classify_keywords_df(df, text_col=args.text_col)
|
| 99 |
+
|
| 100 |
+
if args.preview:
|
| 101 |
+
cols = ['ID', args.text_col, 'sentiment'] + list(load_patterns().keys())
|
| 102 |
+
display_dataframe_to_user('ํค์๋ ๋ถ๋ฅ ์์', df_out[cols].head(20))
|
| 103 |
+
|
| 104 |
+
df_out.to_csv(args.output, index=False, encoding='utf-8-sig')
|
| 105 |
+
print(f"[KEYWORD] ์ ์ฅ ์๋ฃ: {args.output} | ๋ ์ด๋ธ: {', '.join(load_patterns().keys())}")
|