kcelectra-base-DC / source /keyword_classifier
alsxxxz's picture
Create source/keyword_classifier
dbb6259 verified
"""
keyword_classifier.py
๋ฉ€ํ‹ฐ๋ ˆ์ด๋ธ” ํ‚ค์›Œ๋“œ ์ถ”์ถœ ๋ชจ๋“ˆ
ํ•จ์ˆ˜:
- load_patterns: ์‚ฌ์ „ ์ •์˜๋œ ํ‚ค์›Œ๋“œ ๋ฆฌ์ŠคํŠธ๋ฅผ ํŒจํ„ด ๋”•์…”๋„ˆ๋ฆฌ๋กœ ๋ณ€ํ™˜
- classify_keywords_df: DataFrame์— ๋Œ€ํ•ด ๊ฐ ํ‚ค์›Œ๋“œ ํŒจํ„ด์— ๋งž์ถฐ 0/1 ๋ ˆ์ด๋ธ” ์ปฌ๋Ÿผ ์ถ”๊ฐ€
CLI:
--input : ์ž…๋ ฅ CSV ๊ฒฝ๋กœ (ID, divided_comment, sentiment_pred ํฌํ•จ)
--output : ์ถœ๋ ฅ CSV ๊ฒฝ๋กœ
--text-col : ํ…์ŠคํŠธ ์ปฌ๋Ÿผ๋ช… (๊ธฐ๋ณธ: 'divided_comment')
--preview : True์ผ ๋•Œ ์ƒ˜ํ”Œ 20๊ฐœ ํ‘œ์ถœ
์‚ฌ์šฉ ์˜ˆ์‹œ:
python keyword_classifier.py \
--input data/step3_sentiment.csv \
--output data/step4_keywords.csv \
--preview
"""
import argparse
import pandas as pd
import re
from ace_tools_open import display_dataframe_to_user
def load_patterns() -> dict:
"""
์‚ฌ์ „ ์ •์˜๋œ ํ‚ค์›Œ๋“œ ๋ฆฌ์ŠคํŠธ๋ฅผ regex ํŒจํ„ด dict๋กœ ์ปดํŒŒ์ผํ•˜์—ฌ ๋ฐ˜ํ™˜
"""
keyword_dict = {
'๋ง›': [
'๋ง›์žˆ','๋‹ฌ๋‹ฌ','๋‹จ๋ง›','์ง ๋ง›','๋ง›์ž„','๊ฐ์น ๋ง›','๋งค์›Œ','์ง ','์„คํƒ•',
'๋‹จ์ง ','๋ฐ๋ฐ','๋งค์ฝค','์ƒํผ','๋น„๋ฆฟ','์ธ๊ณต์ ','๋‹น์ถฉ์ „', '๋А๋ผ'
],
'์‹๊ฐ': [
'์‹๊ฐ','์ซ€๋“','์ซ€๋“ํ•จ','๋ฐ”์‚ญ','ํฝํฝ','๋ถ€๋“œ๋Ÿฌ์›€','์ซ€์ซ€ํ•จ','์ด‰์ด‰',
'์งˆ๊ฒจ','์”น์‹ธ๋ฆ„','๋น ์‚ญ','๊ฒ‰๋ฐ”์†์ด‰','๊พธ๋•','๋ฏธ๋Œ','์†์ซ€','๋ป‘๋ป‘','์‚ฌ๋ฅด๋ฅด'
],
'๊ธฐํƒ€': [
'ํฌ์žฅ','๋””์ž์ธ','์Šคํƒ€์ผ','ํŽธ์˜์ ','์‚ฌ์ง„','์ธ์Šคํƒ€','๋ธŒ๋žœ๋“œ','์ปฌ๋Ÿฌ',
'๋น„์ฃผ','๋น„์ฃผ์–ผ','์„ ๋ฌผ','๋ฆฌ๋‰ด์–ผ','CU','GS','์„ธ๋ธ','์„ธ๋ธ์ผ๋ ˆ๋ธ',
'์ง€์—์Šค','์”จ์œ ','์ด๋งˆํŠธ','๋…ธ๋ธŒ๋žœ๋“œ','์ด๋งˆํŠธ24','๋ฐฐ๋ฏผ','B๋งˆํŠธ',
'๋น„๋งˆํŠธ','ํŒ์—…','๋ฏธ๋‹ˆ์Šคํ†ฑ'
],
'๊ฐ€๊ฒฉ': [
'๊ฐ€๊ฒฉ','๊ฐ€์„ฑ','ํ• ์ธ','๊ฐ€์„ฑ๋น„','๋ถ€๋‹ด','๋Œ€๋น„','์‹ธ๊ตฌ๋ ค','๊ฐ€๊ฒฉ์—๋น„ํ•ด',
'๋น„์‹ธ์—ฌ','๋„˜๋น„์‹ธ','์ด๋”ด๊ฒŒ','๊ฐ€์‹ฌ๋น„','์ด๊ฐ€๊ฒฉ','ํ•ฉ๋ฆฌ์ '
],
'์ฃผ๊ด€์ ํ‰๊ฐ€': [
'๊ฐ๋™','ํ–‰๋ณต','๋งŒ์กฑ','๋Œ€๋ฐ•','์ตœ๊ณ ','์‹ค๋ง','์•„์‰ฝ','์•„์‰ฌ์šด','์ถ”์ฒœ',
'์ง„์‹ฌ','๊ฐํƒ„','์กด๋ง›','๋น„์ถ”','๋А๋‚Œ','ํ€„๋ฆฌํ‹ฐ','์žฌ๊ตฌ๋งค','๊ฐ•์ถ”',
'๊ตฟ๊ตฟ','์ค‘๋…์„ฑ','๊ฐœ์กด๋ง›'
]
}
# regex ์ปดํŒŒ์ผ
patterns = {}
for label, kw_list in keyword_dict.items():
# OR ๊ฒฐํ•ฉ, ignore case
regex = re.compile('|'.join(map(re.escape, kw_list)), flags=re.IGNORECASE)
patterns[label] = regex
return patterns
def classify_keywords_df(
df: pd.DataFrame,
text_col: str = 'divided_comment'
) -> pd.DataFrame:
"""
1) text_col ๊ธฐ์ค€์œผ๋กœ ํŒจํ„ด ๋งค์นญํ•ด 0/1 ์ปฌ๋Ÿผ ์ถ”๊ฐ€
2) ์•„๋ฌด ํ‚ค์›Œ๋“œ์—๋„ ๋งค์นญ๋˜์ง€ ์•Š์€(๋ชจ๋“  ์ปฌ๋Ÿผ 0) ํ–‰์€ ์‚ญ์ œ
"""
patterns = load_patterns()
label_cols = list(patterns.keys())
# 1) ๊ฐ ํŒจํ„ด๋ณ„ ๋งค์นญ ์—ฌ๋ถ€ ์ปฌ๋Ÿผ ์ƒ์„ฑ
for label, pat in patterns.items():
df[label] = df[text_col].str.contains(pat).astype(int)
# 2) ๋ชจ๋“  ํ‚ค์›Œ๋“œ ์ปฌ๋Ÿผ์ด 0์ธ ํ–‰ DROP
mask = df[label_cols].sum(axis=1) > 0
df_filtered = df[mask].reset_index(drop=True)
return df_filtered
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='ํ‚ค์›Œ๋“œ ๊ธฐ๋ฐ˜ ๋ฉ€ํ‹ฐ๋ ˆ์ด๋ธ” ๋ถ„๋ฅ˜ ๋ชจ๋“ˆ')
parser.add_argument('--input', '-i', required=True, help='์ž…๋ ฅ CSV ํŒŒ์ผ ๊ฒฝ๋กœ')
parser.add_argument('--output', '-o', required=True, help='์ถœ๋ ฅ CSV ํŒŒ์ผ ๊ฒฝ๋กœ')
parser.add_argument('--text-col', default='divided_comment', help='ํ…์ŠคํŠธ ์ปฌ๋Ÿผ๋ช…')
parser.add_argument('--preview', action='store_true', help='์ƒ˜ํ”Œ 20๊ฐœ ์ถœ๋ ฅ')
args = parser.parse_args()
df = pd.read_csv(args.input, encoding='utf-8-sig')
df_out = classify_keywords_df(df, text_col=args.text_col)
if args.preview:
cols = ['ID', args.text_col, 'sentiment'] + list(load_patterns().keys())
display_dataframe_to_user('ํ‚ค์›Œ๋“œ ๋ถ„๋ฅ˜ ์˜ˆ์‹œ', df_out[cols].head(20))
df_out.to_csv(args.output, index=False, encoding='utf-8-sig')
print(f"[KEYWORD] ์ €์žฅ ์™„๋ฃŒ: {args.output} | ๋ ˆ์ด๋ธ”: {', '.join(load_patterns().keys())}")