alsxxxz commited on
Commit
dbb6259
ยท
verified ยท
1 Parent(s): 2f44741

Create source/keyword_classifier

Browse files
Files changed (1) hide show
  1. source/keyword_classifier +105 -0
source/keyword_classifier ADDED
@@ -0,0 +1,105 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ keyword_classifier.py
3
+
4
+ ๋ฉ€ํ‹ฐ๋ ˆ์ด๋ธ” ํ‚ค์›Œ๋“œ ์ถ”์ถœ ๋ชจ๋“ˆ
5
+
6
+ ํ•จ์ˆ˜:
7
+ - load_patterns: ์‚ฌ์ „ ์ •์˜๋œ ํ‚ค์›Œ๋“œ ๋ฆฌ์ŠคํŠธ๋ฅผ ํŒจํ„ด ๋”•์…”๋„ˆ๋ฆฌ๋กœ ๋ณ€ํ™˜
8
+ - classify_keywords_df: DataFrame์— ๋Œ€ํ•ด ๊ฐ ํ‚ค์›Œ๋“œ ํŒจํ„ด์— ๋งž์ถฐ 0/1 ๋ ˆ์ด๋ธ” ์ปฌ๋Ÿผ ์ถ”๊ฐ€
9
+
10
+ CLI:
11
+ --input : ์ž…๋ ฅ CSV ๊ฒฝ๋กœ (ID, divided_comment, sentiment_pred ํฌํ•จ)
12
+ --output : ์ถœ๋ ฅ CSV ๊ฒฝ๋กœ
13
+ --text-col : ํ…์ŠคํŠธ ์ปฌ๋Ÿผ๋ช… (๊ธฐ๋ณธ: 'divided_comment')
14
+ --preview : True์ผ ๋•Œ ์ƒ˜ํ”Œ 20๊ฐœ ํ‘œ์ถœ
15
+
16
+ ์‚ฌ์šฉ ์˜ˆ์‹œ:
17
+ python keyword_classifier.py \
18
+ --input data/step3_sentiment.csv \
19
+ --output data/step4_keywords.csv \
20
+ --preview
21
+ """
22
+
23
+ import argparse
24
+ import pandas as pd
25
+ import re
26
+ from ace_tools_open import display_dataframe_to_user
27
+
28
+
29
+ def load_patterns() -> dict:
30
+ """
31
+ ์‚ฌ์ „ ์ •์˜๋œ ํ‚ค์›Œ๋“œ ๋ฆฌ์ŠคํŠธ๋ฅผ regex ํŒจํ„ด dict๋กœ ์ปดํŒŒ์ผํ•˜์—ฌ ๋ฐ˜ํ™˜
32
+ """
33
+ keyword_dict = {
34
+ '๋ง›': [
35
+ '๋ง›์žˆ','๋‹ฌ๋‹ฌ','๋‹จ๋ง›','์ง ๋ง›','๋ง›์ž„','๊ฐ์น ๋ง›','๋งค์›Œ','์ง ','์„คํƒ•',
36
+ '๋‹จ์ง ','๋ฐ๋ฐ','๋งค์ฝค','์ƒํผ','๋น„๋ฆฟ','์ธ๊ณต์ ','๋‹น์ถฉ์ „', '๋А๋ผ'
37
+ ],
38
+ '์‹๊ฐ': [
39
+ '์‹๊ฐ','์ซ€๋“','์ซ€๋“ํ•จ','๋ฐ”์‚ญ','ํฝํฝ','๋ถ€๋“œ๋Ÿฌ์›€','์ซ€์ซ€ํ•จ','์ด‰์ด‰',
40
+ '์งˆ๊ฒจ','์”น์‹ธ๋ฆ„','๋น ์‚ญ','๊ฒ‰๋ฐ”์†์ด‰','๊พธ๋•','๋ฏธ๋Œ','์†์ซ€','๋ป‘๋ป‘','์‚ฌ๋ฅด๋ฅด'
41
+ ],
42
+ '๊ธฐํƒ€': [
43
+ 'ํฌ์žฅ','๋””์ž์ธ','์Šคํƒ€์ผ','ํŽธ์˜์ ','์‚ฌ์ง„','์ธ์Šคํƒ€','๋ธŒ๋žœ๋“œ','์ปฌ๋Ÿฌ',
44
+ '๋น„์ฃผ','๋น„์ฃผ์–ผ','์„ ๋ฌผ','๋ฆฌ๋‰ด์–ผ','CU','GS','์„ธ๋ธ','์„ธ๋ธ์ผ๋ ˆ๋ธ',
45
+ '์ง€์—์Šค','์”จ์œ ','์ด๋งˆํŠธ','๋…ธ๋ธŒ๋žœ๋“œ','์ด๋งˆํŠธ24','๋ฐฐ๋ฏผ','B๋งˆํŠธ',
46
+ '๋น„๋งˆํŠธ','ํŒ์—…','๋ฏธ๋‹ˆ์Šคํ†ฑ'
47
+ ],
48
+ '๊ฐ€๊ฒฉ': [
49
+ '๊ฐ€๊ฒฉ','๊ฐ€์„ฑ','ํ• ์ธ','๊ฐ€์„ฑ๋น„','๋ถ€๋‹ด','๋Œ€๋น„','์‹ธ๊ตฌ๋ ค','๊ฐ€๊ฒฉ์—๋น„ํ•ด',
50
+ '๋น„์‹ธ์—ฌ','๋„˜๋น„์‹ธ','์ด๋”ด๊ฒŒ','๊ฐ€์‹ฌ๋น„','์ด๊ฐ€๊ฒฉ','ํ•ฉ๋ฆฌ์ '
51
+ ],
52
+ '์ฃผ๊ด€์ ํ‰๊ฐ€': [
53
+ '๊ฐ๋™','ํ–‰๋ณต','๋งŒ์กฑ','๋Œ€๋ฐ•','์ตœ๊ณ ','์‹ค๋ง','์•„์‰ฝ','์•„์‰ฌ์šด','์ถ”์ฒœ',
54
+ '์ง„์‹ฌ','๊ฐํƒ„','์กด๋ง›','๋น„์ถ”','๋А๋‚Œ','ํ€„๋ฆฌํ‹ฐ','์žฌ๊ตฌ๋งค','๊ฐ•์ถ”',
55
+ '๊ตฟ๊ตฟ','์ค‘๋…์„ฑ','๊ฐœ์กด๋ง›'
56
+ ]
57
+ }
58
+ # regex ์ปดํŒŒ์ผ
59
+ patterns = {}
60
+ for label, kw_list in keyword_dict.items():
61
+ # OR ๊ฒฐํ•ฉ, ignore case
62
+ regex = re.compile('|'.join(map(re.escape, kw_list)), flags=re.IGNORECASE)
63
+ patterns[label] = regex
64
+ return patterns
65
+
66
+
67
+ def classify_keywords_df(
68
+ df: pd.DataFrame,
69
+ text_col: str = 'divided_comment'
70
+ ) -> pd.DataFrame:
71
+ """
72
+ 1) text_col ๊ธฐ์ค€์œผ๋กœ ํŒจํ„ด ๋งค์นญํ•ด 0/1 ์ปฌ๋Ÿผ ์ถ”๊ฐ€
73
+ 2) ์•„๋ฌด ํ‚ค์›Œ๋“œ์—๋„ ๋งค์นญ๋˜์ง€ ์•Š์€(๋ชจ๋“  ์ปฌ๋Ÿผ 0) ํ–‰์€ ์‚ญ์ œ
74
+ """
75
+ patterns = load_patterns()
76
+ label_cols = list(patterns.keys())
77
+
78
+ # 1) ๊ฐ ํŒจํ„ด๋ณ„ ๋งค์นญ ์—ฌ๋ถ€ ์ปฌ๋Ÿผ ์ƒ์„ฑ
79
+ for label, pat in patterns.items():
80
+ df[label] = df[text_col].str.contains(pat).astype(int)
81
+
82
+ # 2) ๋ชจ๋“  ํ‚ค์›Œ๋“œ ์ปฌ๋Ÿผ์ด 0์ธ ํ–‰ DROP
83
+ mask = df[label_cols].sum(axis=1) > 0
84
+ df_filtered = df[mask].reset_index(drop=True)
85
+
86
+ return df_filtered
87
+
88
+
89
+ if __name__ == '__main__':
90
+ parser = argparse.ArgumentParser(description='ํ‚ค์›Œ๋“œ ๊ธฐ๋ฐ˜ ๋ฉ€ํ‹ฐ๋ ˆ์ด๋ธ” ๋ถ„๋ฅ˜ ๋ชจ๋“ˆ')
91
+ parser.add_argument('--input', '-i', required=True, help='์ž…๋ ฅ CSV ํŒŒ์ผ ๊ฒฝ๋กœ')
92
+ parser.add_argument('--output', '-o', required=True, help='์ถœ๋ ฅ CSV ํŒŒ์ผ ๊ฒฝ๋กœ')
93
+ parser.add_argument('--text-col', default='divided_comment', help='ํ…์ŠคํŠธ ์ปฌ๋Ÿผ๋ช…')
94
+ parser.add_argument('--preview', action='store_true', help='์ƒ˜ํ”Œ 20๊ฐœ ์ถœ๋ ฅ')
95
+ args = parser.parse_args()
96
+
97
+ df = pd.read_csv(args.input, encoding='utf-8-sig')
98
+ df_out = classify_keywords_df(df, text_col=args.text_col)
99
+
100
+ if args.preview:
101
+ cols = ['ID', args.text_col, 'sentiment'] + list(load_patterns().keys())
102
+ display_dataframe_to_user('ํ‚ค์›Œ๋“œ ๋ถ„๋ฅ˜ ์˜ˆ์‹œ', df_out[cols].head(20))
103
+
104
+ df_out.to_csv(args.output, index=False, encoding='utf-8-sig')
105
+ print(f"[KEYWORD] ์ €์žฅ ์™„๋ฃŒ: {args.output} | ๋ ˆ์ด๋ธ”: {', '.join(load_patterns().keys())}")