File size: 6,086 Bytes
26c3195
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
# =============================================================================
# 감정뢄석 λͺ¨λ“ˆ (ν•œκ΅­μ–΄ + μ˜μ–΄)
# 방법 1: KNU ν•œκ΅­μ–΄ 감성사전 (λ‚΄μž₯) β€” μ˜€ν”„λΌμΈ
# 방법 2: HuggingFace transformers β€” 온라인/GPU ꢌμž₯
# =============================================================================
import pandas as pd
import numpy as np
import re


# ── KNU ν•œκ΅­μ–΄ 감성사전 (μΆ•μ•½ λ‚΄μž₯ 버전) ──────────────────────────────────────
# 원본: https://github.com/park1200656/KnuSentiLex
_KNU_DICT = {
    # 긍정
    "μ’‹λ‹€":2,"ν›Œλ₯­ν•˜λ‹€":2,"λ›°μ–΄λ‚˜λ‹€":2,"만쑱":2,"감사":2,"행볡":2,"κΈ°μ˜λ‹€":2,
    "즐겁닀":2,"νŽΈλ¦¬ν•˜λ‹€":2,"νƒμ›”ν•˜λ‹€":2,"μš°μˆ˜ν•˜λ‹€":2,"친절":2,"λΉ λ₯΄λ‹€":1,
    "κΉ¨λ—ν•˜λ‹€":1,"합리적":1,"μ €λ ΄ν•˜λ‹€":1,"μ‹ λ’°":2,"μ•ˆμ „ν•˜λ‹€":1,"νŽΈν•˜λ‹€":1,
    "졜고":2,"ν›Œλ₯­":2,"μ‚¬λž‘":2,"μ’‹μ•„":2,"μ›ƒμŒ":1,"밝닀":1,"성곡":2,"λ°œμ „":1,
    "유읡":1,"효과적":1,"ν˜μ‹ ":1,"창의":1,"희망":2,"긍정":2,"μ™„λ²½":2,"탁월":2,
    "μΆ”μ²œ":1,"μΉ­μ°¬":2,"흑쑱":2,"감동":2,"μœ μš©ν•˜λ‹€":1,"도움":1,"쉽닀":1,
    # λΆ€μ •
    "λ‚˜μ˜λ‹€":-2,"뢈만":-2,"μ‹«λ‹€":-2,"싀망":-2,"μ΅œμ•…":-2,"뢈편":-1,"λŠλ¦¬λ‹€":-1,
    "λΉ„μ‹Έλ‹€":-1,"λΆˆμ•ˆ":-1,"μœ„ν—˜":-1,"문제":-1,"λΆˆλŸ‰":-2,"κ³ μž₯":-2,"였λ₯˜":-2,
    "짜증":-2,"ν™”λ‚˜λ‹€":-2,"μŠ¬ν”„λ‹€":-2,"우울":-2,"λΆ€μ‘±":-1,"μ–΄λ ΅λ‹€":-1,
    "νž˜λ“€λ‹€":-1,"λ³΅μž‘ν•˜λ‹€":-1,"뢈친절":-2,"λ¬΄μ‹œ":-2,"κ±°μ§“":-2,"사기":-2,
    "λΆˆμ‹ ":-2,"두렡닀":-1,"ν”Όκ³€":-1,"μ§€λ£¨ν•˜λ‹€":-1,"λΆ€μ •":-2,"λ‚­λΉ„":-1,
    "손해":-2,"ν›„νšŒ":-2,"κ±±μ •":-1,"아쉽닀":-1,"뢀정적":-2,"μ–΅μšΈ":-2,
}

# λΆ€μ •μ–΄ νŒ¨ν„΄
_NEG_PATTERNS = re.compile(r"(μ•ˆ|λͺ»|μ—†|μ•Š|μ•ˆλ¨|뢈|λΉ„|무|λ―Έ)")


def _knu_score(text: str) -> float:
    if not isinstance(text, str) or not text.strip():
        return 0.0
    tokens = re.findall(r'\w+', text)
    score  = 0.0
    count  = 0
    for i, tok in enumerate(tokens):
        for word, val in _KNU_DICT.items():
            if word in tok:
                # μ•ž 토큰에 λΆ€μ •μ–΄κ°€ 있으면 κ·Ήμ„± λ°˜μ „
                neg = (i > 0 and _NEG_PATTERNS.search(tokens[i-1]))
                score += -val if neg else val
                count += 1
    return round(score / max(count, 1), 4)


def _label(score: float, pos_thr: float = 0.3, neg_thr: float = -0.3) -> str:
    if score >= pos_thr:   return "긍정"
    if score <= neg_thr:   return "λΆ€μ •"
    return "쀑립"


# ── 방법 2: HuggingFace (선택적) ─────────────────────────────────────────────
def _hf_sentiment(texts: list, model_name: str = "snunlp/KR-FinBert-SC"):
    try:
        from transformers import pipeline
        pipe = pipeline("text-classification", model=model_name,
                        tokenizer=model_name, truncation=True, max_length=512)
        results = []
        batch = 16
        for i in range(0, len(texts), batch):
            chunk = [str(t)[:512] for t in texts[i:i+batch]]
            out = pipe(chunk)
            results.extend(out)
        return results
    except Exception as e:
        return [{"error": str(e)}] * len(texts)


# ── 메인 ν•¨μˆ˜ ─────────────────────────────────────────────────────────────────
def run_sentiment(df: pd.DataFrame, text_col: str,
                  method: str = "dictionary",   # "dictionary" | "transformer"
                  model_name: str = "snunlp/KR-FinBert-SC",
                  pos_threshold: float = 0.3,
                  neg_threshold: float = -0.3):
    """
    Parameters
    ----------
    text_col  : ν…μŠ€νŠΈ 컬럼λͺ…
    method    : 'dictionary' (KNU 사전, 빠름) | 'transformer' (λ”₯λŸ¬λ‹, μ •ν™•)
    """
    texts = df[text_col].fillna("").tolist()

    if method == "transformer":
        try:
            import transformers  # noqa
        except ImportError:
            return None, "transformers νŒ¨ν‚€μ§€κ°€ μ„€μΉ˜λ˜μ–΄ μžˆμ§€ μ•ŠμŠ΅λ‹ˆλ‹€. dictionary 방법을 μ‚¬μš©ν•˜μ„Έμš”."
        raw = _hf_sentiment(texts, model_name)
        if "error" in raw[0]:
            return None, f"Transformer 였λ₯˜: {raw[0]['error']}"
        labels = [r.get("label","").lower() for r in raw]
        scores = [r.get("score", 0.0) for r in raw]
        # λ ˆμ΄λΈ” μ •κ·œν™”
        def norm_label(l):
            if "pos" in l or "긍정" in l: return "긍정"
            if "neg" in l or "λΆ€μ •" in l: return "λΆ€μ •"
            return "쀑립"
        df_out = df[[text_col]].copy()
        df_out["κ°μ •λ ˆμ΄λΈ”"] = [norm_label(l) for l in labels]
        df_out["신뒰도"]    = [round(s, 4) for s in scores]
    else:
        scores = [_knu_score(t) for t in texts]
        labels = [_label(s, pos_threshold, neg_threshold) for s in scores]
        df_out = df[[text_col]].copy()
        df_out["κ°μ •μ μˆ˜"]  = scores
        df_out["κ°μ •λ ˆμ΄λΈ”"] = labels

    # μš”μ•½ 톡계
    label_counts = df_out["κ°μ •λ ˆμ΄λΈ”"].value_counts()
    total = len(df_out)
    summary = pd.DataFrame({
        "감정":    label_counts.index,
        "λΉˆλ„":    label_counts.values,
        "λΉ„μœ¨(%)": (label_counts.values / total * 100).round(1)
    })

    # μƒμœ„ 긍정/λΆ€μ • ν…μŠ€νŠΈ μƒ˜ν”Œ
    if "κ°μ •μ μˆ˜" in df_out.columns:
        pos_top = df_out.nlargest(5, "κ°μ •μ μˆ˜")[[text_col,"κ°μ •μ μˆ˜","κ°μ •λ ˆμ΄λΈ”"]]
        neg_top = df_out.nsmallest(5, "κ°μ •μ μˆ˜")[[text_col,"κ°μ •μ μˆ˜","κ°μ •λ ˆμ΄λΈ”"]]
    else:
        pos_top = df_out[df_out["κ°μ •λ ˆμ΄λΈ”"]=="긍정"].head(5)
        neg_top = df_out[df_out["κ°μ •λ ˆμ΄λΈ”"]=="λΆ€μ •"].head(5)

    return {
        "전체결과": df_out,
        "μš”μ•½":    summary,
        "κΈμ •μƒμœ„": pos_top.reset_index(drop=True),
        "λΆ€μ •μƒμœ„": neg_top.reset_index(drop=True)
    }, None