File size: 6,086 Bytes
26c3195 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 | # =============================================================================
# κ°μ λΆμ λͺ¨λ (νκ΅μ΄ + μμ΄)
# λ°©λ² 1: KNU νκ΅μ΄ κ°μ±μ¬μ (λ΄μ₯) β μ€νλΌμΈ
# λ°©λ² 2: HuggingFace transformers β μ¨λΌμΈ/GPU κΆμ₯
# =============================================================================
import pandas as pd
import numpy as np
import re
# ββ KNU νκ΅μ΄ κ°μ±μ¬μ (μΆμ½ λ΄μ₯ λ²μ ) ββββββββββββββββββββββββββββββββββββββ
# μλ³Έ: https://github.com/park1200656/KnuSentiLex
_KNU_DICT = {
# κΈμ
"μ’λ€":2,"νλ₯νλ€":2,"λ°μ΄λλ€":2,"λ§μ‘±":2,"κ°μ¬":2,"ν볡":2,"κΈ°μλ€":2,
"μ¦κ²λ€":2,"νΈλ¦¬νλ€":2,"νμνλ€":2,"μ°μνλ€":2,"μΉμ ":2,"λΉ λ₯΄λ€":1,
"κΉ¨λνλ€":1,"ν©λ¦¬μ ":1,"μ λ ΄νλ€":1,"μ λ’°":2,"μμ νλ€":1,"νΈνλ€":1,
"μ΅κ³ ":2,"νλ₯":2,"μ¬λ":2,"μ’μ":2,"μμ":1,"λ°λ€":1,"μ±κ³΅":2,"λ°μ ":1,
"μ μ΅":1,"ν¨κ³Όμ ":1,"νμ ":1,"μ°½μ":1,"ν¬λ§":2,"κΈμ ":2,"μλ²½":2,"νμ":2,
"μΆμ²":1,"μΉμ°¬":2,"ν‘μ‘±":2,"κ°λ":2,"μ μ©νλ€":1,"λμ":1,"μ½λ€":1,
# λΆμ
"λμλ€":-2,"λΆλ§":-2,"μ«λ€":-2,"μ€λ§":-2,"μ΅μ
":-2,"λΆνΈ":-1,"λ리λ€":-1,
"λΉμΈλ€":-1,"λΆμ":-1,"μν":-1,"λ¬Έμ ":-1,"λΆλ":-2,"κ³ μ₯":-2,"μ€λ₯":-2,
"μ§μ¦":-2,"νλλ€":-2,"μ¬νλ€":-2,"μ°μΈ":-2,"λΆμ‘±":-1,"μ΄λ ΅λ€":-1,
"νλ€λ€":-1,"볡μ‘νλ€":-1,"λΆμΉμ ":-2,"무μ":-2,"κ±°μ§":-2,"μ¬κΈ°":-2,
"λΆμ ":-2,"λλ ΅λ€":-1,"νΌκ³€":-1,"μ§λ£¨νλ€":-1,"λΆμ ":-2,"λλΉ":-1,
"μν΄":-2,"νν":-2,"κ±±μ ":-1,"μμ½λ€":-1,"λΆμ μ ":-2,"μ΅μΈ":-2,
}
# λΆμ μ΄ ν¨ν΄
_NEG_PATTERNS = re.compile(r"(μ|λͺ»|μ|μ|μλ¨|λΆ|λΉ|무|λ―Έ)")
def _knu_score(text: str) -> float:
if not isinstance(text, str) or not text.strip():
return 0.0
tokens = re.findall(r'\w+', text)
score = 0.0
count = 0
for i, tok in enumerate(tokens):
for word, val in _KNU_DICT.items():
if word in tok:
# μ ν ν°μ λΆμ μ΄κ° μμΌλ©΄ κ·Ήμ± λ°μ
neg = (i > 0 and _NEG_PATTERNS.search(tokens[i-1]))
score += -val if neg else val
count += 1
return round(score / max(count, 1), 4)
def _label(score: float, pos_thr: float = 0.3, neg_thr: float = -0.3) -> str:
if score >= pos_thr: return "κΈμ "
if score <= neg_thr: return "λΆμ "
return "μ€λ¦½"
# ββ λ°©λ² 2: HuggingFace (μ νμ ) βββββββββββββββββββββββββββββββββββββββββββββ
def _hf_sentiment(texts: list, model_name: str = "snunlp/KR-FinBert-SC"):
try:
from transformers import pipeline
pipe = pipeline("text-classification", model=model_name,
tokenizer=model_name, truncation=True, max_length=512)
results = []
batch = 16
for i in range(0, len(texts), batch):
chunk = [str(t)[:512] for t in texts[i:i+batch]]
out = pipe(chunk)
results.extend(out)
return results
except Exception as e:
return [{"error": str(e)}] * len(texts)
# ββ λ©μΈ ν¨μ βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
def run_sentiment(df: pd.DataFrame, text_col: str,
method: str = "dictionary", # "dictionary" | "transformer"
model_name: str = "snunlp/KR-FinBert-SC",
pos_threshold: float = 0.3,
neg_threshold: float = -0.3):
"""
Parameters
----------
text_col : ν
μ€νΈ 컬λΌλͺ
method : 'dictionary' (KNU μ¬μ , λΉ λ¦) | 'transformer' (λ₯λ¬λ, μ ν)
"""
texts = df[text_col].fillna("").tolist()
if method == "transformer":
try:
import transformers # noqa
except ImportError:
return None, "transformers ν¨ν€μ§κ° μ€μΉλμ΄ μμ§ μμ΅λλ€. dictionary λ°©λ²μ μ¬μ©νμΈμ."
raw = _hf_sentiment(texts, model_name)
if "error" in raw[0]:
return None, f"Transformer μ€λ₯: {raw[0]['error']}"
labels = [r.get("label","").lower() for r in raw]
scores = [r.get("score", 0.0) for r in raw]
# λ μ΄λΈ μ κ·ν
def norm_label(l):
if "pos" in l or "κΈμ " in l: return "κΈμ "
if "neg" in l or "λΆμ " in l: return "λΆμ "
return "μ€λ¦½"
df_out = df[[text_col]].copy()
df_out["κ°μ λ μ΄λΈ"] = [norm_label(l) for l in labels]
df_out["μ λ’°λ"] = [round(s, 4) for s in scores]
else:
scores = [_knu_score(t) for t in texts]
labels = [_label(s, pos_threshold, neg_threshold) for s in scores]
df_out = df[[text_col]].copy()
df_out["κ°μ μ μ"] = scores
df_out["κ°μ λ μ΄λΈ"] = labels
# μμ½ ν΅κ³
label_counts = df_out["κ°μ λ μ΄λΈ"].value_counts()
total = len(df_out)
summary = pd.DataFrame({
"κ°μ ": label_counts.index,
"λΉλ": label_counts.values,
"λΉμ¨(%)": (label_counts.values / total * 100).round(1)
})
# μμ κΈμ /λΆμ ν
μ€νΈ μν
if "κ°μ μ μ" in df_out.columns:
pos_top = df_out.nlargest(5, "κ°μ μ μ")[[text_col,"κ°μ μ μ","κ°μ λ μ΄λΈ"]]
neg_top = df_out.nsmallest(5, "κ°μ μ μ")[[text_col,"κ°μ μ μ","κ°μ λ μ΄λΈ"]]
else:
pos_top = df_out[df_out["κ°μ λ μ΄λΈ"]=="κΈμ "].head(5)
neg_top = df_out[df_out["κ°μ λ μ΄λΈ"]=="λΆμ "].head(5)
return {
"μ 체결과": df_out,
"μμ½": summary,
"κΈμ μμ": pos_top.reset_index(drop=True),
"λΆμ μμ": neg_top.reset_index(drop=True)
}, None
|