Sentimen-Analysis / services /advanced_nlp.py
noranisa's picture
Create services/advanced_nlp.py
2e80122 verified
"""
services/advanced_nlp.py
Kumpulan fitur NLP lanjutan:
1. Stance Detection β€” favor / against / neutral terhadap isu
2. Emotion Detection β€” 6 emosi dasar (Ekman)
3. Keyword Extraction β€” frasa kunci multi-kata (YAKE-style)
4. Summarization β€” ringkasan otomatis per platform
"""
import re
import math
from collections import Counter, defaultdict
from typing import Optional
# ═══════════════════════════════════════════════════════════════
# 1. STANCE DETECTION
# ═══════════════════════════════════════════════════════════════
"""
Stance = posisi penutur terhadap isu/target tertentu.
Berbeda dari sentimen:
- Sentimen: "saya senang" (tentang perasaan penulis)
- Stance: "saya mendukung kebijakan X" (tentang posisi terhadap isu)
"""
_FAVOR_KW = {
# Dukungan eksplisit
'setuju','mendukung','dukung','pro','favor','support','approve',
'sepakat','sependapat','membenarkan','memihak','memilih',
'cocok','tepat','benar','bagus','bagus sekali','mantap','keren',
'lanjutkan','teruskan','pertahankan','jaga','jaga terus',
'saya pilih','kami pilih','kita pilih','pilih','memilih',
'setia','percaya','yakin','optimis','harapan','semangat',
'seharusnya','mestinya','perlu dilanjutkan','harus dilanjutkan',
'berhasil','sukses','terbukti','efektif','efisien','berhasil',
}
_AGAINST_KW = {
# Penolakan eksplisit
'menolak','tolak','against','oppose','tidak setuju','tidak sepakat',
'kontra','anti','lawan','melawan','protes','demo','kritik','mengkritik',
'menyalahkan','menyerang','menghujat','mencaci','memaki',
'tidak benar','salah','keliru','bohong','tipu','menipu','palsu',
'tidak berhasil','gagal','kacau','rusak','hancur','bubar',
'cabut','copot','mundur','turun','lengser','ganti','ganti presiden',
'percuma','sia-sia','tidak ada gunanya','tidak efektif','lambat',
'mengecewakan','kecewa','gagal janji','ingkar','bohong janji',
'korupsi','korup','nepotisme','kronisme','pungli','mark up',
}
_STANCE_NEUTRAL_KW = {
'mungkin','bisa jadi','entah','tidak tahu','belum tahu',
'perlu dikaji','perlu dilihat','tergantung','kondisional',
'sebagian','beberapa','ada yang','ada juga',
}
def detect_stance(text: str, target: Optional[str] = None) -> dict:
"""
Deteksi stance satu teks.
target: topik/isu (opsional, untuk konteks)
Return: {stance, confidence, favor_score, against_score}
"""
lower = text.lower()
tokens = lower.split()
favor_score = sum(1 for k in _FAVOR_KW if k in lower)
against_score = sum(1 for k in _AGAINST_KW if k in lower)
neutral_score = sum(1 for k in _STANCE_NEUTRAL_KW if k in lower)
total = favor_score + against_score + neutral_score + 0.1
if favor_score > against_score and favor_score > neutral_score:
stance = "Favor"
confidence = round(favor_score / total, 3)
elif against_score > favor_score and against_score > neutral_score:
stance = "Against"
confidence = round(against_score / total, 3)
else:
stance = "Neutral"
confidence = round(max(neutral_score, 0.3) / total, 3)
confidence = min(confidence, 0.95)
return {
'stance': stance,
'confidence': confidence,
'favor_score': favor_score,
'against_score': against_score,
'neutral_score': neutral_score,
}
def analyze_stance(texts: list[str], target: Optional[str] = None) -> dict:
"""
Analisis stance untuk list teks.
"""
results = []
counts = {'Favor': 0, 'Against': 0, 'Neutral': 0}
for text in texts[:100]:
r = detect_stance(text, target)
results.append({'text': text[:80], **r})
counts[r['stance']] += 1
total = len(results) or 1
dominant = max(counts, key=counts.get)
return {
'per_text': results[:20],
'counts': counts,
'favor_pct': round(counts['Favor'] / total * 100, 1),
'against_pct': round(counts['Against'] / total * 100, 1),
'neutral_pct': round(counts['Neutral'] / total * 100, 1),
'dominant': dominant,
'target': target or 'general',
}
# ═══════════════════════════════════════════════════════════════
# 2. EMOTION DETECTION (6 Basic Emotions - Ekman)
# ═══════════════════════════════════════════════════════════════
_EMOTION_LEXICON = {
'joy': {
'senang','bahagia','gembira','sukacita','riang','ceria','suka',
'bangga','puas','lega','syukur','terima kasih','happy','joy',
'excited','wonderful','amazing','love','great','glad','cheerful',
'mantap','keren','bagus','asyik','seru','menyenangkan','enjoy',
},
'anger': {
'marah','murka','berang','gusar','geram','sebal','kesal','jengkel',
'benci','muak','dongkol','naik darah','emosi','emosional',
'angry','furious','rage','hate','disgust','annoyed','frustrated',
'bodoh','tolol','goblok','kampungan','bajingan','brengsek',
},
'sadness': {
'sedih','duka','susah','pilu','sendu','murung','galau','nestapa',
'menangis','nangis','air mata','hati hancur','patah hati',
'sad','cry','crying','tears','heartbreak','depressed','grief',
'kehilangan','ditinggal','pergi','wafat','meninggal','almarhum',
},
'fear': {
'takut','khawatir','cemas','was-was','gelisah','ngeri','horor',
'panik','syok','terkejut','kaget','tercengang',
'afraid','fear','scary','horror','panic','worried','anxious',
'bahaya','berbahaya','ancaman','mengancam','waspada','hati-hati',
},
'surprise': {
'terkejut','kaget','tercengang','heran','kagum','takjub','wow',
'tidak menyangka','tidak menduga','tiba-tiba','mendadak',
'surprised','shocked','amazed','astonished','unexpected','wow',
'luar biasa','tidak terduga','spontan',
},
'disgust': {
'jijik','muak','mual','eneg','benci','tidak suka','antipati',
'disgusting','gross','horrible','nasty','revolting','awful',
'kotor','najis','busuk','bau','tidak pantas','menjijikkan',
'korup','munafik','hipokrit','pembohong',
},
}
def detect_emotion(text: str) -> dict:
"""
Deteksi emosi dari satu teks.
Return: {dominant_emotion, scores: {emotion: score}, confidence}
"""
lower = text.lower()
scores = {}
for emotion, keywords in _EMOTION_LEXICON.items():
score = sum(1 for kw in keywords if kw in lower)
scores[emotion] = score
total = sum(scores.values())
if total == 0:
return {
'dominant_emotion': 'neutral',
'scores': {e: 0 for e in _EMOTION_LEXICON},
'confidence': 0.0,
'is_emotional': False,
}
dominant = max(scores, key=scores.get)
confidence = round(scores[dominant] / total, 3)
# Normalize to proportion
norm_scores = {e: round(s / total, 3) for e, s in scores.items()}
return {
'dominant_emotion': dominant,
'scores': norm_scores,
'raw_scores': scores,
'confidence': confidence,
'is_emotional': total > 0,
}
def analyze_emotions(texts: list[str]) -> dict:
"""
Analisis distribusi emosi untuk list teks.
"""
emotion_counts = {e: 0 for e in _EMOTION_LEXICON}
emotion_counts['neutral'] = 0
per_text = []
for text in texts[:100]:
r = detect_emotion(text)
per_text.append({'text': text[:80], **r})
if r['dominant_emotion'] in emotion_counts:
emotion_counts[r['dominant_emotion']] += 1
else:
emotion_counts['neutral'] += 1
total = len(texts) or 1
dominant = max(emotion_counts, key=emotion_counts.get)
distribution = {
e: {
'count': c,
'pct': round(c / total * 100, 1)
}
for e, c in emotion_counts.items()
}
return {
'per_text': per_text[:15],
'distribution': distribution,
'dominant': dominant,
'emotional_pct': round(
sum(c for e, c in emotion_counts.items() if e != 'neutral') / total * 100, 1
),
}
# ═══════════════════════════════════════════════════════════════
# 3. KEYWORD/PHRASE EXTRACTION (YAKE-inspired)
# ═══════════════════════════════════════════════════════════════
_STOPWORDS_KW = {
'yang','dan','di','ke','dari','ini','itu','dengan','untuk','adalah',
'ada','pada','juga','tidak','bisa','sudah','saya','kamu','mereka',
'kita','ya','jadi','kalau','tapi','atau','karena','sangat','banget',
'the','is','in','of','a','an','and','it','for','that','this',
'was','are','be','has','have','to','do','we','i','you','he','she',
}
def _tokenize_sentences(text: str) -> list[list[str]]:
"""Split ke kalimat lalu tokenisasi."""
sentences = re.split(r'[.!?;]', text)
result = []
for sent in sentences:
tokens = [
t.lower() for t in re.sub(r'[^\w\s]', ' ', sent).split()
if len(t) > 2 and t.lower() not in _STOPWORDS_KW
]
if tokens:
result.append(tokens)
return result
def extract_keywords(texts: list[str], top_n: int = 20) -> list[dict]:
"""
Ekstrak kata kunci dan frasa kunci menggunakan pendekatan TF-IDF
yang dimodifikasi dengan co-occurrence scoring.
Return: list of {phrase, score, frequency, type}
"""
# Kumpulkan semua teks
combined = ' '.join(texts[:100])
# Unigram frequency
all_tokens = [
t.lower() for t in re.sub(r'[^\w\s]', ' ', combined).split()
if len(t) > 2 and t.lower() not in _STOPWORDS_KW
]
tf = Counter(all_tokens)
total_tokens = len(all_tokens) + 1
# Bigram extraction
bigrams = []
for i in range(len(all_tokens) - 1):
bg = f"{all_tokens[i]} {all_tokens[i+1]}"
bigrams.append(bg)
tf_bigrams = Counter(bigrams)
# Trigram extraction
trigrams = []
for i in range(len(all_tokens) - 2):
tg = f"{all_tokens[i]} {all_tokens[i+1]} {all_tokens[i+2]}"
trigrams.append(tg)
tf_trigrams = Counter(trigrams)
# Score = freq * log(1 + freq) / total (normalized TF)
keywords = []
# Unigrams
for word, freq in tf.most_common(30):
if freq >= 2:
score = (freq / total_tokens) * math.log(1 + freq)
keywords.append({
'phrase': word,
'score': round(score, 5),
'frequency': freq,
'type': 'word',
})
# Bigrams (higher score multiplier karena lebih informatif)
for phrase, freq in tf_bigrams.most_common(20):
if freq >= 2:
score = (freq / total_tokens) * math.log(1 + freq) * 1.5
keywords.append({
'phrase': phrase,
'score': round(score, 5),
'frequency': freq,
'type': 'phrase',
})
# Trigrams
for phrase, freq in tf_trigrams.most_common(10):
if freq >= 2:
score = (freq / total_tokens) * math.log(1 + freq) * 2.0
keywords.append({
'phrase': phrase,
'score': round(score, 5),
'frequency': freq,
'type': 'multi-phrase',
})
# Sort by score, deduplikasi
keywords.sort(key=lambda x: x['score'], reverse=True)
# Hapus yang redundan (kata yang sudah ada di phrase lebih panjang)
seen_words = set()
filtered = []
for kw in keywords:
words_in_phrase = set(kw['phrase'].split())
if not any(w in seen_words for w in words_in_phrase):
filtered.append(kw)
seen_words.update(words_in_phrase)
if len(filtered) >= top_n:
break
return filtered
# ═══════════════════════════════════════════════════════════════
# 4. SUMMARIZATION
# ═══════════════════════════════════════════════════════════════
def _sentence_score(sentence: str, word_freq: Counter, total_words: int) -> float:
"""Score kalimat berdasarkan TF dari kata-kata penting."""
tokens = [
t.lower() for t in re.sub(r'[^\w\s]', ' ', sentence).split()
if len(t) > 2 and t.lower() not in _STOPWORDS_KW
]
if not tokens:
return 0.0
return sum(word_freq.get(t, 0) for t in tokens) / len(tokens)
def summarize_texts(texts: list[str], max_sentences: int = 3) -> str:
"""
Buat ringkasan ekstraktif dari list teks.
Menggunakan TextRank-inspired extractive summarization.
Return: string ringkasan (2-3 kalimat terbaik)
"""
if not texts:
return "Tidak ada data untuk diringkas."
# Gabung semua teks
combined = ' '.join(texts[:80])
# Tokenisasi kalimat
sentences = re.split(r'(?<=[.!?])\s+', combined)
sentences = [s.strip() for s in sentences if len(s.strip()) > 20]
if len(sentences) < 2:
return combined[:300] + ('…' if len(combined) > 300 else '')
# Word frequency untuk scoring
all_words = [
t.lower() for t in re.sub(r'[^\w\s]', ' ', combined).split()
if len(t) > 2 and t.lower() not in _STOPWORDS_KW
]
word_freq = Counter(all_words)
total_words = len(all_words) + 1
# Score tiap kalimat
scored = [
(sent, _sentence_score(sent, word_freq, total_words))
for sent in sentences
]
# Ambil top-N kalimat, pertahankan urutan asli
top_indices = sorted(
range(len(scored)),
key=lambda i: scored[i][1],
reverse=True
)[:max_sentences]
top_indices.sort() # kembalikan ke urutan asli
summary = ' '.join(scored[i][0] for i in top_indices)
return summary[:600] + ('…' if len(summary) > 600 else '')
def summarize_by_platform(result_data: list, max_sentences: int = 2) -> dict:
"""
Buat ringkasan per platform.
result_data: list of {text, sentiment, source}
"""
by_platform = defaultdict(list)
for r in result_data:
src = r.get('source', 'unknown')
text = r.get('text', '')
if text:
by_platform[src].append(text)
summaries = {}
for platform, texts in by_platform.items():
summaries[platform] = {
'summary': summarize_texts(texts, max_sentences),
'text_count': len(texts),
}
# Summary keseluruhan
all_texts = [r.get('text','') for r in result_data if r.get('text')]
summaries['_overall'] = {
'summary': summarize_texts(all_texts, max_sentences + 1),
'text_count': len(all_texts),
}
return summaries