""" services/advanced_nlp.py Kumpulan fitur NLP lanjutan: 1. Stance Detection — favor / against / neutral terhadap isu 2. Emotion Detection — 6 emosi dasar (Ekman) 3. Keyword Extraction — frasa kunci multi-kata (YAKE-style) 4. Summarization — ringkasan otomatis per platform """ import re import math from collections import Counter, defaultdict from typing import Optional # ═══════════════════════════════════════════════════════════════ # 1. STANCE DETECTION # ═══════════════════════════════════════════════════════════════ """ Stance = posisi penutur terhadap isu/target tertentu. Berbeda dari sentimen: - Sentimen: "saya senang" (tentang perasaan penulis) - Stance: "saya mendukung kebijakan X" (tentang posisi terhadap isu) """ _FAVOR_KW = { # Dukungan eksplisit 'setuju','mendukung','dukung','pro','favor','support','approve', 'sepakat','sependapat','membenarkan','memihak','memilih', 'cocok','tepat','benar','bagus','bagus sekali','mantap','keren', 'lanjutkan','teruskan','pertahankan','jaga','jaga terus', 'saya pilih','kami pilih','kita pilih','pilih','memilih', 'setia','percaya','yakin','optimis','harapan','semangat', 'seharusnya','mestinya','perlu dilanjutkan','harus dilanjutkan', 'berhasil','sukses','terbukti','efektif','efisien','berhasil', } _AGAINST_KW = { # Penolakan eksplisit 'menolak','tolak','against','oppose','tidak setuju','tidak sepakat', 'kontra','anti','lawan','melawan','protes','demo','kritik','mengkritik', 'menyalahkan','menyerang','menghujat','mencaci','memaki', 'tidak benar','salah','keliru','bohong','tipu','menipu','palsu', 'tidak berhasil','gagal','kacau','rusak','hancur','bubar', 'cabut','copot','mundur','turun','lengser','ganti','ganti presiden', 'percuma','sia-sia','tidak ada gunanya','tidak efektif','lambat', 'mengecewakan','kecewa','gagal janji','ingkar','bohong janji', 'korupsi','korup','nepotisme','kronisme','pungli','mark up', } _STANCE_NEUTRAL_KW = { 'mungkin','bisa jadi','entah','tidak tahu','belum tahu', 'perlu dikaji','perlu dilihat','tergantung','kondisional', 'sebagian','beberapa','ada yang','ada juga', } def detect_stance(text: str, target: Optional[str] = None) -> dict: """ Deteksi stance satu teks. target: topik/isu (opsional, untuk konteks) Return: {stance, confidence, favor_score, against_score} """ lower = text.lower() tokens = lower.split() favor_score = sum(1 for k in _FAVOR_KW if k in lower) against_score = sum(1 for k in _AGAINST_KW if k in lower) neutral_score = sum(1 for k in _STANCE_NEUTRAL_KW if k in lower) total = favor_score + against_score + neutral_score + 0.1 if favor_score > against_score and favor_score > neutral_score: stance = "Favor" confidence = round(favor_score / total, 3) elif against_score > favor_score and against_score > neutral_score: stance = "Against" confidence = round(against_score / total, 3) else: stance = "Neutral" confidence = round(max(neutral_score, 0.3) / total, 3) confidence = min(confidence, 0.95) return { 'stance': stance, 'confidence': confidence, 'favor_score': favor_score, 'against_score': against_score, 'neutral_score': neutral_score, } def analyze_stance(texts: list[str], target: Optional[str] = None) -> dict: """ Analisis stance untuk list teks. """ results = [] counts = {'Favor': 0, 'Against': 0, 'Neutral': 0} for text in texts[:100]: r = detect_stance(text, target) results.append({'text': text[:80], **r}) counts[r['stance']] += 1 total = len(results) or 1 dominant = max(counts, key=counts.get) return { 'per_text': results[:20], 'counts': counts, 'favor_pct': round(counts['Favor'] / total * 100, 1), 'against_pct': round(counts['Against'] / total * 100, 1), 'neutral_pct': round(counts['Neutral'] / total * 100, 1), 'dominant': dominant, 'target': target or 'general', } # ═══════════════════════════════════════════════════════════════ # 2. EMOTION DETECTION (6 Basic Emotions - Ekman) # ═══════════════════════════════════════════════════════════════ _EMOTION_LEXICON = { 'joy': { 'senang','bahagia','gembira','sukacita','riang','ceria','suka', 'bangga','puas','lega','syukur','terima kasih','happy','joy', 'excited','wonderful','amazing','love','great','glad','cheerful', 'mantap','keren','bagus','asyik','seru','menyenangkan','enjoy', }, 'anger': { 'marah','murka','berang','gusar','geram','sebal','kesal','jengkel', 'benci','muak','dongkol','naik darah','emosi','emosional', 'angry','furious','rage','hate','disgust','annoyed','frustrated', 'bodoh','tolol','goblok','kampungan','bajingan','brengsek', }, 'sadness': { 'sedih','duka','susah','pilu','sendu','murung','galau','nestapa', 'menangis','nangis','air mata','hati hancur','patah hati', 'sad','cry','crying','tears','heartbreak','depressed','grief', 'kehilangan','ditinggal','pergi','wafat','meninggal','almarhum', }, 'fear': { 'takut','khawatir','cemas','was-was','gelisah','ngeri','horor', 'panik','syok','terkejut','kaget','tercengang', 'afraid','fear','scary','horror','panic','worried','anxious', 'bahaya','berbahaya','ancaman','mengancam','waspada','hati-hati', }, 'surprise': { 'terkejut','kaget','tercengang','heran','kagum','takjub','wow', 'tidak menyangka','tidak menduga','tiba-tiba','mendadak', 'surprised','shocked','amazed','astonished','unexpected','wow', 'luar biasa','tidak terduga','spontan', }, 'disgust': { 'jijik','muak','mual','eneg','benci','tidak suka','antipati', 'disgusting','gross','horrible','nasty','revolting','awful', 'kotor','najis','busuk','bau','tidak pantas','menjijikkan', 'korup','munafik','hipokrit','pembohong', }, } def detect_emotion(text: str) -> dict: """ Deteksi emosi dari satu teks. Return: {dominant_emotion, scores: {emotion: score}, confidence} """ lower = text.lower() scores = {} for emotion, keywords in _EMOTION_LEXICON.items(): score = sum(1 for kw in keywords if kw in lower) scores[emotion] = score total = sum(scores.values()) if total == 0: return { 'dominant_emotion': 'neutral', 'scores': {e: 0 for e in _EMOTION_LEXICON}, 'confidence': 0.0, 'is_emotional': False, } dominant = max(scores, key=scores.get) confidence = round(scores[dominant] / total, 3) # Normalize to proportion norm_scores = {e: round(s / total, 3) for e, s in scores.items()} return { 'dominant_emotion': dominant, 'scores': norm_scores, 'raw_scores': scores, 'confidence': confidence, 'is_emotional': total > 0, } def analyze_emotions(texts: list[str]) -> dict: """ Analisis distribusi emosi untuk list teks. """ emotion_counts = {e: 0 for e in _EMOTION_LEXICON} emotion_counts['neutral'] = 0 per_text = [] for text in texts[:100]: r = detect_emotion(text) per_text.append({'text': text[:80], **r}) if r['dominant_emotion'] in emotion_counts: emotion_counts[r['dominant_emotion']] += 1 else: emotion_counts['neutral'] += 1 total = len(texts) or 1 dominant = max(emotion_counts, key=emotion_counts.get) distribution = { e: { 'count': c, 'pct': round(c / total * 100, 1) } for e, c in emotion_counts.items() } return { 'per_text': per_text[:15], 'distribution': distribution, 'dominant': dominant, 'emotional_pct': round( sum(c for e, c in emotion_counts.items() if e != 'neutral') / total * 100, 1 ), } # ═══════════════════════════════════════════════════════════════ # 3. KEYWORD/PHRASE EXTRACTION (YAKE-inspired) # ═══════════════════════════════════════════════════════════════ _STOPWORDS_KW = { 'yang','dan','di','ke','dari','ini','itu','dengan','untuk','adalah', 'ada','pada','juga','tidak','bisa','sudah','saya','kamu','mereka', 'kita','ya','jadi','kalau','tapi','atau','karena','sangat','banget', 'the','is','in','of','a','an','and','it','for','that','this', 'was','are','be','has','have','to','do','we','i','you','he','she', } def _tokenize_sentences(text: str) -> list[list[str]]: """Split ke kalimat lalu tokenisasi.""" sentences = re.split(r'[.!?;]', text) result = [] for sent in sentences: tokens = [ t.lower() for t in re.sub(r'[^\w\s]', ' ', sent).split() if len(t) > 2 and t.lower() not in _STOPWORDS_KW ] if tokens: result.append(tokens) return result def extract_keywords(texts: list[str], top_n: int = 20) -> list[dict]: """ Ekstrak kata kunci dan frasa kunci menggunakan pendekatan TF-IDF yang dimodifikasi dengan co-occurrence scoring. Return: list of {phrase, score, frequency, type} """ # Kumpulkan semua teks combined = ' '.join(texts[:100]) # Unigram frequency all_tokens = [ t.lower() for t in re.sub(r'[^\w\s]', ' ', combined).split() if len(t) > 2 and t.lower() not in _STOPWORDS_KW ] tf = Counter(all_tokens) total_tokens = len(all_tokens) + 1 # Bigram extraction bigrams = [] for i in range(len(all_tokens) - 1): bg = f"{all_tokens[i]} {all_tokens[i+1]}" bigrams.append(bg) tf_bigrams = Counter(bigrams) # Trigram extraction trigrams = [] for i in range(len(all_tokens) - 2): tg = f"{all_tokens[i]} {all_tokens[i+1]} {all_tokens[i+2]}" trigrams.append(tg) tf_trigrams = Counter(trigrams) # Score = freq * log(1 + freq) / total (normalized TF) keywords = [] # Unigrams for word, freq in tf.most_common(30): if freq >= 2: score = (freq / total_tokens) * math.log(1 + freq) keywords.append({ 'phrase': word, 'score': round(score, 5), 'frequency': freq, 'type': 'word', }) # Bigrams (higher score multiplier karena lebih informatif) for phrase, freq in tf_bigrams.most_common(20): if freq >= 2: score = (freq / total_tokens) * math.log(1 + freq) * 1.5 keywords.append({ 'phrase': phrase, 'score': round(score, 5), 'frequency': freq, 'type': 'phrase', }) # Trigrams for phrase, freq in tf_trigrams.most_common(10): if freq >= 2: score = (freq / total_tokens) * math.log(1 + freq) * 2.0 keywords.append({ 'phrase': phrase, 'score': round(score, 5), 'frequency': freq, 'type': 'multi-phrase', }) # Sort by score, deduplikasi keywords.sort(key=lambda x: x['score'], reverse=True) # Hapus yang redundan (kata yang sudah ada di phrase lebih panjang) seen_words = set() filtered = [] for kw in keywords: words_in_phrase = set(kw['phrase'].split()) if not any(w in seen_words for w in words_in_phrase): filtered.append(kw) seen_words.update(words_in_phrase) if len(filtered) >= top_n: break return filtered # ═══════════════════════════════════════════════════════════════ # 4. SUMMARIZATION # ═══════════════════════════════════════════════════════════════ def _sentence_score(sentence: str, word_freq: Counter, total_words: int) -> float: """Score kalimat berdasarkan TF dari kata-kata penting.""" tokens = [ t.lower() for t in re.sub(r'[^\w\s]', ' ', sentence).split() if len(t) > 2 and t.lower() not in _STOPWORDS_KW ] if not tokens: return 0.0 return sum(word_freq.get(t, 0) for t in tokens) / len(tokens) def summarize_texts(texts: list[str], max_sentences: int = 3) -> str: """ Buat ringkasan ekstraktif dari list teks. Menggunakan TextRank-inspired extractive summarization. Return: string ringkasan (2-3 kalimat terbaik) """ if not texts: return "Tidak ada data untuk diringkas." # Gabung semua teks combined = ' '.join(texts[:80]) # Tokenisasi kalimat sentences = re.split(r'(?<=[.!?])\s+', combined) sentences = [s.strip() for s in sentences if len(s.strip()) > 20] if len(sentences) < 2: return combined[:300] + ('…' if len(combined) > 300 else '') # Word frequency untuk scoring all_words = [ t.lower() for t in re.sub(r'[^\w\s]', ' ', combined).split() if len(t) > 2 and t.lower() not in _STOPWORDS_KW ] word_freq = Counter(all_words) total_words = len(all_words) + 1 # Score tiap kalimat scored = [ (sent, _sentence_score(sent, word_freq, total_words)) for sent in sentences ] # Ambil top-N kalimat, pertahankan urutan asli top_indices = sorted( range(len(scored)), key=lambda i: scored[i][1], reverse=True )[:max_sentences] top_indices.sort() # kembalikan ke urutan asli summary = ' '.join(scored[i][0] for i in top_indices) return summary[:600] + ('…' if len(summary) > 600 else '') def summarize_by_platform(result_data: list, max_sentences: int = 2) -> dict: """ Buat ringkasan per platform. result_data: list of {text, sentiment, source} """ by_platform = defaultdict(list) for r in result_data: src = r.get('source', 'unknown') text = r.get('text', '') if text: by_platform[src].append(text) summaries = {} for platform, texts in by_platform.items(): summaries[platform] = { 'summary': summarize_texts(texts, max_sentences), 'text_count': len(texts), } # Summary keseluruhan all_texts = [r.get('text','') for r in result_data if r.get('text')] summaries['_overall'] = { 'summary': summarize_texts(all_texts, max_sentences + 1), 'text_count': len(all_texts), } return summaries