Spaces:
Sleeping
Sleeping
| """ | |
| services/advanced_nlp.py | |
| Kumpulan fitur NLP lanjutan: | |
| 1. Stance Detection β favor / against / neutral terhadap isu | |
| 2. Emotion Detection β 6 emosi dasar (Ekman) | |
| 3. Keyword Extraction β frasa kunci multi-kata (YAKE-style) | |
| 4. Summarization β ringkasan otomatis per platform | |
| """ | |
| import re | |
| import math | |
| from collections import Counter, defaultdict | |
| from typing import Optional | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # 1. STANCE DETECTION | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| """ | |
| Stance = posisi penutur terhadap isu/target tertentu. | |
| Berbeda dari sentimen: | |
| - Sentimen: "saya senang" (tentang perasaan penulis) | |
| - Stance: "saya mendukung kebijakan X" (tentang posisi terhadap isu) | |
| """ | |
| _FAVOR_KW = { | |
| # Dukungan eksplisit | |
| 'setuju','mendukung','dukung','pro','favor','support','approve', | |
| 'sepakat','sependapat','membenarkan','memihak','memilih', | |
| 'cocok','tepat','benar','bagus','bagus sekali','mantap','keren', | |
| 'lanjutkan','teruskan','pertahankan','jaga','jaga terus', | |
| 'saya pilih','kami pilih','kita pilih','pilih','memilih', | |
| 'setia','percaya','yakin','optimis','harapan','semangat', | |
| 'seharusnya','mestinya','perlu dilanjutkan','harus dilanjutkan', | |
| 'berhasil','sukses','terbukti','efektif','efisien','berhasil', | |
| } | |
| _AGAINST_KW = { | |
| # Penolakan eksplisit | |
| 'menolak','tolak','against','oppose','tidak setuju','tidak sepakat', | |
| 'kontra','anti','lawan','melawan','protes','demo','kritik','mengkritik', | |
| 'menyalahkan','menyerang','menghujat','mencaci','memaki', | |
| 'tidak benar','salah','keliru','bohong','tipu','menipu','palsu', | |
| 'tidak berhasil','gagal','kacau','rusak','hancur','bubar', | |
| 'cabut','copot','mundur','turun','lengser','ganti','ganti presiden', | |
| 'percuma','sia-sia','tidak ada gunanya','tidak efektif','lambat', | |
| 'mengecewakan','kecewa','gagal janji','ingkar','bohong janji', | |
| 'korupsi','korup','nepotisme','kronisme','pungli','mark up', | |
| } | |
| _STANCE_NEUTRAL_KW = { | |
| 'mungkin','bisa jadi','entah','tidak tahu','belum tahu', | |
| 'perlu dikaji','perlu dilihat','tergantung','kondisional', | |
| 'sebagian','beberapa','ada yang','ada juga', | |
| } | |
| def detect_stance(text: str, target: Optional[str] = None) -> dict: | |
| """ | |
| Deteksi stance satu teks. | |
| target: topik/isu (opsional, untuk konteks) | |
| Return: {stance, confidence, favor_score, against_score} | |
| """ | |
| lower = text.lower() | |
| tokens = lower.split() | |
| favor_score = sum(1 for k in _FAVOR_KW if k in lower) | |
| against_score = sum(1 for k in _AGAINST_KW if k in lower) | |
| neutral_score = sum(1 for k in _STANCE_NEUTRAL_KW if k in lower) | |
| total = favor_score + against_score + neutral_score + 0.1 | |
| if favor_score > against_score and favor_score > neutral_score: | |
| stance = "Favor" | |
| confidence = round(favor_score / total, 3) | |
| elif against_score > favor_score and against_score > neutral_score: | |
| stance = "Against" | |
| confidence = round(against_score / total, 3) | |
| else: | |
| stance = "Neutral" | |
| confidence = round(max(neutral_score, 0.3) / total, 3) | |
| confidence = min(confidence, 0.95) | |
| return { | |
| 'stance': stance, | |
| 'confidence': confidence, | |
| 'favor_score': favor_score, | |
| 'against_score': against_score, | |
| 'neutral_score': neutral_score, | |
| } | |
| def analyze_stance(texts: list[str], target: Optional[str] = None) -> dict: | |
| """ | |
| Analisis stance untuk list teks. | |
| """ | |
| results = [] | |
| counts = {'Favor': 0, 'Against': 0, 'Neutral': 0} | |
| for text in texts[:100]: | |
| r = detect_stance(text, target) | |
| results.append({'text': text[:80], **r}) | |
| counts[r['stance']] += 1 | |
| total = len(results) or 1 | |
| dominant = max(counts, key=counts.get) | |
| return { | |
| 'per_text': results[:20], | |
| 'counts': counts, | |
| 'favor_pct': round(counts['Favor'] / total * 100, 1), | |
| 'against_pct': round(counts['Against'] / total * 100, 1), | |
| 'neutral_pct': round(counts['Neutral'] / total * 100, 1), | |
| 'dominant': dominant, | |
| 'target': target or 'general', | |
| } | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # 2. EMOTION DETECTION (6 Basic Emotions - Ekman) | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| _EMOTION_LEXICON = { | |
| 'joy': { | |
| 'senang','bahagia','gembira','sukacita','riang','ceria','suka', | |
| 'bangga','puas','lega','syukur','terima kasih','happy','joy', | |
| 'excited','wonderful','amazing','love','great','glad','cheerful', | |
| 'mantap','keren','bagus','asyik','seru','menyenangkan','enjoy', | |
| }, | |
| 'anger': { | |
| 'marah','murka','berang','gusar','geram','sebal','kesal','jengkel', | |
| 'benci','muak','dongkol','naik darah','emosi','emosional', | |
| 'angry','furious','rage','hate','disgust','annoyed','frustrated', | |
| 'bodoh','tolol','goblok','kampungan','bajingan','brengsek', | |
| }, | |
| 'sadness': { | |
| 'sedih','duka','susah','pilu','sendu','murung','galau','nestapa', | |
| 'menangis','nangis','air mata','hati hancur','patah hati', | |
| 'sad','cry','crying','tears','heartbreak','depressed','grief', | |
| 'kehilangan','ditinggal','pergi','wafat','meninggal','almarhum', | |
| }, | |
| 'fear': { | |
| 'takut','khawatir','cemas','was-was','gelisah','ngeri','horor', | |
| 'panik','syok','terkejut','kaget','tercengang', | |
| 'afraid','fear','scary','horror','panic','worried','anxious', | |
| 'bahaya','berbahaya','ancaman','mengancam','waspada','hati-hati', | |
| }, | |
| 'surprise': { | |
| 'terkejut','kaget','tercengang','heran','kagum','takjub','wow', | |
| 'tidak menyangka','tidak menduga','tiba-tiba','mendadak', | |
| 'surprised','shocked','amazed','astonished','unexpected','wow', | |
| 'luar biasa','tidak terduga','spontan', | |
| }, | |
| 'disgust': { | |
| 'jijik','muak','mual','eneg','benci','tidak suka','antipati', | |
| 'disgusting','gross','horrible','nasty','revolting','awful', | |
| 'kotor','najis','busuk','bau','tidak pantas','menjijikkan', | |
| 'korup','munafik','hipokrit','pembohong', | |
| }, | |
| } | |
| def detect_emotion(text: str) -> dict: | |
| """ | |
| Deteksi emosi dari satu teks. | |
| Return: {dominant_emotion, scores: {emotion: score}, confidence} | |
| """ | |
| lower = text.lower() | |
| scores = {} | |
| for emotion, keywords in _EMOTION_LEXICON.items(): | |
| score = sum(1 for kw in keywords if kw in lower) | |
| scores[emotion] = score | |
| total = sum(scores.values()) | |
| if total == 0: | |
| return { | |
| 'dominant_emotion': 'neutral', | |
| 'scores': {e: 0 for e in _EMOTION_LEXICON}, | |
| 'confidence': 0.0, | |
| 'is_emotional': False, | |
| } | |
| dominant = max(scores, key=scores.get) | |
| confidence = round(scores[dominant] / total, 3) | |
| # Normalize to proportion | |
| norm_scores = {e: round(s / total, 3) for e, s in scores.items()} | |
| return { | |
| 'dominant_emotion': dominant, | |
| 'scores': norm_scores, | |
| 'raw_scores': scores, | |
| 'confidence': confidence, | |
| 'is_emotional': total > 0, | |
| } | |
| def analyze_emotions(texts: list[str]) -> dict: | |
| """ | |
| Analisis distribusi emosi untuk list teks. | |
| """ | |
| emotion_counts = {e: 0 for e in _EMOTION_LEXICON} | |
| emotion_counts['neutral'] = 0 | |
| per_text = [] | |
| for text in texts[:100]: | |
| r = detect_emotion(text) | |
| per_text.append({'text': text[:80], **r}) | |
| if r['dominant_emotion'] in emotion_counts: | |
| emotion_counts[r['dominant_emotion']] += 1 | |
| else: | |
| emotion_counts['neutral'] += 1 | |
| total = len(texts) or 1 | |
| dominant = max(emotion_counts, key=emotion_counts.get) | |
| distribution = { | |
| e: { | |
| 'count': c, | |
| 'pct': round(c / total * 100, 1) | |
| } | |
| for e, c in emotion_counts.items() | |
| } | |
| return { | |
| 'per_text': per_text[:15], | |
| 'distribution': distribution, | |
| 'dominant': dominant, | |
| 'emotional_pct': round( | |
| sum(c for e, c in emotion_counts.items() if e != 'neutral') / total * 100, 1 | |
| ), | |
| } | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # 3. KEYWORD/PHRASE EXTRACTION (YAKE-inspired) | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| _STOPWORDS_KW = { | |
| 'yang','dan','di','ke','dari','ini','itu','dengan','untuk','adalah', | |
| 'ada','pada','juga','tidak','bisa','sudah','saya','kamu','mereka', | |
| 'kita','ya','jadi','kalau','tapi','atau','karena','sangat','banget', | |
| 'the','is','in','of','a','an','and','it','for','that','this', | |
| 'was','are','be','has','have','to','do','we','i','you','he','she', | |
| } | |
| def _tokenize_sentences(text: str) -> list[list[str]]: | |
| """Split ke kalimat lalu tokenisasi.""" | |
| sentences = re.split(r'[.!?;]', text) | |
| result = [] | |
| for sent in sentences: | |
| tokens = [ | |
| t.lower() for t in re.sub(r'[^\w\s]', ' ', sent).split() | |
| if len(t) > 2 and t.lower() not in _STOPWORDS_KW | |
| ] | |
| if tokens: | |
| result.append(tokens) | |
| return result | |
| def extract_keywords(texts: list[str], top_n: int = 20) -> list[dict]: | |
| """ | |
| Ekstrak kata kunci dan frasa kunci menggunakan pendekatan TF-IDF | |
| yang dimodifikasi dengan co-occurrence scoring. | |
| Return: list of {phrase, score, frequency, type} | |
| """ | |
| # Kumpulkan semua teks | |
| combined = ' '.join(texts[:100]) | |
| # Unigram frequency | |
| all_tokens = [ | |
| t.lower() for t in re.sub(r'[^\w\s]', ' ', combined).split() | |
| if len(t) > 2 and t.lower() not in _STOPWORDS_KW | |
| ] | |
| tf = Counter(all_tokens) | |
| total_tokens = len(all_tokens) + 1 | |
| # Bigram extraction | |
| bigrams = [] | |
| for i in range(len(all_tokens) - 1): | |
| bg = f"{all_tokens[i]} {all_tokens[i+1]}" | |
| bigrams.append(bg) | |
| tf_bigrams = Counter(bigrams) | |
| # Trigram extraction | |
| trigrams = [] | |
| for i in range(len(all_tokens) - 2): | |
| tg = f"{all_tokens[i]} {all_tokens[i+1]} {all_tokens[i+2]}" | |
| trigrams.append(tg) | |
| tf_trigrams = Counter(trigrams) | |
| # Score = freq * log(1 + freq) / total (normalized TF) | |
| keywords = [] | |
| # Unigrams | |
| for word, freq in tf.most_common(30): | |
| if freq >= 2: | |
| score = (freq / total_tokens) * math.log(1 + freq) | |
| keywords.append({ | |
| 'phrase': word, | |
| 'score': round(score, 5), | |
| 'frequency': freq, | |
| 'type': 'word', | |
| }) | |
| # Bigrams (higher score multiplier karena lebih informatif) | |
| for phrase, freq in tf_bigrams.most_common(20): | |
| if freq >= 2: | |
| score = (freq / total_tokens) * math.log(1 + freq) * 1.5 | |
| keywords.append({ | |
| 'phrase': phrase, | |
| 'score': round(score, 5), | |
| 'frequency': freq, | |
| 'type': 'phrase', | |
| }) | |
| # Trigrams | |
| for phrase, freq in tf_trigrams.most_common(10): | |
| if freq >= 2: | |
| score = (freq / total_tokens) * math.log(1 + freq) * 2.0 | |
| keywords.append({ | |
| 'phrase': phrase, | |
| 'score': round(score, 5), | |
| 'frequency': freq, | |
| 'type': 'multi-phrase', | |
| }) | |
| # Sort by score, deduplikasi | |
| keywords.sort(key=lambda x: x['score'], reverse=True) | |
| # Hapus yang redundan (kata yang sudah ada di phrase lebih panjang) | |
| seen_words = set() | |
| filtered = [] | |
| for kw in keywords: | |
| words_in_phrase = set(kw['phrase'].split()) | |
| if not any(w in seen_words for w in words_in_phrase): | |
| filtered.append(kw) | |
| seen_words.update(words_in_phrase) | |
| if len(filtered) >= top_n: | |
| break | |
| return filtered | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # 4. SUMMARIZATION | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def _sentence_score(sentence: str, word_freq: Counter, total_words: int) -> float: | |
| """Score kalimat berdasarkan TF dari kata-kata penting.""" | |
| tokens = [ | |
| t.lower() for t in re.sub(r'[^\w\s]', ' ', sentence).split() | |
| if len(t) > 2 and t.lower() not in _STOPWORDS_KW | |
| ] | |
| if not tokens: | |
| return 0.0 | |
| return sum(word_freq.get(t, 0) for t in tokens) / len(tokens) | |
| def summarize_texts(texts: list[str], max_sentences: int = 3) -> str: | |
| """ | |
| Buat ringkasan ekstraktif dari list teks. | |
| Menggunakan TextRank-inspired extractive summarization. | |
| Return: string ringkasan (2-3 kalimat terbaik) | |
| """ | |
| if not texts: | |
| return "Tidak ada data untuk diringkas." | |
| # Gabung semua teks | |
| combined = ' '.join(texts[:80]) | |
| # Tokenisasi kalimat | |
| sentences = re.split(r'(?<=[.!?])\s+', combined) | |
| sentences = [s.strip() for s in sentences if len(s.strip()) > 20] | |
| if len(sentences) < 2: | |
| return combined[:300] + ('β¦' if len(combined) > 300 else '') | |
| # Word frequency untuk scoring | |
| all_words = [ | |
| t.lower() for t in re.sub(r'[^\w\s]', ' ', combined).split() | |
| if len(t) > 2 and t.lower() not in _STOPWORDS_KW | |
| ] | |
| word_freq = Counter(all_words) | |
| total_words = len(all_words) + 1 | |
| # Score tiap kalimat | |
| scored = [ | |
| (sent, _sentence_score(sent, word_freq, total_words)) | |
| for sent in sentences | |
| ] | |
| # Ambil top-N kalimat, pertahankan urutan asli | |
| top_indices = sorted( | |
| range(len(scored)), | |
| key=lambda i: scored[i][1], | |
| reverse=True | |
| )[:max_sentences] | |
| top_indices.sort() # kembalikan ke urutan asli | |
| summary = ' '.join(scored[i][0] for i in top_indices) | |
| return summary[:600] + ('β¦' if len(summary) > 600 else '') | |
| def summarize_by_platform(result_data: list, max_sentences: int = 2) -> dict: | |
| """ | |
| Buat ringkasan per platform. | |
| result_data: list of {text, sentiment, source} | |
| """ | |
| by_platform = defaultdict(list) | |
| for r in result_data: | |
| src = r.get('source', 'unknown') | |
| text = r.get('text', '') | |
| if text: | |
| by_platform[src].append(text) | |
| summaries = {} | |
| for platform, texts in by_platform.items(): | |
| summaries[platform] = { | |
| 'summary': summarize_texts(texts, max_sentences), | |
| 'text_count': len(texts), | |
| } | |
| # Summary keseluruhan | |
| all_texts = [r.get('text','') for r in result_data if r.get('text')] | |
| summaries['_overall'] = { | |
| 'summary': summarize_texts(all_texts, max_sentences + 1), | |
| 'text_count': len(all_texts), | |
| } | |
| return summaries |