noranisa commited on
Commit
23ac9a8
Β·
verified Β·
1 Parent(s): 4dab816

Create services/absa.py

Browse files
Files changed (1) hide show
  1. services/absa.py +275 -0
services/absa.py ADDED
@@ -0,0 +1,275 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ services/absa.py
3
+ Aspect-Based Sentiment Analysis (ABSA) untuk Bahasa Indonesia.
4
+
5
+ Pendekatan:
6
+ 1. Ekstrak aspek dari teks menggunakan lexicon + dependency pattern
7
+ 2. Tentukan sentimen per aspek menggunakan window context
8
+ 3. Agregasi hasil per kategori aspek
9
+
10
+ Kategori aspek yang didukung (domain-agnostic):
11
+ - harga/biaya : harga, mahal, murah, biaya, tarif, ongkos
12
+ - kualitas/produk : kualitas, bagus, jelek, rusak, bagus, produk
13
+ - pelayanan/service : pelayanan, layanan, respon, lambat, cepat, ramah
14
+ - lokasi/tempat : lokasi, tempat, jarak, strategis, jauh, dekat
15
+ - kebijakan : kebijakan, aturan, regulasi, keputusan, program
16
+ - pemimpin/tokoh : pemimpin, presiden, gubernur, menteri, pejabat
17
+ - ekonomi : ekonomi, inflasi, harga, pendapatan, gaji, subsidi
18
+ - pendidikan : pendidikan, sekolah, kampus, belajar, kurikulum
19
+ - kesehatan : kesehatan, rumah sakit, dokter, obat, vaksin
20
+ - infrastruktur : jalan, infrastruktur, gedung, fasilitas, listrik
21
+ """
22
+
23
+ import re
24
+ from collections import defaultdict
25
+ from typing import Optional
26
+
27
+ # ─────────────────────────────────────────────
28
+ # ASPECT LEXICON
29
+ # ─────────────────────────────────────────────
30
+ ASPECT_LEXICON = {
31
+ 'harga': [
32
+ 'harga','mahal','murah','biaya','tarif','ongkos','harganya',
33
+ 'cost','price','bayar','bayaran','budget','anggaran','tagihan',
34
+ 'cicilan','kredit','diskon','promo','gratis','terjangkau'
35
+ ],
36
+ 'kualitas': [
37
+ 'kualitas','bagus','jelek','buruk','rusak','cacat','produk',
38
+ 'barang','mutu','kualiti','quality','performa','fitur','spesifikasi',
39
+ 'durable','tahan lama','awet','rapuh','boros'
40
+ ],
41
+ 'pelayanan': [
42
+ 'pelayanan','layanan','servis','service','respon','respons','lambat',
43
+ 'cepat','ramah','kasar','profesional','sopan','membantu','helpful',
44
+ 'cs','customer service','admin','operator','staff','petugas'
45
+ ],
46
+ 'lokasi': [
47
+ 'lokasi','tempat','jarak','strategis','jauh','dekat','akses',
48
+ 'parkir','alamat','wilayah','daerah','kawasan','lingkungan'
49
+ ],
50
+ 'kebijakan': [
51
+ 'kebijakan','aturan','regulasi','keputusan','program','peraturan',
52
+ 'undang','hukum','sanksi','denda','izin','prosedur','birokrasi',
53
+ 'pemerintah','pemerintahan','politik','implementasi'
54
+ ],
55
+ 'pemimpin': [
56
+ 'pemimpin','presiden','gubernur','menteri','pejabat','bupati',
57
+ 'walikota','anggota','dewan','partai','calon','kandidat','tokoh',
58
+ 'figur','kepala','direktur','ceo','pimpinan'
59
+ ],
60
+ 'ekonomi': [
61
+ 'ekonomi','inflasi','deflasi','pendapatan','gaji','upah','subsidi',
62
+ 'pajak','ekspor','impor','investasi','pertumbuhan','resesi','utang',
63
+ 'pinjaman','modal','bisnis','usaha','umkm'
64
+ ],
65
+ 'pendidikan': [
66
+ 'pendidikan','sekolah','kampus','belajar','kurikulum','guru','dosen',
67
+ 'mahasiswa','siswa','nilai','ujian','beasiswa','biaya sekolah',
68
+ 'spp','kuliah','universitas','sd','smp','sma'
69
+ ],
70
+ 'kesehatan': [
71
+ 'kesehatan','rumah sakit','dokter','obat','vaksin','rs','puskesmas',
72
+ 'bpjs','asuransi','rawat','operasi','penyakit','covid','virus',
73
+ 'faskes','apotek','tenaga medis','perawat'
74
+ ],
75
+ 'infrastruktur': [
76
+ 'jalan','infrastruktur','gedung','fasilitas','listrik','air','banjir',
77
+ 'macet','transportasi','tol','jembatan','bandar udara','pelabuhan',
78
+ 'internet','sinyal','jaringan','konstruksi'
79
+ ],
80
+ }
81
+
82
+ # ─────────────────────────────────────────────
83
+ # SENTIMENT LEXICON PER ASPECT
84
+ # ─────────────────────────────────────────────
85
+ SENTIMENT_POS = {
86
+ 'bagus','baik','bagus','mantap','keren','hebat','suka','senang','puas',
87
+ 'meningkat','naik','maju','berkembang','berhasil','sukses','bagus',
88
+ 'terjangkau','murah','gratis','ramah','cepat','tepat','profesional',
89
+ 'strategis','dekat','mudah','lancar','aman','nyaman','bersih',
90
+ 'good','great','nice','excellent','best','amazing','happy','love',
91
+ 'wonderful','perfect','outstanding','satisfied','recommended',
92
+ 'mendukung','setuju','approve','pro','positif','memuji','bangga',
93
+ }
94
+
95
+ SENTIMENT_NEG = {
96
+ 'buruk','jelek','rusak','parah','kecewa','mahal','lambat','lama',
97
+ 'susah','sulit','ribet','boros','kasar','curang','korup','gagal',
98
+ 'turun','menurun','anjlok','jatuh','krisis','masalah','bermasalah',
99
+ 'berbahaya','bahaya','mengecewakan','tidak puas','kapok',
100
+ 'bad','worst','terrible','awful','poor','horrible','hate','dislike',
101
+ 'expensive','slow','failed','disappointed','useless','waste',
102
+ 'menolak','menentang','against','kontra','negatif','mencela','kritik',
103
+ 'bohong','tipu','menipu','korupsi','tidak setuju',
104
+ }
105
+
106
+ NEGATION_WORDS = {
107
+ 'tidak','bukan','belum','tak','gak','ga','nggak','ngga','jangan',
108
+ 'no','not','never','dont',"don't",'without','tanpa',
109
+ }
110
+
111
+ INTENSIFIER_POS = {'sangat','banget','sekali','amat','luar biasa','super','paling','bgt'}
112
+ INTENSIFIER_NEG = {'kurang','agak','sedikit','hampir','nyaris'}
113
+
114
+
115
+ def _get_aspect(token: str) -> Optional[str]:
116
+ """Cari aspek untuk satu token."""
117
+ token = token.lower()
118
+ for aspect, keywords in ASPECT_LEXICON.items():
119
+ if token in keywords or any(kw in token for kw in keywords if len(kw) > 4):
120
+ return aspect
121
+ return None
122
+
123
+
124
+ def _sentiment_score_window(tokens: list, center_idx: int, window: int = 4) -> float:
125
+ """
126
+ Hitung skor sentimen dalam window Β±N kata dari posisi aspek.
127
+ Pertimbangkan negasi dan intensifier.
128
+ Return: float positif = positif, negatif = negatif, 0 = netral
129
+ """
130
+ start = max(0, center_idx - window)
131
+ end = min(len(tokens), center_idx + window + 1)
132
+ window_tokens = tokens[start:end]
133
+
134
+ score = 0.0
135
+ negated = False
136
+ intensify = 1.0
137
+
138
+ for i, tok in enumerate(window_tokens):
139
+ tl = tok.lower()
140
+ if tl in NEGATION_WORDS:
141
+ negated = True
142
+ continue
143
+ if tl in INTENSIFIER_POS:
144
+ intensify = 1.5
145
+ continue
146
+ if tl in INTENSIFIER_NEG:
147
+ intensify = 0.6
148
+ continue
149
+
150
+ if tl in SENTIMENT_POS:
151
+ s = 1.0 * intensify
152
+ score += -s if negated else s
153
+ negated = False
154
+ intensify = 1.0
155
+ elif tl in SENTIMENT_NEG:
156
+ s = -1.0 * intensify
157
+ score += -s if negated else s
158
+ negated = False
159
+ intensify = 1.0
160
+
161
+ return score
162
+
163
+
164
+ def _score_to_label(score: float) -> str:
165
+ if score > 0.3: return "Positive"
166
+ if score < -0.3: return "Negative"
167
+ return "Neutral"
168
+
169
+
170
+ def extract_aspects(text: str) -> list[dict]:
171
+ """
172
+ Ekstrak aspek dan sentimen dari satu teks.
173
+
174
+ Return: list of {aspect, sentiment, score, mention, context}
175
+ """
176
+ if not text or len(text.strip()) < 5:
177
+ return []
178
+
179
+ # Tokenisasi sederhana
180
+ clean = re.sub(r'[^\w\s]', ' ', text.lower())
181
+ tokens = clean.split()
182
+
183
+ results = []
184
+ seen_aspects = set()
185
+
186
+ for i, token in enumerate(tokens):
187
+ aspect = _get_aspect(token)
188
+ if aspect is None:
189
+ continue
190
+
191
+ # Hindari duplikat aspek dalam satu kalimat
192
+ if aspect in seen_aspects:
193
+ continue
194
+ seen_aspects.add(aspect)
195
+
196
+ score = _sentiment_score_window(tokens, i)
197
+ label = _score_to_label(score)
198
+
199
+ # Context window untuk display
200
+ start = max(0, i - 3)
201
+ end = min(len(tokens), i + 4)
202
+ context = ' '.join(tokens[start:end])
203
+
204
+ results.append({
205
+ 'aspect': aspect,
206
+ 'sentiment': label,
207
+ 'score': round(score, 3),
208
+ 'mention': token,
209
+ 'context': context,
210
+ })
211
+
212
+ return results
213
+
214
+
215
+ def analyze_absa(texts: list[str]) -> dict:
216
+ """
217
+ Jalankan ABSA pada list teks.
218
+
219
+ Return:
220
+ {
221
+ 'per_text': list of per-text results,
222
+ 'aggregate': {aspect: {Positive: N, Negative: N, Neutral: N, dominant: str}},
223
+ 'top_aspects': sorted list of most-mentioned aspects,
224
+ 'aspect_sentiment_map': {aspect: dominant_sentiment}
225
+ }
226
+ """
227
+ per_text = []
228
+ aggregate = defaultdict(lambda: {'Positive': 0, 'Negative': 0, 'Neutral': 0, 'total': 0})
229
+
230
+ for text in texts[:80]: # batasi untuk performa
231
+ aspects = extract_aspects(text)
232
+ per_text.append({'text': text[:100], 'aspects': aspects})
233
+ for a in aspects:
234
+ aggregate[a['aspect']][a['sentiment']] += 1
235
+ aggregate[a['aspect']]['total'] += 1
236
+
237
+ # Kalkulasi dominan per aspek
238
+ agg_result = {}
239
+ for aspect, counts in aggregate.items():
240
+ t = counts['total'] or 1
241
+ dominant = max(
242
+ ['Positive', 'Negative', 'Neutral'],
243
+ key=lambda s: counts[s]
244
+ )
245
+ agg_result[aspect] = {
246
+ 'Positive': counts['Positive'],
247
+ 'Negative': counts['Negative'],
248
+ 'Neutral': counts['Neutral'],
249
+ 'total': counts['total'],
250
+ 'pos_pct': round(counts['Positive'] / t * 100, 1),
251
+ 'neg_pct': round(counts['Negative'] / t * 100, 1),
252
+ 'neu_pct': round(counts['Neutral'] / t * 100, 1),
253
+ 'dominant': dominant,
254
+ }
255
+
256
+ # Sort by total mentions
257
+ top_aspects = sorted(
258
+ agg_result.items(),
259
+ key=lambda x: x[1]['total'],
260
+ reverse=True
261
+ )
262
+
263
+ aspect_sentiment_map = {
264
+ asp: data['dominant']
265
+ for asp, data in top_aspects
266
+ }
267
+
268
+ return {
269
+ 'per_text': per_text[:20], # kirim sample ke frontend
270
+ 'aggregate': agg_result,
271
+ 'top_aspects': [{'aspect': a, **d} for a, d in top_aspects[:8]],
272
+ 'aspect_sentiment_map': aspect_sentiment_map,
273
+ 'total_texts_analyzed': len(texts),
274
+ 'aspects_found': len(agg_result),
275
+ }