File size: 14,475 Bytes
512e2dc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4f8b018
512e2dc
47d5d45
4f8b018
 
512e2dc
4f8b018
512e2dc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d567b4d
512e2dc
 
 
 
 
 
 
 
 
d567b4d
d688a86
512e2dc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
47d5d45
512e2dc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
47d5d45
512e2dc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
import streamlit as st
import pandas as pd
import numpy as np
import re
import pickle
import setting
import torch
import nltk
from io import BytesIO
from langdetect import detect, LangDetectException

# Library NLP & Deep Learning
from nltk.corpus import stopwords
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from tensorflow.keras.models import load_model
from tensorflow.keras.preprocessing.sequence import pad_sequences
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch.nn.functional as F


# ==========================================
# 1. SETUP ENVIRONMENT & RESOURCE LOADING
# ==========================================

# Definisi Device (GPU/CPU) untuk PyTorch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Download NLTK Resources secara senyap jika belum ada
try:
    nltk.data.find("corpora/stopwords")
    nltk.data.find("tokenizers/punkt")
except LookupError:
    nltk.download("stopwords", quiet=True)
    nltk.download("punkt", quiet=True)

# Inisialisasi Sastrawi (Hanya sekali agar cepat)
factory = StemmerFactory()
stemmer = factory.create_stemmer()


# ==========================================
# 2. CORE LOGIC: PREPROCESSING
# ==========================================


def reduce_repeating_chars(text, max_repeat=2):
    pattern = r"(.)\1{" + str(max_repeat) + r",}"
    return re.sub(pattern, r"\1" * max_repeat, text)


def normalize_slang_id(tokens):
    """Mapping list token berdasarkan kamus slang."""
    return [setting.SLANG_MAP.get(word, word) for word in tokens]


def fix_ui_nya(text):
    """
    Stemming kata ui, karena ui tidak ada di KBBI jadi tidak bisa
    di pakai disastrawi.
    """
    return text.replace("uinya", "ui nya")


def build_keyword_set(ASPECT_KEYWORDS, lang):
    """
    Stemming kata seperti ui, fitur, dll; karena ui, fitur, dll tidak ada di KBBI jadi tidak bisa
    di pakai disastrawi.
    """
    keywords = set()
    for aspect in ASPECT_KEYWORDS[lang].values():
        for k in aspect:
            keywords.add(k.lower())
    return keywords


def normalize_by_prefix(token, keywords):
    """
    Normalisasi dengan prefix, jadi huruf setelah base bakal dihapus
    """
    norm_token = token
    for kw in keywords:
        # Ngecek kalo ada ga kata yang sama depanya dengan token dan milih yang paling besar len-nya
        cond_norm = (len(kw) > len(norm_token)) or (token == norm_token)
        if token.startswith(kw) and token != kw and cond_norm:
            norm_token = kw
    return norm_token


def normalize_text(text, keywords):
    """
    Normalisasi kata dengan fungsi normalise_by_prefix()
    """
    tokens = text.lower().split()
    tokens = [normalize_by_prefix(t, keywords) for t in tokens]
    return " ".join(tokens)


def clean_text_advanced(ASPECT_KEYWORDS, text, lang="en", use_stemming=True):
    """Membersihkan teks dengan standar NLP Professional."""
    # Membuat keyword id untuk stemming kata tidak diKBBI
    KEYWORDS_ID = build_keyword_set(ASPECT_KEYWORDS, "id")
    KEYWORDS_EN = build_keyword_set(ASPECT_KEYWORDS, "en")
    KEYWORDS = KEYWORDS_ID.union(KEYWORDS_EN)

    if not isinstance(text, str):
        return ""

    # 1. Lowercase
    text = str(text).lower()
    print(f"text lower case : {text}")

    # 2. Hapus URL & Mention/Hashtag
    text = re.sub(r"http\S+|www\S+|https\S+", "", text, flags=re.MULTILINE)
    text = re.sub(r"\@\w+|\#\w+", "", text)
    print(f"text hashtag : {text}")

    # 3. Hapus Angka (Kecuali yang nempel sama huruf seperti 4g, mp3 biar konteks jalan)
    # Opsional, di sini kita hapus angka murni saja
    text = re.sub(r"\b\d+\b", "", text)
    print(f"text hapus angka : {text}")

    # 4. Handle Tanda Baca untuk Segmentasi (Keep . , ! ? tapi kasih spasi)
    # Tujuannya agar tokenisasi nanti memisahkan "bagus." menjadi "bagus" dan "."
    text = re.sub(r"([.,!?])", r" \1 ", text)
    print(f"text tanda baca : {text}")

    # 5. Hapus karakter simbol aneh (keep alpha-numeric & punctuation)
    text = re.sub(r"[^a-z0-9\s.,!?]", " ", text)
    print(f"text simbol : {text}")

    # 6. Reduksi karakter berulang (Baangeeet -> banget)
    text = reduce_repeating_chars(text)
    print(f"text repeating char : {text}")

    # 7. Normalisasi Spasi
    text = re.sub(r"\s+", " ", text).strip()
    print(f"normalisasi spasi : {text}")

    # 8. Fix kata yg ga di KBBI
    print(f"Temp text sebelum fix uinya : {text}")
    # text = fix_ui_nya(text)  # Stemming kata ui
    text = normalize_text(text, KEYWORDS)

    print(f"Temp text setelah fix uinya : {text}")

    # 9. Tokenisasi
    tokens = text.split()

    # 10. Handling per Bahasa
    if lang == "id":
        # Normalisasi Slang
        tokens = [setting.SLANG_MAP.get(t, t) for t in tokens]

        # Stemming Sastrawi (Optional: Bisa dimatikan jika terlalu lambat untuk batch besar)
        # Kita limit hanya stem kalimat < 30 kata agar responsif di Streamlit
        if use_stemming and len(tokens) < 30:
            try:
                # Re-join dulu karena Sastrawi lebih cepat proses string
                temp_text = " ".join(tokens)
                temp_text = stemmer.stem(temp_text)

                tokens = temp_text.split()
            except:
                pass

    # 11. Stopword Removal (Hati-hati dengan Negasi)
    if lang == "id":
        stops = set(stopwords.words("indonesian")) - setting.NEGATION_WORDS
    else:
        stops = set(stopwords.words("english")) - setting.NEGATION_WORDS

    tokens = [t for t in tokens if t not in stops]
    print(" ".join(tokens))
    return " ".join(tokens)


# ==========================================
# 3. MODEL MANAGEMENT (CACHING SYSTEM)
# ==========================================


@st.cache_resource(show_spinner=False)
def load_all_models():
    """
    Memuat semua model AI ke RAM. Menggunakan Cache Streamlit
    agar tidak loading ulang setiap ada interaksi user.
    """
    try:
        # Load English Models
        path_en = "Hamusssss12/spotify-absa-english-v2"
        tok_bert_en = AutoTokenizer.from_pretrained(path_en)
        mod_bert_en = AutoModelForSequenceClassification.from_pretrained(path_en)

        # Load Indonesian Models
        path_id = "Hamusssss12/spotify-absa-indonesian-v2"
        tok_bert_id = AutoTokenizer.from_pretrained(path_id)
        mod_bert_id = AutoModelForSequenceClassification.from_pretrained(path_id)
        # Note: LSTM Models kita keep untuk keperluan advanced development/comparison jika perlu
        # Tapi untuk deployment utama, kita pakai Transformer (BERT) karena akurasi lebih tinggi.

        return {"en": (mod_bert_en, tok_bert_en), "id": (mod_bert_id, tok_bert_id)}

    except Exception as e:
        st.error(f"⚠️ Error Critical: Gagal memuat model AI. Pesan Error: {str(e)}")
        st.info("Pastikan folder 'models' berisi hasil ekstrak ZIP yang benar.")
        return None, None


# ==========================================
# 4. INFERENCE ENGINE (OTAK PREDIKSI)
# ==========================================


def detect_language(text):
    """Mendeteksi bahasa input (ID/EN) secara otomatis."""
    try:
        # Deteksi cepat
        lang = detect(text)
        return "id" if lang == "id" or lang == "in" else "en"
    except:
        # Fallback manual check: Cari kata 'yang', 'dan'
        if any(w in text.lower() for w in ["yang", "dan", "di", "aku"]):
            return "id"
        return "en"


def get_bert_prob(text, model, tokenizer, lang):
    """Mengembalikan skor probabilitas POSITIVE (0.0 - 1.0)."""
    # Pindahkan ke CPU untuk deployment (kecuali server ada GPU)
    # Ini aman untuk Streamlit Cloud/Lokal Laptop biasa
    model.to("cpu")

    inputs = tokenizer(
        text, return_tensors="pt", truncation=True, padding=True, max_length=128
    )

    with torch.no_grad():
        logits = model(**inputs).logits
        probs = F.softmax(logits, dim=1).cpu().numpy()[0]

    if lang == "en":
        return probs[1]  # Probabilitas kelas 1 (Positive)
    elif lang == "id":
        return probs[0]  # Probabilitas kelas 0 (Positive)


def get_smart_aspects(ASPECT_KEYWORDS, segment, lang):
    """
    Mendeteksi aspek + Mengembalikan kata pemicunya.
    Output: [('Audio', 'suara'), ('Price', 'mahal')]
    """
    detected = []
    text_lower = segment.lower()

    # Ambil kamus sesuai bahasa
    vocab = ASPECT_KEYWORDS.get(lang, ASPECT_KEYWORDS["en"])

    for aspect, keywords in vocab.items():
        for key in keywords:
            # Gunakan regex word boundary agar akurat ('ads' not in 'loads')
            pattern = r"\b" + re.escape(key) + r"\b"
            match = re.search(pattern, text_lower)
            if match:
                detected.append((aspect, key))  # Simpan Nama Aspek & Kata Pemicu
                break  # Cukup 1 trigger per aspek per segmen

    return detected


def analyze_single_review_complete(ASPECT_KEYWORDS, text, models_tuple, lang="auto"):
    """
    PIPELINE UTAMA ABSA END-TO-END
    Menerima teks -> Cleaning -> Split Segmen -> Deteksi Aspek -> Scoring BERT.
    """
    # 1. Identifikasi Bahasa & Model
    models_en, models_id = models_tuple
    if not models_en or not models_id:
        return "Error", 0.0, {}, "en"

    if lang == "auto":
        lang = detect_language(text)

    # Load pasangan model & tokenizer yang tepat
    if lang == "id":
        model, tokenizer = models_id
    else:
        model, tokenizer = models_en

    # 2. Preprocessing & Segmentasi Kalimat
    # Kita pisah kalimat jika ada tanda baca atau kata hubung kontras
    if lang == "id":
        delimiters = (
            r"("
            r"\.|!|\?|;|,\s|"
            r"\btapi\b|\btp\b|\btetapi\b|\bnamun\b|\bmelainkan\b|\bakan tetapi\b|"
            r"\bpadahal\b|\bsedangkan\b|\bsebaliknya\b|\bjustru\b|"
            r"\bwalaupun\b|\bwalau\b|\bmeskipun\b|\bmeski\b|\bkendati\b|\bbiarpun\b|"
            r"\bcuma\b|\bcman\b|\bcma\b|\bcm\b|\bhanya\b|\bhanya saja\b|"
            r"\bsayang\b|\bsayangnya\b|\bsyg\b|\bdisayangkan\b|"
            r"\bkecuali\b|\bselain itu\b"
            r")"
        )
    else:
        delimiters = (
            r"("
            r"\.|!|\?|;|,\s|"
            r"\bbut\b|\bhowever\b|\byet\b|\bnevertheless\b|\bnonetheless\b|"
            r"\balthough\b|\bthough\b|\beven though\b|\balbeit\b|"
            r"\bdespite\b|\bin spite of\b|\bregardless\b|"
            r"\bwhile\b|\bwhereas\b|\bon the other hand\b|"
            r"\bexcept\b|\bexception\b|\bunless\b|\bbarring\b|"
            r"\bunfortunately\b|\bsadly\b|\bregrettably\b|\bpity\b"
            r")"
        )

    raw_segments = re.split(delimiters, text.lower())
    segments = [s.strip() for s in raw_segments if len(s.split()) >= 2]
    if not segments:
        segments = [text]  # Fallback jika kalimat pendek

    aspect_sentiment_store = {}

    # 3. Loop Analisis per Segmen
    for seg in segments:
        print(f"seg : {seg}")
        seg_clean = clean_text_advanced(ASPECT_KEYWORDS, seg, lang, use_stemming=True)
        print(f"seg_clean : {seg_clean}")
        # A. Deteksi Aspek & Trigger
        found_aspects = get_smart_aspects(ASPECT_KEYWORDS, seg_clean, lang)
        print(f"found_aspects : {found_aspects}")
        if found_aspects:
            # B. Hitung Sentimen Segmen ini
            # Preprocess khusus model (pake stemming jika perlu)
            if not seg_clean:
                seg_clean = seg
            pos_prob = get_bert_prob(seg, model, tokenizer, lang)

            # Simpan hasil
            for aspect_name, trigger_word in found_aspects:
                if aspect_name not in aspect_sentiment_store:
                    aspect_sentiment_store[aspect_name] = []

                aspect_sentiment_store[aspect_name].append(
                    {"prob": pos_prob, "trigger": trigger_word}
                )
    print(f"aspect_sentiment_store : {aspect_sentiment_store}")
    # 4. Aggregasi Hasil Aspek (Average & Logic)
    final_aspects_output = {}

    if aspect_sentiment_store:
        for asp, data_list in aspect_sentiment_store.items():
            # Rata-rata probabilitas jika aspek muncul beberapa kali
            avg_prob = np.mean([d["prob"] for d in data_list])

            # Ambil trigger word yang pertama ditemukan (representatif)
            triggers = list(set([d["trigger"] for d in data_list]))
            trigger_str = ", ".join(triggers)

            # Penentuan Label (Threshold 0.5)
            if avg_prob > 0.5:
                label = "Positive"
                score = avg_prob
            elif avg_prob < 0.5:
                label = "Negative"
                score = 1.0 - avg_prob

            final_aspects_output[asp] = {
                "label": label,
                "score": score,
                "trigger": trigger_str,
            }
    print(f"final_aspects_output : {final_aspects_output}")
    # 5. Global Sentiment Prediction (Text Utuh)
    clean_global = clean_text_advanced(ASPECT_KEYWORDS, text, lang, use_stemming=True)
    global_prob = get_bert_prob(clean_global, model, tokenizer, lang)

    global_label = "Positive" if global_prob > 0.5 else "Negative"
    global_conf = global_prob if global_label == "Positive" else 1.0 - global_prob

    return global_label, global_conf, final_aspects_output, lang


# ==========================================
# 5. FILE HANDLER UTILITIES
# ==========================================


def load_uploaded_file(uploaded_file):
    """Membaca file CSV/Excel ke DataFrame"""
    try:
        if uploaded_file.name.endswith(".csv"):
            df = pd.read_csv(uploaded_file)
        else:
            df = pd.read_excel(uploaded_file)
            print(f"excel : {df}")
        return df
    except Exception as e:
        return None


def find_text_column(df):
    """Mencari kolom teks secara otomatis"""
    print(f"df : {df}")
    candidates = [
        "content",
        "review",
        "text",
        "ulasan",
        "komentar",
        "feedback",
        "reviewText",
    ]
    for col in df.columns:
        list_lower = [c.lower() for c in candidates]
        if col.lower() in [c.lower() for c in candidates]:
            return col
    # Jika tidak ketemu, cari kolom objek pertama yang panjang
    for col in df.select_dtypes(include=["object"]):
        return col
    return None


def convert_df_to_csv(df):
    """Mengubah DF ke CSV string untuk download button"""
    return df.to_csv(index=False).encode("utf-8")