Spaces:
Runtime error
Runtime error
Create app.py
Browse files
app.py
ADDED
|
@@ -0,0 +1,98 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import gradio as gr
|
| 2 |
+
import joblib
|
| 3 |
+
import re
|
| 4 |
+
import numpy as np
|
| 5 |
+
from sastrawi.stemmer.stemmer_factory import StemmerFactory
|
| 6 |
+
from sastrawi.stopwords.stopwords_factory import StopWordRemoverFactory
|
| 7 |
+
import nltk
|
| 8 |
+
|
| 9 |
+
# --- Download NLTK data (only needs to run once) ---
|
| 10 |
+
try:
|
| 11 |
+
nltk.data.find('tokenizers/punkt')
|
| 12 |
+
except nltk.downloader.DownloadError:
|
| 13 |
+
nltk.download('punkt')
|
| 14 |
+
|
| 15 |
+
# --- 1. Load Pre-trained Model and Vectorizer ---
|
| 16 |
+
# These files should be in the same directory as your app.py file.
|
| 17 |
+
model = joblib.load('best_svm_model.pkl')
|
| 18 |
+
vectorizer = joblib.load('tfidf_vectorizer.pkl')
|
| 19 |
+
|
| 20 |
+
# --- 2. Recreate the Preprocessing Functions ---
|
| 21 |
+
# Initialize Sastrawi components
|
| 22 |
+
stemmer = StemmerFactory().create_stemmer()
|
| 23 |
+
stopword_remover = StopWordRemoverFactory().create_stop_word_remover()
|
| 24 |
+
|
| 25 |
+
# Slang dictionary from your notebook
|
| 26 |
+
slang_dict = {
|
| 27 |
+
'yg': 'yang', 'ga': 'tidak', 'gak': 'tidak', 'udh': 'sudah', 'tdk': 'tidak',
|
| 28 |
+
'bgt': 'banget', 'dg': 'dengan', 'klo': 'kalau', 'kalo': 'kalau', 'mksh': 'terima kasih',
|
| 29 |
+
'terimakasih': 'terima kasih', 'bgs': 'bagus', 'ok': 'oke', 'blm': 'belum', 'sy': 'saya',
|
| 30 |
+
'sya': 'saya', 'ak': 'aku', 'utk': 'untuk', 'tpi': 'tapi', 'tp': 'tapi', 'jd': 'jadi',
|
| 31 |
+
'jg': 'juga', 'trs': 'terus', 'skrg': 'sekarang', 'bkin': 'bikin', 'dr': 'dari',
|
| 32 |
+
'dn': 'dan', 'pke': 'pakai', 'gausah': 'tidak usah', 'ngga': 'tidak', 'bkn': 'bukan',
|
| 33 |
+
'sdh': 'sudah', 'aja': 'saja', 'lg': 'lagi', 'mls': 'malas', 'gk': 'tidak',
|
| 34 |
+
'knp': 'kenapa', 'krn': 'karena', 'gmn': 'bagaimana', 'gimana': 'bagaimana',
|
| 35 |
+
'udah': 'sudah', 'sm': 'sama', 'gbs': 'tidak bisa', 'nggak': 'tidak', 'mantap': 'bagus',
|
| 36 |
+
'cek': 'periksa', 'bansos': 'bantuan sosial'
|
| 37 |
+
}
|
| 38 |
+
|
| 39 |
+
def preprocess_text(text):
|
| 40 |
+
# 1. Cleaning: numbers, punctuation, extra spaces
|
| 41 |
+
text = re.sub(r'\d+', '', text)
|
| 42 |
+
text = re.sub(r'[^\w\s]', '', text)
|
| 43 |
+
text = re.sub(r'\s+', ' ', text).strip()
|
| 44 |
+
|
| 45 |
+
# 2. Case folding
|
| 46 |
+
text = text.lower()
|
| 47 |
+
|
| 48 |
+
# 3. Slang normalization
|
| 49 |
+
words = text.split()
|
| 50 |
+
normalized_words = [slang_dict.get(word, word) for word in words]
|
| 51 |
+
text = ' '.join(normalized_words)
|
| 52 |
+
|
| 53 |
+
# 4. Stopword removal
|
| 54 |
+
text = stopword_remover.remove(text)
|
| 55 |
+
|
| 56 |
+
# 5. Stemming
|
| 57 |
+
text = stemmer.stem(text)
|
| 58 |
+
|
| 59 |
+
return text
|
| 60 |
+
|
| 61 |
+
# --- 3. Prediction Function ---
|
| 62 |
+
def predict_sentiment(sentence):
|
| 63 |
+
# Preprocess the input sentence
|
| 64 |
+
processed_text = preprocess_text(sentence)
|
| 65 |
+
|
| 66 |
+
# Vectorize the text using the loaded TF-IDF vectorizer
|
| 67 |
+
text_vector = vectorizer.transform([processed_text])
|
| 68 |
+
|
| 69 |
+
# The model was trained with an additional 'thumbs_up_log_scaled' feature.
|
| 70 |
+
# Since we only have a sentence, we'll assume a neutral value (0) for this feature.
|
| 71 |
+
thumbs_up_feature = np.array([[0]])
|
| 72 |
+
|
| 73 |
+
# Combine the TF-IDF vector with the thumbs_up feature
|
| 74 |
+
# Note: hstack is used for sparse matrices
|
| 75 |
+
final_vector = np.hstack([text_vector.toarray(), thumbs_up_feature])
|
| 76 |
+
|
| 77 |
+
# Predict using the loaded model
|
| 78 |
+
prediction = model.predict(final_vector)
|
| 79 |
+
|
| 80 |
+
# Return the result
|
| 81 |
+
return prediction[0].capitalize()
|
| 82 |
+
|
| 83 |
+
# --- 4. Create Gradio Interface ---
|
| 84 |
+
iface = gr.Interface(
|
| 85 |
+
fn=predict_sentiment,
|
| 86 |
+
inputs=gr.Textbox(lines=3, placeholder="Masukkan kalimat ulasan dalam Bahasa Indonesia..."),
|
| 87 |
+
outputs="text",
|
| 88 |
+
title="Analisis Sentimen Ulasan Aplikasi",
|
| 89 |
+
description="Analisis sentimen untuk ulasan aplikasi 'Cek Bansos' menggunakan model SVM. Masukkan sebuah kalimat untuk memprediksi sentimennya (Positif, Negatif, atau Netral).",
|
| 90 |
+
examples=[
|
| 91 |
+
["aplikasinya bagus sekali dan sangat membantu"],
|
| 92 |
+
["tidak bisa daftar, gagal terus padahal sinyal bagus"],
|
| 93 |
+
["aplikasi ini biasa saja, tidak ada yang spesial"]
|
| 94 |
+
]
|
| 95 |
+
)
|
| 96 |
+
|
| 97 |
+
# --- 5. Launch the App ---
|
| 98 |
+
iface.launch()
|