|
|
import torch |
|
|
import emoji |
|
|
import re |
|
|
from transformers import BertTokenizer, BertForSequenceClassification |
|
|
from fastapi import FastAPI |
|
|
from pydantic import BaseModel |
|
|
from typing import Dict, List |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
import re |
|
|
import emoji |
|
|
|
|
|
|
|
|
class TextCleaner: |
|
|
|
|
|
def __init__(self): |
|
|
|
|
|
|
|
|
self.character = ['.', ',', ';', ':', '?', '!', '(', ')', '[', ']', '{', '}', '<', '>', '"', '/', '\'', '-', '@'] |
|
|
|
|
|
self.character.extend([chr(i) for i in range(ord('a'), ord('z') + 1)]) |
|
|
|
|
|
|
|
|
def repeatcharClean(self, text): |
|
|
|
|
|
for char_to_clean in self.character: |
|
|
|
|
|
|
|
|
pattern = re.compile(re.escape(char_to_clean) + r'{3,}') |
|
|
|
|
|
|
|
|
text = pattern.sub(char_to_clean, text) |
|
|
return text |
|
|
|
|
|
|
|
|
def clean_review(self, text): |
|
|
|
|
|
text = text.lower() |
|
|
|
|
|
text = re.sub(r'\s+', ' ', text) |
|
|
|
|
|
text = re.sub(r'[^\x00-\x7F]+', ' ', text) |
|
|
|
|
|
|
|
|
new_text = [] |
|
|
|
|
|
for word in text.split(" "): |
|
|
|
|
|
word = '@USER' if word.startswith('@') and len(word) > 1 else word |
|
|
|
|
|
word = 'HTTPURL' if word.startswith('http') else word |
|
|
new_text.append(word) |
|
|
|
|
|
text = " ".join(new_text) |
|
|
|
|
|
|
|
|
text = emoji.demojize(text) |
|
|
|
|
|
text = re.sub(r':[A-Za-z_-]+:', ' ', text) |
|
|
|
|
|
|
|
|
text = re.sub(r"([xX;:]'?[dDpPvVoO3)(])", ' ', text) |
|
|
|
|
|
text = re.sub(r'["#$%&()*+,./:;<=>\[\]\\^_`{|}~]', ' ', text) |
|
|
|
|
|
|
|
|
text = self.repeatcharClean(text) |
|
|
|
|
|
|
|
|
text = re.sub(r'\s+', ' ', text).strip() |
|
|
|
|
|
|
|
|
return text |
|
|
|
|
|
class SentimentPredictor: |
|
|
def __init__(self, tokenizer, model): |
|
|
self.tokenizer = tokenizer |
|
|
self.model = model |
|
|
self.device = torch.device("cpu") |
|
|
self.model.to(self.device) |
|
|
self.label_mapping = {0: 'Positif', 1: 'Netral', 2: 'Negatif'} |
|
|
|
|
|
def predict(self, text: str) -> (str, float, Dict[str, float]): |
|
|
inputs = self.tokenizer(text, return_tensors='pt', truncation=True, padding=True, max_length=280) |
|
|
inputs = {k: v.to(self.device) for k, v in inputs.items()} |
|
|
|
|
|
with torch.no_grad(): |
|
|
outputs = self.model(**inputs) |
|
|
|
|
|
logits = outputs.logits |
|
|
probabilities = torch.softmax(logits, dim=1)[0] |
|
|
|
|
|
confidence_score = probabilities.max().item() |
|
|
predicted_label_id = probabilities.argmax().item() |
|
|
sentiment = self.label_mapping[predicted_label_id] |
|
|
|
|
|
all_scores = {self.label_mapping[i]: prob.item() for i, prob in enumerate(probabilities)} |
|
|
return sentiment, confidence_score, all_scores |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
print("Memuat model dan tokenizer...") |
|
|
tokenizer = BertTokenizer.from_pretrained('indolem/indobertweet-base-uncased') |
|
|
model = BertForSequenceClassification.from_pretrained('indolem/indobertweet-base-uncased', num_labels=3) |
|
|
model_path = 'model_indoBERTweet_100Epochs_sentiment.pth' |
|
|
state_dict = torch.load(model_path, map_location=torch.device('cpu')) |
|
|
model.load_state_dict(state_dict, strict=False) |
|
|
model.eval() |
|
|
print("Model berhasil dimuat.") |
|
|
|
|
|
text_cleaner = TextCleaner() |
|
|
sentiment_predictor = SentimentPredictor(tokenizer, model) |
|
|
|
|
|
app = FastAPI( |
|
|
title="API Klasifikasi Sentimen", |
|
|
description="Sebuah API untuk menganalisis sentimen teks Bahasa Indonesia." |
|
|
) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class TextInput(BaseModel): |
|
|
text: str |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class BatchTextInput(BaseModel): |
|
|
texts: List[str] |
|
|
|
|
|
|
|
|
class PredictionOutput(BaseModel): |
|
|
cleaned_text: str = None |
|
|
sentiment: str |
|
|
confidence: float |
|
|
all_scores: Dict[str, float] |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@app.get("/") |
|
|
def read_root(): |
|
|
return {"message": "Selamat datang di API Klasifikasi Sentimen"} |
|
|
|
|
|
@app.post("/predict", response_model=PredictionOutput) |
|
|
def predict_sentiment(request: TextInput): |
|
|
cleaned_text = text_cleaner.clean_review(request.text) |
|
|
sentiment, confidence, all_scores = sentiment_predictor.predict(cleaned_text) |
|
|
return PredictionOutput( |
|
|
sentiment=sentiment, |
|
|
confidence=confidence, |
|
|
all_scores=all_scores |
|
|
) |
|
|
|
|
|
@app.post("/predict-batch", response_model=List[PredictionOutput]) |
|
|
def predict_sentiment_batch(request: BatchTextInput): |
|
|
results = [] |
|
|
for text in request.texts: |
|
|
cleaned_text = text_cleaner.clean_review(text) |
|
|
sentiment, confidence, all_scores = sentiment_predictor.predict(cleaned_text) |
|
|
results.append(PredictionOutput( |
|
|
cleaned_text=cleaned_text, |
|
|
sentiment=sentiment, |
|
|
confidence=confidence, |
|
|
all_scores=all_scores |
|
|
)) |
|
|
return results |