Spaces:
Sleeping
Sleeping
File size: 4,907 Bytes
d56fb76 e34d011 d56fb76 e34d011 d56fb76 e34d011 d56fb76 e34d011 d56fb76 e34d011 d56fb76 e34d011 d56fb76 e34d011 d56fb76 e34d011 d56fb76 e34d011 d56fb76 e34d011 d56fb76 e34d011 d56fb76 e34d011 d56fb76 e34d011 d56fb76 e34d011 d56fb76 e34d011 d56fb76 e34d011 d56fb76 e34d011 d56fb76 e34d011 d56fb76 e34d011 d56fb76 e34d011 d56fb76 e34d011 d56fb76 e34d011 d56fb76 e34d011 d56fb76 e34d011 d56fb76 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 |
import torch
import torch.nn.functional as F
import re
import string
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSequenceClassification
AVAILABLE_MODELS = {
"toxic_bert": {
"name": "Dzeisonov/indobert-toxic-classifier",
"desc": "IndoBERT (Fine-tuned)"
},
"toxic_roberta": {
"name": "Dzeisonov/indoroberta-toxic-classifier",
"tokenizer_name": "flax-community/indonesian-roberta-base",
"desc": "IndoRoBERTa (Fine-tuned)"
},
"toxic_bertweet": {
"name": "Exqrch/IndoBERTweet-HateSpeech",
"tokenizer_name": "indolem/indobertweet-base-uncased",
"desc": "IndoBERTweet (Baseline Model)"
}
}
# Cache global untuk menyimpan model yang sudah di-load
loaded_models = {}
def get_model_and_tokenizer(model_key):
"""
Load model dan tokenizer secara lazy loading.
Otomatis mendeteksi apakah perlu path tokenizer khusus atau tidak.
"""
# Default ke toxic_bert jika key tidak ditemukan
if model_key not in AVAILABLE_MODELS:
model_key = "toxic_bert"
# Cek cache dulu
if model_key in loaded_models:
return loaded_models[model_key]['tokenizer'], loaded_models[model_key]['model']
config = AVAILABLE_MODELS[model_key]
print(f"⏳ Sedang memuat model baru: {config['desc']} ...")
try:
# LOGIKA PERBAIKAN:
# Ambil nama tokenizer dari 'tokenizer_name' jika ada,
# jika tidak ada, gunakan 'name' model biasa.
tokenizer_path = config.get("tokenizer_name", config['name'])
model_path = config['name']
tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)
model = AutoModelForSequenceClassification.from_pretrained(model_path)
# Simpan ke cache
loaded_models[model_key] = {'tokenizer': tokenizer, 'model': model}
print(f"✅ Model {config['desc']} berhasil dimuat!")
return tokenizer, model
except Exception as e:
print(f"❌ Gagal memuat model {model_key}: {e}")
return None, None
def preprocess_text(text):
"""Membersihkan teks sebelum masuk ke model."""
if not isinstance(text, str) or not text: return ""
text = text.lower()
# Hapus URL, username, hashtag, angka
text = re.sub(r"http\S+|www.\S+|@\w+|#|\d+", "", text)
# Hapus tanda baca
text = text.translate(str.maketrans("", "", string.punctuation))
# Hapus spasi berlebih
text = re.sub(r"\s+", " ", text).strip()
return text
def predict_text(text, model_key):
"""Melakukan prediksi untuk satu kalimat."""
tokenizer, model = get_model_and_tokenizer(model_key)
if not model or not tokenizer:
return {"original_text": text, "label": "ERROR", "score": "0%"}
clean_text = preprocess_text(text)
if not clean_text:
return {"original_text": text, "label": "Kosong", "score": "0%"}
# Tokenisasi
inputs = tokenizer(clean_text, return_tensors="pt", truncation=True, max_length=512)
with torch.no_grad():
outputs = model(**inputs)
probs = F.softmax(outputs.logits, dim=-1)
label_id = torch.argmax(probs, dim=1).item()
confidence = probs[0][label_id].item()
# Ambil label dari config model
predicted_label = model.config.id2label[label_id]
# Standarisasi Label Output (Toxic / Non-Toxic)
# Menangani berbagai kemungkinan output label dari model yang berbeda
if str(predicted_label) in ["LABEL_1", "Toxic", "toxic", "1", "Hate Speech"]:
final_label = "Toxic"
else:
final_label = "Non-Toxic"
return {
"original_text": text,
"text_clean": clean_text,
"label": final_label,
"score": f"{confidence:.1%}" # Format persentase (e.g. 98.5%)
}
def process_file(file_obj, model_key):
"""Memproses file upload (CSV/Excel/TXT)."""
results = []
texts = []
try:
filename = file_obj.filename.lower()
# 1. Jika file CSV
if filename.endswith('.csv'):
df = pd.read_csv(file_obj)
# Asumsi teks ada di kolom pertama
texts = df.iloc[:, 0].astype(str).tolist()
# 2. Jika file Excel (.xlsx / .xls)
elif filename.endswith(('.xlsx', '.xls')):
df = pd.read_excel(file_obj)
texts = df.iloc[:, 0].astype(str).tolist()
# 3. Jika file TXT
else:
content = file_obj.read().decode("utf-8")
texts = content.splitlines()
# Batasi maksimal 50 baris untuk demo agar tidak timeout
limit = 50
for text in texts[:limit]:
if text.strip():
res = predict_text(text, model_key)
results.append(res)
except Exception as e:
print(f"Error processing file: {e}")
return []
return results |