import torch import torch.nn.functional as F import re import string import pandas as pd from transformers import AutoTokenizer, AutoModelForSequenceClassification AVAILABLE_MODELS = { "toxic_bert": { "name": "Dzeisonov/indobert-toxic-classifier", "desc": "IndoBERT (Fine-tuned)" }, "toxic_roberta": { "name": "Dzeisonov/indoroberta-toxic-classifier", "tokenizer_name": "flax-community/indonesian-roberta-base", "desc": "IndoRoBERTa (Fine-tuned)" }, "toxic_bertweet": { "name": "Exqrch/IndoBERTweet-HateSpeech", "tokenizer_name": "indolem/indobertweet-base-uncased", "desc": "IndoBERTweet (Baseline Model)" } } # Cache global untuk menyimpan model yang sudah di-load loaded_models = {} def get_model_and_tokenizer(model_key): """ Load model dan tokenizer secara lazy loading. Otomatis mendeteksi apakah perlu path tokenizer khusus atau tidak. """ # Default ke toxic_bert jika key tidak ditemukan if model_key not in AVAILABLE_MODELS: model_key = "toxic_bert" # Cek cache dulu if model_key in loaded_models: return loaded_models[model_key]['tokenizer'], loaded_models[model_key]['model'] config = AVAILABLE_MODELS[model_key] print(f"⏳ Sedang memuat model baru: {config['desc']} ...") try: # LOGIKA PERBAIKAN: # Ambil nama tokenizer dari 'tokenizer_name' jika ada, # jika tidak ada, gunakan 'name' model biasa. tokenizer_path = config.get("tokenizer_name", config['name']) model_path = config['name'] tokenizer = AutoTokenizer.from_pretrained(tokenizer_path) model = AutoModelForSequenceClassification.from_pretrained(model_path) # Simpan ke cache loaded_models[model_key] = {'tokenizer': tokenizer, 'model': model} print(f"✅ Model {config['desc']} berhasil dimuat!") return tokenizer, model except Exception as e: print(f"❌ Gagal memuat model {model_key}: {e}") return None, None def preprocess_text(text): """Membersihkan teks sebelum masuk ke model.""" if not isinstance(text, str) or not text: return "" text = text.lower() # Hapus URL, username, hashtag, angka text = re.sub(r"http\S+|www.\S+|@\w+|#|\d+", "", text) # Hapus tanda baca text = text.translate(str.maketrans("", "", string.punctuation)) # Hapus spasi berlebih text = re.sub(r"\s+", " ", text).strip() return text def predict_text(text, model_key): """Melakukan prediksi untuk satu kalimat.""" tokenizer, model = get_model_and_tokenizer(model_key) if not model or not tokenizer: return {"original_text": text, "label": "ERROR", "score": "0%"} clean_text = preprocess_text(text) if not clean_text: return {"original_text": text, "label": "Kosong", "score": "0%"} # Tokenisasi inputs = tokenizer(clean_text, return_tensors="pt", truncation=True, max_length=512) with torch.no_grad(): outputs = model(**inputs) probs = F.softmax(outputs.logits, dim=-1) label_id = torch.argmax(probs, dim=1).item() confidence = probs[0][label_id].item() # Ambil label dari config model predicted_label = model.config.id2label[label_id] # Standarisasi Label Output (Toxic / Non-Toxic) # Menangani berbagai kemungkinan output label dari model yang berbeda if str(predicted_label) in ["LABEL_1", "Toxic", "toxic", "1", "Hate Speech"]: final_label = "Toxic" else: final_label = "Non-Toxic" return { "original_text": text, "text_clean": clean_text, "label": final_label, "score": f"{confidence:.1%}" # Format persentase (e.g. 98.5%) } def process_file(file_obj, model_key): """Memproses file upload (CSV/Excel/TXT).""" results = [] texts = [] try: filename = file_obj.filename.lower() # 1. Jika file CSV if filename.endswith('.csv'): df = pd.read_csv(file_obj) # Asumsi teks ada di kolom pertama texts = df.iloc[:, 0].astype(str).tolist() # 2. Jika file Excel (.xlsx / .xls) elif filename.endswith(('.xlsx', '.xls')): df = pd.read_excel(file_obj) texts = df.iloc[:, 0].astype(str).tolist() # 3. Jika file TXT else: content = file_obj.read().decode("utf-8") texts = content.splitlines() # Batasi maksimal 50 baris untuk demo agar tidak timeout limit = 50 for text in texts[:limit]: if text.strip(): res = predict_text(text, model_key) results.append(res) except Exception as e: print(f"Error processing file: {e}") return [] return results