Dzeisonov commited on
Commit
e34d011
·
1 Parent(s): d56fb76

Added Baseline Model

Browse files
Files changed (1) hide show
  1. model_utils.py +48 -22
model_utils.py CHANGED
@@ -5,54 +5,75 @@ import string
5
  import pandas as pd
6
  from transformers import AutoTokenizer, AutoModelForSequenceClassification
7
 
8
- # ==========================================
9
- # KONFIGURASI MODEL
10
- # ==========================================
11
  AVAILABLE_MODELS = {
12
  "toxic_bert": {
13
  "name": "Dzeisonov/indobert-toxic-classifier",
14
- "desc": "IndoBERT (Base) - Toxic Classifier"
15
  },
16
  "toxic_roberta": {
17
  "name": "Dzeisonov/indoroberta-toxic-classifier",
18
- "desc": "IndoRoBERTa (Base) - Toxic Classifier"
 
 
 
 
 
 
19
  }
20
  }
21
 
22
- # Cache global
23
  loaded_models = {}
24
 
25
  def get_model_and_tokenizer(model_key):
26
- """Load model secara lazy loading."""
 
 
 
 
27
  if model_key not in AVAILABLE_MODELS:
28
  model_key = "toxic_bert"
29
 
 
30
  if model_key in loaded_models:
31
  return loaded_models[model_key]['tokenizer'], loaded_models[model_key]['model']
32
 
33
  config = AVAILABLE_MODELS[model_key]
34
- print(f"⏳ Sedang memuat model baru: {config['name']} ...")
35
 
36
  try:
37
- tokenizer = AutoTokenizer.from_pretrained(config['name'])
38
- model = AutoModelForSequenceClassification.from_pretrained(config['name'])
 
 
 
 
 
 
 
 
39
  loaded_models[model_key] = {'tokenizer': tokenizer, 'model': model}
40
- print("✅ Model berhasil dimuat!")
41
  return tokenizer, model
 
42
  except Exception as e:
43
- print(f"❌ Gagal memuat model: {e}")
44
  return None, None
45
 
46
  def preprocess_text(text):
 
47
  if not isinstance(text, str) or not text: return ""
48
  text = text.lower()
 
49
  text = re.sub(r"http\S+|www.\S+|@\w+|#|\d+", "", text)
 
50
  text = text.translate(str.maketrans("", "", string.punctuation))
 
51
  text = re.sub(r"\s+", " ", text).strip()
52
  return text
53
 
54
  def predict_text(text, model_key):
55
- """Prediksi satu kalimat."""
56
  tokenizer, model = get_model_and_tokenizer(model_key)
57
 
58
  if not model or not tokenizer:
@@ -62,6 +83,7 @@ def predict_text(text, model_key):
62
  if not clean_text:
63
  return {"original_text": text, "label": "Kosong", "score": "0%"}
64
 
 
65
  inputs = tokenizer(clean_text, return_tensors="pt", truncation=True, max_length=512)
66
 
67
  with torch.no_grad():
@@ -70,10 +92,12 @@ def predict_text(text, model_key):
70
  label_id = torch.argmax(probs, dim=1).item()
71
  confidence = probs[0][label_id].item()
72
 
 
73
  predicted_label = model.config.id2label[label_id]
74
 
75
- # Standarisasi Label (Toxic / Non-Toxic)
76
- if predicted_label in ["LABEL_1", "Toxic", "toxic", "1"]:
 
77
  final_label = "Toxic"
78
  else:
79
  final_label = "Non-Toxic"
@@ -82,11 +106,11 @@ def predict_text(text, model_key):
82
  "original_text": text,
83
  "text_clean": clean_text,
84
  "label": final_label,
85
- "score": f"{confidence:.1%}" # Mengembalikan persentase (misal: 98.5%)
86
  }
87
 
88
  def process_file(file_obj, model_key):
89
- """Memproses file CSV, Excel, atau TXT."""
90
  results = []
91
  texts = []
92
 
@@ -96,20 +120,22 @@ def process_file(file_obj, model_key):
96
  # 1. Jika file CSV
97
  if filename.endswith('.csv'):
98
  df = pd.read_csv(file_obj)
99
- texts = df.iloc[:, 0].astype(str).tolist() # Ambil kolom pertama
 
100
 
101
  # 2. Jika file Excel (.xlsx / .xls)
102
  elif filename.endswith(('.xlsx', '.xls')):
103
  df = pd.read_excel(file_obj)
104
- texts = df.iloc[:, 0].astype(str).tolist() # Ambil kolom pertama
105
 
106
- # 3. Jika file TXT (fallback)
107
  else:
108
  content = file_obj.read().decode("utf-8")
109
  texts = content.splitlines()
110
 
111
- # Batasi 50 baris agar server tidak hang
112
- for text in texts[:50]:
 
113
  if text.strip():
114
  res = predict_text(text, model_key)
115
  results.append(res)
 
5
  import pandas as pd
6
  from transformers import AutoTokenizer, AutoModelForSequenceClassification
7
 
 
 
 
8
  AVAILABLE_MODELS = {
9
  "toxic_bert": {
10
  "name": "Dzeisonov/indobert-toxic-classifier",
11
+ "desc": "IndoBERT (Fine-tuned)"
12
  },
13
  "toxic_roberta": {
14
  "name": "Dzeisonov/indoroberta-toxic-classifier",
15
+ "tokenizer_name": "flax-community/indonesian-roberta-base",
16
+ "desc": "IndoRoBERTa (Fine-tuned)"
17
+ },
18
+ "toxic_bertweet": {
19
+ "name": "Exqrch/IndoBERTweet-HateSpeech",
20
+ "tokenizer_name": "indolem/indobertweet-base-uncased",
21
+ "desc": "IndoBERTweet (Baseline Model)"
22
  }
23
  }
24
 
25
+ # Cache global untuk menyimpan model yang sudah di-load
26
  loaded_models = {}
27
 
28
  def get_model_and_tokenizer(model_key):
29
+ """
30
+ Load model dan tokenizer secara lazy loading.
31
+ Otomatis mendeteksi apakah perlu path tokenizer khusus atau tidak.
32
+ """
33
+ # Default ke toxic_bert jika key tidak ditemukan
34
  if model_key not in AVAILABLE_MODELS:
35
  model_key = "toxic_bert"
36
 
37
+ # Cek cache dulu
38
  if model_key in loaded_models:
39
  return loaded_models[model_key]['tokenizer'], loaded_models[model_key]['model']
40
 
41
  config = AVAILABLE_MODELS[model_key]
42
+ print(f"⏳ Sedang memuat model baru: {config['desc']} ...")
43
 
44
  try:
45
+ # LOGIKA PERBAIKAN:
46
+ # Ambil nama tokenizer dari 'tokenizer_name' jika ada,
47
+ # jika tidak ada, gunakan 'name' model biasa.
48
+ tokenizer_path = config.get("tokenizer_name", config['name'])
49
+ model_path = config['name']
50
+
51
+ tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)
52
+ model = AutoModelForSequenceClassification.from_pretrained(model_path)
53
+
54
+ # Simpan ke cache
55
  loaded_models[model_key] = {'tokenizer': tokenizer, 'model': model}
56
+ print(f"✅ Model {config['desc']} berhasil dimuat!")
57
  return tokenizer, model
58
+
59
  except Exception as e:
60
+ print(f"❌ Gagal memuat model {model_key}: {e}")
61
  return None, None
62
 
63
  def preprocess_text(text):
64
+ """Membersihkan teks sebelum masuk ke model."""
65
  if not isinstance(text, str) or not text: return ""
66
  text = text.lower()
67
+ # Hapus URL, username, hashtag, angka
68
  text = re.sub(r"http\S+|www.\S+|@\w+|#|\d+", "", text)
69
+ # Hapus tanda baca
70
  text = text.translate(str.maketrans("", "", string.punctuation))
71
+ # Hapus spasi berlebih
72
  text = re.sub(r"\s+", " ", text).strip()
73
  return text
74
 
75
  def predict_text(text, model_key):
76
+ """Melakukan prediksi untuk satu kalimat."""
77
  tokenizer, model = get_model_and_tokenizer(model_key)
78
 
79
  if not model or not tokenizer:
 
83
  if not clean_text:
84
  return {"original_text": text, "label": "Kosong", "score": "0%"}
85
 
86
+ # Tokenisasi
87
  inputs = tokenizer(clean_text, return_tensors="pt", truncation=True, max_length=512)
88
 
89
  with torch.no_grad():
 
92
  label_id = torch.argmax(probs, dim=1).item()
93
  confidence = probs[0][label_id].item()
94
 
95
+ # Ambil label dari config model
96
  predicted_label = model.config.id2label[label_id]
97
 
98
+ # Standarisasi Label Output (Toxic / Non-Toxic)
99
+ # Menangani berbagai kemungkinan output label dari model yang berbeda
100
+ if str(predicted_label) in ["LABEL_1", "Toxic", "toxic", "1", "Hate Speech"]:
101
  final_label = "Toxic"
102
  else:
103
  final_label = "Non-Toxic"
 
106
  "original_text": text,
107
  "text_clean": clean_text,
108
  "label": final_label,
109
+ "score": f"{confidence:.1%}" # Format persentase (e.g. 98.5%)
110
  }
111
 
112
  def process_file(file_obj, model_key):
113
+ """Memproses file upload (CSV/Excel/TXT)."""
114
  results = []
115
  texts = []
116
 
 
120
  # 1. Jika file CSV
121
  if filename.endswith('.csv'):
122
  df = pd.read_csv(file_obj)
123
+ # Asumsi teks ada di kolom pertama
124
+ texts = df.iloc[:, 0].astype(str).tolist()
125
 
126
  # 2. Jika file Excel (.xlsx / .xls)
127
  elif filename.endswith(('.xlsx', '.xls')):
128
  df = pd.read_excel(file_obj)
129
+ texts = df.iloc[:, 0].astype(str).tolist()
130
 
131
+ # 3. Jika file TXT
132
  else:
133
  content = file_obj.read().decode("utf-8")
134
  texts = content.splitlines()
135
 
136
+ # Batasi maksimal 50 baris untuk demo agar tidak timeout
137
+ limit = 50
138
+ for text in texts[:limit]:
139
  if text.strip():
140
  res = predict_text(text, model_key)
141
  results.append(res)