Hoax Detection SahihAI π°π€
Sahih AI
Proyek ini bertujuan untuk mendeteksi apakah sebuah berita kemungkinan benar atau hoax menggunakan model deep learning berbasis TensorFlow. Model dan tokenizer di-host di π€ Hugging Face Hub.
Fitur
- Preprocessing teks (normalisasi slang, penghapusan stopwords, pembersihan tanda baca).
- Konversi emoji menjadi representasi kata.
- Tokenisasi & padding otomatis sebelum inference.
- Model klasifikasi biner (0 = Berita Benar, 1 = Berita Hoax).
Instalasi
Clone model:
git clone https://huggingface.co/ludyhasby/hoax_sahih_AI
cd hoax_sahih_AI
pip install -r requirements.txt
Inference
Berikut adalah contoh penggunaan model
1. Import Library yang diperlukan, packages statics sudah kami masukkan kedalam direktori
import pickle
import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences
from huggingface_hub import hf_hub_download
from statics import slang_dict2, stop_words, emoji_dict, max_length
import re
2. Deklarasikan beberapa fungsi penting
def load_important(token_hf):
# --- Step 1. Load Model ---
model_path = hf_hub_download(
repo_id="ludyhasby/hoax_sahih_AI",
filename="hoax_detection.h5",
token=token_hf # masukkan token Anda
)
model = tf.keras.models.load_model(model_path, compile=False)
# --- Step 2. Load Tokenizer ---
tokenizer_path = hf_hub_download(
repo_id="ludyhasby/hoax_sahih_AI",
filename="tokenizer_hoax.pickle"
)
with open(tokenizer_path, "rb") as handle:
tokenizer = pickle.load(handle)
return model, tokenizer
# --- Step 3. Preprocessing Things ---
def replace_emot(teks, emoji_dict):
for j, emoticon in (emoji_dict["Emoji"].items()):
tag = emoji_dict["tag_indo"][j]
teks = teks.replace(emoticon, f" {tag}")
return teks
def normalize_slang(text, slang_dict):
list_word = text.split()
for word in list_word:
for j, original in slang_dict['original'].items():
if word == original:
text = text.replace(word, slang_dict['replacement'][j])
return text
def teks_to_pad(teks):
teks_seq = tokenizer.texts_to_sequences(teks)
teks_pad = pad_sequences(teks_seq, maxlen=max_length, truncating=trunc_type, padding=pad_type)
return teks_pad
def preprocessing(text, emot_f, slang_dict, STOP_PREP):
text = replace_emot(text, emot_f)
text = re.sub(r'[^\w\s]', ' ', text)
text = text.lower()
text = re.sub(r'username', '', text)
text = normalize_slang(text, slang_dict)
print(text)
# Mengubah string menjadi list kata
words = text.split()
filtered_words = [word for word in words if word.lower() not in STOP_PREP]
text = ' '.join(filtered_words)
text = re.sub(r"\d+", "", text)
text = re.sub(r'[ ]+', ' ', text)
# Tokenizer
padded = teks_to_pad([text])
return padded
def decode_label(encode):
if encode == 0:
return "Berita Kemungkinan Benar"
elif encode == 1:
return "Berita Kemungkinan Salah [Hoax]"
return "Out of Bound !"
def main_inference(teks, emoji_dict, slang_dict, STOP_PREP):
padnya = preprocessing(teks, emoji_dict, slang_dict2, STOP_PREP)
pred = model.predict(padnya)
bin_result = (pred >= 0.5).astype(int)
probability = pred[0][0] if bin_result==1 else 1-pred[0][0]
print(decode_label(bin_result))
print(probability)
3. Contoh Penggunaan
berita = input("Silahkan masukkan berita Anda: ")
main_inference(berita, emoji_dict, slang_dict2, stop_words)
4. Struktur Model
.
βββ hoax_detection.h5 # Model terlatih
βββ tokenizer_hoax.pickle # Tokenizer
βββ hoax_inference.py # Script Contoh Penggunaan Siap Gunakan (Lokal)
βββ requirements.txt
βββ statics.py # Modul Statis yang diperlukan saat proses inferensia
βββ logo_sahihAI.png
βββ README.md
Copyright Ludy Hasby Aulia @ 2025