Spaces:

ramadn
/

allergen_detector

Sleeping

App Files Files Community

rdsarjito commited on Apr 29, 2025

Commit

3ab25d3

1 Parent(s): feaf663

4 commit

Browse files

Files changed (1) hide show

app.py +54 -77

app.py CHANGED Viewed

@@ -1,107 +1,84 @@
 import streamlit as st
 import torch
-import re
 from transformers import AutoTokenizer, AutoModelForSequenceClassification
-import requests
-from bs4 import BeautifulSoup
-import os
-# === Konfigurasi Umum ===
-MODEL_PATH = 'model/alergen_model.pt'  # Pastikan path dan nama file model benar
-LABELS = ['susu', 'kacang', 'telur', 'makanan_laut', 'gandum']
-MAX_LEN = 128
-DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
-# === Load Model & Tokenizer ===
 @st.cache_resource
 def load_model():
-    tokenizer = AutoTokenizer.from_pretrained("indobenchmark/indobert-base-p2")
-    model = AutoModelForSequenceClassification.from_pretrained(
-        "indobenchmark/indobert-base-p2",
-        num_labels=len(LABELS),
-        problem_type="multi_label_classification"
-    )
-    # Load state dict dan target columns
-    state = torch.load(MODEL_PATH, map_location=DEVICE)
-    model.load_state_dict(state['model_state_dict'])
-    target_columns = state['target_columns']  # Simpan target_columns
-    model.to(DEVICE)
     model.eval()
     return tokenizer, model, target_columns
-# === Cleaning Teks ===
 def clean_text(text):
     text = text.replace('--', ' ')
     text = re.sub(r"http\S+", "", text)
-    text = re.sub('\n', ' ', text)
     text = re.sub("[^a-zA-Z0-9\s]", " ", text)
     text = re.sub(" {2,}", " ", text)
-    return text.lower().strip()
-# === Scrape dari Cookpad ===
-def scrape_ingredients(url):
-    try:
-        headers = {'User-Agent': 'Mozilla/5.0'}
-        r = requests.get(url, headers=headers)
-        soup = BeautifulSoup(r.content, 'html.parser')
-        ingredients_div = soup.find('div', id='ingredients')
-        if ingredients_div:
-            return ingredients_div.get_text(separator=' ')
-    except:
-        return None
-# === Prediksi Alergen ===
-def predict_alergen(text, tokenizer, model, target_columns, threshold):
-    text = clean_text(text)
     encoding = tokenizer.encode_plus(
-        text,
         add_special_tokens=True,
-        max_length=MAX_LEN,
         truncation=True,
-        padding='max_length',
-        return_tensors='pt'
     )
-    input_ids = encoding['input_ids'].to(DEVICE)
-    attention_mask = encoding['attention_mask'].to(DEVICE)
     with torch.no_grad():
         outputs = model(input_ids=input_ids, attention_mask=attention_mask)
-        probs = torch.sigmoid(outputs.logits).cpu().numpy()[0]
-    return {label: float(prob) for label, prob in zip(target_columns, probs)}
-# === UI Streamlit ===
-st.set_page_config(page_title="Deteksi Alergen IndoBERT", page_icon="🍲")
-st.title("🍲 Deteksi Alergen dari Resep Cookpad (IndoBERT)")
 tokenizer, model, target_columns = load_model()
-input_mode = st.radio("Pilih input:", ["Teks Manual", "URL Cookpad"])
-if input_mode == "Teks Manual":
-    user_input = st.text_area("📝 Masukkan bahan makanan:")
-else:
-    url = st.text_input("🔗 Masukkan URL Cookpad:")
-    user_input = ""
-    if url:
-        scraped = scrape_ingredients(url)
-        if scraped:
-            user_input = scraped
-            st.success("✅ Berhasil mengambil bahan dari URL")
-            st.text_area("📋 Bahan dari URL:", value=user_input, height=200)
-        else:
-            st.error("❌ Gagal mengambil data dari URL.")
-threshold = st.slider("🎚 Threshold (default 0.5):", 0.0, 1.0, 0.5)
-if st.button("🚀 Prediksi"):
-    if user_input.strip():
-        result = predict_alergen(user_input, tokenizer, model, target_columns, threshold)
-        st.subheader("📊 Hasil Prediksi Alergen:")
-        for label, prob in result.items():
-            status = "✅ Ada" if prob >= threshold else "❌ Tidak Ada"
-            st.write(f"- **{label}**: {status} ({prob:.2f})")
     else:
-        st.warning("⚠️ Masukkan teks bahan atau URL terlebih dahulu.")

 import streamlit as st
 import torch
+import torch.nn as nn
 from transformers import AutoTokenizer, AutoModelForSequenceClassification
+import re
+# ----- Define model class -----
+class MultilabelBertClassifier(nn.Module):
+    def __init__(self, model_name, num_labels):
+        super(MultilabelBertClassifier, self).__init__()
+        self.bert = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)
+        self.bert.classifier = nn.Linear(self.bert.config.hidden_size, num_labels)
+    def forward(self, input_ids, attention_mask):
+        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
+        return outputs.logits
+# ----- Load model and tokenizer -----
 @st.cache_resource
 def load_model():
+    model_path = "model/alergen_model.pt"
+    checkpoint = torch.load(model_path, map_location=torch.device("cpu"))
+    target_columns = checkpoint["target_columns"]
+    model = MultilabelBertClassifier("indobenchmark/indobert-base-p1", num_labels=len(target_columns))
+    model.load_state_dict(checkpoint["model_state_dict"])
     model.eval()
+    tokenizer = AutoTokenizer.from_pretrained("indobenchmark/indobert-base-p1")
     return tokenizer, model, target_columns
+# ----- Preprocessing function -----
 def clean_text(text):
     text = text.replace('--', ' ')
     text = re.sub(r"http\S+", "", text)
+    text = re.sub("\n", " ", text)
     text = re.sub("[^a-zA-Z0-9\s]", " ", text)
     text = re.sub(" {2,}", " ", text)
+    return text.strip().lower()
+# ----- Prediction function -----
+def predict_alergens(text, tokenizer, model, target_columns, max_length=128):
+    cleaned = clean_text(text)
     encoding = tokenizer.encode_plus(
+        cleaned,
         add_special_tokens=True,
+        max_length=max_length,
         truncation=True,
+        return_tensors='pt',
+        padding='max_length'
     )
+    input_ids = encoding["input_ids"]
+    attention_mask = encoding["attention_mask"]
     with torch.no_grad():
         outputs = model(input_ids=input_ids, attention_mask=attention_mask)
+        probs = torch.sigmoid(outputs)
+        preds = (probs > 0.5).float().squeeze(0).tolist()
+    results = {target: bool(preds[i]) for i, target in enumerate(target_columns)}
+    return results
+# ----- Streamlit App UI -----
+st.title("Deteksi Alergen dari Resep")
 tokenizer, model, target_columns = load_model()
+with st.form("alergen_form"):
+    input_text = st.text_area("Masukkan daftar bahan (ingredients):", height=200)
+    submitted = st.form_submit_button("Deteksi Alergen")
+if submitted:
+    if input_text.strip() == "":
+        st.warning("Mohon masukkan teks bahan terlebih dahulu.")
     else:
+        results = predict_alergens(input_text, tokenizer, model, target_columns)
+        st.subheader("Hasil Deteksi Alergen:")
+        for alergi, status in results.items():
+            if status:
+                st.error(f"- {alergi.capitalize()}")
+            else:
+                st.success(f"- {alergi.capitalize()}: Aman")