Spaces:

Di12
/

KC_Classifier

Sleeping

App Files Files Community

Di12 commited on Jun 11, 2025

Commit

ca66ee2

1 Parent(s): 462f32f

Create app.py

Browse files

Files changed (1) hide show

app.py +74 -0

app.py ADDED Viewed

	@@ -0,0 +1,74 @@

+# app.py
+import re
+import unicodedata
+from bs4 import BeautifulSoup
+import joblib
+import gradio as gr
+# —— 1. Preprocess functions —— #
+def clean_html(raw_html: str) -> str:
+    """Loại bỏ <img>, <math>, giữ text thuần."""
+    soup = BeautifulSoup(raw_html, "html.parser")
+    for img in soup.find_all("img"):
+        img.decompose()
+    for math_tag in soup.find_all("math"):
+        math_tag.decompose()
+    return soup.get_text(separator=" ", strip=True)
+def normalize_text(text: str) -> str:
+    """Lowercase, giữ unicode letters & digits, thay ký tự khác thành space."""
+    text = text.lower()
+    chars = []
+    for ch in text:
+        cat = unicodedata.category(ch)
+        if cat.startswith("L") or ch.isdigit() or ch.isspace():
+            chars.append(ch)
+        else:
+            chars.append(" ")
+    text = "".join(chars)
+    # xóa khoảng trắng thừa
+    return re.sub(r"\s+", " ", text).strip()
+def preprocess(content_html: str) -> str:
+    """Pipeline: HTML → clean → normalize"""
+    text = clean_html(content_html)
+    text = normalize_text(text)
+    return text
+# —— 2. Load vectorizer & model —— #
+vect = joblib.load("vectorizer.joblib")
+clf  = joblib.load("nbc_model.joblib")
+# —— 3. Inference function —— #
+def predict_kc(content_html: str) -> str:
+    """
+    Nhận HTML content, trả về mã KC dự đoán.
+    Nếu bỏ trống hoặc không parse được, trả về thông báo.
+    """
+    if not content_html or not isinstance(content_html, str):
+        return "Không có input hợp lệ."
+    text = preprocess(content_html)
+    if not text:
+        return "Nội dung không chứa ký tự để dự đoán."
+    Xv = vect.transform([text])
+    pred = clf.predict(Xv)[0]
+    return pred
+# —— 4. Xây dựng giao diện Gradio —— #
+demo = gr.Interface(
+    fn=predict_kc,
+    inputs=gr.Textbox(
+        lines=6,
+        placeholder="Dán HTML Content (có thể kèm <p>, <img>, <math>) vào đây…"
+    ),
+    outputs=gr.Label(num_top_classes=1, label="Mã KC dự đoán"),
+    title="Naive Bayes KC Predictor",
+    description="""
+        Nhập nội dung câu hỏi (HTML) và nhấn Submit để nhận về
+        mã kiến thức (KC) do model Naive Bayes dự đoán.
+    """,
+    allow_flagging="never",
+)
+if __name__ == "__main__":
+    demo.launch()