Spaces:

Di12
/

KC_Classifier

Sleeping

App Files Files Community

Di12 commited on Jun 11, 2025

Commit

2041649

1 Parent(s): bddd7a2

Update app.py

Browse files

Files changed (1) hide show

app.py +64 -37

app.py CHANGED Viewed

@@ -1,20 +1,19 @@
 import re
 import unicodedata
 from bs4 import BeautifulSoup
-import joblib
 import gradio as gr
 def clean_html(raw_html: str) -> str:
-    """Loại bỏ <img>, <math>, giữ text thuần."""
     soup = BeautifulSoup(raw_html, "html.parser")
-    for img in soup.find_all("img"):
-        img.decompose()
-    for math_tag in soup.find_all("math"):
-        math_tag.decompose()
     return soup.get_text(separator=" ", strip=True)
 def normalize_text(text: str) -> str:
-    """Lowercase, giữ unicode letters & digits, thay ký tự khác thành space."""
     text = text.lower()
     chars = []
     for ch in text:
@@ -23,42 +22,70 @@ def normalize_text(text: str) -> str:
             chars.append(ch)
         else:
             chars.append(" ")
-    text = "".join(chars)
-    # xóa khoảng trắng thừa
-    return re.sub(r"\s+", " ", text).strip()
-def preprocess(content_html: str) -> str
-    text = clean_html(content_html)
-    text = normalize_text(text)
-    return text
-vect = joblib.load("vectorizer.joblib")
-clf  = joblib.load("nbc_model.joblib")
-def predict_kc(content_html: str) -> str:
-    if not content_html or not isinstance(content_html, str):
-        return "Không có input hợp lệ."
     text = preprocess(content_html)
     if not text:
-        return "Nội dung không chứa ký tự để dự đoán."
-    Xv = vect.transform([text])
-    pred = clf.predict(Xv)[0]
-    return pred
-demo = gr.Interface(
-    fn=predict_kc,
-    inputs=gr.Textbox(
-        lines=6,
-        placeholder="Dán HTML Content (có thể kèm <p>, <img>, <math>) vào đây"
-    ),
-    outputs=gr.Label(num_top_classes=1, label="Mã KC dự đoán"),
-    title="Naive Bayes KC Predictor",
-    description="""
-        Nhập nội dung câu hỏi (HTML) và nhấn Submit để nhận về
-        mã kiến thức (KC) do mô hình Naive Bayes dự đoán.
-    """,
-    allow_flagging="never",
 )
 if __name__ == "__main__":
-    demo.launch()

+# app.py
+import json
 import re
 import unicodedata
 from bs4 import BeautifulSoup
+import numpy as np
 import gradio as gr
+# —— 1. Preprocess (như trước) —— #
 def clean_html(raw_html: str) -> str:
     soup = BeautifulSoup(raw_html, "html.parser")
+    for img in soup.find_all("img"): img.decompose()
+    for math in soup.find_all("math"): math.decompose()
     return soup.get_text(separator=" ", strip=True)
 def normalize_text(text: str) -> str:
     text = text.lower()
     chars = []
     for ch in text:
             chars.append(ch)
         else:
             chars.append(" ")
+    return re.sub(r"\s+", " ", "".join(chars)).strip()
+def preprocess(content_html: str) -> str:
+    return normalize_text(clean_html(content_html))
+# —— 2. Load JSON & build transformer + NB classifier —— #
+with open("vectorizer.json", encoding="utf-8") as f:
+    vect_data = json.load(f)
+vocab = vect_data["vocabulary"]
+# nếu có idf: idf = np.array(vect_data["idf"])
+# Chúng ta sẽ implement CountVectorizer-like transform:
+def transform_count(docs):
+    """
+    docs: list of preprocessed strings
+    return: 2D numpy array (n_docs x n_features)
+    """
+    n_docs = len(docs)
+    n_feats = len(vocab)
+    X = np.zeros((n_docs, n_feats), dtype=np.float32)
+    for i, doc in enumerate(docs):
+        for token in doc.split():
+            idx = vocab.get(token)
+            if idx is not None:
+                X[i, idx] += 1.0
+    return X
+# Nếu bạn dùng TfidfVectorizer,
+# bạn sẽ tính tf-idf dựa trên vect_data["idf"] → bỏ qua trong ví dụ này.
+with open("nbc_model.json", encoding="utf-8") as f:
+    clf_data = json.load(f)
+classes = np.array(clf_data["classes"])
+class_log_prior   = np.array(clf_data["class_log_prior"])
+feature_log_prob  = np.array(clf_data["feature_log_prob"])
+def predict_nb_count(docs):
+    """
+    doc-term count matrix X: sử dụng log-prob NB
+    return: list of labels
+    """
+    X = transform_count(docs)   # shape (n_docs, n_feats)
+    # tính log posterior:  log_prior + X @ feature_log_prob.T
+    log_post = class_log_prior + X.dot(feature_log_prob.T)
+    idx = np.argmax(log_post, axis=1)
+    return classes[idx]
+# —— 3. Gradio interface —— #
+def predict_kc(content_html: str):
+    if not content_html:
+        return "Chưa nhập content."
     text = preprocess(content_html)
     if not text:
+        return "Nội dung rỗng sau khi xử lý."
+    label = predict_nb_count([text])[0]
+    return label
+interface = gr.Interface(
+    fn = predict_kc,
+    inputs  = gr.Textbox(lines=5, placeholder="Dán HTML Content…"),
+    outputs = gr.Label(label="KC dự đoán"),
+    title    = "NBC KC Predictor (no-pickle)",
+    description="Dự đoán nhãn KC dựa trên Naive Bayes đã export JSON."
 )
 if __name__ == "__main__":
+    interface.launch()