KC_Classifier / app.py
Di12's picture
Update app.py
c1c6e2b verified
import json
import re
import unicodedata
from bs4 import BeautifulSoup
import numpy as np
import gradio as gr
def clean_html(raw_html: str) -> str:
soup = BeautifulSoup(raw_html, "html.parser")
for img in soup.find_all("img"): img.decompose()
for math in soup.find_all("math"): math.decompose()
return soup.get_text(separator=" ", strip=True)
def normalize_text(text: str) -> str:
text = text.lower()
chars = []
for ch in text:
cat = unicodedata.category(ch)
if cat.startswith("L") or ch.isdigit() or ch.isspace():
chars.append(ch)
else:
chars.append(" ")
return re.sub(r"\s+", " ", "".join(chars)).strip()
def preprocess(content_html: str) -> str:
return normalize_text(clean_html(content_html))
with open("vectorizer.json", encoding="utf-8") as f:
vect_data = json.load(f)
vocab = vect_data["vocabulary"]
# Implement CountVectorizer-like transform:
def transform_count(docs):
"""
docs: list of preprocessed strings
return: 2D numpy array (n_docs x n_features)
"""
n_docs = len(docs)
n_feats = len(vocab)
X = np.zeros((n_docs, n_feats), dtype=np.float32)
for i, doc in enumerate(docs):
for token in doc.split():
idx = vocab.get(token)
if idx is not None:
X[i, idx] += 1.0
return X
with open("nbc_model.json", encoding="utf-8") as f:
clf_data = json.load(f)
classes = np.array(clf_data["classes"])
class_log_prior = np.array(clf_data["class_log_prior"])
feature_log_prob = np.array(clf_data["feature_log_prob"])
def predict_nb_count(docs):
"""
doc-term count matrix X: sử dụng log-prob NB
return: list of labels
"""
X = transform_count(docs) # shape (n_docs, n_feats)
# tính log posterior: log_prior + X @ feature_log_prob.T
log_post = class_log_prior + X.dot(feature_log_prob.T)
idx = np.argmax(log_post, axis=1)
return classes[idx]
def predict_kc(content_html: str):
if not content_html:
return "Chưa nhập nội dung câu hỏi."
text = preprocess(content_html)
if not text:
return "Nội dung rỗng sau khi xử lý."
label = predict_nb_count([text])[0]
return label
css = """
textarea { font-size: 18px !important; }
.gradio-container .output-text { font-size: 18px !important; }
"""
interface = gr.Interface(
fn = predict_kc,
inputs = gr.Textbox(lines=6,
placeholder="Dán nội dung câu hỏi dạng HTML",
label="Nội dung câu hỏi ",
elem_id="input-box"),
outputs = gr.Textbox(label="KC dự đoán",
elem_id="output-box"),
title = "Demo dự đoán KC",
description="Dự đoán nhãn KC từ câu hỏi dựa trên Naive Bayes.",
css=css,
allow_flagging="never"
)
if __name__ == "__main__":
interface.launch()