import json import re import unicodedata from bs4 import BeautifulSoup import numpy as np import gradio as gr def clean_html(raw_html: str) -> str: soup = BeautifulSoup(raw_html, "html.parser") for img in soup.find_all("img"): img.decompose() for math in soup.find_all("math"): math.decompose() return soup.get_text(separator=" ", strip=True) def normalize_text(text: str) -> str: text = text.lower() chars = [] for ch in text: cat = unicodedata.category(ch) if cat.startswith("L") or ch.isdigit() or ch.isspace(): chars.append(ch) else: chars.append(" ") return re.sub(r"\s+", " ", "".join(chars)).strip() def preprocess(content_html: str) -> str: return normalize_text(clean_html(content_html)) with open("vectorizer.json", encoding="utf-8") as f: vect_data = json.load(f) vocab = vect_data["vocabulary"] # Implement CountVectorizer-like transform: def transform_count(docs): """ docs: list of preprocessed strings return: 2D numpy array (n_docs x n_features) """ n_docs = len(docs) n_feats = len(vocab) X = np.zeros((n_docs, n_feats), dtype=np.float32) for i, doc in enumerate(docs): for token in doc.split(): idx = vocab.get(token) if idx is not None: X[i, idx] += 1.0 return X with open("nbc_model.json", encoding="utf-8") as f: clf_data = json.load(f) classes = np.array(clf_data["classes"]) class_log_prior = np.array(clf_data["class_log_prior"]) feature_log_prob = np.array(clf_data["feature_log_prob"]) def predict_nb_count(docs): """ doc-term count matrix X: sử dụng log-prob NB return: list of labels """ X = transform_count(docs) # shape (n_docs, n_feats) # tính log posterior: log_prior + X @ feature_log_prob.T log_post = class_log_prior + X.dot(feature_log_prob.T) idx = np.argmax(log_post, axis=1) return classes[idx] def predict_kc(content_html: str): if not content_html: return "Chưa nhập nội dung câu hỏi." text = preprocess(content_html) if not text: return "Nội dung rỗng sau khi xử lý." label = predict_nb_count([text])[0] return label css = """ textarea { font-size: 18px !important; } .gradio-container .output-text { font-size: 18px !important; } """ interface = gr.Interface( fn = predict_kc, inputs = gr.Textbox(lines=6, placeholder="Dán nội dung câu hỏi dạng HTML", label="Nội dung câu hỏi ", elem_id="input-box"), outputs = gr.Textbox(label="KC dự đoán", elem_id="output-box"), title = "Demo dự đoán KC", description="Dự đoán nhãn KC từ câu hỏi dựa trên Naive Bayes.", css=css, allow_flagging="never" ) if __name__ == "__main__": interface.launch()