Spaces:
Sleeping
Sleeping
File size: 2,868 Bytes
2041649 ca66ee2 2041649 ca66ee2 2041649 ca66ee2 2041649 ca66ee2 2041649 ca66ee2 2041649 ca66ee2 c7dc535 2041649 0fc0fc6 2041649 96b3639 ca66ee2 2041649 ca66ee2 c1c6e2b 7ae4dbc c1c6e2b ca66ee2 c1c6e2b | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 | import json
import re
import unicodedata
from bs4 import BeautifulSoup
import numpy as np
import gradio as gr
def clean_html(raw_html: str) -> str:
soup = BeautifulSoup(raw_html, "html.parser")
for img in soup.find_all("img"): img.decompose()
for math in soup.find_all("math"): math.decompose()
return soup.get_text(separator=" ", strip=True)
def normalize_text(text: str) -> str:
text = text.lower()
chars = []
for ch in text:
cat = unicodedata.category(ch)
if cat.startswith("L") or ch.isdigit() or ch.isspace():
chars.append(ch)
else:
chars.append(" ")
return re.sub(r"\s+", " ", "".join(chars)).strip()
def preprocess(content_html: str) -> str:
return normalize_text(clean_html(content_html))
with open("vectorizer.json", encoding="utf-8") as f:
vect_data = json.load(f)
vocab = vect_data["vocabulary"]
# Implement CountVectorizer-like transform:
def transform_count(docs):
"""
docs: list of preprocessed strings
return: 2D numpy array (n_docs x n_features)
"""
n_docs = len(docs)
n_feats = len(vocab)
X = np.zeros((n_docs, n_feats), dtype=np.float32)
for i, doc in enumerate(docs):
for token in doc.split():
idx = vocab.get(token)
if idx is not None:
X[i, idx] += 1.0
return X
with open("nbc_model.json", encoding="utf-8") as f:
clf_data = json.load(f)
classes = np.array(clf_data["classes"])
class_log_prior = np.array(clf_data["class_log_prior"])
feature_log_prob = np.array(clf_data["feature_log_prob"])
def predict_nb_count(docs):
"""
doc-term count matrix X: sử dụng log-prob NB
return: list of labels
"""
X = transform_count(docs) # shape (n_docs, n_feats)
# tính log posterior: log_prior + X @ feature_log_prob.T
log_post = class_log_prior + X.dot(feature_log_prob.T)
idx = np.argmax(log_post, axis=1)
return classes[idx]
def predict_kc(content_html: str):
if not content_html:
return "Chưa nhập nội dung câu hỏi."
text = preprocess(content_html)
if not text:
return "Nội dung rỗng sau khi xử lý."
label = predict_nb_count([text])[0]
return label
css = """
textarea { font-size: 18px !important; }
.gradio-container .output-text { font-size: 18px !important; }
"""
interface = gr.Interface(
fn = predict_kc,
inputs = gr.Textbox(lines=6,
placeholder="Dán nội dung câu hỏi dạng HTML",
label="Nội dung câu hỏi ",
elem_id="input-box"),
outputs = gr.Textbox(label="KC dự đoán",
elem_id="output-box"),
title = "Demo dự đoán KC",
description="Dự đoán nhãn KC từ câu hỏi dựa trên Naive Bayes.",
css=css,
allow_flagging="never"
)
if __name__ == "__main__":
interface.launch()
|